patch: add MAP_HUGETLB to mmap() where supported (WIP)

Started by Richard Pooleover 12 years ago82 messages
#1Richard Poole
richard@2ndQuadrant.com
1 attachment(s)

The attached patch adds the MAP_HUGETLB flag to mmap() for shared memory
on systems that support it. It's based on Christian Kruse's patch from
last year, incorporating suggestions from Andres Freund.

On a system with 4GB shared_buffers, doing pgbench runs long enough for
each backend to touch most of the buffers, this patch saves nearly 8MB of
memory per backend and improves performances by just over 2% on average.

It is still WIP as there are a couple of points that Andres has pointed
out to me that haven't been addressed yet; also, the documentation is
incomplete.

Richard

--
Richard Poole http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

Attachments:

hugepages-v1.patchtext/x-diff; charset=us-asciiDownload
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 23ebc11..703b28f 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1052,6 +1052,42 @@ include 'filename'
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-huge-tlb-pages" xreflabel="huge_tlb_pages">
+      <term><varname>huge_tlb_pages</varname> (<type>enum</type>)</term>
+      <indexterm>
+       <primary><varname>huge_tlb_pages</> configuration parameter</primary>
+      </indexterm>
+      <listitem>
+       <para>
+        Enables/disables the use of huge tlb pages. Valid values are
+        <literal>on</literal>, <literal>off</literal> and <literal>try</literal>.
+        The default value is <literal>try</literal>.
+       </para>
+
+	   <para>
+	   Use of huge tlb pages reduces the cpu time spent on memory management and
+	   the amount of memory used for page tables and therefore improves performance.
+	   </para>
+
+       <para>
+        With <varname>huge_tlb_pages</varname> set to <literal>on</literal>
+        <symbol>mmap()</symbol> will be called with <symbol>MAP_HUGETLB</symbol>.
+        If the call fails the server will fail fatally.
+       </para>
+
+       <para>
+        With <varname>huge_tlb_pages</varname> set to <literal>off</literal> we
+        will not use <symbol>MAP_HUGETLB</symbol> at all.
+       </para>
+
+       <para>
+        With <varname>huge_tlb_pages</varname> set to <literal>try</literal>
+        we will try to use <symbol>MAP_HUGETLB</symbol> and fall back to
+        <symbol>mmap()</symbol> without <symbol>MAP_HUGETLB</symbol>.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-temp-buffers" xreflabel="temp_buffers">
       <term><varname>temp_buffers</varname> (<type>integer</type>)</term>
       <indexterm>
diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c
index 20e3c32..57fff35 100644
--- a/src/backend/port/sysv_shmem.c
+++ b/src/backend/port/sysv_shmem.c
@@ -27,10 +27,14 @@
 #ifdef HAVE_SYS_SHM_H
 #include <sys/shm.h>
 #endif
+#ifdef MAP_HUGETLB
+#include <dirent.h>
+#endif
 
 #include "miscadmin.h"
 #include "storage/ipc.h"
 #include "storage/pg_shmem.h"
+#include "utils/guc.h"
 
 
 typedef key_t IpcMemoryKey;		/* shared memory key passed to shmget(2) */
@@ -61,6 +65,13 @@ typedef int IpcMemoryId;		/* shared memory ID returned by shmget(2) */
 #define MAP_FAILED ((void *) -1)
 #endif
 
+#ifdef MAP_HUGETLB
+#define PG_HUGETLB_BASE_ADDR (void *)(0x0UL)
+#define PG_MAP_HUGETLB MAP_HUGETLB
+#else
+#define PG_MAP_HUGETLB 0
+#endif
+
 
 unsigned long UsedShmemSegID = 0;
 void	   *UsedShmemSegAddr = NULL;
@@ -342,6 +353,161 @@ PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2)
 }
 
 
+#ifdef MAP_HUGETLB
+#define HUGE_PAGE_INFO_DIR  "/sys/kernel/mm/hugepages"
+
+/*
+ *	static long InternalGetFreeHugepagesCount(const char *name)
+ *
+ * Attempt to read the number of available hugepages from
+ * /sys/kernel/mm/hugepages/hugepages-<size>/free_hugepages
+ * Will fail (return -1) if file could not be opened, 0 if no pages are available
+ * and > 0 if there are free pages
+ *
+ */
+static long
+InternalGetFreeHugepagesCount(const char *name)
+{
+	int fd;
+	char buff[1024];
+	size_t len;
+	long result;
+	char *ptr;
+
+	len = snprintf(buff, 1024, "%s/%s/free_hugepages", HUGE_PAGE_INFO_DIR, name);
+	if (len == 1024) /* I don't think that this will happen ever */
+	{
+		ereport(huge_tlb_pages == HUGE_TLB_TRY ? DEBUG1 : WARNING,
+				(errmsg("Filename %s/%s/free_hugepages is too long", HUGE_PAGE_INFO_DIR, name),
+				 errcontext("while checking hugepage size")));
+		return -1;
+	}
+
+	fd = open(buff, O_RDONLY);
+	if (fd <= 0)
+	{
+		ereport(huge_tlb_pages == HUGE_TLB_TRY ? DEBUG1 : WARNING,
+				(errmsg("Could not open file %s: %s", buff, strerror(errno)),
+				 errcontext("while checking hugepage size")));
+		return -1;
+	}
+
+	len = read(fd, buff, 1024);
+	if (len <= 0)
+	{
+		ereport(huge_tlb_pages == HUGE_TLB_TRY ? DEBUG1 : WARNING,
+				(errmsg("Error reading from file %s: %s", buff, strerror(errno)),
+				 errcontext("while checking hugepage size")));
+		close(fd);
+		return -1;
+	}
+
+	/*
+	 * If the content of free_hugepages is longer than or equal to 1024 bytes
+	 * the rest is irrelevant; we simply want to know if there are any
+	 * hugepages left
+	 */
+	if (len == 1024)
+	{
+		buff[1023] = 0;
+	}
+	else
+	{
+		buff[len] = 0;
+	}
+
+	close(fd);
+
+	result = strtol(buff, &ptr, 10);
+
+	if (ptr == NULL)
+	{
+		ereport(huge_tlb_pages == HUGE_TLB_TRY ? DEBUG1 : WARNING,
+				(errmsg("Could not convert contents of file %s/%s/free_hugepages to number", HUGE_PAGE_INFO_DIR, name),
+				 errcontext("while checking hugepage size")));
+		return -1;
+	}
+
+	return result;
+}
+
+/*
+ *	static long InternalGetHugepageSize()
+ *
+ * Attempt to get a valid hugepage size from /sys/kernel/mm/hugepages/ by
+ * reading directory contents
+ * Will fail (return -1) if the directory could not be opened or no valid
+ * page sizes are available. Will return the smallest hugepage size on
+ * success.
+ *
+ */
+static long
+InternalGetHugepageSize()
+{
+	struct dirent *ent;
+	DIR *dir = opendir(HUGE_PAGE_INFO_DIR);
+	long smallest_size = -1, size;
+	bool valid_size_found = false;
+	char *ptr;
+
+	if (dir == NULL)
+	{
+		ereport(huge_tlb_pages == HUGE_TLB_TRY ? DEBUG1 : WARNING,
+				(errmsg("Could not open directory %s: %s", HUGE_PAGE_INFO_DIR, strerror(errno)),
+				 errcontext("while checking hugepage size")));
+		return -1;
+	}
+
+	/*
+	 * Linux supports multiple hugepage sizes if the hardware
+	 * supports it; for each possible size there will be a
+	 * directory in /sys/kernel/mm/hugepages consisting of the
+	 * string hugepages- and the size of the page, e.g. on x86_64:
+	 * hugepages-2048kB
+	 */
+	while((ent = readdir(dir)) != NULL)
+	{
+		if (strncmp(ent->d_name, "hugepages-", 10) == 0)
+		{
+			size = strtol(ent->d_name + 10, &ptr, 10);
+			if (ptr == NULL)
+			{
+				continue;
+			}
+
+			if (strcmp(ptr, "kB") == 0)
+			{
+				size *= 1024;
+			}
+
+			if ((smallest_size == -1 || size < smallest_size)) {
+				valid_size_found = true;
+				if(InternalGetFreeHugepagesCount(ent->d_name) > 0)
+					smallest_size = size;
+			}
+		}
+	}
+
+	closedir(dir);
+
+	if (smallest_size == -1)
+	{
+		if(valid_size_found)
+			ereport(huge_tlb_pages == HUGE_TLB_TRY ? DEBUG1 : WARNING,
+					(errmsg("No free hugepages"),
+					 errhint("There were no free huge pages of any size")));
+		else
+			ereport(huge_tlb_pages == HUGE_TLB_TRY ? DEBUG1 : WARNING,
+				(errmsg("Could not find a valid hugepage size"),
+				 errhint("This error usually means that either CONFIG_HUGETLB_PAGE "
+						 "is not in kernel or that your architecture does not "
+						 "support hugepages or you did not configure hugepages")));
+	}
+
+	return smallest_size;
+}
+#endif
+
 /*
  * PGSharedMemoryCreate
  *
@@ -391,7 +557,17 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port)
 	 */
 #ifndef EXEC_BACKEND
 	{
+#ifdef MAP_HUGETLB
+		long	pagesize = 0;
+
+		if (huge_tlb_pages == HUGE_TLB_ON || huge_tlb_pages == HUGE_TLB_TRY)
+			pagesize = InternalGetHugepageSize();
+
+		if (pagesize <= 0)
+			pagesize = sysconf(_SC_PAGE_SIZE);
+#else
 		long		pagesize = sysconf(_SC_PAGE_SIZE);
+#endif
 
 		/*
 		 * Ensure request size is a multiple of pagesize.
@@ -410,8 +586,22 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port)
 		 * out to be false, we might need to add a run-time test here and do
 		 * this only if the running kernel supports it.
 		 */
-		AnonymousShmem = mmap(NULL, size, PROT_READ | PROT_WRITE, PG_MMAP_FLAGS,
-							  -1, 0);
+
+		if (huge_tlb_pages == HUGE_TLB_ON || huge_tlb_pages == HUGE_TLB_TRY)
+		{
+			AnonymousShmem = mmap(PG_HUGETLB_BASE_ADDR, size, PROT_READ|PROT_WRITE,
+								  PG_MMAP_FLAGS|PG_MAP_HUGETLB, -1, 0);
+
+			elog(DEBUG3, "mmap() tried with MAP_HUGEPAGE: %p", AnonymousShmem);
+		}
+
+		if ((AnonymousShmem == MAP_FAILED && huge_tlb_pages == HUGE_TLB_TRY)
+			|| huge_tlb_pages == HUGE_TLB_OFF)
+		{
+			AnonymousShmem = mmap(NULL, size, PROT_READ|PROT_WRITE, PG_MMAP_FLAGS,
+								  -1, 0);
+		}
+
 		if (AnonymousShmem == MAP_FAILED)
 			ereport(FATAL,
 					(errmsg("could not map anonymous shared memory: %m"),
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 7d297bc..3b26caa 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -22,6 +22,7 @@
 #include <limits.h>
 #include <unistd.h>
 #include <sys/stat.h>
+#include <sys/mman.h>
 #ifdef HAVE_SYSLOG
 #include <syslog.h>
 #endif
@@ -381,6 +382,22 @@ static const struct config_enum_entry synchronous_commit_options[] = {
 };
 
 /*
+ * huge_tlb_pages may be on|off|try, where try is the default
+ * on: try to mmap() with MAP_HUGETLB and fail when mmap() fails
+ * off: do not try tp mmap() with MAP_HUGETLB
+ * try: try to mmap() with MAP_HUGETLB and fallback to mmap()
+ *      w/o MAP_HUGETLB
+ */
+static const struct config_enum_entry huge_tlb_options[] = {
+#ifdef MAP_HUGETLB
+	{"on", HUGE_TLB_ON, false},
+	{"try", HUGE_TLB_TRY, false},
+#endif
+	{"off", HUGE_TLB_OFF, false},
+	{NULL, 0, false}
+};
+
+/*
  * Options for enum values stored in other modules
  */
 extern const struct config_enum_entry wal_level_options[];
@@ -439,6 +456,12 @@ int			tcp_keepalives_idle;
 int			tcp_keepalives_interval;
 int			tcp_keepalives_count;
 
+#ifdef MAP_HUGETLB
+int huge_tlb_pages = HUGE_TLB_TRY;
+#else
+int huge_tlb_pages = HUGE_TLB_OFF;
+#endif
+
 /*
  * These variables are all dummies that don't do anything, except in some
  * cases provide the value for SHOW to display.  The real state is elsewhere
@@ -3354,6 +3377,26 @@ static struct config_enum ConfigureNamesEnum[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"huge_tlb_pages",
+#ifdef MAP_HUGETLB
+			PGC_SUSET,
+#else
+			PGC_INTERNAL,
+#endif
+			RESOURCES_MEM,
+			gettext_noop("Enable/disable the use of the hugepages feature"),
+			NULL
+		},
+		&huge_tlb_pages,
+#ifdef MAP_HUGETLB
+		HUGE_TLB_TRY,
+#else
+		HUGE_TLB_OFF,
+#endif
+		huge_tlb_options,
+		NULL, NULL, NULL
+	},
 
 	/* End-of-list marker */
 	{
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index d69a02b..7c826d5 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -113,6 +113,7 @@
 
 #shared_buffers = 32MB			# min 128kB
 					# (change requires restart)
+#huge_tlb_pages = try			# try to map memory with MAP_HUGETLB (on, off, try)
 #temp_buffers = 8MB			# min 800kB
 #max_prepared_transactions = 0		# zero disables the feature
 					# (change requires restart)
diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h
index 99211c1..c2fdba4 100644
--- a/src/include/utils/guc.h
+++ b/src/include/utils/guc.h
@@ -235,6 +235,24 @@ extern int	tcp_keepalives_idle;
 extern int	tcp_keepalives_interval;
 extern int	tcp_keepalives_count;
 
+
+/*
+ * Possible values for huge_tlb_pages; default is HUGE_TLB_TRY
+ */
+typedef enum
+{
+	HUGE_TLB_OFF,
+	HUGE_TLB_ON,
+	HUGE_TLB_TRY
+} HugeTlbType;
+
+
+/*
+ * configure the use of huge TLB pages
+ */
+extern int huge_tlb_pages;
+
+
 /*
  * Functions exported by guc.c
  */
#2Peter Eisentraut
peter_e@gmx.net
In reply to: Richard Poole (#1)
Re: patch: add MAP_HUGETLB to mmap() where supported (WIP)

On Sat, 2013-09-14 at 00:41 +0100, Richard Poole wrote:

The attached patch adds the MAP_HUGETLB flag to mmap() for shared
memory on systems that support it.

Please fix the tabs in the SGML files.

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#3Heikki Linnakangas
hlinnakangas@vmware.com
In reply to: Richard Poole (#1)
Re: patch: add MAP_HUGETLB to mmap() where supported (WIP)

On 14.09.2013 02:41, Richard Poole wrote:

The attached patch adds the MAP_HUGETLB flag to mmap() for shared memory
on systems that support it. It's based on Christian Kruse's patch from
last year, incorporating suggestions from Andres Freund.

I don't understand the logic in figuring out the pagesize, and the
smallest supported hugepage size. First of all, even without the patch,
why do we round up the size passed to mmap() to the _SC_PAGE_SIZE?
Surely the kernel will round up the request all by itself. The mmap()
man page doesn't say anything about length having to be a multiple of
pages size.

And with the patch, why do you bother detecting the minimum supported
hugepage size? Surely the kernel will choose the appropriate hugepage
size just fine on its own, no?

It is still WIP as there are a couple of points that Andres has pointed
out to me that haven't been addressed yet;

Which points are those?

I wonder if it would be better to allow setting huge_tlb_pages=try even
on platforms that don't have hugepages. It would simply mean the same as
'off' on such platforms.

- Heikki

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#4Andres Freund
andres@2ndquadrant.com
In reply to: Heikki Linnakangas (#3)
Re: patch: add MAP_HUGETLB to mmap() where supported (WIP)

On 2013-09-16 11:15:28 +0300, Heikki Linnakangas wrote:

On 14.09.2013 02:41, Richard Poole wrote:

The attached patch adds the MAP_HUGETLB flag to mmap() for shared memory
on systems that support it. It's based on Christian Kruse's patch from
last year, incorporating suggestions from Andres Freund.

I don't understand the logic in figuring out the pagesize, and the smallest
supported hugepage size. First of all, even without the patch, why do we
round up the size passed to mmap() to the _SC_PAGE_SIZE? Surely the kernel
will round up the request all by itself. The mmap() man page doesn't say
anything about length having to be a multiple of pages size.

I think it does:
EINVAL We don't like addr, length, or offset (e.g., they are too
large, or not aligned on a page boundary).
and
A file is mapped in multiples of the page size. For a file that is not a multiple
of the page size, the remaining memory is zeroed when mapped, and writes to that
region are not written out to the file. The effect of changing the size of the
underlying file of a mapping on the pages that correspond to added or removed
regions of the file is unspecified.

And no, according to my past experience, the kernel does *not* do any
such rounding up. It will just fail.

And with the patch, why do you bother detecting the minimum supported
hugepage size? Surely the kernel will choose the appropriate hugepage size
just fine on its own, no?

It will fail if it's not a multiple.

It is still WIP as there are a couple of points that Andres has pointed
out to me that haven't been addressed yet;

Which points are those?

I don't know which point Richard already has fixed, so I'll let him
comment on that.

I wonder if it would be better to allow setting huge_tlb_pages=try even on
platforms that don't have hugepages. It would simply mean the same as 'off'
on such platforms.

I wouldn't argue against that.

Greetings,

Andres Freund

--
Andres Freund http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#5Heikki Linnakangas
hlinnakangas@vmware.com
In reply to: Andres Freund (#4)
1 attachment(s)
Re: patch: add MAP_HUGETLB to mmap() where supported (WIP)

On 16.09.2013 13:15, Andres Freund wrote:

On 2013-09-16 11:15:28 +0300, Heikki Linnakangas wrote:

On 14.09.2013 02:41, Richard Poole wrote:

The attached patch adds the MAP_HUGETLB flag to mmap() for shared memory
on systems that support it. It's based on Christian Kruse's patch from
last year, incorporating suggestions from Andres Freund.

I don't understand the logic in figuring out the pagesize, and the smallest
supported hugepage size. First of all, even without the patch, why do we
round up the size passed to mmap() to the _SC_PAGE_SIZE? Surely the kernel
will round up the request all by itself. The mmap() man page doesn't say
anything about length having to be a multiple of pages size.

I think it does:
EINVAL We don't like addr, length, or offset (e.g., they are too
large, or not aligned on a page boundary).

That doesn't mean that they *all* have to be aligned on a page boundary.
It's understandable that 'addr' and 'offset' have to be, but it doesn't
make much sense for 'length'.

and
A file is mapped in multiples of the page size. For a file that is not a multiple
of the page size, the remaining memory is zeroed when mapped, and writes to that
region are not written out to the file. The effect of changing the size of the
underlying file of a mapping on the pages that correspond to added or removed
regions of the file is unspecified.

And no, according to my past experience, the kernel does *not* do any
such rounding up. It will just fail.

I wrote a little test program to play with different values (attached).
I tried this on my laptop with a 3.2 kernel (uname -r: 3.10-2-amd6), and
on a VM with a fresh Centos 6.4 install with 2.6.32 kernel
(2.6.32-358.18.1.el6.x86_64), and they both work the same:

$ ./mmaptest 100 # mmap 100 bytes

in a different terminal:
$ cat /proc/meminfo | grep HugePages_Rsvd
HugePages_Rsvd: 1

So even a tiny allocation, much smaller than any page size, succeeds,
and it reserves a huge page. I tried the same with larger values; the
kernel always uses huge pages, and rounds up the allocation to a
multiple of the huge page size.

So, let's just get rid of the /sys scanning code.

Robert, do you remember why you put the "pagesize =
sysconf(_SC_PAGE_SIZE);" call in the new mmap() shared memory allocator?

- Heikki

Attachments:

mmaptest.ctext/x-csrc; name=mmaptest.cDownload
#6Andres Freund
andres@2ndquadrant.com
In reply to: Heikki Linnakangas (#5)
Re: patch: add MAP_HUGETLB to mmap() where supported (WIP)

On 2013-09-16 16:13:57 +0300, Heikki Linnakangas wrote:

On 16.09.2013 13:15, Andres Freund wrote:

On 2013-09-16 11:15:28 +0300, Heikki Linnakangas wrote:

On 14.09.2013 02:41, Richard Poole wrote:

The attached patch adds the MAP_HUGETLB flag to mmap() for shared memory
on systems that support it. It's based on Christian Kruse's patch from
last year, incorporating suggestions from Andres Freund.

I don't understand the logic in figuring out the pagesize, and the smallest
supported hugepage size. First of all, even without the patch, why do we
round up the size passed to mmap() to the _SC_PAGE_SIZE? Surely the kernel
will round up the request all by itself. The mmap() man page doesn't say
anything about length having to be a multiple of pages size.

I think it does:
EINVAL We don't like addr, length, or offset (e.g., they are too
large, or not aligned on a page boundary).

That doesn't mean that they *all* have to be aligned on a page boundary.
It's understandable that 'addr' and 'offset' have to be, but it doesn't make
much sense for 'length'.

and
A file is mapped in multiples of the page size. For a file that is not a multiple
of the page size, the remaining memory is zeroed when mapped, and writes to that
region are not written out to the file. The effect of changing the size of the
underlying file of a mapping on the pages that correspond to added or removed
regions of the file is unspecified.

And no, according to my past experience, the kernel does *not* do any
such rounding up. It will just fail.

I wrote a little test program to play with different values (attached). I
tried this on my laptop with a 3.2 kernel (uname -r: 3.10-2-amd6), and on a
VM with a fresh Centos 6.4 install with 2.6.32 kernel
(2.6.32-358.18.1.el6.x86_64), and they both work the same:

$ ./mmaptest 100 # mmap 100 bytes

in a different terminal:
$ cat /proc/meminfo | grep HugePages_Rsvd
HugePages_Rsvd: 1

So even a tiny allocation, much smaller than any page size, succeeds, and it
reserves a huge page. I tried the same with larger values; the kernel always
uses huge pages, and rounds up the allocation to a multiple of the huge page
size.

When developing the prototype I am pretty sure I had to add the rounding
up - but I am not sure why now, because after chatting with Heikki about
it, I've looked around and the initial MAP_HUGETLB support in the kernel
(commit 4e52780d41a741fb4861ae1df2413dd816ec11b1) has support for
rounding up.

So, let's just get rid of the /sys scanning code.

Alternatively we could round up NBuffers to actually use the
additionally allocated space. Not sure if that's worth the amount of
code, but wasting several megabytes - or even gigabytes - of memory
isn't nice either.

Greetings,

Andres Freund

--
Andres Freund http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#7Andres Freund
andres@2ndquadrant.com
In reply to: Andres Freund (#6)
Re: patch: add MAP_HUGETLB to mmap() where supported (WIP)

On 2013-09-16 15:18:50 +0200, Andres Freund wrote:

So even a tiny allocation, much smaller than any page size, succeeds, and it
reserves a huge page. I tried the same with larger values; the kernel always
uses huge pages, and rounds up the allocation to a multiple of the huge page
size.

When developing the prototype I am pretty sure I had to add the rounding
up - but I am not sure why now, because after chatting with Heikki about
it, I've looked around and the initial MAP_HUGETLB support in the kernel
(commit 4e52780d41a741fb4861ae1df2413dd816ec11b1) has support for
rounding up.

Ok, the reason for that seems to have been the following bug
https://bugzilla.kernel.org/show_bug.cgi?id=56881

Greetings,

Andres Freund

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#8Robert Haas
robertmhaas@gmail.com
In reply to: Heikki Linnakangas (#5)
Re: patch: add MAP_HUGETLB to mmap() where supported (WIP)

On Mon, Sep 16, 2013 at 9:13 AM, Heikki Linnakangas
<hlinnakangas@vmware.com> wrote:

Robert, do you remember why you put the "pagesize = sysconf(_SC_PAGE_SIZE);"
call in the new mmap() shared memory allocator?

Hmm, no. Unfortunately, I don't. We could try ripping it out and see
if the buildfarm breaks. If it is needed, then the dynamic shared
memory patch I posted probably needs it as well.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#9Abhijit Menon-Sen
ams@2ndquadrant.com
In reply to: Richard Poole (#1)
1 attachment(s)
[PATCH] Use MAP_HUGETLB where supported (v3)

Hi.

This is a slightly reworked version of the patch submitted by Richard
Poole last month, which was based on Christian Kruse's earlier patch.

Apart from doing various minor cleanups and documentation fixes, I also
tested this patch against HEAD on a machine with 256GB of RAM. Here's an
overview of the results.

I set nr_hugepages to 32768 (== 64GB), which (took a very long time and)
allowed me to set shared_buffers to 60GB. I then ran pgbench -s 1000 -i,
and did some runs of "pgbench -c 100 -j 10 -t 1000" with huge_tlb_pages
set to off and on respectively.

With huge_tlb_pages=off, this is the best result I got:

tps = 8680.771068 (including connections establishing)
tps = 8721.504838 (excluding connections establishing)

With huge_tlb_pages=on, this is the best result I got:

tps = 9932.245203 (including connections establishing)
tps = 9983.190304 (excluding connections establishing)

(Even the worst result I got in the latter case was a smidgen faster
than the best with huge_tlb_pages=off: 8796.344078 vs. 8721.504838.)

From /proc/$pid/status, VmPTE was 2880kb with huge_tlb_pages=off, and
56kb with it turned on.

One open question is what to do about rounding up the size. It should
not be necessary, but for the fairly recent bug described at the link
in the comment (https://bugzilla.kernel.org/show_bug.cgi?id=56881). I
tried it without the rounding-up, and it fails on Ubuntu's 3.5.0-28
kernel (mmap returns EINVAL).

Any thoughts?

-- Abhijit

Attachments:

hugepages-v3.patchtext/x-diff; charset=us-asciiDownload
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 77a9303..e4ded7a 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1052,6 +1052,49 @@ include 'filename'
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-huge-tlb-pages" xreflabel="huge_tlb_pages">
+      <term><varname>huge_tlb_pages</varname> (<type>enum</type>)</term>
+      <indexterm>
+       <primary><varname>huge_tlb_pages</> configuration parameter</primary>
+      </indexterm>
+      <listitem>
+       <para>
+        Enables/disables the use of huge TLB pages. Valid values are
+        <literal>try</literal> (the default), <literal>on</literal>,
+        and <literal>off</literal>.
+       </para>
+
+       <para>
+        At present, this feature is supported only on Linux. The setting
+        is ignored on other systems.
+       </para>
+
+       <para>
+        The use of huge TLB pages results in smaller page tables and
+        less CPU time spent on memory management. For more details, see
+        <ulink url="https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt">hugepages.txt
+        </ulink> in the Linux kernel documentation.
+       </para>
+
+       <para>
+        With <varname>huge_tlb_pages</varname> set to <literal>try</literal>,
+        the server will try to use huge pages, but fall back to using
+        normal allocation if the first attempt fails.
+       </para>
+
+       <para>
+        With <varname>huge_tlb_pages</varname> set to <literal>on</literal>,
+        the server will try to use huge pages, and treat failure as a
+        FATAL error.
+       </para>
+
+       <para>
+        With <varname>huge_tlb_pages</varname> set to <literal>off</literal>,
+        the server will not try to use huge pages.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-temp-buffers" xreflabel="temp_buffers">
       <term><varname>temp_buffers</varname> (<type>integer</type>)</term>
       <indexterm>

diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 707edf1..7d9d0a8 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -113,6 +113,7 @@
 
 #shared_buffers = 32MB			# min 128kB
 					# (change requires restart)
+#huge_tlb_pages = try			# try to map memory with MAP_HUGETLB (on, off, try)
 #temp_buffers = 8MB			# min 800kB
 #max_prepared_transactions = 0		# zero disables the feature
 					# (change requires restart)

diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c
index b604407..34937b2 100644
--- a/src/backend/port/sysv_shmem.c
+++ b/src/backend/port/sysv_shmem.c
@@ -27,11 +27,15 @@
 #ifdef HAVE_SYS_SHM_H
 #include <sys/shm.h>
 #endif
+#ifdef MAP_HUGETLB
+#include <dirent.h>
+#endif
 
 #include "miscadmin.h"
 #include "portability/mem.h"
 #include "storage/ipc.h"
 #include "storage/pg_shmem.h"
+#include "utils/guc.h"
 
 
 typedef key_t IpcMemoryKey;		/* shared memory key passed to shmget(2) */
@@ -318,6 +322,151 @@ PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2)
 }
 
 
+#ifdef MAP_HUGETLB
+#define HUGE_PAGE_INFO_DIR  "/sys/kernel/mm/hugepages"
+
+/*
+ * long InternalGetFreeHugepagesCount(const char *name)
+ *
+ * Returns the number of free hugepages of a given size, as reported by
+ * /sys/kernel/mm/hugepages/<name>/free_hugepages. Will fail (return -1)
+ * if the file could not be opened or 0 if no free pages are available.
+ */
+static long
+InternalGetFreeHugepagesCount(const char *name)
+{
+	int fd;
+	char buff[1024];
+	size_t len;
+	long result;
+	char *ptr;
+
+	len = snprintf(buff, 1024, "%s/%s/free_hugepages", HUGE_PAGE_INFO_DIR, name);
+	if (len == 1024)
+	{
+		ereport(huge_tlb_pages == HUGE_TLB_TRY ? DEBUG1 : WARNING,
+				(errmsg("Filename %s/%s/free_hugepages is too long", HUGE_PAGE_INFO_DIR, name),
+				 errcontext("while checking hugepage size")));
+		return -1;
+	}
+
+	fd = open(buff, O_RDONLY);
+	if (fd <= 0)
+	{
+		ereport(huge_tlb_pages == HUGE_TLB_TRY ? DEBUG1 : WARNING,
+				(errmsg("Could not open file %s: %s", buff, strerror(errno)),
+				 errcontext("while checking hugepage size")));
+		return -1;
+	}
+
+	len = read(fd, buff, 1024);
+	if (len <= 0)
+	{
+		ereport(huge_tlb_pages == HUGE_TLB_TRY ? DEBUG1 : WARNING,
+				(errmsg("Error reading from file %s: %s", buff, strerror(errno)),
+				 errcontext("while checking hugepage size")));
+		close(fd);
+		return -1;
+	}
+	close(fd);
+
+	/*
+	 * free_hugepages should contain the number of free hugepages of a
+	 * given size. If we somehow read 1024 bytes from it above (which
+	 * should never happen), we check 1023 bytes and ignore the rest.
+	 */
+	if (len == 1024)
+		len = 1023;
+
+	buff[len] = 0;
+
+	result = strtol(buff, &ptr, 10);
+
+	if (ptr == NULL)
+	{
+		ereport(huge_tlb_pages == HUGE_TLB_TRY ? DEBUG1 : WARNING,
+				(errmsg("Could not convert contents of file %s/%s/free_hugepages to number", HUGE_PAGE_INFO_DIR, name),
+				 errcontext("while checking hugepage size")));
+		return -1;
+	}
+
+	return result;
+}
+
+/*
+ * long InternalGetHugepageSize()
+ *
+ * Returns the smallest valid hugepage size by reading the contents of
+ * the /sys/kernel/mm/hugepages directory. Will fail (return -1) if the
+ * directory could not be opened or no valid page sizes are available.
+ */
+static long
+InternalGetHugepageSize()
+{
+	struct dirent *ent;
+	DIR *dir = opendir(HUGE_PAGE_INFO_DIR);
+	long smallest_size = -1, size;
+	bool valid_size_found = false;
+	char *ptr;
+
+	if (dir == NULL)
+	{
+		ereport(huge_tlb_pages == HUGE_TLB_TRY ? DEBUG1 : WARNING,
+				(errmsg("Could not open directory %s: %s", HUGE_PAGE_INFO_DIR, strerror(errno)),
+				 errcontext("while checking hugepage size")));
+		return -1;
+	}
+
+	/*
+	 * Linux supports multiple hugepage sizes if the hardware
+	 * supports it; for each possible size there will be a
+	 * directory in /sys/kernel/mm/hugepages consisting of the
+	 * string hugepages- and the size of the page, e.g. on x86_64:
+	 * hugepages-2048kB
+	 */
+	while((ent = readdir(dir)) != NULL)
+	{
+		if (strncmp(ent->d_name, "hugepages-", 10) == 0)
+		{
+			size = strtol(ent->d_name + 10, &ptr, 10);
+			if (ptr == NULL)
+			{
+				continue;
+			}
+
+			if (strcmp(ptr, "kB") == 0)
+			{
+				size *= 1024;
+			}
+
+			if ((smallest_size == -1 || size < smallest_size)) {
+				valid_size_found = true;
+				if(InternalGetFreeHugepagesCount(ent->d_name) > 0)
+					smallest_size = size;
+			}
+		}
+	}
+
+	closedir(dir);
+
+	if (smallest_size == -1)
+	{
+		if(valid_size_found)
+			ereport(huge_tlb_pages == HUGE_TLB_TRY ? DEBUG1 : WARNING,
+					(errmsg("No free hugepages"),
+					 errhint("There were no free huge pages of any size")));
+		else
+			ereport(huge_tlb_pages == HUGE_TLB_TRY ? DEBUG1 : WARNING,
+				(errmsg("Could not find a valid hugepage size"),
+				 errhint("This error usually means that either CONFIG_HUGETLB_PAGE "
+						 "is not in kernel or that your architecture does not "
+						 "support hugepages or you did not configure hugepages")));
+	}
+
+	return smallest_size;
+}
+#endif
+
 /*
  * PGSharedMemoryCreate
  *
@@ -367,7 +516,19 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port)
 	 */
 #ifndef EXEC_BACKEND
 	{
-		long		pagesize = sysconf(_SC_PAGE_SIZE);
+		long		pagesize = 0;
+		int			flags = PG_MMAP_FLAGS;
+
+#ifdef MAP_HUGETLB
+		if (huge_tlb_pages == HUGE_TLB_ON || huge_tlb_pages == HUGE_TLB_TRY)
+		{
+			flags |= MAP_HUGETLB;
+			pagesize = InternalGetHugepageSize();
+		}
+#endif
+
+		if (pagesize <= 0)
+			pagesize = sysconf(_SC_PAGE_SIZE);
 
 		/*
 		 * Ensure request size is a multiple of pagesize.
@@ -375,6 +536,10 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port)
 		 * pagesize will, for practical purposes, always be a power of two.
 		 * But just in case it isn't, we do it this way instead of using
 		 * TYPEALIGN().
+		 *
+		 * The kernel should really remove the need to worry about this,
+		 * but see https://bugzilla.kernel.org/show_bug.cgi?id=56881 for
+		 * recent situations in which this did not work.
 		 */
 		if (pagesize > 0 && size % pagesize != 0)
 			size += pagesize - (size % pagesize);
@@ -386,8 +551,22 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port)
 		 * out to be false, we might need to add a run-time test here and do
 		 * this only if the running kernel supports it.
 		 */
-		AnonymousShmem = mmap(NULL, size, PROT_READ | PROT_WRITE, PG_MMAP_FLAGS,
-							  -1, 0);
+
+		AnonymousShmem = mmap(NULL, size, PROT_READ|PROT_WRITE, flags, -1, 0);
+
+#ifdef MAP_HUGETLB
+		/*
+		 * If huge_tlb_pages="try" and the allocation fails, we retry
+		 * without the MAP_HUGETLB flag.
+		 */
+
+		if (AnonymousShmem == MAP_FAILED && huge_tlb_pages == HUGE_TLB_TRY)
+		{
+			AnonymousShmem = mmap(NULL, size, PROT_READ|PROT_WRITE,
+								  PG_MMAP_FLAGS, -1, 0);
+		}
+#endif
+
 		if (AnonymousShmem == MAP_FAILED)
 			ereport(FATAL,
 					(errmsg("could not map anonymous shared memory: %m"),

diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h
index 3e981b3..8011e88 100644
--- a/src/include/utils/guc.h
+++ b/src/include/utils/guc.h
@@ -235,6 +235,24 @@ extern int	tcp_keepalives_idle;
 extern int	tcp_keepalives_interval;
 extern int	tcp_keepalives_count;
 
+
+/*
+ * Possible values for huge_tlb_pages; default is HUGE_TLB_TRY
+ */
+typedef enum
+{
+	HUGE_TLB_OFF,
+	HUGE_TLB_ON,
+	HUGE_TLB_TRY
+} HugeTlbType;
+
+
+/*
+ * configure the use of huge TLB pages
+ */
+extern int huge_tlb_pages;
+
+
 /*
  * Functions exported by guc.c
  */

diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index dfc6704..fe09396 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -22,6 +22,9 @@
 #include <limits.h>
 #include <unistd.h>
 #include <sys/stat.h>
+#ifdef HAVE_SYS_MMAN_H
+#include <sys/mman.h>
+#endif
 #ifdef HAVE_SYSLOG
 #include <syslog.h>
 #endif
@@ -381,6 +384,26 @@ static const struct config_enum_entry synchronous_commit_options[] = {
 };
 
 /*
+ * huge_tlb_pages may be on|off|try, where try is the default
+ * on: try to mmap() with MAP_HUGETLB and fail when mmap() fails
+ * off: do not try tp mmap() with MAP_HUGETLB
+ * try: try to mmap() with MAP_HUGETLB and fallback to mmap()
+ *      w/o MAP_HUGETLB
+ */
+static const struct config_enum_entry huge_tlb_options[] = {
+	{"off", HUGE_TLB_OFF, false},
+	{"on", HUGE_TLB_ON, false},
+	{"try", HUGE_TLB_TRY, false},
+	{"true", HUGE_TLB_ON, true},
+	{"false", HUGE_TLB_OFF, true},
+	{"yes", HUGE_TLB_ON, true},
+	{"no", HUGE_TLB_OFF, true},
+	{"1", HUGE_TLB_ON, true},
+	{"0", HUGE_TLB_OFF, true},
+	{NULL, 0, false}
+};
+
+/*
  * Options for enum values stored in other modules
  */
 extern const struct config_enum_entry wal_level_options[];
@@ -440,6 +463,8 @@ int			tcp_keepalives_idle;
 int			tcp_keepalives_interval;
 int			tcp_keepalives_count;
 
+int huge_tlb_pages = HUGE_TLB_TRY;
+
 /*
  * These variables are all dummies that don't do anything, except in some
  * cases provide the value for SHOW to display.  The real state is elsewhere
@@ -3377,6 +3402,18 @@ static struct config_enum ConfigureNamesEnum[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"huge_tlb_pages",
+			PGC_POSTMASTER,
+			RESOURCES_MEM,
+			gettext_noop("Enable/disable the use of huge TLB pages on Linux"),
+			NULL
+		},
+		&huge_tlb_pages,
+		HUGE_TLB_TRY,
+		huge_tlb_options,
+		NULL, NULL, NULL
+	},
 
 	/* End-of-list marker */
 	{

diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 0250e39..051b6cf 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -532,6 +532,9 @@
 /* Define to 1 if you have the <sys/ipc.h> header file. */
 #undef HAVE_SYS_IPC_H
 
+/* Define to 1 if you have the <sys/mman.h> header file. */
+#undef HAVE_SYS_MMAN_H
+
 /* Define to 1 if you have the <sys/poll.h> header file. */
 #undef HAVE_SYS_POLL_H
 
diff --git a/configure.in b/configure.in
index d2bab32..b755202 100644
--- a/configure.in
+++ b/configure.in
@@ -982,7 +982,7 @@ AC_SUBST(OSSP_UUID_LIBS)
 ##
 
 dnl sys/socket.h is required by AC_FUNC_ACCEPT_ARGTYPES
-AC_CHECK_HEADERS([crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h])
+AC_CHECK_HEADERS([crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/mman.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h])
 
 # On BSD, test for net/if.h will fail unless sys/socket.h
 # is included first.

diff --git a/configure b/configure
index c20afde..67bc57f 100755
--- a/configure
+++ b/configure
@@ -10524,7 +10524,7 @@ done
 
 
 
-for ac_header in crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h
+for ac_header in crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/mman.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h
 do
 as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh`
 if { as_var=$as_ac_Header; eval "test \"\${$as_var+set}\" = set"; }; then

#10Abhijit Menon-Sen
ams@2ndquadrant.com
In reply to: Abhijit Menon-Sen (#9)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

At 2013-10-24 11:33:13 +0530, ams@2ndquadrant.com wrote:

From /proc/$pid/status, VmPTE was 2880kb with huge_tlb_pages=off, and

56kb with it turned on.

(VmPTE is the size of the process's page tables.)

-- Abhijit

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#11Heikki Linnakangas
hlinnakangas@vmware.com
In reply to: Abhijit Menon-Sen (#9)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

On 24.10.2013 09:03, Abhijit Menon-Sen wrote:

This is a slightly reworked version of the patch submitted by Richard
Poole last month, which was based on Christian Kruse's earlier patch.

Thanks.

With huge_tlb_pages=off, this is the best result I got:

tps = 8680.771068 (including connections establishing)
tps = 8721.504838 (excluding connections establishing)

With huge_tlb_pages=on, this is the best result I got:

tps = 9932.245203 (including connections establishing)
tps = 9983.190304 (excluding connections establishing)

(Even the worst result I got in the latter case was a smidgen faster
than the best with huge_tlb_pages=off: 8796.344078 vs. 8721.504838.)

That's really impressive.

One open question is what to do about rounding up the size. It should
not be necessary, but for the fairly recent bug described at the link
in the comment (https://bugzilla.kernel.org/show_bug.cgi?id=56881). I
tried it without the rounding-up, and it fails on Ubuntu's 3.5.0-28
kernel (mmap returns EINVAL).

Let's get rid of the rounding. It's clearly a kernel bug, and it
shouldn't be our business to add workarounds for any kernel bug out
there. And the worst that will happen if you're running a buggy kernel
version is that you fall back to not using huge pages (assuming
huge_tlb_pages=try).

Other comments:

* guc.c doesn't actually need sys/mman.h for anything. Getting rid of
the #include also lets you remove the configure test.

* the documentation should perhaps mention that the setting only has an
effect if POSIX shared memory is used. That's the default on Linux, but
we will try to fall back to SystemV shared memory if it fails.

- Heikki

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#12Robert Haas
robertmhaas@gmail.com
In reply to: Heikki Linnakangas (#11)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

On Thu, Oct 24, 2013 at 9:06 AM, Heikki Linnakangas
<hlinnakangas@vmware.com> wrote:

* the documentation should perhaps mention that the setting only has an
effect if POSIX shared memory is used. That's the default on Linux, but we
will try to fall back to SystemV shared memory if it fails.

This is true for dynamic shared memory, but not for the main shared
memory segment. The main shared memory segment is always the
combination of a small, fixed-size System V shared memory chunk and a
anonymous shared memory region created by mmap(NULL, ..., MAP_SHARED).
POSIX shared memory is not used.

(Exceptions: Anonymous shared memory isn't used on Windows, which has
its own mechanism, or when compiling with EXEC_BACKEND, when the whole
chunk is allocated as System V shared memory.)

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#13Andres Freund
andres@2ndquadrant.com
In reply to: Heikki Linnakangas (#11)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

On 2013-10-24 16:06:19 +0300, Heikki Linnakangas wrote:

On 24.10.2013 09:03, Abhijit Menon-Sen wrote:

One open question is what to do about rounding up the size. It should
not be necessary, but for the fairly recent bug described at the link
in the comment (https://bugzilla.kernel.org/show_bug.cgi?id=56881). I
tried it without the rounding-up, and it fails on Ubuntu's 3.5.0-28
kernel (mmap returns EINVAL).

Let's get rid of the rounding. It's clearly a kernel bug, and it shouldn't
be our business to add workarounds for any kernel bug out there. And the
worst that will happen if you're running a buggy kernel version is that you
fall back to not using huge pages (assuming huge_tlb_pages=try).

But it's a range of relatively popular kernels, that will stay around
for a good while. So I am hesitant to just not do anything about it. The
directory scanning code isn't that bad imo.

Either way:
I think we should log when we tried to use hugepages but fell back to
plain mmap, currently it's hard to see whether they are used.

Greetings,

Andres Freund

--
Andres Freund http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#14Robert Haas
robertmhaas@gmail.com
In reply to: Andres Freund (#13)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

On Thu, Oct 24, 2013 at 1:00 PM, Andres Freund <andres@2ndquadrant.com> wrote:

On 2013-10-24 16:06:19 +0300, Heikki Linnakangas wrote:

On 24.10.2013 09:03, Abhijit Menon-Sen wrote:

One open question is what to do about rounding up the size. It should
not be necessary, but for the fairly recent bug described at the link
in the comment (https://bugzilla.kernel.org/show_bug.cgi?id=56881). I
tried it without the rounding-up, and it fails on Ubuntu's 3.5.0-28
kernel (mmap returns EINVAL).

Let's get rid of the rounding. It's clearly a kernel bug, and it shouldn't
be our business to add workarounds for any kernel bug out there. And the
worst that will happen if you're running a buggy kernel version is that you
fall back to not using huge pages (assuming huge_tlb_pages=try).

But it's a range of relatively popular kernels, that will stay around
for a good while. So I am hesitant to just not do anything about it. The
directory scanning code isn't that bad imo.

Either way:
I think we should log when we tried to use hugepages but fell back to
plain mmap, currently it's hard to see whether they are used.

Logging it might be a good idea, but suppose the systems been running
for 6 months and you don't have the startup logs. Might be a good way
to have an easy way to discover later what happened back then.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#15Sergey Konoplev
gray.ru@gmail.com
In reply to: Abhijit Menon-Sen (#9)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

Hi,

On Wed, Oct 23, 2013 at 11:03 PM, Abhijit Menon-Sen <ams@2ndquadrant.com> wrote:

This is a slightly reworked version of the patch submitted by Richard
Poole last month, which was based on Christian Kruse's earlier patch.

Is it possible that this patch will be included in a minor version of
9.3? IMHO hugepages is a very important ability that postgres lost in
9.3, and it would be great to have it back ASAP.

Thank you.

--
Kind regards,
Sergey Konoplev
PostgreSQL Consultant and DBA

http://www.linkedin.com/in/grayhemp
+1 (415) 867-9984, +7 (901) 903-0499, +7 (988) 888-1979
gray.ru@gmail.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#16Tom Lane
tgl@sss.pgh.pa.us
In reply to: Sergey Konoplev (#15)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

Sergey Konoplev <gray.ru@gmail.com> writes:

On Wed, Oct 23, 2013 at 11:03 PM, Abhijit Menon-Sen <ams@2ndquadrant.com> wrote:

This is a slightly reworked version of the patch submitted by Richard
Poole last month, which was based on Christian Kruse's earlier patch.

Is it possible that this patch will be included in a minor version of
9.3? IMHO hugepages is a very important ability that postgres lost in
9.3, and it would be great to have it back ASAP.

Say what? There's never been any hugepages support in Postgres.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#17Abhijit Menon-Sen
ams@2ndquadrant.com
In reply to: Heikki Linnakangas (#11)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

At 2013-10-24 16:06:19 +0300, hlinnakangas@vmware.com wrote:

Let's get rid of the rounding.

I share Andres's concern that the bug is present in various recent
kernels that are going to stick around for quite some time. Given
the rather significant performance gain, I think it's worth doing
something, though I'm not a big fan of the directory-scanning code
myself.

As a compromise, perhaps we can unconditionally round the size up to be
a multiple of 2MB? That way, we can use huge pages more often, but also
avoid putting in a lot of code and effort into the workaround and waste
only a little space (if any at all).

Other comments:

* guc.c doesn't actually need sys/mman.h for anything. Getting rid
of the #include also lets you remove the configure test.

You're right, guc.c doesn't use it any more; I've removed the #include.

sysv_shmem.c does use it (MAP_*, PROT_*), however, so I've left the test
in configure alone. I see that sys/mman.h is included elsewhere with an
#ifdef WIN32 or HAVE_SHM_OPEN guard, but HAVE_SYS_MMAN_H seems better.

* the documentation should perhaps mention that the setting only has
an effect if POSIX shared memory is used.

As Robert said, this is not correct, so I haven't changed anything.

-- Abhijit

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#18Abhijit Menon-Sen
ams@2ndquadrant.com
In reply to: Andres Freund (#13)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

At 2013-10-24 19:00:28 +0200, andres@2ndquadrant.com wrote:

I think we should log when we tried to use hugepages but fell back to
plain mmap, currently it's hard to see whether they are used.

Good idea, thanks. I'll do this in the next patch I post (which will be
after we reach some consensus about how to handle the rounding problem).

-- Abhijit

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#19Sergey Konoplev
gray.ru@gmail.com
In reply to: Tom Lane (#16)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

On Tue, Oct 29, 2013 at 9:31 PM, Tom Lane <tgl@sss.pgh.pa.us> wrote:

Sergey Konoplev <gray.ru@gmail.com> writes:

On Wed, Oct 23, 2013 at 11:03 PM, Abhijit Menon-Sen <ams@2ndquadrant.com> wrote:

This is a slightly reworked version of the patch submitted by Richard
Poole last month, which was based on Christian Kruse's earlier patch.

Is it possible that this patch will be included in a minor version of
9.3? IMHO hugepages is a very important ability that postgres lost in
9.3, and it would be great to have it back ASAP.

Say what? There's never been any hugepages support in Postgres.

There were an ability to back shared memory with hugepages when using
<=9.2. I use it on ~30 servers for several years and it brings 8-17%
of performance depending on the memory size. Here you will find
several paragraphs of the description about how to do it
https://github.com/grayhemp/pgcookbook/blob/master/database_server_configuration.md.
Just search for the 'hugepages' word on the page.

--
Kind regards,
Sergey Konoplev
PostgreSQL Consultant and DBA

http://www.linkedin.com/in/grayhemp
+1 (415) 867-9984, +7 (901) 903-0499, +7 (988) 888-1979
gray.ru@gmail.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#20David Fetter
david@fetter.org
In reply to: Sergey Konoplev (#19)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

On Tue, Oct 29, 2013 at 11:08:05PM -0700, Sergey Konoplev wrote:

On Tue, Oct 29, 2013 at 9:31 PM, Tom Lane <tgl@sss.pgh.pa.us> wrote:

Sergey Konoplev <gray.ru@gmail.com> writes:

On Wed, Oct 23, 2013 at 11:03 PM, Abhijit Menon-Sen <ams@2ndquadrant.com> wrote:

This is a slightly reworked version of the patch submitted by Richard
Poole last month, which was based on Christian Kruse's earlier patch.

Is it possible that this patch will be included in a minor version of
9.3? IMHO hugepages is a very important ability that postgres lost in
9.3, and it would be great to have it back ASAP.

Say what? There's never been any hugepages support in Postgres.

There were an ability to back shared memory with hugepages when using
<=9.2. I use it on ~30 servers for several years and it brings 8-17%
of performance depending on the memory size. Here you will find
several paragraphs of the description about how to do it
https://github.com/grayhemp/pgcookbook/blob/master/database_server_configuration.md.
Just search for the 'hugepages' word on the page.

For better or worse, we add new features exactly and only in .0
releases. It's what's made it possible for people to plan
deployments, given us a deserved reputation for stability, etc., etc.

I guess what I'm saying here is that awesome as any particular feature
might be to back-patch, that benefit is overwhelmed by the cost of
having unstable releases.

-infininty from me to any proposal that gets us into "are you using
PostgreSQL x.y.z or x.y.w?" when it comes to features.

Cheers,
David.
--
David Fetter <david@fetter.org> http://fetter.org/
Phone: +1 415 235 3778 AIM: dfetter666 Yahoo!: dfetter
Skype: davidfetter XMPP: david.fetter@gmail.com
iCal: webcal://www.tripit.com/feed/ical/people/david74/tripit.ics

Remember to vote!
Consider donating to Postgres: http://www.postgresql.org/about/donate

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#21David Fetter
david@fetter.org
In reply to: Abhijit Menon-Sen (#17)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

On Wed, Oct 30, 2013 at 10:16:57AM +0530, Abhijit Menon-Sen wrote:

At 2013-10-24 16:06:19 +0300, hlinnakangas@vmware.com wrote:

Let's get rid of the rounding.

I share Andres's concern that the bug is present in various recent
kernels that are going to stick around for quite some time. Given
the rather significant performance gain, I think it's worth doing
something, though I'm not a big fan of the directory-scanning code
myself.

As a compromise, perhaps we can unconditionally round the size up to be
a multiple of 2MB?

How about documenting that 2MB is the quantum (OK, we'll say
"indivisible unit" or "smallest division" or something) and failing
with a message to that effect if someone tries to set it otherwise?

Cheers,
David.
--
David Fetter <david@fetter.org> http://fetter.org/
Phone: +1 415 235 3778 AIM: dfetter666 Yahoo!: dfetter
Skype: davidfetter XMPP: david.fetter@gmail.com
iCal: webcal://www.tripit.com/feed/ical/people/david74/tripit.ics

Remember to vote!
Consider donating to Postgres: http://www.postgresql.org/about/donate

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#22Abhijit Menon-Sen
ams@2ndquadrant.com
In reply to: David Fetter (#21)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

At 2013-10-30 00:10:39 -0700, david@fetter.org wrote:

How about documenting that 2MB is the quantum (OK, we'll say
"indivisible unit" or "smallest division" or something) and failing
with a message to that effect if someone tries to set it otherwise?

I don't think you understand the problem. We're not discussing a user
setting here. The size that is passed to PGSharedMemoryCreate is based
on shared_buffers and our estimates of how much memory we need for other
things like ProcArray (see ipci.c:CreateSharedMemoryAndSemaphores).

If this calculated size is not a multiple of a page size supported by
the hardware (usually 2/4/16MB etc.), the allocation will fail under
some commonly-used kernels. We can either ignore the problem and let
the allocation fail, or try to discover the smallest supported huge
page size (what the patch does now), or assume that 2MB pages can be
used if any huge pages can be used and align accordingly.

We could use a larger size, e.g. if we aligned to 16MB then it would
work on hardware that supported 2/4/8/16MB pages, but we'd waste the
extra memory unless we also increased NBuffers after the rounding up
(which is also something Andres suggested earlier).

I don't have a strong opinion on the available options, other than not
liking the "do nothing" approach.

-- Abhijit

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#23Tom Lane
tgl@sss.pgh.pa.us
In reply to: Abhijit Menon-Sen (#17)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

Abhijit Menon-Sen <ams@2ndquadrant.com> writes:

As a compromise, perhaps we can unconditionally round the size up to be
a multiple of 2MB? That way, we can use huge pages more often, but also
avoid putting in a lot of code and effort into the workaround and waste
only a little space (if any at all).

That sounds reasonably painless to me. Note that at least in our main
shmem segment, "extra" space is not useless, because it allows slop for
the main hash tables, notably the locks table.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#24Tom Lane
tgl@sss.pgh.pa.us
In reply to: Sergey Konoplev (#19)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

Sergey Konoplev <gray.ru@gmail.com> writes:

On Tue, Oct 29, 2013 at 9:31 PM, Tom Lane <tgl@sss.pgh.pa.us> wrote:

Say what? There's never been any hugepages support in Postgres.

There were an ability to back shared memory with hugepages when using
<=9.2. I use it on ~30 servers for several years and it brings 8-17%
of performance depending on the memory size. Here you will find
several paragraphs of the description about how to do it
https://github.com/grayhemp/pgcookbook/blob/master/database_server_configuration.md.

What this describes is how to modify Postgres to request huge pages.
That's hardly built-in support.

In any case, as David already explained, we don't do feature additions
in minor releases. We'd be especially unlikely to make an exception
for this, since it has uncertain portability and benefits. Anything
that carries portability risks has got to go through a beta testing
cycle before we'll unleash it on the masses.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#25Abhijit Menon-Sen
ams@2ndquadrant.com
In reply to: Tom Lane (#23)
1 attachment(s)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

At 2013-10-30 11:04:36 -0400, tgl@sss.pgh.pa.us wrote:

As a compromise, perhaps we can unconditionally round the size up to be
a multiple of 2MB? […]

That sounds reasonably painless to me.

Here's a patch that does that and adds a DEBUG1 log message when we try
with MAP_HUGETLB and fail and fallback to ordinary mmap.

-- Abhijit

Attachments:

hugepages-v4.patchtext/x-diff; charset=us-asciiDownload
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 707edf1..7d9d0a8 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -113,6 +113,7 @@
 
 #shared_buffers = 32MB			# min 128kB
 					# (change requires restart)
+#huge_tlb_pages = try			# try to map memory with MAP_HUGETLB (on, off, try)
 #temp_buffers = 8MB			# min 800kB
 #max_prepared_transactions = 0		# zero disables the feature
 					# (change requires restart)

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 77a9303..e4ded7a 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1052,6 +1052,49 @@ include 'filename'
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-huge-tlb-pages" xreflabel="huge_tlb_pages">
+      <term><varname>huge_tlb_pages</varname> (<type>enum</type>)</term>
+      <indexterm>
+       <primary><varname>huge_tlb_pages</> configuration parameter</primary>
+      </indexterm>
+      <listitem>
+       <para>
+        Enables/disables the use of huge TLB pages. Valid values are
+        <literal>try</literal> (the default), <literal>on</literal>,
+        and <literal>off</literal>.
+       </para>
+
+       <para>
+        At present, this feature is supported only on Linux. The setting
+        is ignored on other systems.
+       </para>
+
+       <para>
+        The use of huge TLB pages results in smaller page tables and
+        less CPU time spent on memory management. For more details, see
+        <ulink url="https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt">hugepages.txt
+        </ulink> in the Linux kernel documentation.
+       </para>
+
+       <para>
+        With <varname>huge_tlb_pages</varname> set to <literal>try</literal>,
+        the server will try to use huge pages, but fall back to using
+        normal allocation if the first attempt fails.
+       </para>
+
+       <para>
+        With <varname>huge_tlb_pages</varname> set to <literal>on</literal>,
+        the server will try to use huge pages, and treat failure as a
+        FATAL error.
+       </para>
+
+       <para>
+        With <varname>huge_tlb_pages</varname> set to <literal>off</literal>,
+        the server will not try to use huge pages.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-temp-buffers" xreflabel="temp_buffers">
       <term><varname>temp_buffers</varname> (<type>integer</type>)</term>
       <indexterm>

diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c
index b604407..fc0d74b 100644
--- a/src/backend/port/sysv_shmem.c
+++ b/src/backend/port/sysv_shmem.c
@@ -32,6 +32,7 @@
 #include "portability/mem.h"
 #include "storage/ipc.h"
 #include "storage/pg_shmem.h"
+#include "utils/guc.h"
 
 
 typedef key_t IpcMemoryKey;		/* shared memory key passed to shmget(2) */
@@ -367,14 +368,31 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port)
 	 */
 #ifndef EXEC_BACKEND
 	{
-		long		pagesize = sysconf(_SC_PAGE_SIZE);
+		int			flags = PG_MMAP_FLAGS;
+		long		pagesize = 2*1024*1024;
+
+#ifdef MAP_HUGETLB
+		if (huge_tlb_pages == HUGE_TLB_ON || huge_tlb_pages == HUGE_TLB_TRY)
+		{
+			flags |= MAP_HUGETLB;
+		}
+#endif
 
 		/*
 		 * Ensure request size is a multiple of pagesize.
 		 *
-		 * pagesize will, for practical purposes, always be a power of two.
-		 * But just in case it isn't, we do it this way instead of using
-		 * TYPEALIGN().
+		 * By doing this ourselves, we maximise the chances of being
+		 * able to use huge TLB pages even on kernels that do not round
+		 * up the request size correctly, for example due to this bug:
+		 * https://bugzilla.kernel.org/show_bug.cgi?id=56881
+		 *
+		 * The default value of 2MB for pagesize is chosen based on the
+		 * most common supported huge page size. Rounding up to a larger
+		 * value (e.g. 16MB) would use even larger pages if the hardware
+		 * supported them, but would potentially waste more space.
+		 *
+		 * We round up by hand instead of using TYPEALIGN(), but for all
+		 * practical purposes, pagesize will always be a power of two.
 		 */
 		if (pagesize > 0 && size % pagesize != 0)
 			size += pagesize - (size % pagesize);
@@ -386,8 +404,21 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port)
 		 * out to be false, we might need to add a run-time test here and do
 		 * this only if the running kernel supports it.
 		 */
-		AnonymousShmem = mmap(NULL, size, PROT_READ | PROT_WRITE, PG_MMAP_FLAGS,
-							  -1, 0);
+
+		AnonymousShmem = mmap(NULL, size, PROT_READ|PROT_WRITE, flags, -1, 0);
+
+#ifdef MAP_HUGETLB
+		if (huge_tlb_pages == HUGE_TLB_TRY && AnonymousShmem == MAP_FAILED)
+		{
+			elog(DEBUG1, "mmap(%lu) with MAP_HUGETLB failed with errno=%d; "
+				 "trying without", (uint64)size, errno);
+
+			flags &= ~MAP_HUGETLB;
+			AnonymousShmem = mmap(NULL, size, PROT_READ|PROT_WRITE, flags,
+								  -1, 0);
+		}
+#endif
+
 		if (AnonymousShmem == MAP_FAILED)
 			ereport(FATAL,
 					(errmsg("could not map anonymous shared memory: %m"),

diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index dfc6704..8faafb4 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -381,6 +381,26 @@ static const struct config_enum_entry synchronous_commit_options[] = {
 };
 
 /*
+ * huge_tlb_pages may be on|off|try, where try is the default
+ * on: try to mmap() with MAP_HUGETLB and fail when mmap() fails
+ * off: do not try tp mmap() with MAP_HUGETLB
+ * try: try to mmap() with MAP_HUGETLB and fallback to mmap()
+ *      w/o MAP_HUGETLB
+ */
+static const struct config_enum_entry huge_tlb_options[] = {
+	{"off", HUGE_TLB_OFF, false},
+	{"on", HUGE_TLB_ON, false},
+	{"try", HUGE_TLB_TRY, false},
+	{"true", HUGE_TLB_ON, true},
+	{"false", HUGE_TLB_OFF, true},
+	{"yes", HUGE_TLB_ON, true},
+	{"no", HUGE_TLB_OFF, true},
+	{"1", HUGE_TLB_ON, true},
+	{"0", HUGE_TLB_OFF, true},
+	{NULL, 0, false}
+};
+
+/*
  * Options for enum values stored in other modules
  */
 extern const struct config_enum_entry wal_level_options[];
@@ -440,6 +460,8 @@ int			tcp_keepalives_idle;
 int			tcp_keepalives_interval;
 int			tcp_keepalives_count;
 
+int huge_tlb_pages = HUGE_TLB_TRY;
+
 /*
  * These variables are all dummies that don't do anything, except in some
  * cases provide the value for SHOW to display.  The real state is elsewhere
@@ -3377,6 +3399,18 @@ static struct config_enum ConfigureNamesEnum[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"huge_tlb_pages",
+			PGC_POSTMASTER,
+			RESOURCES_MEM,
+			gettext_noop("Enable/disable the use of huge TLB pages on Linux"),
+			NULL
+		},
+		&huge_tlb_pages,
+		HUGE_TLB_TRY,
+		huge_tlb_options,
+		NULL, NULL, NULL
+	},
 
 	/* End-of-list marker */
 	{

diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h
index 3e981b3..8011e88 100644
--- a/src/include/utils/guc.h
+++ b/src/include/utils/guc.h
@@ -235,6 +235,24 @@ extern int	tcp_keepalives_idle;
 extern int	tcp_keepalives_interval;
 extern int	tcp_keepalives_count;
 
+
+/*
+ * Possible values for huge_tlb_pages; default is HUGE_TLB_TRY
+ */
+typedef enum
+{
+	HUGE_TLB_OFF,
+	HUGE_TLB_ON,
+	HUGE_TLB_TRY
+} HugeTlbType;
+
+
+/*
+ * configure the use of huge TLB pages
+ */
+extern int huge_tlb_pages;
+
+
 /*
  * Functions exported by guc.c
  */

diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 0250e39..051b6cf 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -532,6 +532,9 @@
 /* Define to 1 if you have the <sys/ipc.h> header file. */
 #undef HAVE_SYS_IPC_H
 
+/* Define to 1 if you have the <sys/mman.h> header file. */
+#undef HAVE_SYS_MMAN_H
+
 /* Define to 1 if you have the <sys/poll.h> header file. */
 #undef HAVE_SYS_POLL_H
 
diff --git a/configure b/configure
index c20afde..67bc57f 100755
--- a/configure
+++ b/configure
@@ -10524,7 +10524,7 @@ done
 
 
 
-for ac_header in crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h
+for ac_header in crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/mman.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h
 do
 as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh`
 if { as_var=$as_ac_Header; eval "test \"\${$as_var+set}\" = set"; }; then

diff --git a/configure.in b/configure.in
index d2bab32..b755202 100644
--- a/configure.in
+++ b/configure.in
@@ -982,7 +982,7 @@ AC_SUBST(OSSP_UUID_LIBS)
 ##
 
 dnl sys/socket.h is required by AC_FUNC_ACCEPT_ARGTYPES
-AC_CHECK_HEADERS([crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h])
+AC_CHECK_HEADERS([crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/mman.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h])
 
 # On BSD, test for net/if.h will fail unless sys/socket.h
 # is included first.
#26Andres Freund
andres@2ndquadrant.com
In reply to: Abhijit Menon-Sen (#25)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

On 2013-10-30 22:39:20 +0530, Abhijit Menon-Sen wrote:

At 2013-10-30 11:04:36 -0400, tgl@sss.pgh.pa.us wrote:

As a compromise, perhaps we can unconditionally round the size up to be
a multiple of 2MB? […]

That sounds reasonably painless to me.

Here's a patch that does that and adds a DEBUG1 log message when we try
with MAP_HUGETLB and fail and fallback to ordinary mmap.

But it's in no way guaranteed that the smallest hugepage size is
2MB. It'll be on current x86 hardware, but not on any other platform...

Greetings,

Andres Freund

--
Andres Freund http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#27Sergey Konoplev
gray.ru@gmail.com
In reply to: Tom Lane (#24)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

On Wed, Oct 30, 2013 at 8:11 AM, Tom Lane <tgl@sss.pgh.pa.us> wrote:

Sergey Konoplev <gray.ru@gmail.com> writes:

On Tue, Oct 29, 2013 at 9:31 PM, Tom Lane <tgl@sss.pgh.pa.us> wrote:

Say what? There's never been any hugepages support in Postgres.

There were an ability to back shared memory with hugepages when using
<=9.2. I use it on ~30 servers for several years and it brings 8-17%
of performance depending on the memory size. Here you will find
several paragraphs of the description about how to do it
https://github.com/grayhemp/pgcookbook/blob/master/database_server_configuration.md.

What this describes is how to modify Postgres to request huge pages.
That's hardly built-in support.

I wasn't talking about a built-in support. It was about an ability (a
way) to back sh_buf with hugepages.

In any case, as David already explained, we don't do feature additions
in minor releases. We'd be especially unlikely to make an exception
for this, since it has uncertain portability and benefits. Anything
that carries portability risks has got to go through a beta testing
cycle before we'll unleash it on the masses.

Yes, I got the idea. Thanks both of you for clarification.

--
Kind regards,
Sergey Konoplev
PostgreSQL Consultant and DBA

http://www.linkedin.com/in/grayhemp
+1 (415) 867-9984, +7 (901) 903-0499, +7 (988) 888-1979
gray.ru@gmail.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#28Alvaro Herrera
alvherre@2ndquadrant.com
In reply to: Sergey Konoplev (#27)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

Sergey Konoplev escribi�:

On Wed, Oct 30, 2013 at 8:11 AM, Tom Lane <tgl@sss.pgh.pa.us> wrote:

Sergey Konoplev <gray.ru@gmail.com> writes:

There were an ability to back shared memory with hugepages when using
<=9.2. I use it on ~30 servers for several years and it brings 8-17%
of performance depending on the memory size. Here you will find
several paragraphs of the description about how to do it
https://github.com/grayhemp/pgcookbook/blob/master/database_server_configuration.md.

What this describes is how to modify Postgres to request huge pages.
That's hardly built-in support.

I wasn't talking about a built-in support. It was about an ability (a
way) to back sh_buf with hugepages.

Then what you need is to set
dynamic_shared_memory_type = sysv
in postgresql.conf.

--
�lvaro Herrera http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#29Sergey Konoplev
gray.ru@gmail.com
In reply to: Alvaro Herrera (#28)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

On Wed, Oct 30, 2013 at 11:50 AM, Alvaro Herrera
<alvherre@2ndquadrant.com> wrote:

There were an ability to back shared memory with hugepages when using
<=9.2. I use it on ~30 servers for several years and it brings 8-17%
of performance depending on the memory size. Here you will find
several paragraphs of the description about how to do it
https://github.com/grayhemp/pgcookbook/blob/master/database_server_configuration.md.

What this describes is how to modify Postgres to request huge pages.
That's hardly built-in support.

I wasn't talking about a built-in support. It was about an ability (a
way) to back sh_buf with hugepages.

Then what you need is to set
dynamic_shared_memory_type = sysv
in postgresql.conf.

Neither I found this parameter in the docs nor it works when I specify
it in postgresql.conf.

LOG: unrecognized configuration parameter
"dynamic_shared_memory_type" in file
"/etc/postgresql/9.3/main/postgresql.conf" line 114
FATAL: configuration file "/etc/postgresql/9.3/main/postgresql.conf"
contains errors

--
Kind regards,
Sergey Konoplev
PostgreSQL Consultant and DBA

http://www.linkedin.com/in/grayhemp
+1 (415) 867-9984, +7 (901) 903-0499, +7 (988) 888-1979
gray.ru@gmail.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#30Alvaro Herrera
alvherre@2ndquadrant.com
In reply to: Alvaro Herrera (#28)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

Alvaro Herrera escribi�:

Sergey Konoplev escribi�:

I wasn't talking about a built-in support. It was about an ability (a
way) to back sh_buf with hugepages.

Then what you need is to set
dynamic_shared_memory_type = sysv
in postgresql.conf.

The above is mistaken -- there's no way to disable the mmap() segment in
9.3, other than recompiling with EXEC_BACKEND which is probably
undesirable for other reasons.

I don't think I had ever heard of that recipe to use huge pages in
previous versions; since the win is probably significant in some
systems, we could have made this configurable.

--
�lvaro Herrera http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#31Sergey Konoplev
gray.ru@gmail.com
In reply to: Alvaro Herrera (#30)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

On Wed, Oct 30, 2013 at 12:17 PM, Alvaro Herrera
<alvherre@2ndquadrant.com> wrote:

I wasn't talking about a built-in support. It was about an ability (a
way) to back sh_buf with hugepages.

Then what you need is to set
dynamic_shared_memory_type = sysv
in postgresql.conf.

The above is mistaken -- there's no way to disable the mmap() segment in
9.3, other than recompiling with EXEC_BACKEND which is probably
undesirable for other reasons.

Alternatively, I assume it could be linked with libhugetlbfs and you
don't need any source modifications in this case. However I am not
sure it will work with shared memory.

I don't think I had ever heard of that recipe to use huge pages in
previous versions; since the win is probably significant in some
systems, we could have made this configurable.

There are several articles in the web describing how to do this,
except the mine one. And the win becomes mostly significant when you
have 64GB and more on your server.

--
Kind regards,
Sergey Konoplev
PostgreSQL Consultant and DBA

http://www.linkedin.com/in/grayhemp
+1 (415) 867-9984, +7 (901) 903-0499, +7 (988) 888-1979
gray.ru@gmail.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#32Sergey Konoplev
gray.ru@gmail.com
In reply to: Sergey Konoplev (#31)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

On Wed, Oct 30, 2013 at 12:51 PM, Sergey Konoplev <gray.ru@gmail.com> wrote:

On Wed, Oct 30, 2013 at 12:17 PM, Alvaro Herrera
<alvherre@2ndquadrant.com> wrote:

I wasn't talking about a built-in support. It was about an ability (a
way) to back sh_buf with hugepages.

Then what you need is to set
dynamic_shared_memory_type = sysv
in postgresql.conf.

The above is mistaken -- there's no way to disable the mmap() segment in
9.3, other than recompiling with EXEC_BACKEND which is probably
undesirable for other reasons.

Alternatively, I assume it could be linked with libhugetlbfs and you
don't need any source modifications in this case. However I am not
sure it will work with shared memory.

BTW, I managed to run 9.3 backed with hugepages after I put
HUGETLB_MORECORE (see man libhugetlbfs) to the environment yesterday,
but, after some time of working, it failed with messages showed below.

syslog:

Oct 29 17:53:13 grayhemp kernel: [150579.903875] PID 7584 killed due
to inadequate hugepage pool

postgres:

libhugetlbfslibhugetlbfs2013-10-29 17:53:21 PDT LOG: server process
(PID 7584) was terminated by signal 7: Bus error
2013-10-29 17:53:21 PDT LOG: terminating any other active server processes
2013-10-29 1
7:53:21 PDT WARNING: terminating connection because of crash of
another server process
2013-10-29 17:53:21 PDT DETAIL: The postmaster has commanded this
server process to roll back the current transaction and exit, because
another server process exited abnormally and possibly corrupted shared
memory.

My theory is that it has happened after the amount of huge pages
(vm.nr_overcommit_hugepages + vm.nr_hugepages) was exceeded, but I
might be wrong.

Does anybody has some thoughts of why it has happened and how to work abound it?

--
Kind regards,
Sergey Konoplev
PostgreSQL Consultant and DBA

http://www.linkedin.com/in/grayhemp
+1 (415) 867-9984, +7 (901) 903-0499, +7 (988) 888-1979
gray.ru@gmail.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#33Heikki Linnakangas
hlinnakangas@vmware.com
In reply to: Andres Freund (#26)
1 attachment(s)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

On 30.10.2013 19:11, Andres Freund wrote:

On 2013-10-30 22:39:20 +0530, Abhijit Menon-Sen wrote:

At 2013-10-30 11:04:36 -0400, tgl@sss.pgh.pa.us wrote:

As a compromise, perhaps we can unconditionally round the size up to be
a multiple of 2MB? […]

That sounds reasonably painless to me.

Here's a patch that does that and adds a DEBUG1 log message when we try
with MAP_HUGETLB and fail and fallback to ordinary mmap.

But it's in no way guaranteed that the smallest hugepage size is
2MB. It'll be on current x86 hardware, but not on any other platform...

Sure, but there's no big harm done. We're just trying to avoid hitting a
kernel bug, and as a bonus, we avoid wasting some memory that would
otherwise be lost due to the kernel rounding the allocation. If the
smallest hugepage size is smaller than 2MB, we round up the allocation
unnecessarily, but that doesn't seem serious.

I spent some time whacking this around, new patch version attached. I
moved the mmap() code into a new function, that leaves the
PGSharedMemoryCreate more readable.

I modified the patch so that it throws an error if you set
huge_tlb_pages=on, and the platform doesn't support MAP_HUGETLB (ie.
non-Linux, or EXEC_BACKEND). 'try' is the default, so this only affects
you if you explicitly set it to 'on'. I think that's the right behavior;
if you explicitly ask for it, and you don't get it, that should be an
error. But I'm not wedded to the idea if someone objects; a log message
might also be reasonable: "LOG: huge TLB pages are not supported on this
platform, but huge_tlb_pages was 'on'"

The error message on failed allocation, if huge_tlb_pages=on, needs
updating:

$ bin/postmaster -D data
FATAL: could not map anonymous shared memory: Cannot allocate memory
HINT: This error usually means that PostgreSQL's request for a shared
memory segment exceeded available memory or swap space. To reduce the
request size (currently 189390848 bytes), reduce PostgreSQL's shared
memory usage, perhaps by reducing shared_buffers or max_connections.

The reason the allocation failed in this case was that I used
huge_tlb_pages=on, but had not configured the kernel for huge pages. The
hint is quite misleading in that case, it should advise to configure the
kernel, or turn off huge_tlb_pages.

The documentation needs some work. I think it's pretty user-unfriendly
to link to https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt.
It gives a lot of details, and although it explains stuff that is
relevant, like setting the nr_hugepages sysctl, it also contains a lot
of stuff that is not relevant to us, like how to mount hugetlbfs. Can we
do better than that? Is there a better guide somewhere on how to set the
kernel settings. If not, we should include step-by-step instructions in
our manual.

The "Managing Kernel Resources" section in the user manual should also
be updated to mention how to enable huge pages.

Also, now that I changed huge_tlb_pages='on' to fail on platforms where
it's not supported at all, the docs need to be updated to reflect it.

- Heikki

Attachments:

hugepages-v5.patchtext/x-diff; name=hugepages-v5.patchDownload
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 77a9303..7a60ad0 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1052,6 +1052,41 @@ include 'filename'
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-huge-tlb-pages" xreflabel="huge_tlb_pages">
+      <term><varname>huge_tlb_pages</varname> (<type>enum</type>)</term>
+      <indexterm>
+       <primary><varname>huge_tlb_pages</> configuration parameter</primary>
+      </indexterm>
+      <listitem>
+       <para>
+        Enables/disables the use of huge TLB pages. Valid values are
+        <literal>try</literal> (the default), <literal>on</literal>,
+        and <literal>off</literal>.
+       </para>
+
+       <para>
+        At present, this feature is supported only on Linux. The setting
+        is ignored on other systems.
+       </para>
+
+       <para>
+        The use of huge TLB pages results in smaller page tables and
+        less CPU time spent on memory management, increasing performance. For
+        more details, see
+        <ulink url="https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt">hugepages.txt</ulink>
+        in the Linux kernel documentation.
+       </para>
+
+       <para>
+        With <varname>huge_tlb_pages</varname> set to <literal>try</literal>,
+        the server will try to use huge pages, but fall back to using
+        normal allocation if that fails. With <literal>on</literal, failure
+        to use huge pages will prevent the server from starting up. With
+        <literal>off</literal>, huge pages will not be used.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-temp-buffers" xreflabel="temp_buffers">
       <term><varname>temp_buffers</varname> (<type>integer</type>)</term>
       <indexterm>
diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c
index b604407..3ccd2c2 100644
--- a/src/backend/port/sysv_shmem.c
+++ b/src/backend/port/sysv_shmem.c
@@ -32,6 +32,7 @@
 #include "portability/mem.h"
 #include "storage/ipc.h"
 #include "storage/pg_shmem.h"
+#include "utils/guc.h"
 
 
 typedef key_t IpcMemoryKey;		/* shared memory key passed to shmget(2) */
@@ -41,7 +42,7 @@ typedef int IpcMemoryId;		/* shared memory ID returned by shmget(2) */
 unsigned long UsedShmemSegID = 0;
 void	   *UsedShmemSegAddr = NULL;
 static Size AnonymousShmemSize;
-static void *AnonymousShmem;
+static void *AnonymousShmem = NULL;
 
 static void *InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size);
 static void IpcMemoryDetach(int status, Datum shmaddr);
@@ -317,6 +318,79 @@ PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2)
 	return true;
 }
 
+/*
+ * Creates an anonymous mmap()ed shared memory segment.
+ *
+ * Pass the desired size in *size. This function will modify *size to the
+ * actual size of the allocation, if it ends up allocating a larger than
+ * desired segment.
+ */
+#ifndef EXEC_BACKEND
+static void *
+CreateAnonymousSegment(Size *size)
+{
+	Size		allocsize;
+	void	   *ptr = MAP_FAILED;
+
+#ifndef MAP_HUGETLB
+	if (huge_tlb_pages == HUGE_TLB_ON)
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("huge TLB pages not supported on this platform")));
+#else
+	if (huge_tlb_pages == HUGE_TLB_ON || huge_tlb_pages == HUGE_TLB_TRY)
+	{
+		/*
+		 * Round up the request size to a suitable large value.
+		 *
+		 * Some Linux kernel versions are known to have a bug, which
+		 * causes mmap() with MAP_HUGETLB to fail if the request size is
+		 * not a multiple of any supported huge page size. To work around
+		 * that, we round up the request size to nearest 2MB. 2MB is the
+		 * most common huge page page size on affected systems.
+		 *
+		 * Aside from that bug, even with a kernel that does the
+		 * allocation correctly, rounding it up ourselvees avoids wasting
+		 * memory. Without it, if we for example make an allocation of
+		 * 2MB + 1 bytes, the kernel might decide to use two 2MB huge
+		 * pages for that, and waste 2 MB - 1 of memory. When we do the
+		 * rounding ourselves, we can use that space for allocations.
+		 */
+		int			hugepagesize = 2 * 1024 * 1024;
+
+		allocsize = *size;
+		if (allocsize % hugepagesize != 0)
+			allocsize += hugepagesize - (allocsize % hugepagesize);
+
+		ptr = mmap(NULL, *size, PROT_READ | PROT_WRITE,
+				   PG_MMAP_FLAGS | MAP_HUGETLB, -1, 0);
+		if (huge_tlb_pages == HUGE_TLB_TRY && ptr == MAP_FAILED)
+			elog(DEBUG1, "mmap with MAP_HUGETLB failed, huge pages disabled: %m");
+	}
+#endif
+
+	if (huge_tlb_pages == HUGE_TLB_OFF || huge_tlb_pages == HUGE_TLB_TRY)
+	{
+		allocsize = *size;
+		ptr = mmap(NULL, *size, PROT_READ | PROT_WRITE, PG_MMAP_FLAGS, -1, 0);
+	}
+
+	if (ptr == MAP_FAILED)
+		ereport(FATAL,
+				(errmsg("could not map anonymous shared memory: %m"),
+				 (errno == ENOMEM) ?
+				 errhint("This error usually means that PostgreSQL's request "
+						"for a shared memory segment exceeded available memory "
+						 "or swap space. To reduce the request size (currently "
+						 "%lu bytes), reduce PostgreSQL's shared memory usage, "
+						 "perhaps by reducing shared_buffers or "
+						 "max_connections.",
+						 (unsigned long) *size) : 0));
+
+	*size = allocsize;
+	return ptr;
+}
+#endif
 
 /*
  * PGSharedMemoryCreate
@@ -344,7 +418,14 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port)
 	PGShmemHeader *hdr;
 	IpcMemoryId shmid;
 	struct stat statbuf;
-	Size		sysvsize = size;
+	Size		sysvsize;
+
+#if defined(EXEC_BACKEND) || !defined(MAP_HUGETLB)
+	if (huge_tlb_pages == HUGE_TLB_ON)
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("huge TLB pages not supported on this platform")));
+#endif
 
 	/* Room for a header? */
 	Assert(size > MAXALIGN(sizeof(PGShmemHeader)));
@@ -359,6 +440,12 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port)
 	 * to run many copies of PostgreSQL without needing to adjust system
 	 * settings.
 	 *
+	 * We assume that no one will attempt to run PostgreSQL 9.3 or later
+	 * on systems that are ancient enough that anonymous shared memory is
+	 * not supported, such as pre-2.4 versions of Linux.  If that turns
+	 * out to be false, we might need to add a run-time test here and do
+	 * this only if the running kernel supports it.
+	 *
 	 * However, we disable this logic in the EXEC_BACKEND case, and fall back
 	 * to the old method of allocating the entire segment using System V
 	 * shared memory, because there's no way to attach an mmap'd segment to a
@@ -366,44 +453,13 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port)
 	 * developer use, this shouldn't be a big problem.
 	 */
 #ifndef EXEC_BACKEND
-	{
-		long		pagesize = sysconf(_SC_PAGE_SIZE);
-
-		/*
-		 * Ensure request size is a multiple of pagesize.
-		 *
-		 * pagesize will, for practical purposes, always be a power of two.
-		 * But just in case it isn't, we do it this way instead of using
-		 * TYPEALIGN().
-		 */
-		if (pagesize > 0 && size % pagesize != 0)
-			size += pagesize - (size % pagesize);
+	AnonymousShmem = CreateAnonymousSegment(&size);
+	AnonymousShmemSize = size;
 
-		/*
-		 * We assume that no one will attempt to run PostgreSQL 9.3 or later
-		 * on systems that are ancient enough that anonymous shared memory is
-		 * not supported, such as pre-2.4 versions of Linux.  If that turns
-		 * out to be false, we might need to add a run-time test here and do
-		 * this only if the running kernel supports it.
-		 */
-		AnonymousShmem = mmap(NULL, size, PROT_READ | PROT_WRITE, PG_MMAP_FLAGS,
-							  -1, 0);
-		if (AnonymousShmem == MAP_FAILED)
-			ereport(FATAL,
-					(errmsg("could not map anonymous shared memory: %m"),
-					 (errno == ENOMEM) ?
-				errhint("This error usually means that PostgreSQL's request "
-					 "for a shared memory segment exceeded available memory "
-					  "or swap space. To reduce the request size (currently "
-					  "%lu bytes), reduce PostgreSQL's shared memory usage, "
-						"perhaps by reducing shared_buffers or "
-						"max_connections.",
-						(unsigned long) size) : 0));
-		AnonymousShmemSize = size;
-
-		/* Now we need only allocate a minimal-sized SysV shmem block. */
-		sysvsize = sizeof(PGShmemHeader);
-	}
+	/* Now we need only allocate a minimal-sized SysV shmem block. */
+	sysvsize = sizeof(PGShmemHeader);
+#else
+	sysvsize = size;
 #endif
 
 	/* Make sure PGSharedMemoryAttach doesn't fail without need */
diff --git a/src/backend/port/win32_shmem.c b/src/backend/port/win32_shmem.c
index 0db8e8f..38efe37 100644
--- a/src/backend/port/win32_shmem.c
+++ b/src/backend/port/win32_shmem.c
@@ -128,6 +128,11 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port)
 	DWORD		size_high;
 	DWORD		size_low;
 
+	if (huge_tlb_pages == HUGE_TLB_ON)
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("huge TLB pages not supported on this platform")));
+
 	/* Room for a header? */
 	Assert(size > MAXALIGN(sizeof(PGShmemHeader)));
 
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 54d8078..71b30a2 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -64,6 +64,7 @@
 #include "storage/dsm_impl.h"
 #include "storage/standby.h"
 #include "storage/fd.h"
+#include "storage/pg_shmem.h"
 #include "storage/proc.h"
 #include "storage/predicate.h"
 #include "tcop/tcopprot.h"
@@ -381,6 +382,23 @@ static const struct config_enum_entry synchronous_commit_options[] = {
 };
 
 /*
+ * Although only "on", "off", "try" are documented, we accept all the likely
+ * variants of "on" and "off".
+ */
+static const struct config_enum_entry huge_tlb_options[] = {
+	{"off", HUGE_TLB_OFF, false},
+	{"on", HUGE_TLB_ON, false},
+	{"try", HUGE_TLB_TRY, false},
+	{"true", HUGE_TLB_ON, true},
+	{"false", HUGE_TLB_OFF, true},
+	{"yes", HUGE_TLB_ON, true},
+	{"no", HUGE_TLB_OFF, true},
+	{"1", HUGE_TLB_ON, true},
+	{"0", HUGE_TLB_OFF, true},
+	{NULL, 0, false}
+};
+
+/*
  * Options for enum values stored in other modules
  */
 extern const struct config_enum_entry wal_level_options[];
@@ -441,6 +459,12 @@ int			tcp_keepalives_interval;
 int			tcp_keepalives_count;
 
 /*
+ * This really belongs in pg_shmem.c, but is defined here so that it doesn't
+ * need to be duplicated in all the different implementations of pg_shmem.c.
+ */
+int			huge_tlb_pages;
+
+/*
  * These variables are all dummies that don't do anything, except in some
  * cases provide the value for SHOW to display.  The real state is elsewhere
  * and is kept in sync by assign_hooks.
@@ -3377,6 +3401,15 @@ static struct config_enum ConfigureNamesEnum[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"huge_tlb_pages", PGC_POSTMASTER, RESOURCES_MEM,
+			gettext_noop("Use of huge TLB pages on Linux"),
+			NULL
+		},
+		&huge_tlb_pages,
+		HUGE_TLB_TRY, huge_tlb_options,
+		NULL, NULL, NULL
+	},
 
 	/* End-of-list marker */
 	{
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 34a2d05..ed9573a 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -113,6 +113,8 @@
 
 #shared_buffers = 32MB			# min 128kB
 					# (change requires restart)
+#huge_tlb_pages = try			# on, off, or try
+					# (change requires restart)
 #temp_buffers = 8MB			# min 800kB
 #max_prepared_transactions = 0		# zero disables the feature
 					# (change requires restart)
diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h
index 251fbdf..3a6cbf5 100644
--- a/src/include/storage/pg_shmem.h
+++ b/src/include/storage/pg_shmem.h
@@ -38,6 +38,16 @@ typedef struct PGShmemHeader	/* standard header for all Postgres shmem */
 #endif
 } PGShmemHeader;
 
+/* GUC variable */
+extern int huge_tlb_pages;
+
+/* Possible values for huge_tlb_pages */
+typedef enum
+{
+	HUGE_TLB_OFF,
+	HUGE_TLB_ON,
+	HUGE_TLB_TRY
+} HugeTlbType;
 
 #ifdef EXEC_BACKEND
 #ifndef WIN32
#34Sameer Kumar
sameer.kumar@ashnik.com
In reply to: Heikki Linnakangas (#33)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

I was recently running some tests with huge page tables. I ran them on two
different architectures: x86 and PPC64.

I saw some discussion going on over here so thought of sharing.
I was using 3 Cores, 8GB RAM, 2 LUN for filesystem (1 for dbfiles and 1 for
logfiles) for these tests...

I had dedicated
(shared_buffers + 400bytes*max_connection + wal_buffers)/Pagesize [from
/proc/meminfo] for huge pages. I kept some overcommit_hugepages to be used
by work_mem (max_connection*work_mem)/Pagesize

x86_64 bit gave me a benefit of 2-5% for TPC-C workload( I scaled from 1 to
100 users). PPC64 which uses 16MB and 64MB did not give me any benefits in
fact the performance degraded as the concurrency of system increased.

my 2 cents, hope it helps.

#35Abhijit Menon-Sen
ams@2ndQuadrant.com
In reply to: Heikki Linnakangas (#33)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

At 2013-11-15 15:17:32 +0200, hlinnakangas@vmware.com wrote:

I spent some time whacking this around, new patch version attached.

Thanks.

But I'm not wedded to the idea if someone objects; a log message might
also be reasonable: "LOG: huge TLB pages are not supported on this
platform, but huge_tlb_pages was 'on'"

Put that way, I have to wonder if the right thing to do is just to have
a "try_huge_pages=on|off" setting, and log a warning if the attempt did
not succeed. It would be easier to document, and I don't think there's
much point in making it an error if the allocation fails.

-- Abhijit

P.S. I'd be happy to do the followup work for this patch (updating
documentation, etc.), but it'll have to wait until I recover from
this !#$&@! stomach bug.

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#36Alvaro Herrera
alvherre@2ndquadrant.com
In reply to: Abhijit Menon-Sen (#35)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

Abhijit Menon-Sen wrote:

At 2013-11-15 15:17:32 +0200, hlinnakangas@vmware.com wrote:

But I'm not wedded to the idea if someone objects; a log message might
also be reasonable: "LOG: huge TLB pages are not supported on this
platform, but huge_tlb_pages was 'on'"

Put that way, I have to wonder if the right thing to do is just to have
a "try_huge_pages=on|off" setting, and log a warning if the attempt did
not succeed. It would be easier to document, and I don't think there's
much point in making it an error if the allocation fails.

What about
huge_tlb_pages={off,try}

Or maybe
huge_tlb_pages={off,try,require}

--
�lvaro Herrera http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#37Andres Freund
andres@2ndquadrant.com
In reply to: Alvaro Herrera (#36)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

On 2013-11-21 18:09:38 -0300, Alvaro Herrera wrote:

Abhijit Menon-Sen wrote:

At 2013-11-15 15:17:32 +0200, hlinnakangas@vmware.com wrote:

But I'm not wedded to the idea if someone objects; a log message might
also be reasonable: "LOG: huge TLB pages are not supported on this
platform, but huge_tlb_pages was 'on'"

Put that way, I have to wonder if the right thing to do is just to have
a "try_huge_pages=on|off" setting, and log a warning if the attempt did
not succeed. It would be easier to document, and I don't think there's
much point in making it an error if the allocation fails.

What about
huge_tlb_pages={off,try}

Or maybe
huge_tlb_pages={off,try,require}

I'd certainly want a setting that errors out if it cannot get the memory
using hugetables. If you rely on the reduction in memory (which can be
significant on large s_b, large max_connections), it's rather annoying
not to know whether it suceeded using it.

Greetings,

Andres Freund

--
Andres Freund http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#38Robert Haas
robertmhaas@gmail.com
In reply to: Alvaro Herrera (#36)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

On Thu, Nov 21, 2013 at 4:09 PM, Alvaro Herrera
<alvherre@2ndquadrant.com> wrote:

Abhijit Menon-Sen wrote:

At 2013-11-15 15:17:32 +0200, hlinnakangas@vmware.com wrote:

But I'm not wedded to the idea if someone objects; a log message might
also be reasonable: "LOG: huge TLB pages are not supported on this
platform, but huge_tlb_pages was 'on'"

Put that way, I have to wonder if the right thing to do is just to have
a "try_huge_pages=on|off" setting, and log a warning if the attempt did
not succeed. It would be easier to document, and I don't think there's
much point in making it an error if the allocation fails.

What about
huge_tlb_pages={off,try}

Or maybe
huge_tlb_pages={off,try,require}

I'd spell "require" as "on", or at least accept that as a synonym.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#39Andres Freund
andres@2ndquadrant.com
In reply to: Robert Haas (#38)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

On 2013-11-21 16:24:56 -0500, Robert Haas wrote:

What about
huge_tlb_pages={off,try}

Or maybe
huge_tlb_pages={off,try,require}

I'd spell "require" as "on", or at least accept that as a synonym.

That's off,try, on is what the patch currently implements, Abhijit just
was arguing for dropping the error-out option.

Greetings,

Andres Freund

--
Andres Freund http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#40Abhijit Menon-Sen
ams@2ndQuadrant.com
In reply to: Andres Freund (#37)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

At 2013-11-21 22:14:35 +0100, andres@2ndquadrant.com wrote:

I'd certainly want a setting that errors out if it cannot get the
memory using hugetables.

OK, then the current try/on/off settings are fine.

I'm better today, so I'll read the patch Heikki posted and see what more
needs to be done there.

-- Abhijit

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#41Alvaro Herrera
alvherre@2ndquadrant.com
In reply to: Heikki Linnakangas (#33)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

Heikki Linnakangas wrote:

I spent some time whacking this around, new patch version attached.
I moved the mmap() code into a new function, that leaves the
PGSharedMemoryCreate more readable.

Did this patch go anywhere?

Someone just pinged me about a kernel scalability problem in Linux with
huge pages; if someone did performance measurements with this patch,
perhaps it'd be good to measure again with the kernel patch in place.

https://lkml.org/lkml/2014/1/26/227

--
�lvaro Herrera http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#42Heikki Linnakangas
hlinnakangas@vmware.com
In reply to: Alvaro Herrera (#41)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

On 01/27/2014 09:20 PM, Alvaro Herrera wrote:

Heikki Linnakangas wrote:

I spent some time whacking this around, new patch version attached.
I moved the mmap() code into a new function, that leaves the
PGSharedMemoryCreate more readable.

Did this patch go anywhere?

Oh darn, I remembered we had already committed this, but clearly not.
I'd love to still get this into 9.4. The latest patch
(hugepages-v5.patch) was pretty much ready for commit, except for
documentation.

- Heikki

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#43Christian Kruse
christian@2ndQuadrant.com
In reply to: Heikki Linnakangas (#42)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

Hi,

On 28/01/14 13:51, Heikki Linnakangas wrote:

Oh darn, I remembered we had already committed this, but clearly not. I'd
love to still get this into 9.4. The latest patch (hugepages-v5.patch) was
pretty much ready for commit, except for documentation.

I'm working on it. I ported it to HEAD and currently doing some
benchmarks. Next will be documentation.

Best regards,

--
Christian Kruse http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

#44Christian Kruse
christian@2ndQuadrant.com
In reply to: Heikki Linnakangas (#33)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

Hi,

On 15/11/13 15:17, Heikki Linnakangas wrote:

I spent some time whacking this around, new patch version attached. I moved
the mmap() code into a new function, that leaves the PGSharedMemoryCreate
more readable.

I think there's a bug in this version of the patch. Have a look at
this:

+	if (huge_tlb_pages == HUGE_TLB_ON || huge_tlb_pages == HUGE_TLB_TRY)
+	{
[…]
+		ptr = mmap(NULL, *size, PROT_READ | PROT_WRITE,
+				   PG_MMAP_FLAGS | MAP_HUGETLB, -1, 0);
[…]
+	}
+#endif
+
+	if (huge_tlb_pages == HUGE_TLB_OFF || huge_tlb_pages == HUGE_TLB_TRY)
+	{
+		allocsize = *size;
+		ptr = mmap(NULL, *size, PROT_READ | PROT_WRITE, PG_MMAP_FLAGS, -1, 0);
+	}

This will lead to a duplicate mmap() if hugepages work and
huge_tlb_pages == HUGE_TLB_TRY, or am I missing something?
I think it should be like this:

if (huge_tlb_pages == HUGE_TLB_OFF ||
(huge_tlb_pages == HUGE_TLB_TRY && ptr == MAP_FAILED))

Best regards,

--
Christian Kruse http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

#45Christian Kruse
christian@2ndQuadrant.com
In reply to: Christian Kruse (#44)
1 attachment(s)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

Hi,

attached you will find a new version of the patch, ported to HEAD,
fixed the mentioned bug and - hopefully - dealing the the remaining
issues.

Best regards,

--
Christian Kruse http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

Attachments:

hugepages-v6.patchtext/x-diff; charset=us-asciiDownload
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 14ed6c7..e7c2559 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1107,6 +1107,43 @@ include 'filename'
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-huge-tlb-pages" xreflabel="huge_tlb_pages">
+      <term><varname>huge_tlb_pages</varname> (<type>enum</type>)</term>
+      <indexterm>
+       <primary><varname>huge_tlb_pages</> configuration parameter</primary>
+      </indexterm>
+      <listitem>
+       <para>
+        Enables/disables the use of huge TLB pages. Valid values are
+        <literal>try</literal> (the default), <literal>on</literal>,
+        and <literal>off</literal>.
+       </para>
+
+       <para>
+        At present, this feature is supported only on Linux. The setting
+        is ignored on other systems.
+       </para>
+
+       <para>
+        The use of huge TLB pages results in smaller page tables and
+        less CPU time spent on memory management, increasing performance. For
+        more details, see
+        <ulink url="https://wiki.debian.org/Hugepages">the Debian wiki</ulink>.
+        Remember that you will need at least shared_buffers / huge page size +
+        1 huge TLB pages. So for example for a system with 6GB shared buffers
+        and a hugepage size of 2kb of you will need at least 3156 huge pages.
+       </para>
+
+       <para>
+        With <varname>huge_tlb_pages</varname> set to <literal>try</literal>,
+        the server will try to use huge pages, but fall back to using
+        normal allocation if that fails. With <literal>on</literal, failure
+        to use huge pages will prevent the server from starting up. With
+        <literal>off</literal>, huge pages will not be used.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-temp-buffers" xreflabel="temp_buffers">
       <term><varname>temp_buffers</varname> (<type>integer</type>)</term>
       <indexterm>
diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c
index 0d01617..b3b87d7 100644
--- a/src/backend/port/sysv_shmem.c
+++ b/src/backend/port/sysv_shmem.c
@@ -32,6 +32,7 @@
 #include "portability/mem.h"
 #include "storage/ipc.h"
 #include "storage/pg_shmem.h"
+#include "utils/guc.h"
 
 
 typedef key_t IpcMemoryKey;		/* shared memory key passed to shmget(2) */
@@ -41,7 +42,7 @@ typedef int IpcMemoryId;		/* shared memory ID returned by shmget(2) */
 unsigned long UsedShmemSegID = 0;
 void	   *UsedShmemSegAddr = NULL;
 static Size AnonymousShmemSize;
-static void *AnonymousShmem;
+static void *AnonymousShmem = NULL;
 
 static void *InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size);
 static void IpcMemoryDetach(int status, Datum shmaddr);
@@ -317,6 +318,80 @@ PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2)
 	return true;
 }
 
+/*
+ * Creates an anonymous mmap()ed shared memory segment.
+ *
+ * Pass the desired size in *size. This function will modify *size to the
+ * actual size of the allocation, if it ends up allocating a larger than
+ * desired segment.
+ */
+#ifndef EXEC_BACKEND
+static void *
+CreateAnonymousSegment(Size *size)
+{
+	Size		allocsize;
+	void	   *ptr = MAP_FAILED;
+
+#ifndef MAP_HUGETLB
+	if (huge_tlb_pages == HUGE_TLB_ON)
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("huge TLB pages not supported on this platform")));
+#else
+	if (huge_tlb_pages == HUGE_TLB_ON || huge_tlb_pages == HUGE_TLB_TRY)
+	{
+		/*
+		 * Round up the request size to a suitable large value.
+		 *
+		 * Some Linux kernel versions are known to have a bug, which causes
+		 * mmap() with MAP_HUGETLB to fail if the request size is not a
+		 * multiple of any supported huge page size. To work around that, we
+		 * round up the request size to nearest 2MB. 2MB is the most common
+		 * huge page page size on affected systems.
+		 *
+		 * Aside from that bug, even with a kernel that does the allocation
+		 * correctly, rounding it up ourselvees avoids wasting memory. Without
+		 * it, if we for example make an allocation of 2MB + 1 bytes, the
+		 * kernel might decide to use two 2MB huge pages for that, and waste 2
+		 * MB - 1 of memory. When we do the rounding ourselves, we can use
+		 * that space for allocations.
+		 */
+		int			hugepagesize = 2 * 1024 * 1024;
+
+		allocsize = *size;
+		if (allocsize % hugepagesize != 0)
+			allocsize += hugepagesize - (allocsize % hugepagesize);
+
+		ptr = mmap(NULL, *size, PROT_READ | PROT_WRITE,
+				   PG_MMAP_FLAGS | MAP_HUGETLB, -1, 0);
+		if (huge_tlb_pages == HUGE_TLB_TRY && ptr == MAP_FAILED)
+			elog(DEBUG1, "mmap with MAP_HUGETLB failed, huge pages disabled: %m");
+	}
+#endif
+
+	if (huge_tlb_pages == HUGE_TLB_OFF ||
+		(huge_tlb_pages == HUGE_TLB_TRY && ptr == MAP_FAILED))
+	{
+		allocsize = *size;
+		ptr = mmap(NULL, *size, PROT_READ | PROT_WRITE, PG_MMAP_FLAGS, -1, 0);
+	}
+
+	if (ptr == MAP_FAILED)
+		ereport(FATAL,
+				(errmsg("could not map anonymous shared memory: %m"),
+				 (errno == ENOMEM) ?
+				 errhint("This error usually means that PostgreSQL's request "
+					"for a shared memory segment exceeded available memory, "
+					  "swap space or huge pages. To reduce the request size "
+						 "(currently  %zu bytes), reduce PostgreSQL's shared "
+					   "memory usage, perhaps by reducing shared_buffers or "
+						 "max_connections.",
+						 *size) : 0));
+
+	*size = allocsize;
+	return ptr;
+}
+#endif
 
 /*
  * PGSharedMemoryCreate
@@ -344,7 +419,14 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port)
 	PGShmemHeader *hdr;
 	IpcMemoryId shmid;
 	struct stat statbuf;
-	Size		sysvsize = size;
+	Size		sysvsize;
+
+#if defined(EXEC_BACKEND) || !defined(MAP_HUGETLB)
+	if (huge_tlb_pages == HUGE_TLB_ON)
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("huge TLB pages not supported on this platform")));
+#endif
 
 	/* Room for a header? */
 	Assert(size > MAXALIGN(sizeof(PGShmemHeader)));
@@ -359,6 +441,12 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port)
 	 * to run many copies of PostgreSQL without needing to adjust system
 	 * settings.
 	 *
+	 * We assume that no one will attempt to run PostgreSQL 9.3 or later on
+	 * systems that are ancient enough that anonymous shared memory is not
+	 * supported, such as pre-2.4 versions of Linux.  If that turns out to be
+	 * false, we might need to add a run-time test here and do this only if
+	 * the running kernel supports it.
+	 *
 	 * However, we disable this logic in the EXEC_BACKEND case, and fall back
 	 * to the old method of allocating the entire segment using System V
 	 * shared memory, because there's no way to attach an mmap'd segment to a
@@ -366,44 +454,13 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port)
 	 * developer use, this shouldn't be a big problem.
 	 */
 #ifndef EXEC_BACKEND
-	{
-		long		pagesize = sysconf(_SC_PAGE_SIZE);
-
-		/*
-		 * Ensure request size is a multiple of pagesize.
-		 *
-		 * pagesize will, for practical purposes, always be a power of two.
-		 * But just in case it isn't, we do it this way instead of using
-		 * TYPEALIGN().
-		 */
-		if (pagesize > 0 && size % pagesize != 0)
-			size += pagesize - (size % pagesize);
+	AnonymousShmem = CreateAnonymousSegment(&size);
+	AnonymousShmemSize = size;
 
-		/*
-		 * We assume that no one will attempt to run PostgreSQL 9.3 or later
-		 * on systems that are ancient enough that anonymous shared memory is
-		 * not supported, such as pre-2.4 versions of Linux.  If that turns
-		 * out to be false, we might need to add a run-time test here and do
-		 * this only if the running kernel supports it.
-		 */
-		AnonymousShmem = mmap(NULL, size, PROT_READ | PROT_WRITE, PG_MMAP_FLAGS,
-							  -1, 0);
-		if (AnonymousShmem == MAP_FAILED)
-			ereport(FATAL,
-					(errmsg("could not map anonymous shared memory: %m"),
-					 (errno == ENOMEM) ?
-				errhint("This error usually means that PostgreSQL's request "
-					 "for a shared memory segment exceeded available memory "
-					  "or swap space. To reduce the request size (currently "
-					  "%zu bytes), reduce PostgreSQL's shared memory usage, "
-						"perhaps by reducing shared_buffers or "
-						"max_connections.",
-						size) : 0));
-		AnonymousShmemSize = size;
-
-		/* Now we need only allocate a minimal-sized SysV shmem block. */
-		sysvsize = sizeof(PGShmemHeader);
-	}
+	/* Now we need only allocate a minimal-sized SysV shmem block. */
+	sysvsize = sizeof(PGShmemHeader);
+#else
+	sysvsize = size;
 #endif
 
 	/* Make sure PGSharedMemoryAttach doesn't fail without need */
diff --git a/src/backend/port/win32_shmem.c b/src/backend/port/win32_shmem.c
index 80f1982..9b0cceb 100644
--- a/src/backend/port/win32_shmem.c
+++ b/src/backend/port/win32_shmem.c
@@ -128,6 +128,11 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port)
 	DWORD		size_high;
 	DWORD		size_low;
 
+	if (huge_tlb_pages == HUGE_TLB_ON)
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("huge TLB pages not supported on this platform")));
+
 	/* Room for a header? */
 	Assert(size > MAXALIGN(sizeof(PGShmemHeader)));
 
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 2cc8f90..a9b9794 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -64,6 +64,7 @@
 #include "storage/dsm_impl.h"
 #include "storage/standby.h"
 #include "storage/fd.h"
+#include "storage/pg_shmem.h"
 #include "storage/proc.h"
 #include "storage/predicate.h"
 #include "tcop/tcopprot.h"
@@ -388,6 +389,23 @@ static const struct config_enum_entry synchronous_commit_options[] = {
 };
 
 /*
+ * Although only "on", "off", "try" are documented, we accept all the likely
+ * variants of "on" and "off".
+ */
+static const struct config_enum_entry huge_tlb_options[] = {
+	{"off", HUGE_TLB_OFF, false},
+	{"on", HUGE_TLB_ON, false},
+	{"try", HUGE_TLB_TRY, false},
+	{"true", HUGE_TLB_ON, true},
+	{"false", HUGE_TLB_OFF, true},
+	{"yes", HUGE_TLB_ON, true},
+	{"no", HUGE_TLB_OFF, true},
+	{"1", HUGE_TLB_ON, true},
+	{"0", HUGE_TLB_OFF, true},
+	{NULL, 0, false}
+};
+
+/*
  * Options for enum values stored in other modules
  */
 extern const struct config_enum_entry wal_level_options[];
@@ -448,6 +466,12 @@ int			tcp_keepalives_interval;
 int			tcp_keepalives_count;
 
 /*
+ * This really belongs in pg_shmem.c, but is defined here so that it doesn't
+ * need to be duplicated in all the different implementations of pg_shmem.c.
+ */
+int			huge_tlb_pages;
+
+/*
  * These variables are all dummies that don't do anything, except in some
  * cases provide the value for SHOW to display.  The real state is elsewhere
  * and is kept in sync by assign_hooks.
@@ -3430,6 +3454,15 @@ static struct config_enum ConfigureNamesEnum[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"huge_tlb_pages", PGC_POSTMASTER, RESOURCES_MEM,
+			gettext_noop("Use of huge TLB pages on Linux"),
+			NULL
+		},
+		&huge_tlb_pages,
+		HUGE_TLB_TRY, huge_tlb_options,
+		NULL, NULL, NULL
+	},
 
 	/* End-of-list marker */
 	{
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 7ad6b7c..c8673b3 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -115,6 +115,8 @@
 
 #shared_buffers = 32MB			# min 128kB
 					# (change requires restart)
+#huge_tlb_pages = try			# on, off, or try
+					# (change requires restart)
 #temp_buffers = 8MB			# min 800kB
 #max_prepared_transactions = 0		# zero disables the feature
 					# (change requires restart)
diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h
index 22ef901..df094e8 100644
--- a/src/include/storage/pg_shmem.h
+++ b/src/include/storage/pg_shmem.h
@@ -38,6 +38,16 @@ typedef struct PGShmemHeader	/* standard header for all Postgres shmem */
 #endif
 } PGShmemHeader;
 
+/* GUC variable */
+extern int huge_tlb_pages;
+
+/* Possible values for huge_tlb_pages */
+typedef enum
+{
+	HUGE_TLB_OFF,
+	HUGE_TLB_ON,
+	HUGE_TLB_TRY
+} HugeTlbType;
 
 #ifdef EXEC_BACKEND
 #ifndef WIN32
#46Heikki Linnakangas
hlinnakangas@vmware.com
In reply to: Christian Kruse (#45)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

On 01/28/2014 06:11 PM, Christian Kruse wrote:

Hi,

attached you will find a new version of the patch, ported to HEAD,
fixed the mentioned bug and - hopefully - dealing the the remaining
issues.

Thanks, I have committed this now.

The documentation is still lacking. We should explain somewhere how to
set nr.hugepages, for example. The "Managing Kernel Resources" section
ought to mention setting. Could I ask you to work on that, please?

- Heikki

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#47Vik Fearing
vik.fearing@dalibo.com
In reply to: Heikki Linnakangas (#46)
1 attachment(s)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

On 01/29/2014 01:12 PM, Heikki Linnakangas wrote:

On 01/28/2014 06:11 PM, Christian Kruse wrote:

Hi,

attached you will find a new version of the patch, ported to HEAD,
fixed the mentioned bug and - hopefully - dealing the the remaining
issues.

Thanks, I have committed this now.

The documentation is still lacking.

The documentation is indeed lacking since it breaks the build.

doc/src/sgml/config.sgml contains the line

normal allocation if that fails. With <literal>on</literal, failure

which doesn't correctly terminate the closing </literal> tag.

Trivial patch attached.

--
Vik

Attachments:

fix_tag.patchtext/x-diff; name=fix_tag.patchDownload
*** a/doc/src/sgml/config.sgml
--- b/doc/src/sgml/config.sgml
***************
*** 1137,1143 **** include 'filename'
         <para>
          With <varname>huge_tlb_pages</varname> set to <literal>try</literal>,
          the server will try to use huge pages, but fall back to using
!         normal allocation if that fails. With <literal>on</literal, failure
          to use huge pages will prevent the server from starting up. With
          <literal>off</literal>, huge pages will not be used.
         </para>
--- 1137,1143 ----
         <para>
          With <varname>huge_tlb_pages</varname> set to <literal>try</literal>,
          the server will try to use huge pages, but fall back to using
!         normal allocation if that fails. With <literal>on</literal>, failure
          to use huge pages will prevent the server from starting up. With
          <literal>off</literal>, huge pages will not be used.
         </para>
#48Heikki Linnakangas
hlinnakangas@vmware.com
In reply to: Vik Fearing (#47)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

On 01/29/2014 04:01 PM, Vik Fearing wrote:

On 01/29/2014 01:12 PM, Heikki Linnakangas wrote:

The documentation is still lacking.

The documentation is indeed lacking since it breaks the build.

doc/src/sgml/config.sgml contains the line

normal allocation if that fails. With <literal>on</literal, failure

which doesn't correctly terminate the closing </literal> tag.

Trivial patch attached.

Thanks, applied!

- Heikki

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#49Merlin Moncure
mmoncure@gmail.com
In reply to: Christian Kruse (#43)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

On Tue, Jan 28, 2014 at 5:58 AM, Christian Kruse
<christian@2ndquadrant.com> wrote:

Hi,

On 28/01/14 13:51, Heikki Linnakangas wrote:

Oh darn, I remembered we had already committed this, but clearly not. I'd
love to still get this into 9.4. The latest patch (hugepages-v5.patch) was
pretty much ready for commit, except for documentation.

I'm working on it. I ported it to HEAD and currently doing some
benchmarks. Next will be documentation.

you mentioned benchmarks -- do you happen to have the results handy? (curious)

merlin

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#50Jeff Janes
jeff.janes@gmail.com
In reply to: Heikki Linnakangas (#46)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

On Wed, Jan 29, 2014 at 4:12 AM, Heikki Linnakangas <hlinnakangas@vmware.com

wrote:

On 01/28/2014 06:11 PM, Christian Kruse wrote:

Hi,

attached you will find a new version of the patch, ported to HEAD,
fixed the mentioned bug and - hopefully - dealing the the remaining
issues.

Thanks, I have committed this now.

I'm getting this warning now with gcc (GCC) 4.4.7:

pg_shmem.c: In function 'PGSharedMemoryCreate':
pg_shmem.c:332: warning: 'allocsize' may be used uninitialized in this
function
pg_shmem.c:332: note: 'allocsize' was declared here

Cheers,

Jeff

#51Christian Kruse
christian@2ndQuadrant.com
In reply to: Heikki Linnakangas (#46)
1 attachment(s)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

Hi,

On 29/01/14 14:12, Heikki Linnakangas wrote:

The documentation is still lacking. We should explain somewhere how to set
nr.hugepages, for example. The "Managing Kernel Resources" section ought to
mention setting. Could I ask you to work on that, please?

Of course! Attached you will find a patch for better documentation.

Best regards,

--
Christian Kruse http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

Attachments:

huge_tlb_docs.patchtext/x-diff; charset=us-asciiDownload
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
old mode 100644
new mode 100755
index 1b5f831..68b38f7
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1128,10 +1128,7 @@ include 'filename'
         The use of huge TLB pages results in smaller page tables and
         less CPU time spent on memory management, increasing performance. For
         more details, see
-        <ulink url="https://wiki.debian.org/Hugepages">the Debian wiki</ulink>.
-        Remember that you will need at least shared_buffers / huge page size +
-        1 huge TLB pages. So for example for a system with 6GB shared buffers
-        and a hugepage size of 2kb of you will need at least 3156 huge pages.
+        <link linkend="linux-huge-tlb-pages">Linux huge TLB pages</link>.
        </para>
 
        <para>
diff --git a/doc/src/sgml/runtime.sgml b/doc/src/sgml/runtime.sgml
old mode 100644
new mode 100755
index bbb808f..2288c7b
--- a/doc/src/sgml/runtime.sgml
+++ b/doc/src/sgml/runtime.sgml
@@ -1307,6 +1307,83 @@ echo -1000 > /proc/self/oom_score_adj
    </para>
    </note>
   </sect2>
+
+  <sect2 id="linux-huge-tlb-pages">
+   <title>Linux huge TLB pages</title>
+
+   <para>
+    Nowadays memory address spaces for processes are virtual. This means, when
+    a process reserves memory, it gets a virtual memory address which has to
+    be translated to a physical memory address by the OS or the CPU. This can
+    be done via calculations, but since memory is accessed very often there is
+    a cache for that, called Translation Lookaside Buffer,
+    short <emphasis>TLB</emphasis>.
+   </para>
+
+   <para>
+    When a process reserves memory, this is done in chunks (often
+    of <literal>4kb</literal>) named pages. This means if a process requires
+    1GB of RAM, it has <literal>262144</literal> (<literal>1GB</literal>
+    / <literal>4KB</literal>) pages and therefore <literal>262144</literal>
+    entries for the translation table. Since the TLB has a limited number of
+    entries it is obvious that they can't be they can't all be cached, which
+    will lead to loss of performance.
+   </para>
+
+   <para>
+    One way to tune this is to increase the page size. Most platforms allow
+    larger pages, e.g. x86 allows pages of <literal>2MB</literal>. This would
+    reduce the number of pages to <literal>512</literal>
+    (<literal>1GB</literal> / <literal>2MB</literal>). This reduces the number
+    of necessary lookups drastrically.
+   </para>
+
+   <para>
+    To enable this feature in <productname>PostgreSQL</productname> you need a
+    kernel with <varname>CONFIG_HUGETLBFS=y</varname> and
+    <varname>CONFIG_HUGETLB_PAGE=y</varname>. You also have to tune the system
+    setting <varname>vm.nr_hugepages</varname>. To calculate the number of
+    necessary huge pages start <productname>PostgreSQL</productname> without
+    huge pages enabled and check the <varname>VmPeak<varname> value from the
+    proc filesystem:
+
+<programlisting>
+$ <userinput>head -1 /path/to/data/directory/postmaster.pid</userinput>
+4170
+$ <userinput>grep ^VmPeak /proc/4170/status</userinput>
+VmPeak:	 6490428 kB
+</programlisting>
+     <literal>6490428</literal> / <literal>2048</literal>
+     (<varname>PAGE_SIZE</varname> <literal>2MB</literal>) are
+     roughly <literal>3169.154</literal> huge pages, so you will need at
+     least <literal>3170</literal> huge pages:
+<programlisting>
+sysctl -w vm.nr_hugepages=3170
+</programlisting>
+    Sometimes the kernel is not able to allocate the desired number of huge
+    pages, so it might be necessary to repeat that command or to reboot. Don't
+    forget to add an entry to <filename>/etc/sysctl.conf</filename> to persist
+    this setting through reboots.
+   </para>
+
+   <para>
+    The default behavior for huge pages
+    in <productname>PostgreSQL</productname> is to use them when possible and
+    to fallback to normal pages when failing. To enforce the use of huge
+    pages, you can
+    set <link linkend="guc-huge-tlb-pages"><varname>huge_tlb_pages</varname></link>
+    to <literal>on</literal>. Note that in this
+    case <productname>PostgreSQL</productname> will fail to start if not
+    enough huge pages are available.
+   </para>
+
+   <para>
+    For a detailed description of the <productname>Linux</productname> huge
+    pages feature have a look
+    at <ulink url="">https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt</ulink>.
+   </para>
+
+  </sect2>
  </sect1>
 
 
#52Christian Kruse
christian@2ndquadrant.com
In reply to: Jeff Janes (#50)
1 attachment(s)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

Hi,

On 29/01/14 10:11, Jeff Janes wrote:

I'm getting this warning now with gcc (GCC) 4.4.7:

Interesting. I don't get that warning. But the compiler is (formally)
right.

pg_shmem.c: In function 'PGSharedMemoryCreate':
pg_shmem.c:332: warning: 'allocsize' may be used uninitialized in this
function
pg_shmem.c:332: note: 'allocsize' was declared here

Attached patch should fix that.

Best regards,

--
Christian Kruse http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

Attachments:

hugepages-v7.patchtext/x-diff; charset=us-asciiDownload
diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c
index f7596bf..c39dfb6 100644
--- a/src/backend/port/sysv_shmem.c
+++ b/src/backend/port/sysv_shmem.c
@@ -329,7 +329,7 @@ PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2)
 static void *
 CreateAnonymousSegment(Size *size)
 {
-	Size		allocsize;
+	Size		allocsize = *size;
 	void	   *ptr = MAP_FAILED;
 
 #ifndef MAP_HUGETLB
@@ -358,7 +358,6 @@ CreateAnonymousSegment(Size *size)
 		 */
 		int			hugepagesize = 2 * 1024 * 1024;
 
-		allocsize = *size;
 		if (allocsize % hugepagesize != 0)
 			allocsize += hugepagesize - (allocsize % hugepagesize);
 
@@ -372,7 +371,6 @@ CreateAnonymousSegment(Size *size)
 	if (huge_tlb_pages == HUGE_TLB_OFF ||
 		(huge_tlb_pages == HUGE_TLB_TRY && ptr == MAP_FAILED))
 	{
-		allocsize = *size;
 		ptr = mmap(NULL, *size, PROT_READ | PROT_WRITE, PG_MMAP_FLAGS, -1, 0);
 	}
 
#53Heikki Linnakangas
hlinnakangas@vmware.com
In reply to: Christian Kruse (#52)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

On 01/29/2014 09:18 PM, Christian Kruse wrote:

Hi,

On 29/01/14 10:11, Jeff Janes wrote:

I'm getting this warning now with gcc (GCC) 4.4.7:

Interesting. I don't get that warning. But the compiler is (formally)
right.

pg_shmem.c: In function 'PGSharedMemoryCreate':
pg_shmem.c:332: warning: 'allocsize' may be used uninitialized in this
function
pg_shmem.c:332: note: 'allocsize' was declared here

Hmm, I didn't get that warning either.

Attached patch should fix that.

That's not quite right. If the first mmap() fails, allocsize is set to
the rounded-up size, but the second mmap() uses the original size for
the allocation. So it returns a too high value to the caller.

Ugh, it's actually broken anyway :-(. The first allocation also passes
*size to mmap(), so the calculated rounded-up allocsize value is not
used for anything.

Fix pushed.

- Heikki

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#54Christian Kruse
christian@2ndquadrant.com
In reply to: Heikki Linnakangas (#53)
1 attachment(s)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

Hi,

On 29/01/14 21:36, Heikki Linnakangas wrote:

[…]
Fix pushed.

You are right. Thanks. But there is another bug, see

<20140128154307.GC24091@defunct.ch>

ff. Attached you will find a patch fixing that.

Best regards,

--
Christian Kruse http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

Attachments:

hugepages-v8.patchtext/x-diff; charset=us-asciiDownload
diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c
index ac3a9fe..cf590a0 100644
--- a/src/backend/port/sysv_shmem.c
+++ b/src/backend/port/sysv_shmem.c
@@ -380,9 +380,12 @@ CreateAnonymousSegment(Size *size)
 	}
 
 	if (ptr == MAP_FAILED)
+	{
+		int			saved_errno = errno;
+
 		ereport(FATAL,
 				(errmsg("could not map anonymous shared memory: %m"),
-				 (errno == ENOMEM) ?
+				 (saved_errno == ENOMEM) ?
 				 errhint("This error usually means that PostgreSQL's request "
 					"for a shared memory segment exceeded available memory, "
 					  "swap space or huge pages. To reduce the request size "
@@ -390,6 +393,7 @@ CreateAnonymousSegment(Size *size)
 					   "memory usage, perhaps by reducing shared_buffers or "
 						 "max_connections.",
 						 *size) : 0));
+	}
 
 	*size = allocsize;
 	return ptr;
#55Heikki Linnakangas
hlinnakangas@vmware.com
In reply to: Christian Kruse (#54)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

On 01/29/2014 09:59 PM, Christian Kruse wrote:

Hi,

On 29/01/14 21:36, Heikki Linnakangas wrote:

[�]
Fix pushed.

You are right. Thanks. But there is another bug, see

<20140128154307.GC24091@defunct.ch>

ff. Attached you will find a patch fixing that.

Thanks. There are more cases of that in InternalIpcMemoryCreate, they
ought to be fixed as well. And should also grep the rest of the codebase
for more instances of that. And this needs to be back-patched.

- Heikki

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#56Christian Kruse
christian@2ndquadrant.com
In reply to: Heikki Linnakangas (#55)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

Hi,

On 29/01/14 22:17, Heikki Linnakangas wrote:

Thanks. There are more cases of that in InternalIpcMemoryCreate, they ought
to be fixed as well. And should also grep the rest of the codebase for more
instances of that. And this needs to be back-patched.

I'm way ahead of you ;-) Working on it.

Best regards,

--
Christian Kruse http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

#57Christian Kruse
christian@2ndQuadrant.com
In reply to: Christian Kruse (#51)
1 attachment(s)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

Hi,

after I finally got documentation compilation working I updated the
patch to be syntactically correct. You will find it attached.

Best regards,

--
Christian Kruse http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

Attachments:

huge_tlb_docs-v1.patchtext/x-diff; charset=us-asciiDownload
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 1b5f831..68b38f7 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1128,10 +1128,7 @@ include 'filename'
         The use of huge TLB pages results in smaller page tables and
         less CPU time spent on memory management, increasing performance. For
         more details, see
-        <ulink url="https://wiki.debian.org/Hugepages">the Debian wiki</ulink>.
-        Remember that you will need at least shared_buffers / huge page size +
-        1 huge TLB pages. So for example for a system with 6GB shared buffers
-        and a hugepage size of 2kb of you will need at least 3156 huge pages.
+        <link linkend="linux-huge-tlb-pages">Linux huge TLB pages</link>.
        </para>
 
        <para>
diff --git a/doc/src/sgml/runtime.sgml b/doc/src/sgml/runtime.sgml
index bbb808f..0b98314 100644
--- a/doc/src/sgml/runtime.sgml
+++ b/doc/src/sgml/runtime.sgml
@@ -1307,6 +1307,82 @@ echo -1000 > /proc/self/oom_score_adj
    </para>
    </note>
   </sect2>
+
+  <sect2 id="linux-huge-tlb-pages">
+   <title>Linux huge TLB pages</title>
+
+   <para>
+    Nowadays memory address spaces for processes are virtual. This means, when
+    a process reserves memory, it gets a virtual memory address which has to
+    be translated to a physical memory address by the OS or the CPU. This can
+    be done via calculations, but since memory is accessed very often there is
+    a cache for that, called Translation Lookaside Buffer,
+    short <emphasis>TLB</emphasis>.
+   </para>
+
+   <para>
+    When a process reserves memory, this is done in chunks (often
+    of <literal>4kb</literal>) named pages. This means if a process requires
+    1GB of RAM, it has <literal>262144</literal> (<literal>1GB</literal>
+    / <literal>4KB</literal>) pages and therefore <literal>262144</literal>
+    entries for the translation table. Since the TLB has a limited number of
+    entries it is obvious that they can't be they can't all be cached, which
+    will lead to loss of performance.
+   </para>
+
+   <para>
+    One way to tune this is to increase the page size. Most platforms allow
+    larger pages, e.g. x86 allows pages of <literal>2MB</literal>. This would
+    reduce the number of pages to <literal>512</literal>
+    (<literal>1GB</literal> / <literal>2MB</literal>). This reduces the number
+    of necessary lookups drastrically.
+   </para>
+
+   <para>
+    To enable this feature in <productname>PostgreSQL</productname> you need a
+    kernel with <varname>CONFIG_HUGETLBFS=y</varname> and
+    <varname>CONFIG_HUGETLB_PAGE=y</varname>. You also have to tune the system
+    setting <varname>vm.nr_hugepages</varname>. To calculate the number of
+    necessary huge pages start <productname>PostgreSQL</productname> without
+    huge pages enabled and check the <varname>VmPeak</varname> value from the
+    proc filesystem:
+<programlisting>
+$ <userinput>head -1 /path/to/data/directory/postmaster.pid</userinput>
+4170
+$ <userinput>grep ^VmPeak /proc/4170/status</userinput>
+VmPeak:  6490428 kB
+</programlisting>
+     <literal>6490428</literal> / <literal>2048</literal>
+     (<varname>PAGE_SIZE</varname> <literal>2MB</literal>) are
+     roughly <literal>3169.154</literal> huge pages, so you will need at
+     least <literal>3170</literal> huge pages:
+<programlisting>
+$ <userinput>sysctl -w vm.nr_hugepages=3170</userinput>
+</programlisting>
+    Sometimes the kernel is not able to allocate the desired number of huge
+    pages, so it might be necessary to repeat that command or to reboot. Don't
+    forget to add an entry to <filename>/etc/sysctl.conf</filename> to persist
+    this setting through reboots.
+   </para>
+
+   <para>
+    The default behavior for huge pages
+    in <productname>PostgreSQL</productname> is to use them when possible and
+    to fallback to normal pages when failing. To enforce the use of huge
+    pages, you can
+    set <link linkend="guc-huge-tlb-pages"><varname>huge_tlb_pages</varname></link>
+    to <literal>on</literal>. Note that in this
+    case <productname>PostgreSQL</productname> will fail to start if not
+    enough huge pages are available.
+   </para>
+
+   <para>
+    For a detailed description of the <productname>Linux</productname> huge
+    pages feature have a look
+    at <ulink url="">https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt</ulink>.
+   </para>
+
+  </sect2>
  </sect1>
 
 
#58Peter Eisentraut
peter_e@gmx.net
In reply to: Christian Kruse (#57)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

On 1/30/14, 2:28 AM, Christian Kruse wrote:

after I finally got documentation compilation working I updated the
patch to be syntactically correct. You will find it attached.

I don't think we should be explaining the basics of OS memory management
in our documentation. And if we did, we shouldn't copy it verbatim from
the Debian wiki without attribution.

I think this patch should be cut down to the paragraphs that cover the
actual configuration.

On a technical note, use <xref> instead of <link> for linking.
doc/src/sgml/README.links contains some information.

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#59Robert Haas
robertmhaas@gmail.com
In reply to: Peter Eisentraut (#58)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

On Tue, Feb 25, 2014 at 10:29 AM, Peter Eisentraut <peter_e@gmx.net> wrote:

And if we did, we shouldn't copy it verbatim from
the Debian wiki without attribution.

That is seriously not cool.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#60Andres Freund
andres@2ndquadrant.com
In reply to: Peter Eisentraut (#58)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

On 2014-02-25 10:29:32 -0500, Peter Eisentraut wrote:

On 1/30/14, 2:28 AM, Christian Kruse wrote:

after I finally got documentation compilation working I updated the
patch to be syntactically correct. You will find it attached.

I don't think we should be explaining the basics of OS memory management
in our documentation.

Agreed.

And if we did, we shouldn't copy it verbatim from the Debian wiki
without attribution.

Is it actually? A quick comparison doesn't show that many similarities?
Christian?

Greetings,

Andres Freund

--
Andres Freund http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#61Christian Kruse
christian@2ndQuadrant.com
In reply to: Peter Eisentraut (#58)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

Hi,

On 25/02/14 10:29, Peter Eisentraut wrote:

I don't think we should be explaining the basics of OS memory management
in our documentation.

Well, I'm confused. I thought that's exactly what has been asked.

And if we did, we shouldn't copy it verbatim from the Debian wiki
without attribution.

I didn't. This is a write-up of several articles, blog posts and
documentation I read about this topic.

However, if you think the texts are too similar, then we should add a
note, yes. Didn't mean to copy w/o referring to a source.

I think this patch should be cut down to the paragraphs that cover the
actual configuration.

I tried to cover the issues Heikki brought up in
<52861EEC.2090702@vmware.com>.

On a technical note, use <xref> instead of <link> for linking.
doc/src/sgml/README.links contains some information.

OK, I will post an updated patch later this evening.

Best regards,

--
Christian Kruse http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

#62Christian Kruse
christian@2ndQuadrant.com
In reply to: Andres Freund (#60)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

Hi,

On 25/02/14 17:01, Andres Freund wrote:

And if we did, we shouldn't copy it verbatim from the Debian wiki
without attribution.

Is it actually? A quick comparison doesn't show that many similarities?
Christian?

Not as far as I know. But of course, as I wrote the text I _also_
(that's not my only source) read the Debian article and I was
influenced by it. It may be that the texts are more similar then I
thought, although I still don't see it.

Best regards,

--
Christian Kruse http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

#63Peter Eisentraut
peter_e@gmx.net
In reply to: Christian Kruse (#62)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

On 2/25/14, 11:08 AM, Christian Kruse wrote:

Hi,

On 25/02/14 17:01, Andres Freund wrote:

And if we did, we shouldn't copy it verbatim from the Debian wiki
without attribution.

Is it actually? A quick comparison doesn't show that many similarities?
Christian?

Not as far as I know. But of course, as I wrote the text I _also_
(that's not my only source) read the Debian article and I was
influenced by it. It may be that the texts are more similar then I
thought, although I still don't see it.

I suspect that it was done subconsciously. But I did notice it right
away, so there is something to it.

As I mentioned, I would just cut those introductory parts out.

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#64Bruce Momjian
bruce@momjian.us
In reply to: Peter Eisentraut (#63)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

On Tue, Feb 25, 2014 at 12:18:02PM -0500, Peter Eisentraut wrote:

On 2/25/14, 11:08 AM, Christian Kruse wrote:

Hi,

On 25/02/14 17:01, Andres Freund wrote:

And if we did, we shouldn't copy it verbatim from the Debian wiki
without attribution.

Is it actually? A quick comparison doesn't show that many similarities?
Christian?

Not as far as I know. But of course, as I wrote the text I _also_
(that's not my only source) read the Debian article and I was
influenced by it. It may be that the texts are more similar then I
thought, although I still don't see it.

I suspect that it was done subconsciously. But I did notice it right
away, so there is something to it.

As I mentioned, I would just cut those introductory parts out.

Should we link to the Debian wiki content?

--
Bruce Momjian <bruce@momjian.us> http://momjian.us
EnterpriseDB http://enterprisedb.com

+ Everyone has their own god. +

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#65Tom Lane
tgl@sss.pgh.pa.us
In reply to: Bruce Momjian (#64)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

Bruce Momjian <bruce@momjian.us> writes:

On Tue, Feb 25, 2014 at 12:18:02PM -0500, Peter Eisentraut wrote:

As I mentioned, I would just cut those introductory parts out.

Should we link to the Debian wiki content?

-1. We generally don't link to our *own* wiki in our SGML docs, let alone
things that aren't even under our project's control. Moreover, Debian
is not going to be explaining these things in a way that accounts for
non-Linux operating systems.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#66Andres Freund
andres@2ndQuadrant.com
In reply to: Tom Lane (#65)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

On 2014-02-25 13:21:46 -0500, Tom Lane wrote:

Bruce Momjian <bruce@momjian.us> writes:

On Tue, Feb 25, 2014 at 12:18:02PM -0500, Peter Eisentraut wrote:

As I mentioned, I would just cut those introductory parts out.

Should we link to the Debian wiki content?

-1. We generally don't link to our *own* wiki in our SGML docs, let alone
things that aren't even under our project's control.

Agreed. Especially as the interesting bit is the postgres specific
logic, not the rest.

I think all that's needed is to cut the first paragraphs that generally
explain what huge pages are in some detail from the text and make sure
the later paragraphs don't refer to the earlier ones.

Moreover, Debian
is not going to be explaining these things in a way that accounts for
non-Linux operating systems.

It's a linux only feature so far, so that alone wouldn't be a problem.

Greetings,

Andres Freund

--
Andres Freund http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#67Christian Kruse
christian@2ndQuadrant.com
In reply to: Andres Freund (#66)
1 attachment(s)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

Hi,

On 25/02/14 19:28, Andres Freund wrote:

I think all that's needed is to cut the first paragraphs that generally
explain what huge pages are in some detail from the text and make sure
the later paragraphs don't refer to the earlier ones.

Attached you will find a new version of the patch.

Best regards,

--
Christian Kruse http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

Attachments:

huge_tlb_docs-v2.patchtext/x-diff; charset=us-asciiDownload
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 4dc1277..0006090 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1128,10 +1128,7 @@ include 'filename'
         The use of huge TLB pages results in smaller page tables and
         less CPU time spent on memory management, increasing performance. For
         more details, see
-        <ulink url="https://wiki.debian.org/Hugepages">the Debian wiki</ulink>.
-        Remember that you will need at least shared_buffers / huge page size +
-        1 huge TLB pages. So for example for a system with 6GB shared buffers
-        and a hugepage size of 2kb of you will need at least 3156 huge pages.
+        <xref linkend="linux-huge-tlb-pages">.
        </para>
 
        <para>
diff --git a/doc/src/sgml/runtime.sgml b/doc/src/sgml/runtime.sgml
index bbb808f..f172526 100644
--- a/doc/src/sgml/runtime.sgml
+++ b/doc/src/sgml/runtime.sgml
@@ -1307,6 +1307,55 @@ echo -1000 > /proc/self/oom_score_adj
    </para>
    </note>
   </sect2>
+
+  <sect2 id="linux-huge-tlb-pages">
+   <title>Linux huge TLB pages</title>
+
+   <para>
+    To enable this feature in <productname>PostgreSQL</productname> you need a
+    kernel with <varname>CONFIG_HUGETLBFS=y</varname> and
+    <varname>CONFIG_HUGETLB_PAGE=y</varname>. You also have to tune the system
+    setting <varname>vm.nr_hugepages</varname>. To calculate the number of
+    necessary huge pages start <productname>PostgreSQL</productname> without
+    huge pages enabled and check the <varname>VmPeak</varname> value from the
+    proc filesystem:
+<programlisting>
+$ <userinput>head -1 /path/to/data/directory/postmaster.pid</userinput>
+4170
+$ <userinput>grep ^VmPeak /proc/4170/status</userinput>
+VmPeak:  6490428 kB
+</programlisting>
+     <literal>6490428</literal> / <literal>2048</literal>
+     (<varname>PAGE_SIZE</varname> is <literal>2MB</literal> in this case) are
+     roughly <literal>3169.154</literal> huge pages, so you will need at
+     least <literal>3170</literal> huge pages:
+<programlisting>
+$ <userinput>sysctl -w vm.nr_hugepages=3170</userinput>
+</programlisting>
+    Sometimes the kernel is not able to allocate the desired number of huge
+    pages, so it might be necessary to repeat that command or to reboot. Don't
+    forget to add an entry to <filename>/etc/sysctl.conf</filename> to persist
+    this setting through reboots.
+   </para>
+
+   <para>
+    The default behavior for huge pages in
+    <productname>PostgreSQL</productname> is to use them when possible and
+    to fallback to normal pages when failing. To enforce the use of huge
+    pages, you can set
+    <link linkend="guc-huge-tlb-pages"><varname>huge_tlb_pages</varname></link>
+    to <literal>on</literal>. Note that in this case
+    <productname>PostgreSQL</productname> will fail to start if not enough huge
+    pages are available.
+   </para>
+
+   <para>
+    For a detailed description of the <productname>Linux</productname> huge
+    pages feature have a look
+    at <ulink url="https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt">https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt</ulink>.
+   </para>
+
+  </sect2>
  </sect1>
 
 
#68Christian Kruse
christian@2ndQuadrant.com
In reply to: Peter Eisentraut (#63)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

Hi Peter,

after a night of sleep I'm still not able to swallow the pill. To be
honest I'm a little bit angry about this accusation.

I didn't mean to copy from the Debian wiki and after re-checking the
text again I'm still convinced that I didn't.

Of course the text SAYS something similar, but this is in the nature
of things. Structure, diction and focus are different. Also the
information transferred is different and gathered from various
articles, including the Debian wiki, the huge page docs of the kernel,
the Wikipedia and some old IBM and Oracle docs.

Best regards,

--
Christian Kruse http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

#69Heikki Linnakangas
hlinnakangas@vmware.com
In reply to: Christian Kruse (#67)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

On 02/26/2014 10:35 AM, Christian Kruse wrote:

On 25/02/14 19:28, Andres Freund wrote:

I think all that's needed is to cut the first paragraphs that generally
explain what huge pages are in some detail from the text and make sure
the later paragraphs don't refer to the earlier ones.

Attached you will find a new version of the patch.

Thanks!

huge_tlb_pages (enum)

Enables/disables the use of huge TLB pages. Valid values are try (the default), on, and off.

At present, this feature is supported only on Linux. The setting is ignored on other systems.

The use of huge TLB pages results in smaller page tables and less CPU time spent on memory management, increasing performance. For more details, see Section 17.4.4.

With huge_tlb_pages set to try, the server will try to use huge pages, but fall back to using normal allocation if that fails. With on, failure to use huge pages will prevent the server from starting up. With off, huge pages will not be used.

That still says "The setting is ignored on other systems". That's not
quite true: as explained later in the section, if you set
huge_tlb_pages=on and the platform doesn't support it, the server will
refuse to start.

17.4.4. Linux huge TLB pages

This section looks good to me. I'm OK with the level of detail, although
maybe just a sentence or two about what huge TLB pages are and what
benefits they have would still be in order. How about adding something
like this as the first sentence:

"Using huge TLB pages reduces overhead when using large contiguous
chunks of memory, like PostgreSQL does."

To enable this feature in PostgreSQL you need a kernel with CONFIG_HUGETLBFS=y and CONFIG_HUGETLB_PAGE=y. You also have to tune the system setting vm.nr_hugepages. To calculate the number of necessary huge pages start PostgreSQL without huge pages enabled and check the VmPeak value from the proc filesystem:

$ head -1 /path/to/data/directory/postmaster.pid
4170
$ grep ^VmPeak /proc/4170/status
VmPeak: 6490428 kB

6490428 / 2048 (PAGE_SIZE is 2MB in this case) are roughly 3169.154 huge pages, so you will need at least 3170 huge pages:

$ sysctl -w vm.nr_hugepages=3170

That's good advice, but perhaps s/calculate/estimate/. It's just an
approximation, after all.

- Heikki

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#70Christian Kruse
christian@2ndQuadrant.com
In reply to: Heikki Linnakangas (#69)
1 attachment(s)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

Hi,

On 26/02/14 14:34, Heikki Linnakangas wrote:

That still says "The setting is ignored on other systems". That's not quite
true: as explained later in the section, if you set huge_tlb_pages=on and
the platform doesn't support it, the server will refuse to start.

I added a sentence about it.

"Using huge TLB pages reduces overhead when using large contiguous chunks of
memory, like PostgreSQL does."

Sentence added.

That's good advice, but perhaps s/calculate/estimate/. It's just an
approximation, after all.

Fixed.

New patch version is attached.

Best regards,

--
Christian Kruse http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

Attachments:

huge_tlb_docs-v3.patchtext/x-diff; charset=us-asciiDownload
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 4dc1277..c5c2d8b 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1120,18 +1120,17 @@ include 'filename'
        </para>
 
        <para>
-        At present, this feature is supported only on Linux. The setting
-        is ignored on other systems.
+        At present, this feature is supported only on Linux. The setting is
+        ignored on other systems when set to <literal>try</literal>.
+        <productname>PostgreSQL</productname> will
+        refuse to start when set to <literal>on</literal>.
        </para>
 
        <para>
         The use of huge TLB pages results in smaller page tables and
         less CPU time spent on memory management, increasing performance. For
         more details, see
-        <ulink url="https://wiki.debian.org/Hugepages">the Debian wiki</ulink>.
-        Remember that you will need at least shared_buffers / huge page size +
-        1 huge TLB pages. So for example for a system with 6GB shared buffers
-        and a hugepage size of 2kb of you will need at least 3156 huge pages.
+        <xref linkend="linux-huge-tlb-pages">.
        </para>
 
        <para>
diff --git a/doc/src/sgml/runtime.sgml b/doc/src/sgml/runtime.sgml
index bbb808f..5f9fa61 100644
--- a/doc/src/sgml/runtime.sgml
+++ b/doc/src/sgml/runtime.sgml
@@ -1307,6 +1307,57 @@ echo -1000 > /proc/self/oom_score_adj
    </para>
    </note>
   </sect2>
+
+  <sect2 id="linux-huge-tlb-pages">
+   <title>Linux huge TLB pages</title>
+
+   <para>
+    Using huge TLB pages reduces overhead when using large contiguous chunks
+    of memory, like PostgreSQL does. To enable this feature
+    in <productname>PostgreSQL</productname> you need a kernel
+    with <varname>CONFIG_HUGETLBFS=y</varname> and
+    <varname>CONFIG_HUGETLB_PAGE=y</varname>. You also have to tune the system
+    setting <varname>vm.nr_hugepages</varname>. To estimate the number of
+    necessary huge pages start <productname>PostgreSQL</productname> without
+    huge pages enabled and check the <varname>VmPeak</varname> value from the
+    proc filesystem:
+<programlisting>
+$ <userinput>head -1 /path/to/data/directory/postmaster.pid</userinput>
+4170
+$ <userinput>grep ^VmPeak /proc/4170/status</userinput>
+VmPeak:  6490428 kB
+</programlisting>
+     <literal>6490428</literal> / <literal>2048</literal>
+     (<varname>PAGE_SIZE</varname> is <literal>2MB</literal> in this case) are
+     roughly <literal>3169.154</literal> huge pages, so you will need at
+     least <literal>3170</literal> huge pages:
+<programlisting>
+$ <userinput>sysctl -w vm.nr_hugepages=3170</userinput>
+</programlisting>
+    Sometimes the kernel is not able to allocate the desired number of huge
+    pages, so it might be necessary to repeat that command or to reboot. Don't
+    forget to add an entry to <filename>/etc/sysctl.conf</filename> to persist
+    this setting through reboots.
+   </para>
+
+   <para>
+    The default behavior for huge pages in
+    <productname>PostgreSQL</productname> is to use them when possible and
+    to fallback to normal pages when failing. To enforce the use of huge
+    pages, you can set
+    <link linkend="guc-huge-tlb-pages"><varname>huge_tlb_pages</varname></link>
+    to <literal>on</literal>. Note that in this case
+    <productname>PostgreSQL</productname> will fail to start if not enough huge
+    pages are available.
+   </para>
+
+   <para>
+    For a detailed description of the <productname>Linux</productname> huge
+    pages feature have a look
+    at <ulink url="https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt">https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt</ulink>.
+   </para>
+
+  </sect2>
  </sect1>
 
 
#71Alvaro Herrera
alvherre@2ndquadrant.com
In reply to: Heikki Linnakangas (#69)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

There's one thing that rubs me the wrong way about all this
functionality, which is that we've named it "huge TLB pages". That is
wrong -- the TLB pages are not huge. In fact, as far as I understand,
the TLB doesn't have pages at all. It's the pages that are huge, but
those pages are not TLB pages, they are just memory pages.

I think we have named it this way only because Linux for some reason
named the mmap() flag MAP_HUGETLB for some reason. The TLB is not huge
either (in fact you can't alter the size of the TLB at all; it's a
hardware thing.) I think this flag means "use the TLB entries reserved
for huge pages for the memory I'm requesting".

Since we haven't released any of this, should we discuss renaming it to
just "huge pages"?

--
�lvaro Herrera http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#72Heikki Linnakangas
hlinnakangas@vmware.com
In reply to: Alvaro Herrera (#71)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

On 02/26/2014 06:13 PM, Alvaro Herrera wrote:

There's one thing that rubs me the wrong way about all this
functionality, which is that we've named it "huge TLB pages". That is
wrong -- the TLB pages are not huge. In fact, as far as I understand,
the TLB doesn't have pages at all. It's the pages that are huge, but
those pages are not TLB pages, they are just memory pages.

I think we have named it this way only because Linux for some reason
named the mmap() flag MAP_HUGETLB for some reason. The TLB is not huge
either (in fact you can't alter the size of the TLB at all; it's a
hardware thing.) I think this flag means "use the TLB entries reserved
for huge pages for the memory I'm requesting".

Since we haven't released any of this, should we discuss renaming it to
just "huge pages"?

Linux calls it "huge tlb pages" in many places, not just MAP_HUGETLB.
Like in CONFIG_HUGETLB_PAGES and hugetlbfs. I agree it's a bit weird.
Linux also calls it just "huge pages" in many other places, like in
/proc/meminfo output.

FreeBSD calls them superpages and Windows calls them "large pages".
Yeah, it would seem better to call them just "huge pages", so that it's
more reminiscent of those names, if we ever implement support for
super/huge/large pages on other platforms.

- Heikki

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#73Stephen Frost
sfrost@snowman.net
In reply to: Christian Kruse (#68)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

Christian,

Thanks for working on all of this and dealing with the requests for
updates and changes, as well as for dealing very professionally with an
inappropriate and incorrect remark. Unfortunately, mailing lists can
make communication difficult and someone's knee-jerk reaction (not
referring to your reaction here) can end up causing much frustration.

Remind me when we're at a conference somewhere and I'll gladly buy you a
beer (or whatever your choice is). Seriously, thanks for working on the
'huge pages' changes and documentation- it's often a thankless job and
clearly one which can be extremely frustrating.

Thanks again,

Stephen

#74Christian Kruse
christian@2ndQuadrant.com
In reply to: Alvaro Herrera (#71)
1 attachment(s)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

Hi,

On 26/02/14 13:13, Alvaro Herrera wrote:

There's one thing that rubs me the wrong way about all this
functionality, which is that we've named it "huge TLB pages". That is
wrong -- the TLB pages are not huge. In fact, as far as I understand,
the TLB doesn't have pages at all. It's the pages that are huge, but
those pages are not TLB pages, they are just memory pages.

I didn't think about this, yet, but you are totally right.

Since we haven't released any of this, should we discuss renaming it to
just "huge pages"?

Attached is a patch with the updated documentation (now uses
consistently huge pages) as well as a renamed GUC, consistent wording
(always use huge pages) as well as renamed variables.

Should I create a new commit fest entry for this and delete the old
one? Or should this be done in two patches? Locally in my repo this is
done with two commits, so it would be easy to split that.

Best regards,

--
Christian Kruse http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

Attachments:

huge_tlb_docs_with_renamed_guc_and_variables.patchtext/x-diff; charset=us-asciiDownload
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index cf11306..77c778f 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1166,35 +1166,33 @@ include 'filename'
       </listitem>
      </varlistentry>
 
-     <varlistentry id="guc-huge-tlb-pages" xreflabel="huge_tlb_pages">
-      <term><varname>huge_tlb_pages</varname> (<type>enum</type>)</term>
+     <varlistentry id="guc-huge-pages" xreflabel="huge_pages">
+      <term><varname>huge_pages</varname> (<type>enum</type>)</term>
       <indexterm>
-       <primary><varname>huge_tlb_pages</> configuration parameter</primary>
+       <primary><varname>huge_pages</> configuration parameter</primary>
       </indexterm>
       <listitem>
        <para>
-        Enables/disables the use of huge TLB pages. Valid values are
+        Enables/disables the use of huge pages. Valid values are
         <literal>try</literal> (the default), <literal>on</literal>,
         and <literal>off</literal>.
        </para>
 
        <para>
-        At present, this feature is supported only on Linux. The setting
-        is ignored on other systems.
+        At present, this feature is supported only on Linux. The setting is
+        ignored on other systems when set to <literal>try</literal>.
+        <productname>PostgreSQL</productname> will
+        refuse to start when set to <literal>on</literal>.
        </para>
 
        <para>
-        The use of huge TLB pages results in smaller page tables and
-        less CPU time spent on memory management, increasing performance. For
-        more details, see
-        <ulink url="https://wiki.debian.org/Hugepages">the Debian wiki</ulink>.
-        Remember that you will need at least shared_buffers / huge page size +
-        1 huge TLB pages. So for example for a system with 6GB shared buffers
-        and a hugepage size of 2kb of you will need at least 3156 huge pages.
+        The use of huge pages results in smaller page tables and less CPU time
+        spent on memory management, increasing performance. For more details,
+        see <xref linkend="linux-huge-pages">.
        </para>
 
        <para>
-        With <varname>huge_tlb_pages</varname> set to <literal>try</literal>,
+        With <varname>huge_pages</varname> set to <literal>try</literal>,
         the server will try to use huge pages, but fall back to using
         normal allocation if that fails. With <literal>on</literal>, failure
         to use huge pages will prevent the server from starting up. With
diff --git a/doc/src/sgml/runtime.sgml b/doc/src/sgml/runtime.sgml
index bbb808f..7f4a235 100644
--- a/doc/src/sgml/runtime.sgml
+++ b/doc/src/sgml/runtime.sgml
@@ -1307,6 +1307,57 @@ echo -1000 > /proc/self/oom_score_adj
    </para>
    </note>
   </sect2>
+
+  <sect2 id="linux-huge-pages">
+   <title>Linux huge pages</title>
+
+   <para>
+    Using huge pages reduces overhead when using large contiguous chunks of
+    memory, like <productname>PostgreSQL</productname> does. To enable this
+    feature in <productname>PostgreSQL</productname> you need a kernel
+    with <varname>CONFIG_HUGETLBFS=y</varname> and
+    <varname>CONFIG_HUGETLB_PAGE=y</varname>. You also have to tune the system
+    setting <varname>vm.nr_hugepages</varname>. To estimate the number of
+    necessary huge pages start <productname>PostgreSQL</productname> without
+    huge pages enabled and check the <varname>VmPeak</varname> value from the
+    proc filesystem:
+<programlisting>
+$ <userinput>head -1 /path/to/data/directory/postmaster.pid</userinput>
+4170
+$ <userinput>grep ^VmPeak /proc/4170/status</userinput>
+VmPeak:  6490428 kB
+</programlisting>
+     <literal>6490428</literal> / <literal>2048</literal>
+     (<varname>PAGE_SIZE</varname> is <literal>2MB</literal> in this case) are
+     roughly <literal>3169.154</literal> huge pages, so you will need at
+     least <literal>3170</literal> huge pages:
+<programlisting>
+$ <userinput>sysctl -w vm.nr_hugepages=3170</userinput>
+</programlisting>
+    Sometimes the kernel is not able to allocate the desired number of huge
+    pages, so it might be necessary to repeat that command or to reboot. Don't
+    forget to add an entry to <filename>/etc/sysctl.conf</filename> to persist
+    this setting through reboots.
+   </para>
+
+   <para>
+    The default behavior for huge pages in
+    <productname>PostgreSQL</productname> is to use them when possible and
+    to fallback to normal pages when failing. To enforce the use of huge
+    pages, you can set
+    <link linkend="guc-huge-pages"><varname>huge_pages</varname></link>
+    to <literal>on</literal>. Note that in this case
+    <productname>PostgreSQL</productname> will fail to start if not enough huge
+    pages are available.
+   </para>
+
+   <para>
+    For a detailed description of the <productname>Linux</productname> huge
+    pages feature have a look
+    at <ulink url="https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt">https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt</ulink>.
+   </para>
+
+  </sect2>
  </sect1>
 
 
diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c
index 65ad595..51c1a2b 100644
--- a/src/backend/port/sysv_shmem.c
+++ b/src/backend/port/sysv_shmem.c
@@ -333,12 +333,12 @@ CreateAnonymousSegment(Size *size)
 	int			mmap_errno = 0;
 
 #ifndef MAP_HUGETLB
-	if (huge_tlb_pages == HUGE_TLB_ON)
+	if (huge_pages == HUGE_PAGES_ON)
 		ereport(ERROR,
 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 				 errmsg("huge TLB pages not supported on this platform")));
 #else
-	if (huge_tlb_pages == HUGE_TLB_ON || huge_tlb_pages == HUGE_TLB_TRY)
+	if (huge_pages == HUGE_PAGES_ON || huge_pages == HUGE_PAGES_TRY)
 	{
 		/*
 		 * Round up the request size to a suitable large value.
@@ -364,13 +364,13 @@ CreateAnonymousSegment(Size *size)
 		ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE,
 				   PG_MMAP_FLAGS | MAP_HUGETLB, -1, 0);
 		mmap_errno = errno;
-		if (huge_tlb_pages == HUGE_TLB_TRY && ptr == MAP_FAILED)
+		if (huge_pages == HUGE_PAGES_TRY && ptr == MAP_FAILED)
 			elog(DEBUG1, "mmap with MAP_HUGETLB failed, huge pages disabled: %m");
 	}
 #endif
 
-	if (huge_tlb_pages == HUGE_TLB_OFF ||
-		(huge_tlb_pages == HUGE_TLB_TRY && ptr == MAP_FAILED))
+	if (huge_pages == HUGE_PAGES_OFF ||
+		(huge_pages == HUGE_PAGES_TRY && ptr == MAP_FAILED))
 	{
 		/*
 		 * use the original size, not the rounded up value, when falling
@@ -431,10 +431,10 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port)
 	Size		sysvsize;
 
 #if defined(EXEC_BACKEND) || !defined(MAP_HUGETLB)
-	if (huge_tlb_pages == HUGE_TLB_ON)
+	if (huge_pages == HUGE_PAGES_ON)
 		ereport(ERROR,
 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-				 errmsg("huge TLB pages not supported on this platform")));
+				 errmsg("huge pages not supported on this platform")));
 #endif
 
 	/* Room for a header? */
diff --git a/src/backend/port/win32_shmem.c b/src/backend/port/win32_shmem.c
index 9b0cceb..dca371c 100644
--- a/src/backend/port/win32_shmem.c
+++ b/src/backend/port/win32_shmem.c
@@ -128,10 +128,10 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port)
 	DWORD		size_high;
 	DWORD		size_low;
 
-	if (huge_tlb_pages == HUGE_TLB_ON)
+	if (huge_pages == HUGE_PAGES_ON)
 		ereport(ERROR,
 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-				 errmsg("huge TLB pages not supported on this platform")));
+				 errmsg("huge pages not supported on this platform")));
 
 	/* Room for a header? */
 	Assert(size > MAXALIGN(sizeof(PGShmemHeader)));
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index b27cb89..c76edb4 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -393,16 +393,16 @@ static const struct config_enum_entry synchronous_commit_options[] = {
  * Although only "on", "off", "try" are documented, we accept all the likely
  * variants of "on" and "off".
  */
-static const struct config_enum_entry huge_tlb_options[] = {
-	{"off", HUGE_TLB_OFF, false},
-	{"on", HUGE_TLB_ON, false},
-	{"try", HUGE_TLB_TRY, false},
-	{"true", HUGE_TLB_ON, true},
-	{"false", HUGE_TLB_OFF, true},
-	{"yes", HUGE_TLB_ON, true},
-	{"no", HUGE_TLB_OFF, true},
-	{"1", HUGE_TLB_ON, true},
-	{"0", HUGE_TLB_OFF, true},
+static const struct config_enum_entry huge_pages_options[] = {
+	{"off", HUGE_PAGES_OFF, false},
+	{"on", HUGE_PAGES_ON, false},
+	{"try", HUGE_PAGES_TRY, false},
+	{"true", HUGE_PAGES_ON, true},
+	{"false", HUGE_PAGES_OFF, true},
+	{"yes", HUGE_PAGES_ON, true},
+	{"no", HUGE_PAGES_OFF, true},
+	{"1", HUGE_PAGES_ON, true},
+	{"0", HUGE_PAGES_OFF, true},
 	{NULL, 0, false}
 };
 
@@ -470,7 +470,7 @@ int			tcp_keepalives_count;
  * This really belongs in pg_shmem.c, but is defined here so that it doesn't
  * need to be duplicated in all the different implementations of pg_shmem.c.
  */
-int			huge_tlb_pages;
+int			huge_pages;
 
 /*
  * These variables are all dummies that don't do anything, except in some
@@ -3497,12 +3497,12 @@ static struct config_enum ConfigureNamesEnum[] =
 	},
 
 	{
-		{"huge_tlb_pages", PGC_POSTMASTER, RESOURCES_MEM,
-			gettext_noop("Use of huge TLB pages on Linux"),
+		{"huge_pages", PGC_POSTMASTER, RESOURCES_MEM,
+			gettext_noop("Use of huge pages on Linux"),
 			NULL
 		},
-		&huge_tlb_pages,
-		HUGE_TLB_TRY, huge_tlb_options,
+		&huge_pages,
+		HUGE_PAGES_TRY, huge_pages_options,
 		NULL, NULL, NULL
 	},
 
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index ce56059..3629a52 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -115,7 +115,7 @@
 
 #shared_buffers = 32MB			# min 128kB
 					# (change requires restart)
-#huge_tlb_pages = try			# on, off, or try
+#huge_pages = try			# on, off, or try
 					# (change requires restart)
 #temp_buffers = 8MB			# min 800kB
 #max_prepared_transactions = 0		# zero disables the feature
diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h
index 0d60729..0dc960b 100644
--- a/src/include/storage/pg_shmem.h
+++ b/src/include/storage/pg_shmem.h
@@ -39,15 +39,15 @@ typedef struct PGShmemHeader	/* standard header for all Postgres shmem */
 } PGShmemHeader;
 
 /* GUC variable */
-extern int huge_tlb_pages;
+extern int huge_pages;
 
-/* Possible values for huge_tlb_pages */
+/* Possible values for huge_pages */
 typedef enum
 {
-	HUGE_TLB_OFF,
-	HUGE_TLB_ON,
-	HUGE_TLB_TRY
-} HugeTlbType;
+	HUGE_PAGES_OFF,
+	HUGE_PAGES_ON,
+	HUGE_PAGES_TRY
+} HugePagesType;
 
 #ifndef WIN32
 extern unsigned long UsedShmemSegID;
#75Christian Kruse
christian@2ndQuadrant.com
In reply to: Stephen Frost (#73)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

Hi Peter,

thank you for your nice words, much appreciated. I'm sorry that I was
so whiny about this in the last post.

Best regards,

--
Christian Kruse http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

#76Christian Kruse
christian@2ndQuadrant.com
In reply to: Christian Kruse (#75)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

Hi,

On 27/02/14 08:35, Christian Kruse wrote:

Hi Peter,

Sorry, Stephen of course – it was definitely to early.

Best regards,

--
Christian Kruse http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

#77Heikki Linnakangas
hlinnakangas@vmware.com
In reply to: Christian Kruse (#74)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

On 02/27/2014 09:34 AM, Christian Kruse wrote:

Hi,

On 26/02/14 13:13, Alvaro Herrera wrote:

There's one thing that rubs me the wrong way about all this
functionality, which is that we've named it "huge TLB pages". That is
wrong -- the TLB pages are not huge. In fact, as far as I understand,
the TLB doesn't have pages at all. It's the pages that are huge, but
those pages are not TLB pages, they are just memory pages.

I didn't think about this, yet, but you are totally right.

Since we haven't released any of this, should we discuss renaming it to
just "huge pages"?

Attached is a patch with the updated documentation (now uses
consistently huge pages) as well as a renamed GUC, consistent wording
(always use huge pages) as well as renamed variables.

Hmm, I wonder if that could now be misunderstood to have something to do
with the PostgreSQL page size? Maybe add the word "memory" or "operating
system" in the first sentence in the docs, like this: "Enables/disables
the use of huge memory pages".

<para>
At present, this feature is supported only on Linux. The setting is
ignored on other systems when set to <literal>try</literal>.
<productname>PostgreSQL</productname> will
refuse to start when set to <literal>on</literal>.
</para>

Is it clear enough that PostgreSQL will only refuse to start up when
it's set to on, *if the feature's not supported on the platform*?
Perhaps just leave that last sentence out. It's mentioned later that "
With <literal>on</literal>, failure to use huge pages will prevent the
server from starting up.", that's probably enough.

- Heikki

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#78Peter Geoghegan
pg@heroku.com
In reply to: Heikki Linnakangas (#77)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

On Fri, Feb 28, 2014 at 9:43 AM, Heikki Linnakangas
<hlinnakangas@vmware.com> wrote:

Hmm, I wonder if that could now be misunderstood to have something to do
with the PostgreSQL page size? Maybe add the word "memory" or "operating
system" in the first sentence in the docs, like this: "Enables/disables the
use of huge memory pages".

Whenever I wish to emphasize that distinction, I tend to use the term
"MMU pages".

--
Peter Geoghegan

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#79Christian Kruse
christian@2ndQuadrant.com
In reply to: Heikki Linnakangas (#77)
1 attachment(s)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

Hi,

Attached is a patch with the updated documentation (now uses
consistently huge pages) as well as a renamed GUC, consistent wording
(always use huge pages) as well as renamed variables.

Hmm, I wonder if that could now be misunderstood to have something to do
with the PostgreSQL page size? Maybe add the word "memory" or "operating
system" in the first sentence in the docs, like this: "Enables/disables the
use of huge memory pages".

Accepted, see attached patch.

<para>
At present, this feature is supported only on Linux. The setting is
ignored on other systems when set to <literal>try</literal>.
<productname>PostgreSQL</productname> will
refuse to start when set to <literal>on</literal>.
</para>

Is it clear enough that PostgreSQL will only refuse to start up when it's
set to on, *if the feature's not supported on the platform*? Perhaps just
leave that last sentence out. It's mentioned later that " With
<literal>on</literal>, failure to use huge pages will prevent the server
from starting up.", that's probably enough.

Fixed.

Best regards,

--
Christian Kruse http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

Attachments:

hugepages-v9.patchtext/x-diff; charset=us-asciiDownload
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index fa9ee37..065c1db 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1166,14 +1166,14 @@ include 'filename'
       </listitem>
      </varlistentry>
 
-     <varlistentry id="guc-huge-tlb-pages" xreflabel="huge_tlb_pages">
-      <term><varname>huge_tlb_pages</varname> (<type>enum</type>)</term>
+     <varlistentry id="guc-huge-pages" xreflabel="huge_pages">
+      <term><varname>huge_pages</varname> (<type>enum</type>)</term>
       <indexterm>
-       <primary><varname>huge_tlb_pages</> configuration parameter</primary>
+       <primary><varname>huge_pages</> configuration parameter</primary>
       </indexterm>
       <listitem>
        <para>
-        Enables/disables the use of huge TLB pages. Valid values are
+        Enables/disables the use of huge memory pages. Valid values are
         <literal>try</literal> (the default), <literal>on</literal>,
         and <literal>off</literal>.
        </para>
@@ -1181,19 +1181,16 @@ include 'filename'
        <para>
         At present, this feature is supported only on Linux. The setting is
         ignored on other systems when set to <literal>try</literal>.
-        <productname>PostgreSQL</productname> will
-        refuse to start when set to <literal>on</literal>.
        </para>
 
        <para>
-        The use of huge TLB pages results in smaller page tables and
-        less CPU time spent on memory management, increasing performance. For
-        more details, see
-        <xref linkend="linux-huge-tlb-pages">.
+        The use of huge pages results in smaller page tables and less CPU time
+        spent on memory management, increasing performance. For more details,
+        see <xref linkend="linux-huge-pages">.
        </para>
 
        <para>
-        With <varname>huge_tlb_pages</varname> set to <literal>try</literal>,
+        With <varname>huge_pages</varname> set to <literal>try</literal>,
         the server will try to use huge pages, but fall back to using
         normal allocation if that fails. With <literal>on</literal>, failure
         to use huge pages will prevent the server from starting up. With
diff --git a/doc/src/sgml/runtime.sgml b/doc/src/sgml/runtime.sgml
index 5f9fa61..7f4a235 100644
--- a/doc/src/sgml/runtime.sgml
+++ b/doc/src/sgml/runtime.sgml
@@ -1308,13 +1308,13 @@ echo -1000 > /proc/self/oom_score_adj
    </note>
   </sect2>
 
-  <sect2 id="linux-huge-tlb-pages">
-   <title>Linux huge TLB pages</title>
+  <sect2 id="linux-huge-pages">
+   <title>Linux huge pages</title>
 
    <para>
-    Using huge TLB pages reduces overhead when using large contiguous chunks
-    of memory, like PostgreSQL does. To enable this feature
-    in <productname>PostgreSQL</productname> you need a kernel
+    Using huge pages reduces overhead when using large contiguous chunks of
+    memory, like <productname>PostgreSQL</productname> does. To enable this
+    feature in <productname>PostgreSQL</productname> you need a kernel
     with <varname>CONFIG_HUGETLBFS=y</varname> and
     <varname>CONFIG_HUGETLB_PAGE=y</varname>. You also have to tune the system
     setting <varname>vm.nr_hugepages</varname>. To estimate the number of
@@ -1345,7 +1345,7 @@ $ <userinput>sysctl -w vm.nr_hugepages=3170</userinput>
     <productname>PostgreSQL</productname> is to use them when possible and
     to fallback to normal pages when failing. To enforce the use of huge
     pages, you can set
-    <link linkend="guc-huge-tlb-pages"><varname>huge_tlb_pages</varname></link>
+    <link linkend="guc-huge-pages"><varname>huge_pages</varname></link>
     to <literal>on</literal>. Note that in this case
     <productname>PostgreSQL</productname> will fail to start if not enough huge
     pages are available.
diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c
index 65ad595..51c1a2b 100644
--- a/src/backend/port/sysv_shmem.c
+++ b/src/backend/port/sysv_shmem.c
@@ -333,12 +333,12 @@ CreateAnonymousSegment(Size *size)
 	int			mmap_errno = 0;
 
 #ifndef MAP_HUGETLB
-	if (huge_tlb_pages == HUGE_TLB_ON)
+	if (huge_pages == HUGE_PAGES_ON)
 		ereport(ERROR,
 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 				 errmsg("huge TLB pages not supported on this platform")));
 #else
-	if (huge_tlb_pages == HUGE_TLB_ON || huge_tlb_pages == HUGE_TLB_TRY)
+	if (huge_pages == HUGE_PAGES_ON || huge_pages == HUGE_PAGES_TRY)
 	{
 		/*
 		 * Round up the request size to a suitable large value.
@@ -364,13 +364,13 @@ CreateAnonymousSegment(Size *size)
 		ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE,
 				   PG_MMAP_FLAGS | MAP_HUGETLB, -1, 0);
 		mmap_errno = errno;
-		if (huge_tlb_pages == HUGE_TLB_TRY && ptr == MAP_FAILED)
+		if (huge_pages == HUGE_PAGES_TRY && ptr == MAP_FAILED)
 			elog(DEBUG1, "mmap with MAP_HUGETLB failed, huge pages disabled: %m");
 	}
 #endif
 
-	if (huge_tlb_pages == HUGE_TLB_OFF ||
-		(huge_tlb_pages == HUGE_TLB_TRY && ptr == MAP_FAILED))
+	if (huge_pages == HUGE_PAGES_OFF ||
+		(huge_pages == HUGE_PAGES_TRY && ptr == MAP_FAILED))
 	{
 		/*
 		 * use the original size, not the rounded up value, when falling
@@ -431,10 +431,10 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port)
 	Size		sysvsize;
 
 #if defined(EXEC_BACKEND) || !defined(MAP_HUGETLB)
-	if (huge_tlb_pages == HUGE_TLB_ON)
+	if (huge_pages == HUGE_PAGES_ON)
 		ereport(ERROR,
 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-				 errmsg("huge TLB pages not supported on this platform")));
+				 errmsg("huge pages not supported on this platform")));
 #endif
 
 	/* Room for a header? */
diff --git a/src/backend/port/win32_shmem.c b/src/backend/port/win32_shmem.c
index 9b0cceb..dca371c 100644
--- a/src/backend/port/win32_shmem.c
+++ b/src/backend/port/win32_shmem.c
@@ -128,10 +128,10 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port)
 	DWORD		size_high;
 	DWORD		size_low;
 
-	if (huge_tlb_pages == HUGE_TLB_ON)
+	if (huge_pages == HUGE_PAGES_ON)
 		ereport(ERROR,
 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-				 errmsg("huge TLB pages not supported on this platform")));
+				 errmsg("huge pages not supported on this platform")));
 
 	/* Room for a header? */
 	Assert(size > MAXALIGN(sizeof(PGShmemHeader)));
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index b27cb89..c76edb4 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -393,16 +393,16 @@ static const struct config_enum_entry synchronous_commit_options[] = {
  * Although only "on", "off", "try" are documented, we accept all the likely
  * variants of "on" and "off".
  */
-static const struct config_enum_entry huge_tlb_options[] = {
-	{"off", HUGE_TLB_OFF, false},
-	{"on", HUGE_TLB_ON, false},
-	{"try", HUGE_TLB_TRY, false},
-	{"true", HUGE_TLB_ON, true},
-	{"false", HUGE_TLB_OFF, true},
-	{"yes", HUGE_TLB_ON, true},
-	{"no", HUGE_TLB_OFF, true},
-	{"1", HUGE_TLB_ON, true},
-	{"0", HUGE_TLB_OFF, true},
+static const struct config_enum_entry huge_pages_options[] = {
+	{"off", HUGE_PAGES_OFF, false},
+	{"on", HUGE_PAGES_ON, false},
+	{"try", HUGE_PAGES_TRY, false},
+	{"true", HUGE_PAGES_ON, true},
+	{"false", HUGE_PAGES_OFF, true},
+	{"yes", HUGE_PAGES_ON, true},
+	{"no", HUGE_PAGES_OFF, true},
+	{"1", HUGE_PAGES_ON, true},
+	{"0", HUGE_PAGES_OFF, true},
 	{NULL, 0, false}
 };
 
@@ -470,7 +470,7 @@ int			tcp_keepalives_count;
  * This really belongs in pg_shmem.c, but is defined here so that it doesn't
  * need to be duplicated in all the different implementations of pg_shmem.c.
  */
-int			huge_tlb_pages;
+int			huge_pages;
 
 /*
  * These variables are all dummies that don't do anything, except in some
@@ -3497,12 +3497,12 @@ static struct config_enum ConfigureNamesEnum[] =
 	},
 
 	{
-		{"huge_tlb_pages", PGC_POSTMASTER, RESOURCES_MEM,
-			gettext_noop("Use of huge TLB pages on Linux"),
+		{"huge_pages", PGC_POSTMASTER, RESOURCES_MEM,
+			gettext_noop("Use of huge pages on Linux"),
 			NULL
 		},
-		&huge_tlb_pages,
-		HUGE_TLB_TRY, huge_tlb_options,
+		&huge_pages,
+		HUGE_PAGES_TRY, huge_pages_options,
 		NULL, NULL, NULL
 	},
 
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index ce56059..3629a52 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -115,7 +115,7 @@
 
 #shared_buffers = 32MB			# min 128kB
 					# (change requires restart)
-#huge_tlb_pages = try			# on, off, or try
+#huge_pages = try			# on, off, or try
 					# (change requires restart)
 #temp_buffers = 8MB			# min 800kB
 #max_prepared_transactions = 0		# zero disables the feature
diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h
index 0d60729..0dc960b 100644
--- a/src/include/storage/pg_shmem.h
+++ b/src/include/storage/pg_shmem.h
@@ -39,15 +39,15 @@ typedef struct PGShmemHeader	/* standard header for all Postgres shmem */
 } PGShmemHeader;
 
 /* GUC variable */
-extern int huge_tlb_pages;
+extern int huge_pages;
 
-/* Possible values for huge_tlb_pages */
+/* Possible values for huge_pages */
 typedef enum
 {
-	HUGE_TLB_OFF,
-	HUGE_TLB_ON,
-	HUGE_TLB_TRY
-} HugeTlbType;
+	HUGE_PAGES_OFF,
+	HUGE_PAGES_ON,
+	HUGE_PAGES_TRY
+} HugePagesType;
 
 #ifndef WIN32
 extern unsigned long UsedShmemSegID;
#80Christian Kruse
christian@2ndquadrant.com
In reply to: Peter Geoghegan (#78)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

Hi,

On 28/02/14 17:58, Peter Geoghegan wrote:

On Fri, Feb 28, 2014 at 9:43 AM, Heikki Linnakangas
<hlinnakangas@vmware.com> wrote:

Hmm, I wonder if that could now be misunderstood to have something to do
with the PostgreSQL page size? Maybe add the word "memory" or "operating
system" in the first sentence in the docs, like this: "Enables/disables the
use of huge memory pages".

Whenever I wish to emphasize that distinction, I tend to use the term
"MMU pages".

I don't like to distinct that much from Linux terminology, this may
lead to confusion. And to use this term only in one place doesn't seem
to make sense, too – naming will then be inconsistent and thus lead to
confusion, too. Do you agree?

Best regards,

--
Christian Kruse http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

#81Heikki Linnakangas
hlinnakangas@vmware.com
In reply to: Christian Kruse (#79)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

On 03/03/2014 11:34 AM, Christian Kruse wrote:

Hi,

Attached is a patch with the updated documentation (now uses
consistently huge pages) as well as a renamed GUC, consistent wording
(always use huge pages) as well as renamed variables.

Hmm, I wonder if that could now be misunderstood to have something to do
with the PostgreSQL page size? Maybe add the word "memory" or "operating
system" in the first sentence in the docs, like this: "Enables/disables the
use of huge memory pages".

Accepted, see attached patch.

Thanks, committed!

I spotted this in section "17.4.1 Shared Memory and Semaphores":

Linux

The default maximum segment size is 32 MB, and the default maximum total size is 2097152 pages. A page is almost always 4096 bytes except in unusual kernel configurations with "huge pages" (use getconf PAGE_SIZE to verify).

It's not any more wrong now than it's always been, but I don't think
huge pages ever affect PAGE_SIZE... Could I cajole you into rephrasing
that, too?

- Heikki

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#82Christian Kruse
christian@2ndQuadrant.com
In reply to: Heikki Linnakangas (#81)
1 attachment(s)
Re: [PATCH] Use MAP_HUGETLB where supported (v3)

Hi,

On 03/03/14 21:03, Heikki Linnakangas wrote:

I spotted this in section "17.4.1 Shared Memory and Semaphores":

Linux

The default maximum segment size is 32 MB, and the default maximum total size is 2097152 pages. A page is almost always 4096 bytes except in unusual kernel configurations with "huge pages" (use getconf PAGE_SIZE to verify).

It's not any more wrong now than it's always been, but I don't think huge
pages ever affect PAGE_SIZE... Could I cajole you into rephrasing that, too?

Hm… to be honest, I'm not sure how to change that. What about this?

The default maximum segment size is 32 MB, and the
default maximum total size is 2097152
pages. A page is almost always 4096 bytes except in
kernel configurations with <quote>huge pages</quote>
(use <literal>cat /proc/meminfo | grep Hugepagesize</literal>
to verify), but they have to be enabled explicitely via
<xref linkend="guc-huge-pages">. See
<xref linkend="linux-huge-pages"> for details.

I attached a patch doing this change.

Best regards,

--
Christian Kruse http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

Attachments:

shm_docs-v1.patchtext/x-diff; charset=us-asciiDownload
diff --git a/doc/src/sgml/runtime.sgml b/doc/src/sgml/runtime.sgml
index 7f4a235..8811097 100644
--- a/doc/src/sgml/runtime.sgml
+++ b/doc/src/sgml/runtime.sgml
@@ -881,9 +881,12 @@ option        SEMMAP=256
        <para>
         The default maximum segment size is 32 MB, and the
         default maximum total size is 2097152
-        pages.  A page is almost always 4096 bytes except in unusual
+        pages.  A page is almost always 4096 bytes except in
         kernel configurations with <quote>huge pages</quote>
-        (use <literal>getconf PAGE_SIZE</literal> to verify).
+        (use <literal>cat /proc/meminfo | grep Hugepagesize</literal>
+        to verify), but they have to be enabled explicitely via
+        <xref linkend="guc-huge-pages">. See
+        <xref linkend="linux-huge-pages"> for details.
        </para>
 
        <para>