From 5cf1af94337523c2dcd6427d70ca5c589942a64c Mon Sep 17 00:00:00 2001
From: Odin Ugedal <odin@ugedal.com>
Date: Sun, 7 Jun 2020 21:04:57 +0200
Subject: [PATCH v2] Add support for choosing huge page size

This adds support for using non-default huge page sizes for shared
memory. This is achived via the new "huge_page_size" config entry.
The config value defaults to 0, meaning it will use the system default.
---
 doc/src/sgml/config.sgml                      | 27 ++++++++
 doc/src/sgml/runtime.sgml                     | 41 +++++++-----
 src/backend/port/sysv_shmem.c                 | 67 ++++++++++++++-----
 src/backend/utils/misc/guc.c                  | 11 +++
 src/backend/utils/misc/postgresql.conf.sample |  2 +
 src/include/storage/pg_shmem.h                |  1 +
 6 files changed, 118 insertions(+), 31 deletions(-)

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index aca8f73a50..42f06a41cb 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1582,6 +1582,33 @@ include_dir 'conf.d'
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-huge-page-size" xreflabel="huge_page_size">
+      <term><varname>huge_page_size</varname> (<type>integer</type>)
+      <indexterm>
+       <primary><varname>huge_page_size</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Controls what size of huge pages is used in conjunction with
+        <xref linkend="guc-huge-pages"/>.
+        The default is zero (<literal>0</literal>).
+        When set to <literal>0</literal>, the default huge page size on the system will
+        be used.
+       </para>
+       <para>
+        Some commonly available page sizes on modern 64 bit server architectures include:
+        <literal>2MB<literal> and <literal>1GB</literal> (Intel and AMD), <literal>16MB</literal> and
+        <literal>16GB</literal> (IBM POWER), and <literal>64kB</literal>, <literal>2MB<literal>,
+        <literal>32MB</literal> and <literal>1GB</literal> (ARM). For more information
+        about usage and support, see <xref linkend="linux-huge-pages"/>.
+       </para>
+       <para>
+        Controlling huge page size is currently not supported on Windows.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-temp-buffers" xreflabel="temp_buffers">
       <term><varname>temp_buffers</varname> (<type>integer</type>)
       <indexterm>
diff --git a/doc/src/sgml/runtime.sgml b/doc/src/sgml/runtime.sgml
index 88210c4a5d..cbdbcb4fdf 100644
--- a/doc/src/sgml/runtime.sgml
+++ b/doc/src/sgml/runtime.sgml
@@ -1391,41 +1391,50 @@ export PG_OOM_ADJUST_VALUE=0
     using large values of <xref linkend="guc-shared-buffers"/>.  To use this
     feature in <productname>PostgreSQL</productname> you need a kernel
     with <varname>CONFIG_HUGETLBFS=y</varname> and
-    <varname>CONFIG_HUGETLB_PAGE=y</varname>. You will also have to adjust
-    the kernel setting <varname>vm.nr_hugepages</varname>. To estimate the
-    number of huge pages needed, start <productname>PostgreSQL</productname>
-    without huge pages enabled and check the
-    postmaster's anonymous shared memory segment size, as well as the system's
-    huge page size, using the <filename>/proc</filename> file system.  This might
-    look like:
+    <varname>CONFIG_HUGETLB_PAGE=y</varname>. You will also have to pre-allocate
+    huge pages with the the desired huge page size. To estimate the number of
+    huge pages needed, start <productname>PostgreSQL</productname> without huge
+    pages enabled and check the postmaster's anonymous shared memory segment size,
+    as well as the system's supported huge page sizes, using the
+    <filename>/sys</filename> file system.  This might look like:
 <programlisting>
 $ <userinput>head -1 $PGDATA/postmaster.pid</userinput>
 4170
 $ <userinput>pmap 4170 | awk '/rw-s/ &amp;&amp; /zero/ {print $2}'</userinput>
 6490428K
+$ <userinput>ls /sys/kernel/mm/hugepages</userinput>
+hugepages-1048576kB  hugepages-2048kB
+</programlisting>
+
+     You can now choose between the supported sizes, 2MiB and 1GiB in this case.
+     By default <productname>PostgreSQL</productname> will use the default huge
+     page size on the system, but that can be configured via
+     <xref linkend="guc-huge-page-size"/>.
+     The default huge page size can be found with:
+<programlisting>
 $ <userinput>grep ^Hugepagesize /proc/meminfo</userinput>
 Hugepagesize:       2048 kB
 </programlisting>
+
+     For <literal>2MiB</literal>,
      <literal>6490428</literal> / <literal>2048</literal> gives approximately
      <literal>3169.154</literal>, so in this example we need at
      least <literal>3170</literal> huge pages, which we can set with:
 <programlisting>
-$ <userinput>sysctl -w vm.nr_hugepages=3170</userinput>
+$ <userinput>echo 3170 | tee /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages</userinput>
 </programlisting>
     A larger setting would be appropriate if other programs on the machine
-    also need huge pages.  Don't forget to add this setting
-    to <filename>/etc/sysctl.conf</filename> so that it will be reapplied
-    after reboots.
+    also need huge pages. It is also possible to pre allocate huge pages on boot
+    by adding the kernel parameters <literal>hugepagesz=2M hugepages=3170</literal>.
    </para>
 
    <para>
     Sometimes the kernel is not able to allocate the desired number of huge
-    pages immediately, so it might be necessary to repeat the command or to
-    reboot.  (Immediately after a reboot, most of the machine's memory
-    should be available to convert into huge pages.)  To verify the huge
-    page allocation situation, use:
+    pages immediately due to external fragmentation, so it might be necessary to
+    repeat the command or to reboot. To verify the huge page allocation situation
+    for a given size, use:
 <programlisting>
-$ <userinput>grep Huge /proc/meminfo</userinput>
+$ <userinput>cat /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages</userinput>
 </programlisting>
    </para>
 
diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c
index 198a6985bf..21baffd4f4 100644
--- a/src/backend/port/sysv_shmem.c
+++ b/src/backend/port/sysv_shmem.c
@@ -32,6 +32,7 @@
 #endif
 
 #include "miscadmin.h"
+#include "port/pg_bitutils.h"
 #include "portability/mem.h"
 #include "storage/dsm.h"
 #include "storage/fd.h"
@@ -464,25 +465,15 @@ PGSharedMemoryAttach(IpcMemoryId shmId,
  * hugepage sizes, we might want to think about more invasive strategies,
  * such as increasing shared_buffers to absorb the extra space.
  *
- * Returns the (real or assumed) page size into *hugepagesize,
+ * Returns the (real, assumed or config provided) page size into *hugepagesize,
  * and the hugepage-related mmap flags to use into *mmap_flags.
- *
- * Currently *mmap_flags is always just MAP_HUGETLB.  Someday, on systems
- * that support it, we might OR in additional bits to specify a particular
- * non-default huge page size.
  */
+
+
 static void
 GetHugePageSize(Size *hugepagesize, int *mmap_flags)
 {
-	/*
-	 * If we fail to find out the system's default huge page size, assume it
-	 * is 2MB.  This will work fine when the actual size is less.  If it's
-	 * more, we might get mmap() or munmap() failures due to unaligned
-	 * requests; but at this writing, there are no reports of any non-Linux
-	 * systems being picky about that.
-	 */
-	*hugepagesize = 2 * 1024 * 1024;
-	*mmap_flags = MAP_HUGETLB;
+	Size		default_hugepagesize = 0;
 
 	/*
 	 * System-dependent code to find out the default huge page size.
@@ -491,6 +482,7 @@ GetHugePageSize(Size *hugepagesize, int *mmap_flags)
 	 * nnnn kB".  Ignore any failures, falling back to the preset default.
 	 */
 #ifdef __linux__
+
 	{
 		FILE	   *fp = AllocateFile("/proc/meminfo", "r");
 		char		buf[128];
@@ -505,7 +497,7 @@ GetHugePageSize(Size *hugepagesize, int *mmap_flags)
 				{
 					if (ch == 'k')
 					{
-						*hugepagesize = sz * (Size) 1024;
+						default_hugepagesize = sz * (Size) 1024;
 						break;
 					}
 					/* We could accept other units besides kB, if needed */
@@ -515,6 +507,51 @@ GetHugePageSize(Size *hugepagesize, int *mmap_flags)
 		}
 	}
 #endif							/* __linux__ */
+
+	if (huge_page_size != 0)
+	{
+		/* If huge page size is provided in in config we use that size */
+		*hugepagesize = (Size) huge_page_size * 1024;
+	}
+	else if (default_hugepagesize != 0)
+	{
+		*hugepagesize = default_hugepagesize;
+	}
+	else
+	{
+		/*
+		 * If we fail to find out the system's default huge page size, or no
+		 * huge page size is provided in config, assume it is 2MB. This will
+		 * work fine when the actual size is less.  If it's more, we might get
+		 * mmap() or munmap() failures due to unaligned requests; but at this
+		 * writing, there are no reports of any non-Linux systems being picky
+		 * about that.
+		 */
+		*hugepagesize = 2 * 1024 * 1024;
+	}
+
+
+	*mmap_flags = MAP_HUGETLB;
+
+	/*
+	 * System-dependent code to configure mmap_flags.
+	 *
+	 * On Linux, configure flags to include page size, since default huge page
+	 * size will be used in case no size is provided.
+	 */
+#ifdef __linux__
+
+	/*
+	 * If the selected huge page size is not the default, add flag to mmap to
+	 * specify it
+	 */
+	if (*hugepagesize != default_hugepagesize)
+	{
+		int			shift = pg_ceil_log2_64(*hugepagesize);
+
+		*mmap_flags |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT;
+	}
+#endif							/* __linux__ */
 }
 
 #endif							/* MAP_HUGETLB */
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 2f3e0a70e0..b482c660cf 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -585,6 +585,7 @@ int			ssl_renegotiation_limit;
  * need to be duplicated in all the different implementations of pg_shmem.c.
  */
 int			huge_pages;
+int			huge_page_size;
 
 /*
  * These variables are all dummies that don't do anything, except in some
@@ -2269,6 +2270,16 @@ static struct config_int ConfigureNamesInt[] =
 		1024, 16, INT_MAX / 2,
 		NULL, NULL, NULL
 	},
+	{
+		{"huge_page_size", PGC_POSTMASTER, RESOURCES_MEM,
+			gettext_noop("The size of huge page that should be used."),
+			NULL,
+			GUC_UNIT_KB
+		},
+		&huge_page_size,
+		0, 0, INT_MAX,
+		NULL, NULL, NULL
+	},
 
 	{
 		{"temp_buffers", PGC_USERSET, RESOURCES_MEM,
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index ac02bd0c00..750d3f6245 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -122,6 +122,8 @@
 					# (change requires restart)
 #huge_pages = try			# on, off, or try
 					# (change requires restart)
+#huge_page_size = 0			# use defualt huge page size when set to zero
+					# (change requires restart)
 #temp_buffers = 8MB			# min 800kB
 #max_prepared_transactions = 0		# zero disables the feature
 					# (change requires restart)
diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h
index 0de26b3427..9992932a00 100644
--- a/src/include/storage/pg_shmem.h
+++ b/src/include/storage/pg_shmem.h
@@ -44,6 +44,7 @@ typedef struct PGShmemHeader	/* standard header for all Postgres shmem */
 /* GUC variables */
 extern int	shared_memory_type;
 extern int	huge_pages;
+extern int	huge_page_size;
 
 /* Possible values for huge_pages */
 typedef enum
-- 
2.27.0

