From 9712e50d6d15c18ea2c5fcf457972486b0d4ef53 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@vondra.me>
Date: Tue, 6 May 2025 21:12:21 +0200
Subject: [PATCH v1 1/6] NUMA: interleaving buffers

Ensure shared buffers are allocated from all NUMA nodes, in a balanced
way, instead of just using the node where Postgres initially starts, or
where the kernel decides to migrate the page, etc. With pre-warming
performed by a single backend, this can easily result in severely
unbalanced memory distribution (with most from a single NUMA node).

The kernel would eventually move some of the memory to other nodes
(thanks to zone_reclaim), but that tends to take a long time. So this
patch improves predictability, reduces the time needed for warmup
during benchmarking, etc.  It's less dependent on what the CPU
scheduler does, etc.

Furthermore, the buffers are mapped to NUMA nodes in a deterministic
way, so this also allows further improvements like backends using
buffers from the same NUMA node.

The effect is similar to

     numactl --interleave=all

but there's a number of important differences.

Firstly, it's applied only to shared buffers (and also to descriptors),
not to the whole shared memory segment. It's not clear we'd want to use
interleaving for all parts, storing entries with different sizes and
life cycles (e.g. ProcArray may need different approach).

Secondly, it considers the page and block size, and makes sure not to
split a buffer on different NUMA nodes (which with the regular
interleaving is guaranteed to happen, unless when using huge pages). The
patch performs "explicit" interleaving, so that buffers are not split
like this.

The patch maps both buffers and buffer descriptors, so that the buffer
and it's buffer descriptor end up on the same NUMA node.

The mapping happens in larger chunks (see choose_chunk_items). This is
required to handle buffer descriptors (which are smaller than buffers),
and it should also help to reduce the number of mappings. Most NUMA
systems will use 1GB chunks, unless using very small shared buffers.

Notes:

* The feature is enabled by numa_buffers_interleave GUC (false by default)

* It's not clear we want to enable interleaving for all shared memory.
  We probably want that for shared buffers, but maybe not for ProcArray
  or freelists.

* Similar questions are about huge pages - in general it's a good idea,
  but maybe it's not quite good for ProcArray. It's somewhate separate
  from NUMA, but not entirely because NUMA works on page granularity.
  PGPROC entries are ~8KB, so too large for interleaving with 4K pages,
  as we don't want to split the entry to multiple nodes. But could be
  done explicitly, by specifying which node to use for the pages.

* We could partition ProcArray, with one partition per NUMA node, and
  then at connection time pick a node from the same node. The process
  could migrate to some other node later, especially for long-lived
  connections, but there's no perfect solution, Maybe we could set
  affinity to cores from the same node, or something like that?
---
 src/backend/storage/buffer/buf_init.c | 384 +++++++++++++++++++++++++-
 src/backend/storage/buffer/bufmgr.c   |   1 +
 src/backend/utils/init/globals.c      |   3 +
 src/backend/utils/misc/guc_tables.c   |  10 +
 src/bin/pgbench/pgbench.c             |  67 ++---
 src/include/miscadmin.h               |   2 +
 src/include/storage/bufmgr.h          |   1 +
 7 files changed, 427 insertions(+), 41 deletions(-)

diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
index ed1dc488a42..2ad34624c49 100644
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -14,9 +14,17 @@
  */
 #include "postgres.h"
 
+#ifdef USE_LIBNUMA
+#include <numa.h>
+#include <numaif.h>
+#endif
+
+#include "port/pg_numa.h"
 #include "storage/aio.h"
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
+#include "storage/pg_shmem.h"
+#include "storage/proc.h"
 
 BufferDescPadded *BufferDescriptors;
 char	   *BufferBlocks;
@@ -25,6 +33,19 @@ WritebackContext BackendWritebackContext;
 CkptSortItem *CkptBufferIds;
 
 
+static Size get_memory_page_size(void);
+static int64 choose_chunk_buffers(int NBuffers, Size mem_page_size, int num_nodes);
+static void pg_numa_interleave_memory(char *startptr, char *endptr,
+									  Size mem_page_size, Size chunk_size,
+									  int num_nodes);
+
+/* number of buffers allocated on the same NUMA node */
+static int64 numa_chunk_buffers = -1;
+
+/* number of NUMA nodes (as returned by numa_num_configured_nodes) */
+static int	numa_nodes = -1;
+
+
 /*
  * Data Structures:
  *		buffers live in a freelist and a lookup data structure.
@@ -71,18 +92,80 @@ BufferManagerShmemInit(void)
 				foundDescs,
 				foundIOCV,
 				foundBufCkpt;
+	Size		mem_page_size;
+	Size		buffer_align;
+
+	/*
+	 * XXX A bit weird. Do we need to worry about postmaster? Could this even
+	 * run outside postmaster? I don't think so.
+	 *
+	 * XXX Another issue is we may get different values than when sizing the
+	 * the memory, because at that point we didn't know if we get huge pages,
+	 * so we assumed we will. Shouldn't cause crashes, but we might allocate
+	 * shared memory and then not use some of it (because of the alignment
+	 * that we don't actually need). Not sure about better way, good for now.
+	 */
+	if (IsUnderPostmaster)
+		mem_page_size = pg_get_shmem_pagesize();
+	else
+		mem_page_size = get_memory_page_size();
+
+	/*
+	 * With NUMA we need to ensure the buffers are properly aligned not just
+	 * to PG_IO_ALIGN_SIZE, but also to memory page size, because NUMA works
+	 * on page granularity, and we don't want a buffer to get split to
+	 * multiple nodes (when using multiple memory pages).
+	 *
+	 * We also don't want to interfere with other parts of shared memory,
+	 * which could easily happen with huge pages (e.g. with data stored before
+	 * buffers).
+	 *
+	 * We do this by aligning to the larger of the two values (we know both
+	 * are power-of-two values, so the larger value is automatically a
+	 * multiple of the lesser one).
+	 *
+	 * XXX Maybe there's a way to use less alignment?
+	 *
+	 * XXX Maybe with (mem_page_size > PG_IO_ALIGN_SIZE), we don't need to
+	 * align to mem_page_size? Especially for very large huge pages (e.g. 1GB)
+	 * that doesn't seem quite worth it. Maybe we should simply align to
+	 * BLCKSZ, so that buffers don't get split? Still, we might interfere with
+	 * other stuff stored in shared memory that we want to allocate on a
+	 * particular NUMA node (e.g. ProcArray).
+	 *
+	 * XXX Maybe with "too large" huge pages we should just not do this, or
+	 * maybe do this only for sufficiently large areas (e.g. shared buffers,
+	 * but not ProcArray).
+	 */
+	buffer_align = Max(mem_page_size, PG_IO_ALIGN_SIZE);
+
+	/* one page is a multiple of the other */
+	Assert(((mem_page_size % PG_IO_ALIGN_SIZE) == 0) ||
+		   ((PG_IO_ALIGN_SIZE % mem_page_size) == 0));
 
-	/* Align descriptors to a cacheline boundary. */
+	/*
+	 * Align descriptors to a cacheline boundary, and memory page.
+	 *
+	 * We want to distribute both to NUMA nodes, so that each buffer and it's
+	 * descriptor are on the same NUMA node. So we align both the same way.
+	 *
+	 * XXX The memory page is always larger than cacheline, so the cacheline
+	 * reference is a bit unnecessary.
+	 *
+	 * XXX In principle we only need to do this with NUMA, otherwise we could
+	 * still align just to cacheline, as before.
+	 */
 	BufferDescriptors = (BufferDescPadded *)
-		ShmemInitStruct("Buffer Descriptors",
-						NBuffers * sizeof(BufferDescPadded),
-						&foundDescs);
+		TYPEALIGN(buffer_align,
+				  ShmemInitStruct("Buffer Descriptors",
+								  NBuffers * sizeof(BufferDescPadded) + buffer_align,
+								  &foundDescs));
 
 	/* Align buffer pool on IO page size boundary. */
 	BufferBlocks = (char *)
-		TYPEALIGN(PG_IO_ALIGN_SIZE,
+		TYPEALIGN(buffer_align,
 				  ShmemInitStruct("Buffer Blocks",
-								  NBuffers * (Size) BLCKSZ + PG_IO_ALIGN_SIZE,
+								  NBuffers * (Size) BLCKSZ + buffer_align,
 								  &foundBufs));
 
 	/* Align condition variables to cacheline boundary. */
@@ -112,6 +195,63 @@ BufferManagerShmemInit(void)
 	{
 		int			i;
 
+		/*
+		 * Assign chunks of buffers and buffer descriptors to the available
+		 * NUMA nodes. We can't use the regular interleaving, because with
+		 * regular memory pages (smaller than BLCKSZ) we'd split all buffers
+		 * to multiple NUMA nodes. And we don't want that.
+		 *
+		 * But even with huge pages it seems like a good idea to not have
+		 * mapping for each page.
+		 *
+		 * So we always assign a larger contiguous chunk of buffers to the
+		 * same NUMA node, as calculated by choose_chunk_buffers(). We try to
+		 * keep the chunks large enough to work both for buffers and buffer
+		 * descriptors, but not too large. See the comments at
+		 * choose_chunk_buffers() for details.
+		 *
+		 * Thanks to the earlier alignment (to memory page etc.), we know the
+		 * buffers won't get split, etc.
+		 *
+		 * This also makes it easier / straightforward to calculate which NUMA
+		 * node a buffer belongs to (it's a matter of divide + mod). See
+		 * BufferGetNode().
+		 */
+		if (numa_buffers_interleave)
+		{
+			char	   *startptr,
+					   *endptr;
+			Size		chunk_size;
+
+			numa_nodes = numa_num_configured_nodes();
+
+			numa_chunk_buffers
+				= choose_chunk_buffers(NBuffers, mem_page_size, numa_nodes);
+
+			elog(LOG, "BufferManagerShmemInit num_nodes %d chunk_buffers %ld",
+				 numa_nodes, numa_chunk_buffers);
+
+			/* first map buffers */
+			startptr = BufferBlocks;
+			endptr = startptr + ((Size) NBuffers) * BLCKSZ;
+			chunk_size = (numa_chunk_buffers * BLCKSZ);
+
+			pg_numa_interleave_memory(startptr, endptr,
+									  mem_page_size,
+									  chunk_size,
+									  numa_nodes);
+
+			/* now do the same for buffer descriptors */
+			startptr = (char *) BufferDescriptors;
+			endptr = startptr + ((Size) NBuffers) * sizeof(BufferDescPadded);
+			chunk_size = (numa_chunk_buffers * sizeof(BufferDescPadded));
+
+			pg_numa_interleave_memory(startptr, endptr,
+									  mem_page_size,
+									  chunk_size,
+									  numa_nodes);
+		}
+
 		/*
 		 * Initialize all the buffer headers.
 		 */
@@ -144,6 +284,11 @@ BufferManagerShmemInit(void)
 		GetBufferDescriptor(NBuffers - 1)->freeNext = FREENEXT_END_OF_LIST;
 	}
 
+	/*
+	 * As this point we have all the buffers in a single long freelist. With
+	 * freelist partitioning we rebuild them in StrategyInitialize.
+	 */
+
 	/* Init other shared buffer-management stuff */
 	StrategyInitialize(!foundDescs);
 
@@ -152,24 +297,72 @@ BufferManagerShmemInit(void)
 						 &backend_flush_after);
 }
 
+/*
+ * Determine the size of memory page.
+ *
+ * XXX This is a bit tricky, because the result depends at which point we call
+ * this. Before the allocation we don't know if we succeed in allocating huge
+ * pages - but we have to size everything for the chance that we will. And then
+ * if the huge pages fail (with 'huge_pages=try'), we'll use the regular memory
+ * pages. But at that point we can't adjust the sizing.
+ *
+ * XXX Maybe with huge_pages=try we should do the sizing twice - first with
+ * huge pages, and if that fails, then without them. But not for this patch.
+ * Up to this point there was no such dependency on huge pages.
+ */
+static Size
+get_memory_page_size(void)
+{
+	Size		os_page_size;
+	Size		huge_page_size;
+
+#ifdef WIN32
+	SYSTEM_INFO sysinfo;
+
+	GetSystemInfo(&sysinfo);
+	os_page_size = sysinfo.dwPageSize;
+#else
+	os_page_size = sysconf(_SC_PAGESIZE);
+#endif
+
+	/* assume huge pages get used, unless HUGE_PAGES_OFF */
+	if (huge_pages_status != HUGE_PAGES_OFF)
+		GetHugePageSize(&huge_page_size, NULL);
+	else
+		huge_page_size = 0;
+
+	return Max(os_page_size, huge_page_size);
+}
+
 /*
  * BufferManagerShmemSize
  *
  * compute the size of shared memory for the buffer pool including
  * data pages, buffer descriptors, hash tables, etc.
+ *
+ * XXX Called before allocation, so we don't know if huge pages get used yet.
+ * So we need to assume huge pages get used, and use get_memory_page_size()
+ * to calculate the largest possible memory page.
  */
 Size
 BufferManagerShmemSize(void)
 {
 	Size		size = 0;
+	Size		mem_page_size;
+
+	/* XXX why does IsUnderPostmaster matter? */
+	if (IsUnderPostmaster)
+		mem_page_size = pg_get_shmem_pagesize();
+	else
+		mem_page_size = get_memory_page_size();
 
 	/* size of buffer descriptors */
 	size = add_size(size, mul_size(NBuffers, sizeof(BufferDescPadded)));
 	/* to allow aligning buffer descriptors */
-	size = add_size(size, PG_CACHE_LINE_SIZE);
+	size = add_size(size, Max(mem_page_size, PG_IO_ALIGN_SIZE));
 
 	/* size of data pages, plus alignment padding */
-	size = add_size(size, PG_IO_ALIGN_SIZE);
+	size = add_size(size, Max(mem_page_size, PG_IO_ALIGN_SIZE));
 	size = add_size(size, mul_size(NBuffers, BLCKSZ));
 
 	/* size of stuff controlled by freelist.c */
@@ -186,3 +379,178 @@ BufferManagerShmemSize(void)
 
 	return size;
 }
+
+/*
+ * choose_chunk_buffers
+ *		choose the number of buffers allocated to a NUMA node at once
+ *
+ * We don't map shared buffers to NUMA nodes one by one, but in larger chunks.
+ * This is both for efficiency reasons (fewer mappings), and also because we
+ * want to map buffer descriptors too - and descriptors are much smaller. So
+ * we pick a number that's high enough for descriptors to use whole pages.
+ *
+ * We also want to keep buffers somehow evenly distributed on nodes, with
+ * about NBuffers/nodes per node. So we don't use chunks larger than this,
+ * to keep it as fair as possible (the chunk size is a possible difference
+ * between memory allocated to different NUMA nodes).
+ *
+ * It's possible shared buffers are so small this is not possible (i.e.
+ * it's less than chunk_size). But sensible NUMA systems will use a lot
+ * of memory, so this is unlikely.
+ *
+ * We simply print a warning about the misbalance, and that's it.
+ *
+ * XXX It'd be good to ensure the chunk size is a power-of-2, because then
+ * we could calculate the NUMA node simply by shift/modulo, while now we
+ * have to do a division. But we don't know how many buffers and buffer
+ * descriptors fits into a memory page. It may not be a power-of-2.
+ */
+static int64
+choose_chunk_buffers(int NBuffers, Size mem_page_size, int num_nodes)
+{
+	int64		num_items;
+	int64		max_items;
+
+	/* make sure the chunks will align nicely */
+	Assert(BLCKSZ % sizeof(BufferDescPadded) == 0);
+	Assert(mem_page_size % sizeof(BufferDescPadded) == 0);
+	Assert(((BLCKSZ % mem_page_size) == 0) || ((mem_page_size % BLCKSZ) == 0));
+
+	/*
+	 * The minimum number of items to fill a memory page with descriptors and
+	 * blocks. The NUMA allocates memory in pages, and we need to do that for
+	 * both buffers and descriptors.
+	 *
+	 * In practice the BLCKSZ doesn't really matter, because it's much larger
+	 * than BufferDescPadded, so the result is determined buffer descriptors.
+	 * But it's clearer this way.
+	 */
+	num_items = Max(mem_page_size / sizeof(BufferDescPadded),
+					mem_page_size / BLCKSZ);
+
+	/*
+	 * We shouldn't use chunks larger than NBuffers/num_nodes, because with
+	 * larger chunks the last NUMA node would end up with much less memory (or
+	 * no memory at all).
+	 */
+	max_items = (NBuffers / num_nodes);
+
+	/*
+	 * Did we already exceed the maximum desirable chunk size? That is, will
+	 * the last node get less than one whole chunk (or no memory at all)?
+	 */
+	if (num_items > max_items)
+		elog(WARNING, "choose_chunk_buffers: chunk items exceeds max (%ld > %ld)",
+			 num_items, max_items);
+
+	/* grow the chunk size until we hit the max limit. */
+	while (2 * num_items <= max_items)
+		num_items *= 2;
+
+	/*
+	 * XXX It's not difficult to construct cases where we end up with not
+	 * quite balanced distribution. For example, with shared_buffers=10GB and
+	 * 4 NUMA nodes, we end up with 2GB chunks, which means the first node
+	 * gets 4GB, and the three other nodes get 2GB each.
+	 *
+	 * We could be smarter, and try to get more balanced distribution. We
+	 * could simply reduce max_items e.g. to
+	 *
+	 * max_items = (NBuffers / num_nodes) / 4;
+	 *
+	 * in which cases we'd end up with 512MB chunks, and each nodes would get
+	 * the same 2.5GB chunk. It may not always work out this nicely, but it's
+	 * better than with (NBuffers / num_nodes).
+	 *
+	 * Alternatively, we could "backtrack" - try with the large max_items,
+	 * check how balanced it is, and if it's too imbalanced, try with a
+	 * smaller one.
+	 *
+	 * We however want a simple scheme.
+	 */
+
+	return num_items;
+}
+
+/*
+ * Calculate the NUMA node for a given buffer.
+ */
+int
+BufferGetNode(Buffer buffer)
+{
+	/* not NUMA interleaving */
+	if (numa_chunk_buffers == -1)
+		return -1;
+
+	return (buffer / numa_chunk_buffers) % numa_nodes;
+}
+
+/*
+ * pg_numa_interleave_memory
+ *		move memory to different NUMA nodes in larger chunks
+ *
+ * startptr - start of the region (should be aligned to page size)
+ * endptr - end of the region (doesn't need to be aligned)
+ * mem_page_size - size of the memory page size
+ * chunk_size - size of the chunk to move to a single node (should be multiple
+ *              of page size
+ * num_nodes - number of nodes to allocate memory to
+ *
+ * XXX Maybe this should use numa_tonode_memory and numa_police_memory instead?
+ * That might be more efficient than numa_move_pages, as it works on larger
+ * chunks of memory, not individual system pages, I think.
+ *
+ * XXX The "interleave" name is not quite accurate, I guess.
+ */
+static void
+pg_numa_interleave_memory(char *startptr, char *endptr,
+						  Size mem_page_size, Size chunk_size,
+						  int num_nodes)
+{
+	volatile uint64 touch pg_attribute_unused();
+	char	   *ptr = startptr;
+
+	/* chunk size has to be a multiple of memory page */
+	Assert((chunk_size % mem_page_size) == 0);
+
+	/*
+	 * Walk the memory pages in the range, and determine the node for each
+	 * one. We use numa_tonode_memory(), because then we can move a whole
+	 * memory range to the node, we don't need to worry about individual pages
+	 * like with numa_move_pages().
+	 */
+	while (ptr < endptr)
+	{
+		/* We may have an incomplete chunk at the end. */
+		Size		sz = Min(chunk_size, (endptr - ptr));
+
+		/*
+		 * What NUMA node does this range belong to? Each chunk should go to
+		 * the same NUMA node, in a round-robin manner.
+		 */
+		int			node = ((ptr - startptr) / chunk_size) % num_nodes;
+
+		/*
+		 * Nope, we have the first buffer from the next memory page, and we'll
+		 * set NUMA node for it (and all pages up to the next buffer). The
+		 * buffer should align with the memory page, thanks to the
+		 * buffer_align earlier.
+		 */
+		Assert((int64) ptr % mem_page_size == 0);
+		Assert((sz % mem_page_size) == 0);
+
+		/*
+		 * XXX no return value, to make this fail on error, has to use
+		 * numa_set_strict
+		 *
+		 * XXX Should we still touch the memory first, like with numa_move_pages,
+		 * or is that not necessary?
+		 */
+		numa_tonode_memory(ptr, sz, node);
+
+		ptr += sz;
+	}
+
+	/* should have processed all chunks */
+	Assert(ptr == endptr);
+}
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 406ce77693c..e1e1cfd379d 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -685,6 +685,7 @@ ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockN
 	BufferDesc *bufHdr;
 	BufferTag	tag;
 	uint32		buf_state;
+
 	Assert(BufferIsValid(recent_buffer));
 
 	ResourceOwnerEnlarge(CurrentResourceOwner);
diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c
index d31cb45a058..876cb64cf66 100644
--- a/src/backend/utils/init/globals.c
+++ b/src/backend/utils/init/globals.c
@@ -145,6 +145,9 @@ int			max_worker_processes = 8;
 int			max_parallel_workers = 8;
 int			MaxBackends = 0;
 
+/* NUMA stuff */
+bool		numa_buffers_interleave = false;
+
 /* GUC parameters for vacuum */
 int			VacuumBufferUsageLimit = 2048;
 
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 511dc32d519..198a57e70a5 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -2116,6 +2116,16 @@ struct config_bool ConfigureNamesBool[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"numa_buffers_interleave", PGC_POSTMASTER, DEVELOPER_OPTIONS,
+			gettext_noop("Enables NUMA interleaving of shared buffers."),
+			gettext_noop("When enabled, the buffers in shared memory are interleaved to all NUMA nodes."),
+		},
+		&numa_buffers_interleave,
+		false,
+		NULL, NULL, NULL
+	},
+
 	{
 		{"sync_replication_slots", PGC_SIGHUP, REPLICATION_STANDBY,
 			gettext_noop("Enables a physical standby to synchronize logical failover replication slots from the primary server."),
diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c
index 69b6a877dc9..c07de903f76 100644
--- a/src/bin/pgbench/pgbench.c
+++ b/src/bin/pgbench/pgbench.c
@@ -305,7 +305,7 @@ static const char *progname;
 #define	CPU_PINNING_RANDOM		1
 #define	CPU_PINNING_COLOCATED	2
 
-static int pinning_mode = CPU_PINNING_NONE;
+static int	pinning_mode = CPU_PINNING_NONE;
 
 #define WSEP '@'				/* weight separator */
 
@@ -874,20 +874,20 @@ static bool socket_has_input(socket_set *sa, int fd, int idx);
  */
 typedef struct cpu_generator_state
 {
-	int		ncpus;		/* number of CPUs available */
-	int		nitems;		/* number of items in the queue */
-	int	   *nthreads;	/* number of threads for each CPU */
-	int	   *nclients;	/* number of processes for each CPU */
-	int	   *items;		/* queue of CPUs to pick from */
-} cpu_generator_state;
+	int			ncpus;			/* number of CPUs available */
+	int			nitems;			/* number of items in the queue */
+	int		   *nthreads;		/* number of threads for each CPU */
+	int		   *nclients;		/* number of processes for each CPU */
+	int		   *items;			/* queue of CPUs to pick from */
+}			cpu_generator_state;
 
 static cpu_generator_state cpu_generator_init(int ncpus);
-static void cpu_generator_refill(cpu_generator_state *state);
-static void cpu_generator_reset(cpu_generator_state *state);
-static int cpu_generator_thread(cpu_generator_state *state);
-static int cpu_generator_client(cpu_generator_state *state, int thread_cpu);
-static void cpu_generator_print(cpu_generator_state *state);
-static bool cpu_generator_check(cpu_generator_state *state);
+static void cpu_generator_refill(cpu_generator_state * state);
+static void cpu_generator_reset(cpu_generator_state * state);
+static int	cpu_generator_thread(cpu_generator_state * state);
+static int	cpu_generator_client(cpu_generator_state * state, int thread_cpu);
+static void cpu_generator_print(cpu_generator_state * state);
+static bool cpu_generator_check(cpu_generator_state * state);
 
 static void reset_pinning(TState *threads, int nthreads);
 
@@ -7422,7 +7422,7 @@ main(int argc, char **argv)
 	/* try to assign threads/clients to CPUs */
 	if (pinning_mode != CPU_PINNING_NONE)
 	{
-		int nprocs = get_nprocs();
+		int			nprocs = get_nprocs();
 		cpu_generator_state state = cpu_generator_init(nprocs);
 
 retry:
@@ -7433,6 +7433,7 @@ retry:
 		for (i = 0; i < nthreads; i++)
 		{
 			TState	   *thread = &threads[i];
+
 			thread->cpu = cpu_generator_thread(&state);
 		}
 
@@ -7444,7 +7445,7 @@ retry:
 		while (true)
 		{
 			/* did we find any unassigned backend? */
-			bool found = false;
+			bool		found = false;
 
 			for (i = 0; i < nthreads; i++)
 			{
@@ -7678,10 +7679,10 @@ threadRun(void *arg)
 		/* determine PID of the backend, pin it to the same CPU */
 		for (int i = 0; i < nstate; i++)
 		{
-			char   *pid_str;
-			pid_t	pid;
+			char	   *pid_str;
+			pid_t		pid;
 
-			PGresult *res = PQexec(state[i].con, "select pg_backend_pid()");
+			PGresult   *res = PQexec(state[i].con, "select pg_backend_pid()");
 
 			if (PQresultStatus(res) != PGRES_TUPLES_OK)
 				pg_fatal("could not determine PID of the backend for client %d",
@@ -8184,7 +8185,7 @@ cpu_generator_init(int ncpus)
 {
 	struct timeval tv;
 
-	cpu_generator_state	state;
+	cpu_generator_state state;
 
 	state.ncpus = ncpus;
 
@@ -8207,7 +8208,7 @@ cpu_generator_init(int ncpus)
 }
 
 static void
-cpu_generator_refill(cpu_generator_state *state)
+cpu_generator_refill(cpu_generator_state * state)
 {
 	struct timeval tv;
 
@@ -8223,7 +8224,7 @@ cpu_generator_refill(cpu_generator_state *state)
 }
 
 static void
-cpu_generator_reset(cpu_generator_state *state)
+cpu_generator_reset(cpu_generator_state * state)
 {
 	state->nitems = 0;
 	cpu_generator_refill(state);
@@ -8236,15 +8237,15 @@ cpu_generator_reset(cpu_generator_state *state)
 }
 
 static int
-cpu_generator_thread(cpu_generator_state *state)
+cpu_generator_thread(cpu_generator_state * state)
 {
 	if (state->nitems == 0)
 		cpu_generator_refill(state);
 
 	while (true)
 	{
-		int idx = lrand48() % state->nitems;
-		int cpu = state->items[idx];
+		int			idx = lrand48() % state->nitems;
+		int			cpu = state->items[idx];
 
 		state->items[idx] = state->items[state->nitems - 1];
 		state->nitems--;
@@ -8256,10 +8257,10 @@ cpu_generator_thread(cpu_generator_state *state)
 }
 
 static int
-cpu_generator_client(cpu_generator_state *state, int thread_cpu)
+cpu_generator_client(cpu_generator_state * state, int thread_cpu)
 {
-	int		min_clients;
-	bool	has_valid_cpus = false;
+	int			min_clients;
+	bool		has_valid_cpus = false;
 
 	for (int i = 0; i < state->nitems; i++)
 	{
@@ -8284,8 +8285,8 @@ cpu_generator_client(cpu_generator_state *state, int thread_cpu)
 
 	while (true)
 	{
-		int idx = lrand48() % state->nitems;
-		int cpu = state->items[idx];
+		int			idx = lrand48() % state->nitems;
+		int			cpu = state->items[idx];
 
 		if (cpu == thread_cpu)
 			continue;
@@ -8303,7 +8304,7 @@ cpu_generator_client(cpu_generator_state *state, int thread_cpu)
 }
 
 static void
-cpu_generator_print(cpu_generator_state *state)
+cpu_generator_print(cpu_generator_state * state)
 {
 	for (int i = 0; i < state->ncpus; i++)
 	{
@@ -8312,10 +8313,10 @@ cpu_generator_print(cpu_generator_state *state)
 }
 
 static bool
-cpu_generator_check(cpu_generator_state *state)
+cpu_generator_check(cpu_generator_state * state)
 {
-	int	min_count = INT_MAX,
-		max_count = 0;
+	int			min_count = INT_MAX,
+				max_count = 0;
 
 	for (int i = 0; i < state->ncpus; i++)
 	{
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 1bef98471c3..014a6079af2 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -178,6 +178,8 @@ extern PGDLLIMPORT int MaxConnections;
 extern PGDLLIMPORT int max_worker_processes;
 extern PGDLLIMPORT int max_parallel_workers;
 
+extern PGDLLIMPORT bool numa_buffers_interleave;
+
 extern PGDLLIMPORT int commit_timestamp_buffers;
 extern PGDLLIMPORT int multixact_member_buffers;
 extern PGDLLIMPORT int multixact_offset_buffers;
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 41fdc1e7693..c257c8a1c20 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -319,6 +319,7 @@ extern void EvictRelUnpinnedBuffers(Relation rel,
 /* in buf_init.c */
 extern void BufferManagerShmemInit(void);
 extern Size BufferManagerShmemSize(void);
+extern int	BufferGetNode(Buffer buffer);
 
 /* in localbuf.c */
 extern void AtProcExit_LocalBuffers(void);
-- 
2.49.0