From 6505848ac8359c8c76dfbffc7150b6601ab07601 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@vondra.me>
Date: Thu, 22 May 2025 18:38:41 +0200
Subject: [PATCH v1 4/6] NUMA: partition buffer freelist

Instead of a single buffer freelist, partition into multiple smaller
lists, to reduce lock contention, and to spread the buffers over all
NUMA nodes more evenly.

There are four strategies, specified by GUC numa_partition_freelist

* none - single long freelist, should work just like now

* node - one freelist per NUMA node, with only buffers from that node

* cpu - one freelist per CPU

* pid - freelist determined by PID (same number of freelists as 'cpu')

When allocating a buffer, it's taken from the correct freelist (e.g.
same NUMA node).

Note: This is (probably) more important than partitioning ProcArray.
---
 src/backend/storage/buffer/buf_init.c |   4 +-
 src/backend/storage/buffer/freelist.c | 324 +++++++++++++++++++++++---
 src/backend/utils/init/globals.c      |   1 +
 src/backend/utils/misc/guc_tables.c   |  18 ++
 src/include/miscadmin.h               |   1 +
 src/include/storage/bufmgr.h          |   8 +
 6 files changed, 327 insertions(+), 29 deletions(-)

diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
index 2ad34624c49..920f1a32a8f 100644
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -543,8 +543,8 @@ pg_numa_interleave_memory(char *startptr, char *endptr,
 		 * XXX no return value, to make this fail on error, has to use
 		 * numa_set_strict
 		 *
-		 * XXX Should we still touch the memory first, like with numa_move_pages,
-		 * or is that not necessary?
+		 * XXX Should we still touch the memory first, like with
+		 * numa_move_pages, or is that not necessary?
 		 */
 		numa_tonode_memory(ptr, sz, node);
 
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index e046526c149..c93ec2841c5 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -15,14 +15,41 @@
  */
 #include "postgres.h"
 
+#include <sched.h>
+#include <sys/sysinfo.h>
+
+#ifdef USE_LIBNUMA
+#include <numa.h>
+#include <numaif.h>
+#endif
+
 #include "pgstat.h"
 #include "port/atomics.h"
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
+#include "storage/ipc.h"
 #include "storage/proc.h"
 
 #define INT_ACCESS_ONCE(var)	((int)(*((volatile int *)&(var))))
 
+/*
+ * Represents one freelist partition.
+ */
+typedef struct BufferStrategyFreelist
+{
+	/* Spinlock: protects the values below */
+	slock_t		freelist_lock;
+
+	/*
+	 * XXX Not sure why this needs to be aligned like this. Need to ask
+	 * Andres.
+	 */
+	int			firstFreeBuffer __attribute__((aligned(64)));	/* Head of list of
+																 * unused buffers */
+
+	/* Number of buffers consumed from this list. */
+	uint64		consumed;
+}			BufferStrategyFreelist;
 
 /*
  * The shared freelist control information.
@@ -39,8 +66,6 @@ typedef struct
 	 */
 	pg_atomic_uint32 nextVictimBuffer;
 
-	int			firstFreeBuffer;	/* Head of list of unused buffers */
-
 	/*
 	 * Statistics.  These counters should be wide enough that they can't
 	 * overflow during a single bgwriter cycle.
@@ -51,13 +76,27 @@ typedef struct
 	/*
 	 * Bgworker process to be notified upon activity or -1 if none. See
 	 * StrategyNotifyBgWriter.
+	 *
+	 * XXX Not sure why this needs to be aligned like this. Need to ask
+	 * Andres. Also, shouldn't the alignment be specified after, like for
+	 * "consumed"?
 	 */
-	int			bgwprocno;
+	int			__attribute__((aligned(64))) bgwprocno;
+
+	BufferStrategyFreelist freelists[FLEXIBLE_ARRAY_MEMBER];
 } BufferStrategyControl;
 
 /* Pointers to shared state */
 static BufferStrategyControl *StrategyControl = NULL;
 
+/*
+ * XXX shouldn't this be in BufferStrategyControl? Probably not, we need to
+ * calculate it during sizing, and perhaps it could change before the memory
+ * gets allocated (so we need to remember the values).
+ */
+static int	strategy_nnodes;
+static int	strategy_ncpus;
+
 /*
  * Private (non-shared) state for managing a ring of shared buffers to re-use.
  * This is currently the only kind of BufferAccessStrategy object, but someday
@@ -157,6 +196,90 @@ ClockSweepTick(void)
 	return victim;
 }
 
+/*
+ * ChooseFreeList
+ *		Pick the buffer freelist to use, depending on the CPU and NUMA node.
+ *
+ * Without partitioned freelists (numa_partition_freelist=false), there's only
+ * a single freelist, so use that.
+ *
+ * With partitioned freelists, we have multiple ways how to pick the freelist
+ * for the backend:
+ *
+ * - one freelist per CPU, use the freelist for CPU the task executes on
+ *
+ * - one freelist per NUMA node, use the freelist for node task executes on
+ *
+ * - use fixed number of freelists, map processes to lists based on PID
+ *
+ * There may be some other strategies, not sure. The important thing is this
+ * needs to be refrecled during initialization, i.e. we need to create the
+ * right number of lists.
+ */
+static BufferStrategyFreelist *
+ChooseFreeList(void)
+{
+	unsigned	cpu;
+	unsigned	node;
+	int			rc;
+
+	int			freelist_idx;
+
+	/* freelist not partitioned, return the first (and only) freelist */
+	if (numa_partition_freelist == FREELIST_PARTITION_NONE)
+		return &StrategyControl->freelists[0];
+
+	/*
+	 * freelist is partitioned, so determine the CPU/NUMA node, and pick a
+	 * list based on that.
+	 */
+	rc = getcpu(&cpu, &node);
+	if (rc != 0)
+		elog(ERROR, "getcpu failed: %m");
+
+	/*
+	 * FIXME This doesn't work well if CPUs are excluded from being run or
+	 * offline. In that case we end up not using some freelists at all, but
+	 * not sure if we need to worry about that. Probably not for now. But
+	 * could that change while the system is running?
+	 *
+	 * XXX Maybe we should somehow detect changes to the list of CPUs, and
+	 * rebuild the lists if that changes? But that seems expensive.
+	 */
+	if (cpu > strategy_ncpus)
+		elog(ERROR, "cpu out of range: %d > %u", cpu, strategy_ncpus);
+	else if (node > strategy_nnodes)
+		elog(ERROR, "node out of range: %d > %u", cpu, strategy_nnodes);
+
+	/*
+	 * Pick the freelist, based on CPU, NUMA node or process PID. This matches
+	 * how we built the freelists above.
+	 *
+	 * XXX Can we rely on some of the values (especially strategy_nnodes) to
+	 * be a power-of-2? Then we could replace the modulo with a mask, which is
+	 * likely more efficient.
+	 */
+	switch (numa_partition_freelist)
+	{
+		case FREELIST_PARTITION_CPU:
+			freelist_idx = cpu % strategy_ncpus;
+			break;
+
+		case FREELIST_PARTITION_NODE:
+			freelist_idx = node % strategy_nnodes;
+			break;
+
+		case FREELIST_PARTITION_PID:
+			freelist_idx = MyProcPid % strategy_ncpus;
+			break;
+
+		default:
+			elog(ERROR, "unknown freelist partitioning value");
+	}
+
+	return &StrategyControl->freelists[freelist_idx];
+}
+
 /*
  * have_free_buffer -- a lockless check to see if there is a free buffer in
  *					   buffer pool.
@@ -168,10 +291,13 @@ ClockSweepTick(void)
 bool
 have_free_buffer(void)
 {
-	if (StrategyControl->firstFreeBuffer >= 0)
-		return true;
-	else
-		return false;
+	for (int i = 0; i < strategy_ncpus; i++)
+	{
+		if (StrategyControl->freelists[i].firstFreeBuffer >= 0)
+			return true;
+	}
+
+	return false;
 }
 
 /*
@@ -193,6 +319,7 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r
 	int			bgwprocno;
 	int			trycounter;
 	uint32		local_buf_state;	/* to avoid repeated (de-)referencing */
+	BufferStrategyFreelist *freelist;
 
 	*from_ring = false;
 
@@ -259,31 +386,35 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r
 	 * buffer_strategy_lock not the individual buffer spinlocks, so it's OK to
 	 * manipulate them without holding the spinlock.
 	 */
-	if (StrategyControl->firstFreeBuffer >= 0)
+	freelist = ChooseFreeList();
+	if (freelist->firstFreeBuffer >= 0)
 	{
 		while (true)
 		{
 			/* Acquire the spinlock to remove element from the freelist */
-			SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
+			SpinLockAcquire(&freelist->freelist_lock);
 
-			if (StrategyControl->firstFreeBuffer < 0)
+			if (freelist->firstFreeBuffer < 0)
 			{
-				SpinLockRelease(&StrategyControl->buffer_strategy_lock);
+				SpinLockRelease(&freelist->freelist_lock);
 				break;
 			}
 
-			buf = GetBufferDescriptor(StrategyControl->firstFreeBuffer);
+			buf = GetBufferDescriptor(freelist->firstFreeBuffer);
 			Assert(buf->freeNext != FREENEXT_NOT_IN_LIST);
 
 			/* Unconditionally remove buffer from freelist */
-			StrategyControl->firstFreeBuffer = buf->freeNext;
+			freelist->firstFreeBuffer = buf->freeNext;
 			buf->freeNext = FREENEXT_NOT_IN_LIST;
 
+			/* increment number of buffers we consumed from this list */
+			freelist->consumed++;
+
 			/*
 			 * Release the lock so someone else can access the freelist while
 			 * we check out this buffer.
 			 */
-			SpinLockRelease(&StrategyControl->buffer_strategy_lock);
+			SpinLockRelease(&freelist->freelist_lock);
 
 			/*
 			 * If the buffer is pinned or has a nonzero usage_count, we cannot
@@ -305,7 +436,17 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r
 		}
 	}
 
-	/* Nothing on the freelist, so run the "clock sweep" algorithm */
+	/*
+	 * Nothing on the freelist, so run the "clock sweep" algorithm
+	 *
+	 * XXX Should we also make this NUMA-aware, to only access buffers from
+	 * the same NUMA node? That'd probably mean we need to make the clock
+	 * sweep NUMA-aware, perhaps by having multiple clock sweeps, each for a
+	 * subset of buffers. But that also means each process could "sweep" only
+	 * a fraction of buffers, even if the other buffers are better candidates
+	 * for eviction. Would that also mean we'd have multiple bgwriters, one
+	 * for each node, or would one bgwriter handle all of that?
+	 */
 	trycounter = NBuffers;
 	for (;;)
 	{
@@ -352,11 +493,22 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r
 
 /*
  * StrategyFreeBuffer: put a buffer on the freelist
+ *
+ * XXX This calls ChooseFreeList() again, and it might return the freelist to
+ * a different freelist than it was taken from (either by a different backend,
+ * or perhaps even the same backend running on a different CPU). Is that good?
+ * Maybe we should try to balance this somehow, e.g. by choosing a random list,
+ * the shortest one, or something like that? But that breaks the whole idea of
+ * having freelists with buffers from a particular NUMA node.
  */
 void
 StrategyFreeBuffer(BufferDesc *buf)
 {
-	SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
+	BufferStrategyFreelist *freelist;
+
+	freelist = ChooseFreeList();
+
+	SpinLockAcquire(&freelist->freelist_lock);
 
 	/*
 	 * It is possible that we are told to put something in the freelist that
@@ -364,11 +516,11 @@ StrategyFreeBuffer(BufferDesc *buf)
 	 */
 	if (buf->freeNext == FREENEXT_NOT_IN_LIST)
 	{
-		buf->freeNext = StrategyControl->firstFreeBuffer;
-		StrategyControl->firstFreeBuffer = buf->buf_id;
+		buf->freeNext = freelist->firstFreeBuffer;
+		freelist->firstFreeBuffer = buf->buf_id;
 	}
 
-	SpinLockRelease(&StrategyControl->buffer_strategy_lock);
+	SpinLockRelease(&freelist->freelist_lock);
 }
 
 /*
@@ -432,6 +584,42 @@ StrategyNotifyBgWriter(int bgwprocno)
 	SpinLockRelease(&StrategyControl->buffer_strategy_lock);
 }
 
+/* prints some debug info / stats about freelists at shutdown */
+static void
+freelist_before_shmem_exit(int code, Datum arg)
+{
+	for (int node = 0; node < strategy_ncpus; node++)
+	{
+		BufferStrategyFreelist *freelist = &StrategyControl->freelists[node];
+		uint64		remain = 0;
+		uint64		actually_free = 0;
+		int			cur = freelist->firstFreeBuffer;
+
+		while (cur >= 0)
+		{
+			uint32		local_buf_state;
+			BufferDesc *buf;
+
+			buf = GetBufferDescriptor(cur);
+
+			remain++;
+
+			local_buf_state = LockBufHdr(buf);
+
+			if (!(local_buf_state & BM_TAG_VALID))
+				actually_free++;
+
+			UnlockBufHdr(buf, local_buf_state);
+
+			cur = buf->freeNext;
+		}
+		elog(LOG, "freelist %d, firstF: %d: consumed: %lu, remain: %lu, actually free: %lu",
+			 node,
+			 freelist->firstFreeBuffer,
+			 freelist->consumed,
+			 remain, actually_free);
+	}
+}
 
 /*
  * StrategyShmemSize
@@ -446,11 +634,33 @@ StrategyShmemSize(void)
 {
 	Size		size = 0;
 
+	/* FIXME */
+#ifdef USE_LIBNUMA
+	strategy_ncpus = numa_num_task_cpus();
+	strategy_nnodes = numa_num_task_nodes();
+#else
+	strategy_ncpus = 1;
+	strategy_nnodes = 1;
+#endif
+
+	Assert(strategy_nnodes <= strategy_ncpus);
+
 	/* size of lookup hash table ... see comment in StrategyInitialize */
 	size = add_size(size, BufTableShmemSize(NBuffers + NUM_BUFFER_PARTITIONS));
 
 	/* size of the shared replacement strategy control block */
-	size = add_size(size, MAXALIGN(sizeof(BufferStrategyControl)));
+	size = add_size(size, MAXALIGN(offsetof(BufferStrategyControl, freelists)));
+
+	/*
+	 * Allocate one frelist per CPU. We might use per-node freelists, but the
+	 * assumption is the number of CPUs is less than number of NUMA nodes.
+	 *
+	 * FIXME This assumes the we have more CPUs than NUMA nodes, which seems
+	 * like a safe assumption. But maybe we should calculate how many elements
+	 * we actually need, depending on the GUC? Not a huge amount of memory.
+	 */
+	size = add_size(size, MAXALIGN(mul_size(sizeof(BufferStrategyFreelist),
+											strategy_ncpus)));
 
 	return size;
 }
@@ -466,6 +676,7 @@ void
 StrategyInitialize(bool init)
 {
 	bool		found;
+	int			buffers_per_cpu;
 
 	/*
 	 * Initialize the shared buffer lookup hashtable.
@@ -484,23 +695,27 @@ StrategyInitialize(bool init)
 	 */
 	StrategyControl = (BufferStrategyControl *)
 		ShmemInitStruct("Buffer Strategy Status",
-						sizeof(BufferStrategyControl),
+						offsetof(BufferStrategyControl, freelists) +
+						(sizeof(BufferStrategyFreelist) * strategy_ncpus),
 						&found);
 
 	if (!found)
 	{
+		/*
+		 * XXX Calling get_nprocs() may not be quite correct, because some of
+		 * the processors may get disabled, etc.
+		 */
+		int			num_cpus = get_nprocs();
+
 		/*
 		 * Only done once, usually in postmaster
 		 */
 		Assert(init);
 
-		SpinLockInit(&StrategyControl->buffer_strategy_lock);
+		/* register callback to dump some stats on exit */
+		before_shmem_exit(freelist_before_shmem_exit, 0);
 
-		/*
-		 * Grab the whole linked list of free buffers for our strategy. We
-		 * assume it was previously set up by BufferManagerShmemInit().
-		 */
-		StrategyControl->firstFreeBuffer = 0;
+		SpinLockInit(&StrategyControl->buffer_strategy_lock);
 
 		/* Initialize the clock sweep pointer */
 		pg_atomic_init_u32(&StrategyControl->nextVictimBuffer, 0);
@@ -511,6 +726,61 @@ StrategyInitialize(bool init)
 
 		/* No pending notification */
 		StrategyControl->bgwprocno = -1;
+
+		/*
+		 * Rebuild the freelist - right now all buffers are in one huge list,
+		 * we want to rework that into multiple lists. Start by initializing
+		 * the strategy to have empty lists.
+		 */
+		for (int nfreelist = 0; nfreelist < strategy_ncpus; nfreelist++)
+		{
+			BufferStrategyFreelist *freelist;
+
+			freelist = &StrategyControl->freelists[nfreelist];
+
+			freelist->firstFreeBuffer = FREENEXT_END_OF_LIST;
+
+			SpinLockInit(&freelist->freelist_lock);
+		}
+
+		/* buffers per CPU (also used for PID partitioning) */
+		buffers_per_cpu = (NBuffers / strategy_ncpus);
+
+		elog(LOG, "NBuffers: %d, nodes %d, ncpus: %d, divide: %d, remain: %d",
+			 NBuffers, strategy_nnodes, strategy_ncpus,
+			 buffers_per_cpu, NBuffers - (strategy_ncpus * buffers_per_cpu));
+
+		/*
+		 * Walk through the buffers, add them to the correct list. Walk from
+		 * the end, because we're adding the buffers to the beginning.
+		 */
+		for (int i = NBuffers - 1; i >= 0; i--)
+		{
+			BufferDesc *buf = GetBufferDescriptor(i);
+			BufferStrategyFreelist *freelist;
+			int			belongs_to = 0; /* first freelist by default */
+
+			/*
+			 * Split the freelist into partitions, if needed (or just keep the
+			 * freelist we already built in BufferManagerShmemInit().
+			 */
+			if ((numa_partition_freelist == FREELIST_PARTITION_CPU) ||
+				(numa_partition_freelist == FREELIST_PARTITION_PID))
+			{
+				belongs_to = (i % num_cpus);
+			}
+			else if (numa_partition_freelist == FREELIST_PARTITION_NODE)
+			{
+				/* determine NUMA node for buffer */
+				belongs_to = BufferGetNode(i);
+			}
+
+			/* add to the right freelist */
+			freelist = &StrategyControl->freelists[belongs_to];
+
+			buf->freeNext = freelist->firstFreeBuffer;
+			freelist->firstFreeBuffer = i;
+		}
 	}
 	else
 		Assert(!init);
diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c
index f5359db3656..7febf3001a3 100644
--- a/src/backend/utils/init/globals.c
+++ b/src/backend/utils/init/globals.c
@@ -148,6 +148,7 @@ int			MaxBackends = 0;
 /* NUMA stuff */
 bool		numa_buffers_interleave = false;
 bool		numa_localalloc = false;
+int			numa_partition_freelist = 0;
 
 /* GUC parameters for vacuum */
 int			VacuumBufferUsageLimit = 2048;
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 57f2df7ab74..e2361c161e6 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -491,6 +491,14 @@ static const struct config_enum_entry file_copy_method_options[] = {
 	{NULL, 0, false}
 };
 
+static const struct config_enum_entry freelist_partition_options[] = {
+	{"none", FREELIST_PARTITION_NONE, false},
+	{"node", FREELIST_PARTITION_NODE, false},
+	{"cpu", FREELIST_PARTITION_CPU, false},
+	{"pid", FREELIST_PARTITION_PID, false},
+	{NULL, 0, false}
+};
+
 /*
  * Options for enum values stored in other modules
  */
@@ -5284,6 +5292,16 @@ struct config_enum ConfigureNamesEnum[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"numa_partition_freelist", PGC_USERSET, DEVELOPER_OPTIONS,
+			gettext_noop("Enables buffer freelists to be partitioned per NUMA node."),
+			gettext_noop("When enabled, we create a separate freelist per NUMA node."),
+		},
+		&numa_partition_freelist,
+		FREELIST_PARTITION_NONE, freelist_partition_options,
+		NULL, NULL, NULL
+	},
+
 	{
 		{"wal_sync_method", PGC_SIGHUP, WAL_SETTINGS,
 			gettext_noop("Selects the method used for forcing WAL updates to disk."),
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 692871a401f..17528439f07 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -180,6 +180,7 @@ extern PGDLLIMPORT int max_parallel_workers;
 
 extern PGDLLIMPORT bool numa_buffers_interleave;
 extern PGDLLIMPORT bool numa_localalloc;
+extern PGDLLIMPORT int numa_partition_freelist;
 
 extern PGDLLIMPORT int commit_timestamp_buffers;
 extern PGDLLIMPORT int multixact_member_buffers;
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index c257c8a1c20..efb7e28c10f 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -93,6 +93,14 @@ typedef enum ExtendBufferedFlags
 	EB_LOCK_TARGET = (1 << 5),
 }			ExtendBufferedFlags;
 
+typedef enum FreelistPartitionMode
+{
+	FREELIST_PARTITION_NONE,
+	FREELIST_PARTITION_NODE,
+	FREELIST_PARTITION_CPU,
+	FREELIST_PARTITION_PID,
+}			FreelistPartitionMode;
+
 /*
  * Some functions identify relations either by relation or smgr +
  * relpersistence.  Used via the BMR_REL()/BMR_SMGR() macros below.  This
-- 
2.49.0