From bdcf90fbd89a0aec397a3d57224ae732959733f9 Mon Sep 17 00:00:00 2001
From: Greg Burd <greg@burd.me>
Date: Sat, 25 Apr 2026 15:52:36 -0400
Subject: [PATCH v1] Reduce clock-sweep atomic contention by claiming buffers
 in batches

StrategyGetBuffer() advances nextVictimBuffer via
pg_atomic_fetch_add_u32(..., 1) on every tick.  On multi-socket
systems the cache line holding the counter has to travel over the
interconnect on each operation, pushing a sweep tick from ~20ns (the
same-socket case) into the ~100-200ns range.  With hundreds of
concurrent backends under eviction pressure, that one cache line
becomes the dominant cost in the sweep, visible as elevated
bus-cycles and cache-misses in perf profiles.

Each backend now claims a range of CLOCK_SWEEP_BATCH_SIZE (64)
consecutive buffer IDs with a single fetch-add and iterates through
them privately.  The sweep still advances through the pool in order,
each buffer is still visited exactly once per complete pass, and
usage_count is still decremented exactly once per buffer per pass;
the meaning of usage_count as "how many complete passes a buffer
survives without a re-pin" is preserved.  What changes is the
temporal ordering of decrements within a single pass, which the
algorithm does not depend on.

Wraparound handling is adjusted: with batching, multiple backends
can each see their fetch-add return a value past NBuffers within
the same pass.  Any such backend takes buffer_strategy_lock,
re-reads the counter, and if it is still out of range wraps it with
a single CAS and increments completePasses.  StrategySyncStart()
continues to see a consistent (nextVictimBuffer, completePasses)
pair.

Batching is only useful when the atomic is actually contended
across nodes, so it is applied only when libnuma reports more than
one node (pg_numa_get_max_node() >= 1); otherwise the batch size
stays at 1 and the code path matches master bit-for-bit.  The batch
is also capped at NBuffers so a claim cannot wrap the pool more
than once.

Co-Authored-by: Jim Mlodgenski <mlodj@amazon.com>
Co-Authored-by: Greg Burd <greg@burd.me>
---
 src/backend/storage/buffer/freelist.c | 136 ++++++++++++++++++--------
 1 file changed, 94 insertions(+), 42 deletions(-)

diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index fdb5bad7910..e86ed1f7da0 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -22,6 +22,7 @@
 #include "storage/proc.h"
 #include "storage/shmem.h"
 #include "storage/subsystems.h"
+#include "port/pg_numa.h"
 
 #define INT_ACCESS_ONCE(var)	((int)(*((volatile int *)&(var))))
 
@@ -100,68 +101,101 @@ static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy,
 static void AddBufferToRing(BufferAccessStrategy strategy,
 							BufferDesc *buf);
 
+/*
+ * Number of buffer IDs to claim from the shared clock hand at once.
+ * Larger values reduce contention on the shared atomic.  With a batch
+ * size of 64, concurrent backends sweep non-overlapping chunks of 64
+ * buffers rather than interleaving one buffer at a time.  The global
+ * sweep order is preserved — each buffer is still visited exactly once
+ * per complete pass.
+ */
+#define CLOCK_SWEEP_BATCH_SIZE 64
+
+/*
+ * Per-backend state for batched clock sweep.
+ */
+static uint32 MyBatchPos = 0;	/* next buffer within batch */
+static uint32 MyBatchEnd = 0;	/* one past last buffer in batch */
+
+/*
+ * Effective batch size for the clock sweep, computed once at startup.
+ * On non-NUMA systems (single socket, no libnuma, or containers blocking
+ * get_mempolicy), this is 1 -- the original one-at-a-time behavior.
+ * On multi-node NUMA systems, this is Min(CLOCK_SWEEP_BATCH_SIZE, NBuffers)
+ * to reduce cross-socket atomic contention on nextVictimBuffer.
+ */
+static uint32 ClockSweepBatchSize = 1;
+
+static inline uint32
+EffectiveBatchSize(void)
+{
+	return ClockSweepBatchSize;
+}
+
 /*
  * ClockSweepTick - Helper routine for StrategyGetBuffer()
  *
- * Move the clock hand one buffer ahead of its current position and return the
- * id of the buffer now under the hand.
+ * Return the next buffer to consider for eviction.  Backends claim batches
+ * of consecutive buffer IDs from the shared clock hand, then iterate through
+ * them locally without further atomic operations.  This preserves the global
+ * sweep order while reducing cross-socket contention on the shared counter.
  */
 static inline uint32
 ClockSweepTick(void)
 {
 	uint32		victim;
 
-	/*
-	 * Atomically move hand ahead one buffer - if there's several processes
-	 * doing this, this can lead to buffers being returned slightly out of
-	 * apparent order.
-	 */
-	victim =
-		pg_atomic_fetch_add_u32(&StrategyControl->nextVictimBuffer, 1);
-
-	if (victim >= NBuffers)
+	if (MyBatchPos >= MyBatchEnd)
 	{
-		uint32		originalVictim = victim;
-
-		/* always wrap what we look up in BufferDescriptors */
-		victim = victim % NBuffers;
-
 		/*
-		 * If we're the one that just caused a wraparound, force
-		 * completePasses to be incremented while holding the spinlock. We
-		 * need the spinlock so StrategySyncStart() can return a consistent
-		 * value consisting of nextVictimBuffer and completePasses.
+		 * Claim a new batch from the shared clock hand.  This is the only
+		 * atomic operation per batch, reducing contention by the batch size.
 		 */
-		if (victim == 0)
+		uint32		start;
+		uint32		batch_size = EffectiveBatchSize();
+
+		start = pg_atomic_fetch_add_u32(&StrategyControl->nextVictimBuffer,
+										batch_size);
+
+		if (start >= (uint32) NBuffers)
 		{
-			uint32		expected;
-			uint32		wrapped;
-			bool		success = false;
+			start = start % NBuffers;
 
-			expected = originalVictim + 1;
+			/*
+			 * If the counter has grown beyond NBuffers, try to wrap it back.
+			 * We must hold the spinlock so StrategySyncStart() can read
+			 * nextVictimBuffer and completePasses consistently.
+			 *
+			 * Multiple backends may enter this section concurrently. After
+			 * acquiring the spinlock, re-read the counter: if another backend
+			 * already wrapped it below NBuffers, we're done.
+			 */
+			SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
 
-			while (!success)
 			{
-				/*
-				 * Acquire the spinlock while increasing completePasses. That
-				 * allows other readers to read nextVictimBuffer and
-				 * completePasses in a consistent manner which is required for
-				 * StrategySyncStart().  In theory delaying the increment
-				 * could lead to an overflow of nextVictimBuffers, but that's
-				 * highly unlikely and wouldn't be particularly harmful.
-				 */
-				SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
-
-				wrapped = expected % NBuffers;
+				uint32		current;
+				uint32		wrapped;
 
-				success = pg_atomic_compare_exchange_u32(&StrategyControl->nextVictimBuffer,
-														 &expected, wrapped);
-				if (success)
-					StrategyControl->completePasses++;
-				SpinLockRelease(&StrategyControl->buffer_strategy_lock);
+				current = pg_atomic_read_u32(&StrategyControl->nextVictimBuffer);
+				if (current >= (uint32) NBuffers)
+				{
+					wrapped = current % NBuffers;
+					if (pg_atomic_compare_exchange_u32(&StrategyControl->nextVictimBuffer,
+													   &current, wrapped))
+						StrategyControl->completePasses++;
+				}
 			}
+
+			SpinLockRelease(&StrategyControl->buffer_strategy_lock);
 		}
+
+		MyBatchPos = start;
+		MyBatchEnd = start + batch_size;
 	}
+
+	victim = MyBatchPos % NBuffers;
+	MyBatchPos++;
+
 	return victim;
 }
 
@@ -408,6 +442,24 @@ StrategyCtlShmemInit(void *arg)
 
 	/* No pending notification */
 	StrategyControl->bgwprocno = -1;
+
+	/*
+	 * Determine the effective clock-sweep batch size.
+	 *
+	 * On multi-node NUMA systems, claiming batches of buffers from the shared
+	 * clock hand reduces cross-socket contention on the atomic counter.  On
+	 * single-socket systems, batching provides no benefit (the atomic is
+	 * already socket-local) and just causes backends to skip buffers, so we
+	 * use batch size 1 for the original behavior.
+	 *
+	 * pg_numa_init() returns -1 when NUMA is unavailable.
+	 * pg_numa_get_max_node() returns 0 for a single NUMA node.
+	 */
+	if (pg_numa_init() != -1 && pg_numa_get_max_node() >= 1)
+		ClockSweepBatchSize = Min(CLOCK_SWEEP_BATCH_SIZE,
+								  (uint32) NBuffers);
+	else
+		ClockSweepBatchSize = 1;
 }
 
 
-- 
2.50.1 (Apple Git-155)