From be039fdf42ba3bf6637697f6fec38874f94d09b6 Mon Sep 17 00:00:00 2001 From: Greg Burd Date: Fri, 11 Jul 2025 09:05:45 -0400 Subject: [PATCH v2 2/2] Remove the buffer_strategy_lock. With the removal of the freelist the remaining items in the BufferStrategyControl structure no longer require strict coordination. Atomic operations will suffice. --- src/backend/storage/buffer/README | 39 +++++++++---------- src/backend/storage/buffer/bufmgr.c | 8 ++-- src/backend/storage/buffer/freelist.c | 56 ++++++--------------------- src/backend/storage/buffer/localbuf.c | 2 +- src/include/storage/buf_internals.h | 2 +- 5 files changed, 36 insertions(+), 71 deletions(-) diff --git a/src/backend/storage/buffer/README b/src/backend/storage/buffer/README index cd52effd911..a60f77d7ee9 100644 --- a/src/backend/storage/buffer/README +++ b/src/backend/storage/buffer/README @@ -127,11 +127,10 @@ bits of the tag's hash value. The rules stated above apply to each partition independently. If it is necessary to lock more than one partition at a time, they must be locked in partition-number order to avoid risk of deadlock. -* A separate system-wide spinlock, buffer_strategy_lock, provides mutual -exclusion for operations that select buffers for replacement. A spinlock is -used here rather than a lightweight lock for efficiency; no other locks of any -sort should be acquired while buffer_strategy_lock is held. This is essential -to allow buffer replacement to happen in multiple backends with reasonable +* Operations that select buffers for replacement don't require a lock, but +rather use atomic operations to ensure coordination across backends when +accessing members of the BufferStrategyControl datastructure. This allows +buffer replacement to happen in multiple backends with reasonable concurrency. * Each buffer header contains a spinlock that must be taken when examining @@ -173,14 +172,12 @@ buffer_strategy_lock. The algorithm for a process that needs to obtain a victim buffer is: -1. Obtain buffer_strategy_lock. +1. Select the buffer pointed to by nextVictimBuffer, and circularly advance +nextVictimBuffer for next time. -2. Select the buffer pointed to by nextVictimBuffer, and circularly advance -nextVictimBuffer for next time. Release buffer_strategy_lock. - -3. If the selected buffer is pinned or has a nonzero usage count, it cannot -be used. Decrement its usage count (if nonzero), reacquire -buffer_strategy_lock, and return to step 3 to examine the next buffer. +2. If the selected buffer is pinned or has a nonzero usage count, it cannot be +used. Decrement its usage count (if nonzero), return to step 3 to examine the +next buffer. 4. Pin the selected buffer, and return. @@ -196,9 +193,9 @@ Buffer Ring Replacement Strategy When running a query that needs to access a large number of pages just once, such as VACUUM or a large sequential scan, a different strategy is used. A page that has been touched only by such a scan is unlikely to be needed -again soon, so instead of running the normal clock sweep algorithm and +again soon, so instead of running the normal clock-sweep algorithm and blowing out the entire buffer cache, a small ring of buffers is allocated -using the normal clock sweep algorithm and those buffers are reused for the +using the normal clock-sweep algorithm and those buffers are reused for the whole scan. This also implies that much of the write traffic caused by such a statement will be done by the backend itself and not pushed off onto other processes. @@ -244,13 +241,13 @@ nextVictimBuffer (which it does not change!), looking for buffers that are dirty and not pinned nor marked with a positive usage count. It pins, writes, and releases any such buffer. -If we can assume that reading nextVictimBuffer is an atomic action, then -the writer doesn't even need to take buffer_strategy_lock in order to look -for buffers to write; it needs only to spinlock each buffer header for long -enough to check the dirtybit. Even without that assumption, the writer -only needs to take the lock long enough to read the variable value, not -while scanning the buffers. (This is a very substantial improvement in -the contention cost of the writer compared to PG 8.0.) +We enforce reading nextVictimBuffer within an atomic action so that the writer +doesn't even need to take buffer_strategy_lock in order to look for buffers to +write; it needs only to spinlock each buffer header for long enough to check +the dirtybit. Even without that assumption, the writer only needs to take the +lock long enough to read the variable value, not while scanning the buffers. +(This is a very substantial improvement in the contention cost of the writer +compared to PG 8.0.) The background writer takes shared content lock on a buffer while writing it out (and anyone else who flushes buffer contents to disk must do so too). diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 9c059441a5c..d068f77362d 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -3593,7 +3593,7 @@ BufferSync(int flags) * This is called periodically by the background writer process. * * Returns true if it's appropriate for the bgwriter process to go into - * low-power hibernation mode. (This happens if the strategy clock sweep + * low-power hibernation mode. (This happens if the strategy clock-sweep * has been "lapped" and no buffer allocations have occurred recently, * or if the bgwriter has been effectively disabled by setting * bgwriter_lru_maxpages to 0.) @@ -3643,7 +3643,7 @@ BgBufferSync(WritebackContext *wb_context) uint32 new_recent_alloc; /* - * Find out where the clock sweep currently is, and how many buffer + * Find out where the clock-sweep currently is, and how many buffer * allocations have happened since our last call. */ strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc); @@ -3664,8 +3664,8 @@ BgBufferSync(WritebackContext *wb_context) /* * Compute strategy_delta = how many buffers have been scanned by the - * clock sweep since last time. If first time through, assume none. Then - * see if we are still ahead of the clock sweep, and if so, how many + * clock-sweep since last time. If first time through, assume none. Then + * see if we are still ahead of the clock-sweep, and if so, how many * buffers we could scan before we'd catch up with it and "lap" it. Note: * weird-looking coding of xxx_passes comparisons are to avoid bogus * behavior when the passes counts wrap around. diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index a228ff27377..267a9d84df3 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -29,11 +29,8 @@ */ typedef struct { - /* Spinlock: protects the values below */ - slock_t buffer_strategy_lock; - /* - * Clock sweep hand: index of next buffer to consider grabbing. Note that + * Clock-sweep hand: index of next buffer to consider grabbing. Note that * this isn't a concrete buffer - we only ever increase the value. So, to * get an actual buffer, it needs to be used modulo NBuffers. */ @@ -43,7 +40,7 @@ typedef struct * Statistics. These counters should be wide enough that they can't * overflow during a single bgwriter cycle. */ - uint32 completePasses; /* Complete cycles of the clock sweep */ + pg_atomic_uint32 completePasses; /* Complete cycles of the clock-sweep */ pg_atomic_uint32 numBufferAllocs; /* Buffers allocated since last reset */ /* @@ -116,12 +113,7 @@ ClockSweepTick(void) /* always wrap what we look up in BufferDescriptors */ victim = victim % NBuffers; - /* - * If we're the one that just caused a wraparound, force - * completePasses to be incremented while holding the spinlock. We - * need the spinlock so StrategySyncStart() can return a consistent - * value consisting of nextVictimBuffer and completePasses. - */ + /* Increment completePasses if we just caused a wraparound */ if (victim == 0) { uint32 expected; @@ -132,23 +124,12 @@ ClockSweepTick(void) while (!success) { - /* - * Acquire the spinlock while increasing completePasses. That - * allows other readers to read nextVictimBuffer and - * completePasses in a consistent manner which is required for - * StrategySyncStart(). In theory delaying the increment - * could lead to an overflow of nextVictimBuffers, but that's - * highly unlikely and wouldn't be particularly harmful. - */ - SpinLockAcquire(&StrategyControl->buffer_strategy_lock); - wrapped = expected % NBuffers; success = pg_atomic_compare_exchange_u32(&StrategyControl->nextVictimBuffer, &expected, wrapped); if (success) - StrategyControl->completePasses++; - SpinLockRelease(&StrategyControl->buffer_strategy_lock); + pg_atomic_fetch_add_u32(&StrategyControl->completePasses, 1); } } } @@ -177,10 +158,7 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r *from_ring = false; - /* - * If given a strategy object, see whether it can select a buffer. We - * assume strategy objects don't need buffer_strategy_lock. - */ + /* If given a strategy object, see whether it can select a buffer */ if (strategy != NULL) { buf = GetBufferFromRing(strategy, buf_state); @@ -225,7 +203,7 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r */ pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocs, 1); - /* Use the "clock sweep" algorithm to find a free buffer */ + /* Use the "clock-sweep" algorithm to find a free buffer */ trycounter = NBuffers; for (;;) { @@ -287,13 +265,12 @@ StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc) uint32 nextVictimBuffer; int result; - SpinLockAcquire(&StrategyControl->buffer_strategy_lock); nextVictimBuffer = pg_atomic_read_u32(&StrategyControl->nextVictimBuffer); result = nextVictimBuffer % NBuffers; if (complete_passes) { - *complete_passes = StrategyControl->completePasses; + *complete_passes = pg_atomic_read_u32(&StrategyControl->completePasses); /* * Additionally add the number of wraparounds that happened before @@ -306,7 +283,7 @@ StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc) { *num_buf_alloc = pg_atomic_exchange_u32(&StrategyControl->numBufferAllocs, 0); } - SpinLockRelease(&StrategyControl->buffer_strategy_lock); + return result; } @@ -321,21 +298,14 @@ StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc) void StrategyNotifyBgWriter(int bgwprocno) { - /* - * We acquire buffer_strategy_lock just to ensure that the store appears - * atomic to StrategyGetBuffer. The bgwriter should call this rather - * infrequently, so there's no performance penalty from being safe. - */ - SpinLockAcquire(&StrategyControl->buffer_strategy_lock); StrategyControl->bgwprocno = bgwprocno; - SpinLockRelease(&StrategyControl->buffer_strategy_lock); } /* * StrategyShmemSize * - * estimate the size of shared memory used by the freelist-related structures. + * Estimate the size of shared memory used by the freelist-related structures. * * Note: for somewhat historical reasons, the buffer lookup hashtable size * is also determined here. @@ -393,13 +363,11 @@ StrategyInitialize(bool init) */ Assert(init); - SpinLockInit(&StrategyControl->buffer_strategy_lock); - - /* Initialize the clock sweep pointer */ + /* Initialize the clock-sweep pointer */ pg_atomic_init_u32(&StrategyControl->nextVictimBuffer, 0); /* Clear statistics */ - StrategyControl->completePasses = 0; + pg_atomic_init_u32(&StrategyControl->completePasses, 0); pg_atomic_init_u32(&StrategyControl->numBufferAllocs, 0); /* No pending notification */ @@ -643,7 +611,7 @@ GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state) * * If usage_count is 0 or 1 then the buffer is fair game (we expect 1, * since our own previous usage of the ring element would have left it - * there, but it might've been decremented by clock sweep since then). A + * there, but it might've been decremented by clock-sweep since then). A * higher usage_count indicates someone else has touched the buffer, so we * shouldn't re-use it. */ diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index 3da9c41ee1d..7a34f5e430a 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -229,7 +229,7 @@ GetLocalVictimBuffer(void) ResourceOwnerEnlarge(CurrentResourceOwner); /* - * Need to get a new buffer. We use a clock sweep algorithm (essentially + * Need to get a new buffer. We use a clock-sweep algorithm (essentially * the same as what freelist.c does now...) */ trycounter = NLocBuffer; diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index 00eade63971..133a0dd7fd5 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -81,7 +81,7 @@ StaticAssertDecl(BUF_REFCOUNT_BITS + BUF_USAGECOUNT_BITS + BUF_FLAG_BITS == 32, * accuracy and speed of the clock-sweep buffer management algorithm. A * large value (comparable to NBuffers) would approximate LRU semantics. * But it can take as many as BM_MAX_USAGE_COUNT+1 complete cycles of - * clock sweeps to find a free buffer, so in practice we don't want the + * clock-sweeps to find a free buffer, so in practice we don't want the * value to be very large. */ #define BM_MAX_USAGE_COUNT 5 -- 2.49.0