From 01087e0724c4184f21ad3b369b73a761f0b561a0 Mon Sep 17 00:00:00 2001 From: Skakovsky Yury Date: Wed, 6 Feb 2019 12:26:03 +0300 Subject: [PATCH] Tweaked LRU for shared buffer management --- src/backend/storage/buffer/buf_init.c | 8 + src/backend/storage/buffer/bufmgr.c | 4 + src/backend/storage/buffer/freelist.c | 389 +++++++++++++++++++++----- src/include/storage/buf_internals.h | 13 + 4 files changed, 350 insertions(+), 64 deletions(-) diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c index b0ee3a26d6..7deb6bce66 100644 --- a/src/backend/storage/buffer/buf_init.c +++ b/src/backend/storage/buffer/buf_init.c @@ -126,6 +126,14 @@ InitBufferPool(void) buf->buf_id = i; + buf->id_of_prev = i - 1; + buf->id_of_next = i + 1; + if (buf->id_of_next == NBuffers) + buf->id_of_next = NO_LOGICAL_NEIGHBOUR; + + buf->beforeMid = (buf->buf_id < NBuffers * 5 / 8); + buf->inLiveZone = (buf->buf_id < NBuffers * 7 / 8); + /* * Initially link all the buffers together as unused. Subsequent * management of this list is done by freelist.c. diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 273e2f385f..de1a9f3037 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -1051,6 +1051,10 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, *foundPtr = false; } } + else + { + RemoveBufferOnStart(buf); + } return buf; } diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index 03caceaf7b..189bd34321 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -46,6 +46,35 @@ typedef struct * when the list is empty) */ + /* + * Index of 5/8-position buffer + */ + int separatingBufferLogical; + /* + * Index of 7/8-position buffer + */ + int leaderOfDeathZoneBufferLogical; + /* + * Index of 0/8-position buffer + */ + int firstBufferLogical; + /* + * Index of last buffer + */ + int lastBufferLogical; + + /* + * NOTE: there is invariant that id_of_next of + * lastBufferLogical == -1 == NO_LOGICAL_NEIGHBOUT. + * The same sutiatin with id_of_prev of firstBufferLogical + */ + + /* + * Index of current buffer for "clock sweep" algorithm at the last 1/8. + * There is real index becasuse of in ClockSweepTickAtTheEnd() we check. + */ + pg_atomic_uint32 curVictim; + /* * Statistics. These counters should be wide enough that they can't * overflow during a single bgwriter cycle. @@ -103,69 +132,248 @@ static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy, static void AddBufferToRing(BufferAccessStrategy strategy, BufferDesc *buf); -/* - * ClockSweepTick - Helper routine for StrategyGetBuffer() - * - * Move the clock hand one buffer ahead of its current position and return the - * id of the buffer now under the hand. - */ +/* Push curVictim and returns it */ static inline uint32 -ClockSweepTick(void) +ClockSweepTickAtTheEnd(void) { - uint32 victim; + uint32 oldVictim; + uint32 newVictim; + bool success = false; + + while (!success) { + oldVictim = pg_atomic_read_u32(&StrategyControl->curVictim); + newVictim = GetBufferDescriptor(oldVictim)->id_of_next; + if (newVictim == NO_LOGICAL_NEIGHBOUR) { + newVictim = StrategyControl->leaderOfDeathZoneBufferLogical; + } + success = pg_atomic_compare_exchange_u32(&StrategyControl->curVictim, &oldVictim, newVictim); + } + return newVictim; +} +/* + * Changes logical indeces. So buf->buf_id will equals + * StrategyControl->firstBufferLogical. But not for the second + * buffers (see below). + * + * During the work we hold StrategyControl->buffer_strategy_lock. + */ +void +RemoveBufferOnStart(BufferDesc* buf) { + BufferDesc* buf_next; + BufferDesc* buf_prev; + BufferDesc* currentMaster; + BufferDesc* currentLeaderOfDeathZone; + BufferDesc* currentSeparatingBuffer; + + SpinLockAcquire(&StrategyControl->buffer_strategy_lock); + + /* + * buf->id_of_prev == NO_LOGICAL_NEIGHBOUR + * this is case when we have found the first buffer yet + * + * buf->id_of_prev == StrategyControl->firstBufferLogical + * helps us to avoid extra code for this case (the end does not justify the means) + */ + if ( + buf->id_of_prev == NO_LOGICAL_NEIGHBOUR + || buf->id_of_prev == StrategyControl->firstBufferLogical + ) + { + SpinLockRelease(&StrategyControl->buffer_strategy_lock); + return; + } + + /* + * Initialization of all the necessary variables + */ + currentLeaderOfDeathZone = GetBufferDescriptor(StrategyControl->leaderOfDeathZoneBufferLogical); + currentSeparatingBuffer = GetBufferDescriptor(StrategyControl->separatingBufferLogical); + currentMaster = GetBufferDescriptor(StrategyControl->firstBufferLogical); + /* - * Atomically move hand ahead one buffer - if there's several processes - * doing this, this can lead to buffers being returned slightly out of - * apparent order. + * We must consider this case to avoid segfault. */ - victim = - pg_atomic_fetch_add_u32(&StrategyControl->nextVictimBuffer, 1); + if (buf->id_of_next >= 0) + buf_next = GetBufferDescriptor(buf->id_of_next); + else + buf_next = NULL; + buf_prev = GetBufferDescriptor(buf->id_of_prev); - if (victim >= NBuffers) + /* + * It seems that it is useless. But this commit is the most stable. + * Just in case we will not touch. + */ + if (currentMaster == buf || currentMaster == buf_prev) + { + SpinLockRelease(&StrategyControl->buffer_strategy_lock); + return; + } + + /* + * update links at neighbors + */ + if (buf_next != NULL) + { + buf_prev->id_of_next = buf->id_of_next; + buf_next->id_of_prev = buf->id_of_prev; + } + else { - uint32 originalVictim = victim; + buf_prev->id_of_next = NO_LOGICAL_NEIGHBOUR; + StrategyControl->lastBufferLogical = buf_prev->buf_id; + } + + /* + * update other links + */ + buf->id_of_prev = NO_LOGICAL_NEIGHBOUR; + buf->id_of_next = StrategyControl->firstBufferLogical; + currentMaster->id_of_prev = buf->buf_id; - /* always wrap what we look up in BufferDescriptors */ - victim = victim % NBuffers; + /* + * update firstBufferLogical + */ + StrategyControl->firstBufferLogical = buf->buf_id; - /* - * If we're the one that just caused a wraparound, force - * completePasses to be incremented while holding the spinlock. We - * need the spinlock so StrategySyncStart() can return a consistent - * value consisting of nextVictimBuffer and completePasses. - */ - if (victim == 0) + /* + * update boolean field 'inLiveZone' and leaderOfDeathZoneBufferLogical + * if it is necessary, that is buf was situated in the last 1/8 of + * logical buffer. + */ + if (!buf->inLiveZone) { + buf->inLiveZone = true; + buf->beforeMid = true; + + if (currentLeaderOfDeathZone == buf) { - uint32 expected; - uint32 wrapped; - bool success = false; - - expected = originalVictim + 1; + StrategyControl->leaderOfDeathZoneBufferLogical = buf_prev->buf_id; + buf_prev->inLiveZone = false; + } + else + { + StrategyControl->leaderOfDeathZoneBufferLogical = currentLeaderOfDeathZone->id_of_prev; + GetBufferDescriptor(StrategyControl->leaderOfDeathZoneBufferLogical)->inLiveZone = false; + } + } - while (!success) - { - /* - * Acquire the spinlock while increasing completePasses. That - * allows other readers to read nextVictimBuffer and - * completePasses in a consistent manner which is required for - * StrategySyncStart(). In theory delaying the increment - * could lead to an overflow of nextVictimBuffers, but that's - * highly unlikely and wouldn't be particularly harmful. - */ - SpinLockAcquire(&StrategyControl->buffer_strategy_lock); + /* + * Similarly with position 5/8. + * Note that (!buf->inLiveZone) excepts (!buf->beforeMid) + */ + if (!buf->beforeMid) { + buf->beforeMid = true; + + if (currentSeparatingBuffer == buf) + { + StrategyControl->separatingBufferLogical = buf_prev->buf_id; + buf_prev->beforeMid = false; + } + else + { + StrategyControl->separatingBufferLogical = currentSeparatingBuffer->id_of_prev; + GetBufferDescriptor(StrategyControl->separatingBufferLogical)->beforeMid = false; + } + } + + SpinLockRelease(&StrategyControl->buffer_strategy_lock); +} - wrapped = expected % NBuffers; +/* + * All the code is similar to RemoveBufferOnStart. + * + * P.S. it is copypaste and should be merged to one function. + * P.P.S. we will lose some performance after the merger. + * + * TODO + */ +void +RemoveBufferOnSeparatingPosition(BufferDesc* buf) { + BufferDesc* buf_next; + BufferDesc* buf_prev; + BufferDesc* currentMaster; + BufferDesc* master_prev; + BufferDesc* currentLeaderOfDeathZone; + + SpinLockAcquire(&StrategyControl->buffer_strategy_lock); + + if ( + buf->buf_id == StrategyControl->separatingBufferLogical + || buf->id_of_prev == StrategyControl->separatingBufferLogical + || buf->beforeMid + ) + { + SpinLockRelease(&StrategyControl->buffer_strategy_lock); + return; + } + + /* + * Init + */ + currentLeaderOfDeathZone = GetBufferDescriptor(StrategyControl->leaderOfDeathZoneBufferLogical); + currentMaster = GetBufferDescriptor(StrategyControl->separatingBufferLogical); + + if (buf->id_of_next >= 0) + buf_next = GetBufferDescriptor(buf->id_of_next); + else + buf_next = NULL; + + buf_prev = GetBufferDescriptor(buf->id_of_prev); + + master_prev = GetBufferDescriptor(currentMaster->id_of_prev); - success = pg_atomic_compare_exchange_u32(&StrategyControl->nextVictimBuffer, - &expected, wrapped); - if (success) - StrategyControl->completePasses++; - SpinLockRelease(&StrategyControl->buffer_strategy_lock); - } - } + /* + * More efficiently to skip bump if we want to bump only + * on one position. Moreover, code is clearer in this way. + */ + if (currentMaster == buf_prev) + { + SpinLockRelease(&StrategyControl->buffer_strategy_lock); + return; + } + + /* + * We just do it here because of all the next operations do not + * touch currentLeaderOfDeathZone. + */ + if (currentLeaderOfDeathZone == buf) { + buf->inLiveZone = true; + buf_prev->inLiveZone = false; + + StrategyControl->leaderOfDeathZoneBufferLogical = currentLeaderOfDeathZone->id_of_prev; + } + + if (buf->id_of_next >= 0) + { + buf_next->id_of_prev = buf_prev->buf_id; + buf_prev->id_of_next = buf_next->buf_id; } - return victim; + else + { + buf_prev->id_of_next = NO_LOGICAL_NEIGHBOUR; + + StrategyControl->lastBufferLogical = buf_prev->buf_id; + } + + buf->id_of_next = StrategyControl->separatingBufferLogical; + buf->id_of_prev = master_prev->buf_id; + + currentMaster->id_of_prev = buf->buf_id; + master_prev->id_of_next = buf->buf_id; + + StrategyControl->separatingBufferLogical = buf->buf_id; + + /* + * Now we consider only case when buf != currentLeaderOfDeathZone + * (see that case (by the way excepting current case) earlier). + */ + if (!buf->inLiveZone) + { + GetBufferDescriptor(currentLeaderOfDeathZone->id_of_prev)->inLiveZone = false; + StrategyControl->leaderOfDeathZoneBufferLogical = currentLeaderOfDeathZone->id_of_prev; + } + + SpinLockRelease(&StrategyControl->buffer_strategy_lock); } /* @@ -204,7 +412,9 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state) int bgwprocno; int trycounter; uint32 local_buf_state; /* to avoid repeated (de-)referencing */ - + int victimCandidate; + BufferDesc *tempBuf; + /* * If given a strategy object, see whether it can select a buffer. We * assume strategy objects don't need buffer_strategy_lock. @@ -284,7 +494,9 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state) /* Unconditionally remove buffer from freelist */ StrategyControl->firstFreeBuffer = buf->freeNext; buf->freeNext = FREENEXT_NOT_IN_LIST; - + + StrategyControl->lastBufferLogical = buf->buf_id; + /* * Release the lock so someone else can access the freelist while * we check out this buffer. @@ -305,6 +517,13 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state) if (strategy != NULL) AddBufferToRing(strategy, buf); *buf_state = local_buf_state; + + /* + * There is a reason here too to make a bump buffer, + * but we did not have time to test it. + * TODO + */ + return buf; } UnlockBufHdr(buf, local_buf_state); @@ -312,11 +531,14 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state) } } - /* Nothing on the freelist, so run the "clock sweep" algorithm */ + /* Nothing on the freelist, so run the "LRU5/8" algorithm */ trycounter = NBuffers; + SpinLockAcquire(&StrategyControl->buffer_strategy_lock); + victimCandidate = StrategyControl->lastBufferLogical; + SpinLockRelease(&StrategyControl->buffer_strategy_lock); for (;;) - { - buf = GetBufferDescriptor(ClockSweepTick()); + { + buf = GetBufferDescriptor(victimCandidate); /* * If the buffer is pinned or has a nonzero usage_count, we cannot use @@ -326,11 +548,19 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state) if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0) { + /* + * usage_count is useless after replacements "clock sweep", + * TODO + */ if (BUF_STATE_GET_USAGECOUNT(local_buf_state) != 0) { local_buf_state -= BUF_USAGECOUNT_ONE; trycounter = NBuffers; + + SpinLockAcquire(&StrategyControl->buffer_strategy_lock); + victimCandidate = StrategyControl->lastBufferLogical; + SpinLockRelease(&StrategyControl->buffer_strategy_lock); } else { @@ -338,20 +568,38 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state) if (strategy != NULL) AddBufferToRing(strategy, buf); *buf_state = local_buf_state; + + RemoveBufferOnSeparatingPosition(buf); + return buf; } } - else if (--trycounter == 0) + else { + + if (--trycounter == 0) + { + /* + * We've scanned all the buffers without making any state changes, + * so all the buffers are pinned (or were when we looked at them). + * We could hope that someone will free one eventually, but it's + * probably better to fail than to risk getting stuck in an + * infinite loop. + */ + UnlockBufHdr(buf, local_buf_state); + + elog(ERROR, "no unpinned buffers available"); + } /* - * We've scanned all the buffers without making any state changes, - * so all the buffers are pinned (or were when we looked at them). - * We could hope that someone will free one eventually, but it's - * probably better to fail than to risk getting stuck in an - * infinite loop. - */ - UnlockBufHdr(buf, local_buf_state); - elog(ERROR, "no unpinned buffers available"); + * In a good scenario, lock is not needed here: we use atomic variables + * TODO + * And "% NBuffers" is useless to: it seems that there is no bug in + * ClockSweepTickAtTheEnd(), it returns valid buf_id always + * TODO + */ + SpinLockAcquire(&StrategyControl->buffer_strategy_lock); + victimCandidate = ClockSweepTickAtTheEnd() % NBuffers; + SpinLockRelease(&StrategyControl->buffer_strategy_lock); } UnlockBufHdr(buf, local_buf_state); } @@ -512,9 +760,22 @@ StrategyInitialize(bool init) StrategyControl->firstFreeBuffer = 0; StrategyControl->lastFreeBuffer = NBuffers - 1; + /* + * Initially first/lastBufferLogical coincide with the physical ones. + */ + StrategyControl->firstBufferLogical = 0; + StrategyControl->lastBufferLogical = NBuffers - 1; + + /* + * Initialization of separating buffers. + */ + StrategyControl->separatingBufferLogical = NBuffers * 5 / 8; + StrategyControl->leaderOfDeathZoneBufferLogical = NBuffers * 7 / 8; + /* Initialize the clock sweep pointer */ pg_atomic_init_u32(&StrategyControl->nextVictimBuffer, 0); - + pg_atomic_init_u32(&StrategyControl->curVictim, StrategyControl->leaderOfDeathZoneBufferLogical); + /* Clear statistics */ StrategyControl->completePasses = 0; pg_atomic_init_u32(&StrategyControl->numBufferAllocs, 0); diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index ba1b5463fc..95b392c912 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -186,6 +186,14 @@ typedef struct BufferDesc int wait_backend_pid; /* backend PID of pin-count waiter */ int freeNext; /* link in freelist chain */ + /* Indeces of next and previous buffers in logical sequence */ + int id_of_next; + int id_of_prev; + + /* Not including separating buffers */ + bool beforeMid; + bool inLiveZone; + LWLock content_lock; /* to lock access to buffer contents */ } BufferDesc; @@ -236,6 +244,11 @@ extern PGDLLIMPORT LWLockMinimallyPadded *BufferIOLWLockArray; #define FREENEXT_END_OF_LIST (-1) #define FREENEXT_NOT_IN_LIST (-2) +/* + * The id_of_next/id_of_prev is either the index of the next freelist entry or: + */ +#define NO_LOGICAL_NEIGHBOUR (-1) + /* * Functions for acquiring/releasing a shared buffer header's spinlock. Do * not apply these to local buffers! -- 2.20.1