From 48e2de1758b93dedc060171acf54d45cb64a06fb Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Tue, 4 Feb 2025 17:51:11 +0100 Subject: [PATCH v1 2/2] BufTable: Optimize allocation of BufTable entries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously the slots were assigned ±randomly based on freelist order, now we first try to use slot in the bucket before trying to acquire slots from the freelists. This reduces the number of pointers we need to chase for lookups. --- src/backend/storage/buffer/buf_table.c | 137 ++++++++++++++++++------- 1 file changed, 102 insertions(+), 35 deletions(-) diff --git a/src/backend/storage/buffer/buf_table.c b/src/backend/storage/buffer/buf_table.c index b237822285a..b9c302186a7 100644 --- a/src/backend/storage/buffer/buf_table.c +++ b/src/backend/storage/buffer/buf_table.c @@ -25,6 +25,8 @@ #include "storage/buf_internals.h" #include "utils/dynahash.h" +#define NUM_FREELISTS NUM_BUFFER_PARTITIONS + /* * Hash element: Singly linked list with hash of value contained, and * the reference to the buffer header that contains the to-be-hashed value. @@ -69,9 +71,9 @@ typedef struct HashBucket typedef struct FreeList { slock_t mutex; - int32 firstFree; - int32 lastFree; - int32 myTag; + int32 prev; /* first free */ + int32 next; /* last free */ + int32 freeListTag; } FreeList; @@ -92,7 +94,7 @@ typedef struct BufTable { BufTableHeader hdr; char pad[CACHELINEALIGN(sizeof(BufTableHeader))]; }; - FreeListPadded freeList[NUM_BUFFER_PARTITIONS]; + FreeListPadded freeList[NUM_FREELISTS]; HashBucket buckets[FLEXIBLE_ARRAY_MEMBER]; } BufTable; @@ -100,12 +102,12 @@ static BufTable *bufLookupTable; #define NumBucketsFromSize(size) (1 << my_log2((size * 5) / 4)) -#define FreeListIdxFromHash(hash) (hash % NUM_BUFFER_PARTITIONS) +#define FreeListIdxFromHash(hash) (hash % NUM_FREELISTS) static inline FreeList * BufTableGetFreelist(BufTable *table, uint32 idx) { - Assert(idx < NUM_BUFFER_PARTITIONS); + Assert(idx < NUM_FREELISTS); return &table->freeList[idx].list; } @@ -151,7 +153,7 @@ BufTableShmemSize(int size) } -static inline void +static void add_to_freelist(BufTable *table, HashFreeListEntry *freeEntry, uint32 index) { uint32 freeListIdx; @@ -167,12 +169,14 @@ add_to_freelist(BufTable *table, HashFreeListEntry *freeEntry, uint32 index) freeEntry->prev = freeListTag; S_LOCK(&freeList->mutex); - next = freeList->firstFree; - if (next < 0) + next = freeList->prev; + if (next < 0) /* list is empty */ { pg_atomic_write_u32(&freeEntry->tag, (uint32) freeListTag); - freeList->firstFree = (int32) index; - freeList->lastFree = (int32) index; + freeList->prev = (int32) index; + freeList->next = (int32) index; + freeEntry->next = freeListTag; + freeEntry->prev = freeListTag; } else { @@ -183,8 +187,8 @@ add_to_freelist(BufTable *table, HashFreeListEntry *freeEntry, uint32 index) pg_atomic_write_u32(&freeEntry->tag, (uint32) freeListTag); - freeEntry->prev = freeList->firstFree; - freeList->firstFree = prevEntry->next = (int32) index; + freeEntry->prev = freeList->prev; + freeList->prev = prevEntry->next = (int32) index; } S_UNLOCK(&freeList->mutex); } @@ -220,15 +224,15 @@ InitBufTable(int size) bufLookupTable->hdr.numBuckets = nbuckets; bufLookupTable->hdr.mask = nbuckets - 1; - for (int i = 0; i < NUM_BUFFER_PARTITIONS; i++) + for (int i = 0; i < NUM_FREELISTS; i++) { FreeList *list = &bufLookupTable->freeList[i].list; S_INIT_LOCK(&list->mutex); - list->myTag = (-1 - i); + list->freeListTag = (-1 - i); - list->firstFree = list->myTag; - list->lastFree = list->myTag; + list->prev = list->freeListTag; + list->next = list->freeListTag; } /* build per-partition freelists */ @@ -281,39 +285,81 @@ initial_lookup(BufTable *table, uint32 hashvalue, HashBucket **hashBucket, int32 * does not hold this code will loop infinitely. */ static HashElement * -buftable_get_free_element(BufTable *table, uint32 hashValue, int32 *pInt) +buftable_get_free_element(BufTable *table, HashBucket *bucket, uint32 hashValue, + int32 *pInt) { FreeList *freeList; HashFreeListEntry *entry; + uint32 freeListIdx = FreeListIdxFromHash(hashValue); + freeList = BufTableGetFreelist(table, freeListIdx); - do { - for (;; hashValue++) + /* + * First, try to fit the newly added element into the bucket's + * element slot, if there's space. + */ + if (freeList->freeListTag == (int32) pg_atomic_read_u32(&bucket->freeEntry.tag)) + { + S_LOCK(&freeList->mutex); + if (likely(freeList->freeListTag == (int32) pg_atomic_read_u32(&bucket->freeEntry.tag))) { - freeList = BufTableGetFreelist(table, FreeListIdxFromHash(hashValue)); + *pInt = calc_bucket(table, hashValue); + entry = &bucket->freeEntry; - S_LOCK(&freeList->mutex); - if (freeList->firstFree != freeList->myTag) - break; - S_UNLOCK(&freeList->mutex); + goto freeElementFound; } + S_UNLOCK(&freeList->mutex); + } - entry = BufTableGetFreeEntry(table, freeList->firstFree); - *pInt = freeList->firstFree; - } while (false); + /* + * If all else failed, fall back to pulling the elements from the + * freelists directly. + */ + for (;;) + { + S_LOCK(&freeList->mutex); + if (freeList->prev != freeList->freeListTag) + break; + S_UNLOCK(&freeList->mutex); + hashValue++; + freeListIdx = FreeListIdxFromHash(hashValue); + freeList = BufTableGetFreelist(table, freeListIdx); + } - Assert(entry->next == freeList->myTag); + entry = BufTableGetFreeEntry(table, freeList->prev); + *pInt = freeList->prev; - /* unlink the entry from the linked list*/ - if (entry->prev != freeList->myTag) + /* + * preconditions: + * + * - entry is set + * - *pInt is set to the offset of the entry + * - freeList contains the freeList associated with entry + * - freeList->mutex is locked + */ +freeElementFound: + /* + * Unlink the entry from the (doubly!) linked list. + */ + if (entry->prev != freeList->freeListTag) { HashFreeListEntry *prevEntry = BufTableGetFreeEntry(table, entry->prev); + Assert(pg_atomic_read_u32(&prevEntry->tag) == freeList->freeListTag); prevEntry->next = entry->next; - freeList->firstFree = entry->prev; } else { - Assert(freeList->firstFree == freeList->lastFree); - Assert(entry->next == entry->prev); + freeList->next = entry->next; + } + + if (entry->next != freeList->freeListTag) + { + HashFreeListEntry *nextEntry = BufTableGetFreeEntry(table, entry->next); + Assert(pg_atomic_read_u32(&nextEntry->tag) == freeList->freeListTag); + nextEntry->prev = entry->prev; + } + else + { + freeList->prev = entry->prev; } S_UNLOCK(&freeList->mutex); @@ -443,7 +489,8 @@ BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id) return (int) pg_atomic_read_u32(&matchElement->buffer); /* else, create a new one */ - matchElement = buftable_get_free_element(table, hashcode, &matchElemIdx); + matchElement = buftable_get_free_element(table, currBucket, hashcode, + &matchElemIdx); /* link into hashbucket chain */ *prevNextPtr = matchElemIdx; @@ -505,6 +552,26 @@ BufTableDelete(BufferTag *tagPtr, uint32 hashcode, int buffer) */ Assert(0 <= (int32) pg_atomic_read_u32(&matchElement->buffer)); + /* + * If we're dropping the optimally located hash entry, move the + * next element's slot data, and drop that next element instead. + */ + if (&currBucket->element == matchElement && matchElement->next >= 0) + { + int32 nextIdx = matchElement->next; + HashElement *next = BufTableGetElement(table, nextIdx); + + /* copy payloads into the "old deleted" slot */ + pg_atomic_write_u32(&matchElement->buffer, + pg_atomic_read_u32(&next->buffer)); + matchElement->hashValue = next->hashValue; + + /* Mark the next element for deletion */ + prevNextPtr = &matchElement->next; + matchElement = next; + matchElemIdx = nextIdx; + } + /* unlink the element from the singly-linked list */ *prevNextPtr = matchElement->next; -- 2.46.0