From 0b7e0fd25b55c8d869afcbc7b86535f87b32442f Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Tue, 4 Feb 2025 17:03:19 +0100 Subject: [PATCH v1 1/2] BufTable: Use optimized hash table for buffer lookups MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This uses up to 1/3rd the memory and 1 less redirections vs the current dynahash.c table. In the worst case for 64-bit systems, it'll only use 16% less memory, in the best case ±58%. Future commits will further optimize allocation of entries to reduce cache misses. --- src/backend/storage/buffer/buf_table.c | 459 ++++++++++++++++++++++--- src/backend/storage/buffer/bufmgr.c | 4 +- src/include/storage/buf_internals.h | 2 +- 3 files changed, 414 insertions(+), 51 deletions(-) diff --git a/src/backend/storage/buffer/buf_table.c b/src/backend/storage/buffer/buf_table.c index a50955d5286..b237822285a 100644 --- a/src/backend/storage/buffer/buf_table.c +++ b/src/backend/storage/buffer/buf_table.c @@ -21,17 +21,117 @@ */ #include "postgres.h" +#include "common/hashfn.h" #include "storage/buf_internals.h" +#include "utils/dynahash.h" -/* entry for buffer lookup hashtable */ -typedef struct +/* + * Hash element: Singly linked list with hash of value contained, and + * the reference to the buffer header that contains the to-be-hashed value. + * + * Note: when negative, buffer is a sentinel value indicating it as a freelist + * entry for list at offset (-1 - offset). The user should lock that freelist + * to acquire that slot, but not before confirming that the value hasn't + * been updated since. + */ +typedef struct HashElement +{ + int32 next; + uint32 hashValue; + pg_atomic_uint32 buffer; +} HashElement; + +/* + * Free list entry for the hash. + * + * Note tha + */ +typedef struct HashFreeListEntry +{ + int32 next; + int32 prev; + pg_atomic_uint32 tag; +} HashFreeListEntry; + +StaticAssertDecl(offsetof(HashFreeListEntry, tag) == offsetof(HashElement, buffer), + "tag and buffer should be at the same offset into HashFreeListEntry and HashElement"); + +typedef struct HashBucket +{ + int32 bucketStartElement; + union { + HashElement element; + HashFreeListEntry freeEntry; + }; +} HashBucket; + + +typedef struct FreeList +{ + slock_t mutex; + int32 firstFree; + int32 lastFree; + int32 myTag; +} FreeList; + + +typedef union FreeListPadded +{ + FreeList list; + char padding[PG_CACHE_LINE_SIZE]; +} FreeListPadded; + +typedef struct BufTableHeader +{ + uint32 numBuckets; + uint32 mask; +} BufTableHeader; + +typedef struct BufTable { + union { + BufTableHeader hdr; + char pad[CACHELINEALIGN(sizeof(BufTableHeader))]; + }; + FreeListPadded freeList[NUM_BUFFER_PARTITIONS]; + HashBucket buckets[FLEXIBLE_ARRAY_MEMBER]; +} BufTable; + +static BufTable *bufLookupTable; + +#define NumBucketsFromSize(size) (1 << my_log2((size * 5) / 4)) + +#define FreeListIdxFromHash(hash) (hash % NUM_BUFFER_PARTITIONS) + +static inline FreeList * +BufTableGetFreelist(BufTable *table, uint32 idx) +{ + Assert(idx < NUM_BUFFER_PARTITIONS); + return &table->freeList[idx].list; +} + +static inline HashBucket * +BufTableGetBucket(BufTable *table, uint32 hash) { - BufferTag key; /* Tag of a disk page */ - int id; /* Associated buffer ID */ -} BufferLookupEnt; + int32 offset = (hash & table->hdr.mask); + + Assert(offset < table->hdr.numBuckets); + + return &table->buckets[offset]; +} -static HTAB *SharedBufHash; +static inline HashElement * +BufTableGetElement(BufTable *table, int32 offset) +{ + Assert(offset >= 0 && offset <= table->hdr.numBuckets); + return &table->buckets[offset].element; +} +static inline HashFreeListEntry * +BufTableGetFreeEntry(BufTable *table, int32 offset) +{ + Assert(offset >= 0 && offset <= table->hdr.numBuckets); + return &table->buckets[offset].freeEntry; +} /* * Estimate space needed for mapping hashtable @@ -40,7 +140,53 @@ static HTAB *SharedBufHash; Size BufTableShmemSize(int size) { - return hash_estimate_size(size, sizeof(BufferLookupEnt)); + uint64 nbuckets = NumBucketsFromSize(size); + Size alloc = 0; + + alloc = add_size(alloc, PG_CACHE_LINE_SIZE); + alloc = add_size(alloc, offsetof(BufTable, buckets)); + alloc = add_size(alloc, mul_size(nbuckets, sizeof(HashBucket))); + + return alloc; +} + + +static inline void +add_to_freelist(BufTable *table, HashFreeListEntry *freeEntry, uint32 index) +{ + uint32 freeListIdx; + FreeList *freeList; + int32 next; + int32 freeListTag; + Assert(freeEntry == BufTableGetFreeEntry(table, index)); + + freeListIdx = FreeListIdxFromHash(index); + freeList = BufTableGetFreelist(bufLookupTable, freeListIdx); + freeListTag = (-1 - (int32) freeListIdx); + freeEntry->next = freeListTag; + freeEntry->prev = freeListTag; + + S_LOCK(&freeList->mutex); + next = freeList->firstFree; + if (next < 0) + { + pg_atomic_write_u32(&freeEntry->tag, (uint32) freeListTag); + freeList->firstFree = (int32) index; + freeList->lastFree = (int32) index; + } + else + { + HashFreeListEntry *prevEntry; + + prevEntry = BufTableGetFreeEntry(table, next); + Assert(pg_atomic_read_u32(&prevEntry->tag) == (uint32) freeListTag); + + pg_atomic_write_u32(&freeEntry->tag, (uint32) freeListTag); + + freeEntry->prev = freeList->firstFree; + freeList->firstFree = prevEntry->next = (int32) index; + } + S_UNLOCK(&freeList->mutex); } /* @@ -50,21 +196,132 @@ BufTableShmemSize(int size) void InitBufTable(int size) { - HASHCTL info; + void *baseptr; + bool found = false; + uint32 nbuckets = NumBucketsFromSize(size); + + baseptr = ShmemInitStruct("Shared Buffer Lookup Table", + BufTableShmemSize(size), + &found); + + /* + * Runs only in postmaster during startup; there is no concurrent process + * that can or is allowed to initialize this. + */ + Assert(!found); + + if (found) + return; + + Assert(PointerIsAligned(baseptr, void *)); + bufLookupTable = (void *) CACHELINEALIGN((char *) baseptr); + + /* nbuckets is always a power of 2, so mask is always nbuckets-1 */ + bufLookupTable->hdr.numBuckets = nbuckets; + bufLookupTable->hdr.mask = nbuckets - 1; + + for (int i = 0; i < NUM_BUFFER_PARTITIONS; i++) + { + FreeList *list = &bufLookupTable->freeList[i].list; + + S_INIT_LOCK(&list->mutex); + list->myTag = (-1 - i); + + list->firstFree = list->myTag; + list->lastFree = list->myTag; + } + + /* build per-partition freelists */ + for (int i = 0; i < nbuckets; i++) + { + HashBucket *bucket = BufTableGetBucket(bufLookupTable, i); + bucket->bucketStartElement = -1; + add_to_freelist(bufLookupTable, &bucket->freeEntry, i); + } +} + +/* Convert a hash value to a bucket number */ +static inline int32 +calc_bucket(BufTable *table, uint32 hash_val) +{ + uint32 bucket; + + bucket = hash_val & table->hdr.mask; + + return (int32) bucket; +} + +static inline HashElement * +initial_lookup(BufTable *table, uint32 hashvalue, HashBucket **hashBucket, int32 *idx) +{ + HashBucket *bucket; + int32 bucketno; + int32 startElementOffset; + + bucketno = calc_bucket(table, hashvalue); + + bucket = BufTableGetBucket(bufLookupTable, bucketno); + startElementOffset = bucket->bucketStartElement; + *idx = startElementOffset; + *hashBucket = bucket; + + if (startElementOffset < 0) + return NULL; + + if (likely(startElementOffset == bucketno)) + return &bucket->element; + + return BufTableGetElement(table, startElementOffset); +} + +/* + * Get a new free element from a freelist. + * + * NOTE: This assumes there are free elements available. If that assumption + * does not hold this code will loop infinitely. + */ +static HashElement * +buftable_get_free_element(BufTable *table, uint32 hashValue, int32 *pInt) +{ + FreeList *freeList; + HashFreeListEntry *entry; + + do { + for (;; hashValue++) + { + freeList = BufTableGetFreelist(table, FreeListIdxFromHash(hashValue)); + + S_LOCK(&freeList->mutex); + if (freeList->firstFree != freeList->myTag) + break; + S_UNLOCK(&freeList->mutex); + } + + entry = BufTableGetFreeEntry(table, freeList->firstFree); + *pInt = freeList->firstFree; + } while (false); + + Assert(entry->next == freeList->myTag); - /* assume no locking is needed yet */ + /* unlink the entry from the linked list*/ + if (entry->prev != freeList->myTag) + { + HashFreeListEntry *prevEntry = BufTableGetFreeEntry(table, entry->prev); + prevEntry->next = entry->next; + freeList->firstFree = entry->prev; + } + else + { + Assert(freeList->firstFree == freeList->lastFree); + Assert(entry->next == entry->prev); + } - /* BufferTag maps to Buffer */ - info.keysize = sizeof(BufferTag); - info.entrysize = sizeof(BufferLookupEnt); - info.num_partitions = NUM_BUFFER_PARTITIONS; + S_UNLOCK(&freeList->mutex); - SharedBufHash = ShmemInitHash("Shared Buffer Lookup Table", - size, size, - &info, - HASH_ELEM | HASH_BLOBS | HASH_PARTITION); + return (HashElement *) entry; } + /* * BufTableHashCode * Compute the hash code associated with a BufferTag @@ -77,7 +334,7 @@ InitBufTable(int size) uint32 BufTableHashCode(BufferTag *tagPtr) { - return get_hash_value(SharedBufHash, tagPtr); + return hash_bytes((void *) tagPtr, sizeof(BufferTag)); } /* @@ -89,19 +346,40 @@ BufTableHashCode(BufferTag *tagPtr) int BufTableLookup(BufferTag *tagPtr, uint32 hashcode) { - BufferLookupEnt *result; + BufTable *table = bufLookupTable; + HashBucket *currBucket; + HashElement *matchElement; + int32 curIdx; + + /* + * Do the initial lookup + */ + matchElement = initial_lookup(table, hashcode, &currBucket, + &curIdx); + + /* follow the bucket's chain */ + while (matchElement != NULL) + { + if (matchElement->hashValue == hashcode && BufferTagsEqual( + tagPtr, + &GetBufferDescriptor(pg_atomic_read_u32(&matchElement->buffer))->tag + )) + break; - result = (BufferLookupEnt *) - hash_search_with_hash_value(SharedBufHash, - tagPtr, - hashcode, - HASH_FIND, - NULL); + if (matchElement->next < 0) + { + matchElement = NULL; + break; + } - if (!result) + curIdx = matchElement->next; + matchElement = BufTableGetElement(table, curIdx); + } + + if (!matchElement) return -1; - return result->id; + return (int) pg_atomic_read_u32(&matchElement->buffer); } /* @@ -117,23 +395,64 @@ BufTableLookup(BufferTag *tagPtr, uint32 hashcode) int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id) { - BufferLookupEnt *result; - bool found; + BufTable *table = bufLookupTable; + HashBucket *currBucket; + HashElement *matchElement; + int32 *prevNextPtr; + int32 matchElemIdx; Assert(buf_id >= 0); /* -1 is reserved for not-in-table */ Assert(tagPtr->blockNum != P_NEW); /* invalid tag */ - result = (BufferLookupEnt *) - hash_search_with_hash_value(SharedBufHash, - tagPtr, - hashcode, - HASH_ENTER, - &found); + /* + * Do the initial lookup + */ + matchElement = initial_lookup(table, hashcode, &currBucket, + &matchElemIdx); + prevNextPtr = &currBucket->bucketStartElement; + + while (matchElement != NULL) + { + /* + * Make sure it's a valid element in the chain. We have a lock on + * that chain, so any breakage is a broken invariant. + */ + Assert(0 <= (int32) pg_atomic_read_u32(&matchElement->buffer)); + + if (matchElement->hashValue == hashcode && BufferTagsEqual( + tagPtr, + &GetBufferDescriptor(pg_atomic_read_u32(&matchElement->buffer))->tag + )) + break; - if (found) /* found something already in the table */ - return result->id; + prevNextPtr = &matchElement->next; - result->id = buf_id; + if (matchElement->next < 0) + { + Assert(matchElement->next == -1); + matchElement = NULL; + break; + } + + matchElemIdx = matchElement->next; + matchElement = BufTableGetElement(table, matchElemIdx); + } + + /* Return existing buffer ID if we found a pre-existing buffer */ + if (matchElement != NULL) + return (int) pg_atomic_read_u32(&matchElement->buffer); + + /* else, create a new one */ + matchElement = buftable_get_free_element(table, hashcode, &matchElemIdx); + + /* link into hashbucket chain */ + *prevNextPtr = matchElemIdx; + + /* copy data into the element */ + matchElement->next = -1; /* it's the tail of the bucket */ + matchElement->hashValue = hashcode; + + pg_atomic_write_u32(&matchElement->buffer, (uint32) ((int32) buf_id)); return -1; } @@ -145,17 +464,61 @@ BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id) * Caller must hold exclusive lock on BufMappingLock for tag's partition */ void -BufTableDelete(BufferTag *tagPtr, uint32 hashcode) +BufTableDelete(BufferTag *tagPtr, uint32 hashcode, int buffer) { - BufferLookupEnt *result; + BufTable *table = bufLookupTable; + HashBucket *currBucket; + HashElement *matchElement; + int32 *prevNextPtr; + int32 matchElemIdx; + + /* + * Do the initial lookup + */ + matchElement = initial_lookup(table, hashcode, &currBucket, + &matchElemIdx); + prevNextPtr = &currBucket->bucketStartElement; + + while (matchElement != NULL) + { + if (matchElement->hashValue == hashcode && + pg_atomic_read_u32(&matchElement->buffer) == buffer) + break; + + prevNextPtr = &matchElement->next; + + if (matchElement->next < 0) + { + matchElement = NULL; + break; + } + + matchElemIdx = matchElement->next; + matchElement = BufTableGetElement(table, matchElemIdx); + } + + if (matchElement != NULL) + { + /* + * We can't test the buffer tag, because that may have been cleared + * by the caller by now. + */ + Assert(0 <= (int32) pg_atomic_read_u32(&matchElement->buffer)); - result = (BufferLookupEnt *) - hash_search_with_hash_value(SharedBufHash, - tagPtr, - hashcode, - HASH_REMOVE, - NULL); + /* unlink the element from the singly-linked list */ + *prevNextPtr = matchElement->next; - if (!result) /* shouldn't happen */ - elog(ERROR, "shared buffer hash table corrupted"); + /* + * Remove record from hash bucket's chain. + * + * Note that bucket chains are singly linked lists, but freelists + * doubly linked lists. + */ + add_to_freelist(bufLookupTable, (HashFreeListEntry *) matchElement, + matchElemIdx); + } + else + { + elog(PANIC, "shared buffer hash table corrupted"); + } } diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index ee83669992b..362e2d2f134 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -1856,7 +1856,7 @@ retry: * Remove the buffer from the lookup hashtable, if it was in there. */ if (oldFlags & BM_TAG_VALID) - BufTableDelete(&oldTag, oldHash); + BufTableDelete(&oldTag, oldHash, buf->buf_id); /* * Done with mapping lock. @@ -1935,7 +1935,7 @@ InvalidateVictimBuffer(BufferDesc *buf_hdr) Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0); /* finally delete buffer from the buffer mapping table */ - BufTableDelete(&tag, hash); + BufTableDelete(&tag, hash, buf_hdr->buf_id); LWLockRelease(partition_lock); diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index 1a65342177d..5739bc44c00 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -439,7 +439,7 @@ extern void InitBufTable(int size); extern uint32 BufTableHashCode(BufferTag *tagPtr); extern int BufTableLookup(BufferTag *tagPtr, uint32 hashcode); extern int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id); -extern void BufTableDelete(BufferTag *tagPtr, uint32 hashcode); +extern void BufTableDelete(BufferTag *tagPtr, uint32 hashcode, int buffer); /* localbuf.c */ extern bool PinLocalBuffer(BufferDesc *buf_hdr, bool adjust_usagecount); -- 2.46.0