From 86ca31af76acd71fce3fd36ddf25f63d6d699b77 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Sat, 4 Jan 2025 01:02:15 +0100 Subject: [PATCH v3 2/2] RFC: Extend buffer pin lifetime for GIST IOS This should fix issues with incorrect results when IOS encounters concurrent vacuum. --- src/include/access/gist_private.h | 16 ++++ src/backend/access/gist/gistget.c | 116 ++++++++++++++++++++++++++- src/backend/access/gist/gistscan.c | 60 ++++++++++++++ src/backend/access/gist/gistvacuum.c | 4 +- 4 files changed, 192 insertions(+), 4 deletions(-) diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h index 7b8749c8db0..cf5fd4336c7 100644 --- a/src/include/access/gist_private.h +++ b/src/include/access/gist_private.h @@ -17,6 +17,7 @@ #include "access/amapi.h" #include "access/gist.h" #include "access/itup.h" +#include "lib/ilist.h" #include "lib/pairingheap.h" #include "storage/bufmgr.h" #include "storage/buffile.h" @@ -124,6 +125,7 @@ typedef struct GISTSearchHeapItem * index-only scans */ OffsetNumber offnum; /* track offset in page to mark tuple as * LP_DEAD */ + uint32 pagepinOffset; /* Pinned page's offset into GISTScanOpaqueData.pinned */ } GISTSearchHeapItem; /* Unvisited item, either index page or heap tuple */ @@ -148,6 +150,13 @@ typedef struct GISTSearchItem (offsetof(GISTSearchItem, distances) + \ sizeof(IndexOrderByDistance) * (n_distances)) +typedef struct GISTPagePin +{ + slist_node nextFree; + Buffer buffer; /* pinned buffer */ + int count; /* number of results not yet returned */ +} GISTPagePin; + /* * GISTScanOpaqueData: private state for a scan of a GiST index */ @@ -176,6 +185,12 @@ typedef struct GISTScanOpaqueData OffsetNumber curPageData; /* next item to return */ MemoryContext pageDataCxt; /* context holding the fetched tuples, for * index-only scans */ + + GISTPagePin *pinned; /* page pins, used in index-only scans. + * otherwise NULL */ + slist_head nextFreePin; /* next free page pin, if available */ + BlockNumber pincapacity; /* current max pin count in pinned */ + uint32 releasePinOffset; /* reduce pin count on this buffer next */ } GISTScanOpaqueData; typedef GISTScanOpaqueData *GISTScanOpaque; @@ -463,6 +478,7 @@ extern XLogRecPtr gistXLogAssignLSN(void); extern bool gistgettuple(IndexScanDesc scan, ScanDirection dir); extern int64 gistgetbitmap(IndexScanDesc scan, TIDBitmap *tbm); extern bool gistcanreturn(Relation index, int attno); +extern uint32 gistgetpin(GISTScanOpaque opaque); /* gistvalidate.c */ extern bool gistvalidate(Oid opclassoid); diff --git a/src/backend/access/gist/gistget.c b/src/backend/access/gist/gistget.c index b35b8a97577..779478a44b2 100644 --- a/src/backend/access/gist/gistget.c +++ b/src/backend/access/gist/gistget.c @@ -337,6 +337,8 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, OffsetNumber maxoff; OffsetNumber i; MemoryContext oldcxt; + GISTPagePin *pin = NULL; + uint32 pagepinoff; Assert(!GISTSearchItemIsHeap(*pageItem)); @@ -471,6 +473,16 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, so->pageData[so->nPageData].recontup = gistFetchTuple(giststate, r, it); MemoryContextSwitchTo(oldcxt); + + if (pin == NULL) + { + pagepinoff = gistgetpin(so); + pin = &so->pinned[pagepinoff]; + pin->buffer = buffer; + pin->count = 0; + } + so->pageData[so->nPageData].pagepinOffset = pagepinoff; + pin->count++; } so->nPageData++; } @@ -501,7 +513,19 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, * In an index-only scan, also fetch the data from the tuple. */ if (scan->xs_want_itup) + { item->data.heap.recontup = gistFetchTuple(giststate, r, it); + + if (!PointerIsValid(pin)) + { + pagepinoff = gistgetpin(so); + pin = &so->pinned[pagepinoff]; + pin->buffer = buffer; + pin->count = 0; + } + pin->count++; + item->data.heap.pagepinOffset = pagepinoff; + } } else { @@ -526,7 +550,67 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, } } - UnlockReleaseBuffer(buffer); + if (scan->xs_want_itup && GistPageIsLeaf(page) && PointerIsValid(pin)) + { + Assert(pin->count > 0); + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + /* Pin to be released when scan pin->count reaches 0 */ + } + else + UnlockReleaseBuffer(buffer); +} + +uint32 +gistgetpin(GISTScanOpaque opaque) +{ + slist_node *node; + GISTPagePin *nextFree; + + if (opaque->pincapacity == 0) + { + opaque->pincapacity = 1; + opaque->pinned = palloc0(sizeof(GISTPagePin)); + + return 0; + } + + if (slist_is_empty(&opaque->nextFreePin)) + { + BlockNumber firstEntry = opaque->pincapacity; + + /* + * We don't have any entries in the linked list, so repalloc is used + * safely here. (If there were entries left, they'd be stale + * references after this.) + * + * Note that before overflowing uint32 we'll always first fail the + * repalloc(), which has a limit of ~1GiB - much less than the 16GiB + * that we'd have to consume before we'd overflow pincapacity. + */ + opaque->pincapacity *= 2; + + opaque->pinned = repalloc(opaque->pinned, + mul_size(sizeof(GISTPagePin), + opaque->pincapacity)); + + /* + * Fill nextFreePin list back-to-front, to push higher indexes into + * the back of the slist. + */ + for (BlockNumber i = opaque->pincapacity - 1; i >= firstEntry; i--) + { + slist_push_head(&opaque->nextFreePin, + &opaque->pinned[i].nextFree); + opaque->pinned[i].buffer = InvalidBuffer; + opaque->pinned[i].count = 0; + } + } + + node = slist_pop_head_node(&opaque->nextFreePin); + nextFree = slist_container(GISTPagePin, nextFree, node); + nextFree->count = 0; + nextFree->buffer = 0; + return (uint32) (nextFree - opaque->pinned); } /* @@ -588,7 +672,10 @@ getNextNearest(IndexScanDesc scan) /* in an index-only scan, also return the reconstructed tuple. */ if (scan->xs_want_itup) + { scan->xs_hitup = item->data.heap.recontup; + so->releasePinOffset = item->data.heap.pagepinOffset; + } res = true; } else @@ -637,6 +724,28 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir) gistScanPage(scan, &fakeItem, NULL, NULL, NULL); } + if (so->releasePinOffset != UINT32_MAX) + { + GISTPagePin *pin; + + Assert(so->pincapacity > so->releasePinOffset); + + pin = &so->pinned[so->releasePinOffset]; + + Assert(pin->count > 0); + Assert(BufferIsValid(pin->buffer)); + + pin->count--; + + if (pin->count == 0) + { + ReleaseBuffer(pin->buffer); + pin->buffer = InvalidBuffer; + pin->count = 0; + slist_push_head(&so->nextFreePin, &pin->nextFree); + } + } + if (scan->numberOfOrderBys > 0) { /* Must fetch tuples in strict distance order */ @@ -651,7 +760,6 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir) { if (scan->kill_prior_tuple && so->curPageData > 0) { - if (so->killedItems == NULL) { MemoryContext oldCxt = @@ -673,7 +781,11 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir) /* in an index-only scan, also return the reconstructed tuple */ if (scan->xs_want_itup) + { scan->xs_hitup = so->pageData[so->curPageData].recontup; + so->releasePinOffset = + so->pageData[so->curPageData].pagepinOffset; + } so->curPageData++; diff --git a/src/backend/access/gist/gistscan.c b/src/backend/access/gist/gistscan.c index de472e16373..738fd0d4ace 100644 --- a/src/backend/access/gist/gistscan.c +++ b/src/backend/access/gist/gistscan.c @@ -111,6 +111,11 @@ gistbeginscan(Relation r, int nkeys, int norderbys) so->curBlkno = InvalidBlockNumber; so->curPageLSN = InvalidXLogRecPtr; + so->pincapacity = 0; + so->pinned = NULL; + slist_init(&so->nextFreePin); + so->releasePinOffset = UINT32_MAX; + scan->opaque = so; /* @@ -209,6 +214,34 @@ gistrescan(IndexScanDesc scan, ScanKey key, int nkeys, ALLOCSET_DEFAULT_SIZES); } + if (PointerIsValid(so->pinned)) + { + Assert(scan->xs_want_itup); + + slist_init(&so->nextFreePin); + + /* + * Fill nextFreePin list back-to-front, to push higher indexes into + * the back of the slist. + */ + for (int p = so->pincapacity - 1; p >= 0; p--) + { + GISTPagePin *pin = &so->pinned[p]; + + if (BufferIsValid(pin->buffer)) + { + Assert(pin->count > 0); + ReleaseBuffer(pin->buffer); + pin->buffer = InvalidBuffer; + pin->count = 0; + } + + slist_push_head(&so->nextFreePin, &pin->nextFree); + } + + so->releasePinOffset = UINT32_MAX; + } + /* create new, empty pairing heap for search queue */ oldCxt = MemoryContextSwitchTo(so->queueCxt); so->queue = pairingheap_allocate(pairingheap_GISTSearchItem_cmp, scan); @@ -348,6 +381,33 @@ gistendscan(IndexScanDesc scan) { GISTScanOpaque so = (GISTScanOpaque) scan->opaque; + if (PointerIsValid(so->pinned)) + { + Assert(scan->xs_want_itup); + slist_init(&so->nextFreePin); + + /* + * Fill nextFreePin list back-to-front, to push higher indexes into + * the back of the slist. + */ + for (int p = so->pincapacity - 1; p >= 0; p--) + { + GISTPagePin *pin = &so->pinned[p]; + + if (BufferIsValid(pin->buffer)) + { + Assert(pin->count > 0); + ReleaseBuffer(pin->buffer); + pin->buffer = InvalidBuffer; + pin->count = 0; + } + + slist_push_head(&so->nextFreePin, &pin->nextFree); + } + + so->releasePinOffset = UINT32_MAX; + } + /* * freeGISTstate is enough to clean up everything made by gistbeginscan, * as well as the queueCxt if there is a separate context for it. diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c index 24fb94f473e..b0f0401a8ae 100644 --- a/src/backend/access/gist/gistvacuum.c +++ b/src/backend/access/gist/gistvacuum.c @@ -290,9 +290,9 @@ restart: /* * We are not going to stay here for a long time, aggressively grab an - * exclusive lock. + * exclusive lock for cleanup. */ - LockBuffer(buffer, GIST_EXCLUSIVE); + LockBufferForCleanup(buffer); page = (Page) BufferGetPage(buffer); if (gistPageRecyclable(page)) -- 2.45.2