From c4c72f2eac60ace32d0c6084625f1ac3cfd949b8 Mon Sep 17 00:00:00 2001 From: nkey Date: Fri, 7 Feb 2025 21:46:01 +0100 Subject: [PATCH v8 4/4] Fix index-only scan race condition in SP-GiST implementation Apply the buffer management improvements previously made to GiST index-only scans to SP-GiST as well. Author: Matthias van de Meent Michail Nikolaev Discussion: https://www.postgresql.org/message-id/flat/CANtu0oi0rkR%2BFsgyLXnGZ-uW2950-urApAWLhy-%2BV1WJD%3D_ZXA%40mail.gmail.com --- src/backend/access/spgist/spgscan.c | 108 ++++++++++++++++++++++---- src/backend/access/spgist/spgvacuum.c | 2 +- src/include/access/spgist_private.h | 5 ++ 3 files changed, 99 insertions(+), 16 deletions(-) diff --git a/src/backend/access/spgist/spgscan.c b/src/backend/access/spgist/spgscan.c index 53f910e9d89..72ea14dfe2c 100644 --- a/src/backend/access/spgist/spgscan.c +++ b/src/backend/access/spgist/spgscan.c @@ -30,7 +30,8 @@ typedef void (*storeRes_func) (SpGistScanOpaque so, ItemPointer heapPtr, Datum leafValue, bool isNull, SpGistLeafTuple leafTuple, bool recheck, - bool recheckDistances, double *distances); + bool recheckDistances, double *distances, + Buffer pin); /* * Pairing heap comparison function for the SpGistSearchItem queue. @@ -95,6 +96,11 @@ spgFreeSearchItem(SpGistScanOpaque so, SpGistSearchItem *item) if (item->traversalValue) pfree(item->traversalValue); + if (so->want_itup && item->isLeaf) + { + Assert(BufferIsValid(item->buffer)); + ReleaseBuffer(item->buffer); + } pfree(item); } @@ -142,6 +148,7 @@ spgAddStartItem(SpGistScanOpaque so, bool isnull) startEntry->traversalValue = NULL; startEntry->recheck = false; startEntry->recheckDistances = false; + startEntry->buffer = InvalidBuffer; spgAddSearchItemToQueue(so, startEntry); } @@ -416,6 +423,9 @@ spgrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, /* preprocess scankeys, set up the representation in *so */ spgPrepareScanKeys(scan); + /* release any pinned buffers from earlier rescans */ + spgScanEndDropAllPagePins(scan, so); + /* set up starting queue entries */ resetSpGistScanOpaque(so); @@ -428,6 +438,12 @@ spgendscan(IndexScanDesc scan) { SpGistScanOpaque so = (SpGistScanOpaque) scan->opaque; + /* + * release any pinned buffers from earlier rescans, before we drop their + * data by dropping the memory contexts. + */ + spgScanEndDropAllPagePins(scan, so); + MemoryContextDelete(so->tempCxt); MemoryContextDelete(so->traversalCxt); @@ -460,7 +476,7 @@ spgendscan(IndexScanDesc scan) static SpGistSearchItem * spgNewHeapItem(SpGistScanOpaque so, int level, SpGistLeafTuple leafTuple, Datum leafValue, bool recheck, bool recheckDistances, - bool isnull, double *distances) + bool isnull, double *distances, Buffer addPin) { SpGistSearchItem *item = spgAllocSearchItem(so, isnull, distances); @@ -479,6 +495,10 @@ spgNewHeapItem(SpGistScanOpaque so, int level, SpGistLeafTuple leafTuple, datumCopy(leafValue, so->state.attType.attbyval, so->state.attType.attlen); + Assert(BufferIsValid(addPin)); + IncrBufferRefCount(addPin); + item->buffer = addPin; + /* * If we're going to need to reconstruct INCLUDE attributes, store the * whole leaf tuple so we can get the INCLUDE attributes out of it. @@ -495,6 +515,7 @@ spgNewHeapItem(SpGistScanOpaque so, int level, SpGistLeafTuple leafTuple, { item->value = (Datum) 0; item->leafTuple = NULL; + item->buffer = InvalidBuffer; } item->traversalValue = NULL; item->isLeaf = true; @@ -513,7 +534,7 @@ spgNewHeapItem(SpGistScanOpaque so, int level, SpGistLeafTuple leafTuple, static bool spgLeafTest(SpGistScanOpaque so, SpGistSearchItem *item, SpGistLeafTuple leafTuple, bool isnull, - bool *reportedSome, storeRes_func storeRes) + bool *reportedSome, storeRes_func storeRes, Buffer buffer) { Datum leafValue; double *distances; @@ -580,7 +601,8 @@ spgLeafTest(SpGistScanOpaque so, SpGistSearchItem *item, recheck, recheckDistances, isnull, - distances); + distances, + buffer); spgAddSearchItemToQueue(so, heapItem); @@ -591,7 +613,7 @@ spgLeafTest(SpGistScanOpaque so, SpGistSearchItem *item, /* non-ordered scan, so report the item right away */ Assert(!recheckDistances); storeRes(so, &leafTuple->heapPtr, leafValue, isnull, - leafTuple, recheck, false, NULL); + leafTuple, recheck, false, NULL, buffer); *reportedSome = true; } } @@ -750,6 +772,35 @@ spgGetNextQueueItem(SpGistScanOpaque so) return (SpGistSearchItem *) pairingheap_remove_first(so->scanQueue); } +/* + * Note: This removes all items from the pairingheap. + */ +void +spgScanEndDropAllPagePins(IndexScanDesc scan, SpGistScanOpaque so) +{ + /* Guaranteed no pinned pages */ + if (so->scanQueue == NULL || !scan->xs_want_itup) + return; + + if (BufferIsValid(so->pagePin)) + { + ReleaseBuffer(so->pagePin); + so->pagePin = InvalidBuffer; + } + + while (!pairingheap_is_empty(so->scanQueue)) + { + SpGistSearchItem *item; + + item = spgGetNextQueueItem(so); + if (!item->isLeaf) + continue; + + Assert(BufferIsValid(item->buffer)); + spgFreeSearchItem(so, item); + } +} + enum SpGistSpecialOffsetNumbers { SpGistBreakOffsetNumber = InvalidOffsetNumber, @@ -761,6 +812,7 @@ static OffsetNumber spgTestLeafTuple(SpGistScanOpaque so, SpGistSearchItem *item, Page page, OffsetNumber offset, + Buffer buffer, bool isnull, bool isroot, bool *reportedSome, storeRes_func storeRes) @@ -799,7 +851,7 @@ spgTestLeafTuple(SpGistScanOpaque so, Assert(ItemPointerIsValid(&leafTuple->heapPtr)); - spgLeafTest(so, item, leafTuple, isnull, reportedSome, storeRes); + spgLeafTest(so, item, leafTuple, isnull, reportedSome, storeRes, buffer); return SGLT_GET_NEXTOFFSET(leafTuple); } @@ -835,7 +887,8 @@ redirect: Assert(so->numberOfNonNullOrderBys > 0); storeRes(so, &item->heapPtr, item->value, item->isNull, item->leafTuple, item->recheck, - item->recheckDistances, item->distances); + item->recheckDistances, item->distances, + item->buffer); reportedSome = true; } else @@ -873,7 +926,7 @@ redirect: /* When root is a leaf, examine all its tuples */ for (offset = FirstOffsetNumber; offset <= max; offset++) (void) spgTestLeafTuple(so, item, page, offset, - isnull, true, + buffer, isnull, true, &reportedSome, storeRes); } else @@ -883,7 +936,7 @@ redirect: { Assert(offset >= FirstOffsetNumber && offset <= max); offset = spgTestLeafTuple(so, item, page, offset, - isnull, false, + buffer, isnull, false, &reportedSome, storeRes); if (offset == SpGistRedirectOffsetNumber) goto redirect; @@ -929,9 +982,9 @@ static void storeBitmap(SpGistScanOpaque so, ItemPointer heapPtr, Datum leafValue, bool isnull, SpGistLeafTuple leafTuple, bool recheck, - bool recheckDistances, double *distances) + bool recheckDistances, double *distances, + Buffer pin) { - Assert(!recheckDistances && !distances); tbm_add_tuples(so->tbm, heapPtr, 1, recheck); so->ntids++; } @@ -954,10 +1007,9 @@ spggetbitmap(IndexScanDesc scan, TIDBitmap *tbm) /* storeRes subroutine for gettuple case */ static void -storeGettuple(SpGistScanOpaque so, ItemPointer heapPtr, - Datum leafValue, bool isnull, - SpGistLeafTuple leafTuple, bool recheck, - bool recheckDistances, double *nonNullDistances) +storeGettuple(SpGistScanOpaque so, ItemPointer heapPtr, Datum leafValue, + bool isnull, SpGistLeafTuple leafTuple, bool recheck, + bool recheckDistances, double *nonNullDistances, Buffer pin) { Assert(so->nPtrs < MaxIndexTuplesPerPage); so->heapPtrs[so->nPtrs] = *heapPtr; @@ -1016,6 +1068,25 @@ storeGettuple(SpGistScanOpaque so, ItemPointer heapPtr, so->reconTups[so->nPtrs] = heap_form_tuple(so->reconTupDesc, leafDatums, leafIsnulls); + + /* + * IOS: Make sure we have one additional pin on the buffer + * from the tuple we are going to return. + * + * In the case buffer is changing - unpin previous buffer. + * + * We may switch buffers almost randomly in case of ordered + * scan - but in such case each item in queue holding its own + * pin. + */ + if (so->pagePin != pin) + { + if (BufferIsValid(so->pagePin)) + ReleaseBuffer(so->pagePin); + so->pagePin = pin; + if (BufferIsValid(so->pagePin)) + IncrBufferRefCount(so->pagePin); + } } so->nPtrs++; } @@ -1065,6 +1136,13 @@ spggettuple(IndexScanDesc scan, ScanDirection dir) for (i = 0; i < so->nPtrs; i++) pfree(so->reconTups[i]); + + /* Unpin page of last returned tuple if any */ + if (BufferIsValid(so->pagePin)) + { + ReleaseBuffer(so->pagePin); + so->pagePin = InvalidBuffer; + } } so->iPtr = so->nPtrs = 0; diff --git a/src/backend/access/spgist/spgvacuum.c b/src/backend/access/spgist/spgvacuum.c index 894aefa19e1..f02c270c5cc 100644 --- a/src/backend/access/spgist/spgvacuum.c +++ b/src/backend/access/spgist/spgvacuum.c @@ -629,7 +629,7 @@ spgvacuumpage(spgBulkDeleteState *bds, BlockNumber blkno) buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, RBM_NORMAL, bds->info->strategy); - LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + LockBufferForCleanup(buffer); page = (Page) BufferGetPage(buffer); if (PageIsNew(page)) diff --git a/src/include/access/spgist_private.h b/src/include/access/spgist_private.h index cb43a278f46..c886e51e996 100644 --- a/src/include/access/spgist_private.h +++ b/src/include/access/spgist_private.h @@ -175,6 +175,7 @@ typedef struct SpGistSearchItem bool isLeaf; /* SearchItem is heap item */ bool recheck; /* qual recheck is needed */ bool recheckDistances; /* distance recheck is needed */ + Buffer buffer; /* buffer pinned for this leaf tuple (IOS-only) */ /* array with numberOfOrderBys entries */ double distances[FLEXIBLE_ARRAY_MEMBER]; @@ -226,6 +227,7 @@ typedef struct SpGistScanOpaqueData TupleDesc reconTupDesc; /* if so, descriptor for reconstructed tuples */ int nPtrs; /* number of TIDs found on current page */ int iPtr; /* index for scanning through same */ + Buffer pagePin; /* output tuple's pinned buffer, if IOS */ ItemPointerData heapPtrs[MaxIndexTuplesPerPage]; /* TIDs from cur page */ bool recheck[MaxIndexTuplesPerPage]; /* their recheck flags */ bool recheckDistances[MaxIndexTuplesPerPage]; /* distance recheck @@ -488,6 +490,9 @@ typedef SpGistDeadTupleData *SpGistDeadTuple; #define GBUF_REQ_LEAF(flags) (((flags) & GBUF_PARITY_MASK) == GBUF_LEAF) #define GBUF_REQ_NULLS(flags) ((flags) & GBUF_NULLS) +/* spgscan.c */ +void spgScanEndDropAllPagePins(IndexScanDesc scan, SpGistScanOpaque so); + /* spgutils.c */ /* reloption parameters */ -- 2.43.0