From 6211b0e943ec317c163212f856cf4acaf638cd5d Mon Sep 17 00:00:00 2001 From: nkey Date: Fri, 7 Feb 2025 21:41:50 +0100 Subject: [PATCH v8 2/4] Fix index-only scan race condition in GiST implementation Prevent incorrect results in index-only scans caused by concurrent VACUUM. 1. Use LockBufferForCleanup instead of GIST_EXCLUSIVE in gistvacuum.c for more consistent naming 2. Add buffer pinning to ensure index pages remain pinned according to correct locking protocol 3. Update documentation in gist/README to explain the new locking scheme and note potential buffer exhaustion concerns with ordered scans. Author: Matthias van de Meent , Michail Nikolaev Discussion: https://www.postgresql.org/message-id/flat/CANtu0oi0rkR%2BFsgyLXnGZ-uW2950-urApAWLhy-%2BV1WJD%3D_ZXA%40mail.gmail.com --- src/backend/access/gist/README | 16 ++++ src/backend/access/gist/gistget.c | 32 ++++++++ src/backend/access/gist/gistscan.c | 115 ++++++++++++++++++++++++--- src/backend/access/gist/gistvacuum.c | 6 +- src/include/access/gist_private.h | 2 + 5 files changed, 158 insertions(+), 13 deletions(-) diff --git a/src/backend/access/gist/README b/src/backend/access/gist/README index 8015ff19f05..c7c2afad088 100644 --- a/src/backend/access/gist/README +++ b/src/backend/access/gist/README @@ -287,6 +287,22 @@ would complicate the insertion algorithm. So when an insertion sees a page with F_FOLLOW_RIGHT set, it immediately tries to bring the split that crashed in the middle to completion by adding the downlink in the parent. +Index-only scans and VACUUM +--------------------------- + +Index-only scans require that any tuple returned by the index scan has not +been removed from the index by a call to ambulkdelete through VACUUM. +To ensure this invariant, bulkdelete now requires a buffer cleanup lock, and +every Index-only scan (IOS) will keep a pin on each page that it is returning +tuples from. For ordered scans, we keep one pin for each matching leaf tuple, +for unordered scans we just keep an additional pin while we're still working +on the page's tuples. This ensures that pages seen by the scan won't be +cleaned up until after the tuples have been returned. + +These longer pin lifetimes can cause buffer exhaustion with messages like "no +unpinned buffers available" when the index has many pages that have similar +ordering; but future work can figure out how to best work that out. + Buffering build algorithm ------------------------- diff --git a/src/backend/access/gist/gistget.c b/src/backend/access/gist/gistget.c index cc40e928e0a..3788a855c50 100644 --- a/src/backend/access/gist/gistget.c +++ b/src/backend/access/gist/gistget.c @@ -395,6 +395,7 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, } so->nPageData = so->curPageData = 0; + Assert(so->pagePin == InvalidBuffer); scan->xs_hitup = NULL; /* might point into pageDataCxt */ if (so->pageDataCxt) MemoryContextReset(so->pageDataCxt); @@ -460,6 +461,7 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, so->pageData[so->nPageData].heapPtr = it->t_tid; so->pageData[so->nPageData].recheck = recheck; so->pageData[so->nPageData].offnum = i; + so->pageData[so->nPageData].buffer = InvalidBuffer; /* * In an index-only scan, also fetch the data from the tuple. The @@ -471,6 +473,16 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, so->pageData[so->nPageData].recontup = gistFetchTuple(giststate, r, it); MemoryContextSwitchTo(oldcxt); + + /* + * Only maintain a single additional buffer pin for unordered + * IOS scans; as we have all data already in one place. + */ + if (so->nPageData == 0) + { + so->pagePin = buffer; + IncrBufferRefCount(buffer); + } } so->nPageData++; } @@ -501,7 +513,11 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, * In an index-only scan, also fetch the data from the tuple. */ if (scan->xs_want_itup) + { item->data.heap.recontup = gistFetchTuple(giststate, r, it); + item->data.heap.buffer = buffer; + IncrBufferRefCount(buffer); + } } else { @@ -567,6 +583,10 @@ getNextNearest(IndexScanDesc scan) /* free previously returned tuple */ pfree(scan->xs_hitup); scan->xs_hitup = NULL; + + Assert(BufferIsValid(so->pagePin)); + ReleaseBuffer(so->pagePin); + so->pagePin = InvalidBuffer; } do @@ -588,7 +608,11 @@ getNextNearest(IndexScanDesc scan) /* in an index-only scan, also return the reconstructed tuple. */ if (scan->xs_want_itup) + { + Assert(BufferIsValid(item->data.heap.buffer)); scan->xs_hitup = item->data.heap.recontup; + so->pagePin = item->data.heap.buffer; + } res = true; } else @@ -704,6 +728,14 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir) so->killedItems[so->numKilled++] = so->pageData[so->curPageData - 1].offnum; } + + if (scan->xs_want_itup && so->nPageData > 0) + { + Assert(BufferIsValid(so->pagePin)); + ReleaseBuffer(so->pagePin); + so->pagePin = InvalidBuffer; + } + /* find and process the next index page */ do { diff --git a/src/backend/access/gist/gistscan.c b/src/backend/access/gist/gistscan.c index 700fa959d03..932c2271510 100644 --- a/src/backend/access/gist/gistscan.c +++ b/src/backend/access/gist/gistscan.c @@ -110,6 +110,7 @@ gistbeginscan(Relation r, int nkeys, int norderbys) so->numKilled = 0; so->curBlkno = InvalidBlockNumber; so->curPageLSN = InvalidXLogRecPtr; + so->pagePin = InvalidBuffer; scan->opaque = so; @@ -151,18 +152,73 @@ gistrescan(IndexScanDesc scan, ScanKey key, int nkeys, Assert(so->queueCxt == so->giststate->scanCxt); first_time = true; } - else if (so->queueCxt == so->giststate->scanCxt) - { - /* second time through */ - so->queueCxt = AllocSetContextCreate(so->giststate->scanCxt, - "GiST queue context", - ALLOCSET_DEFAULT_SIZES); - first_time = false; - } else { - /* third or later time through */ - MemoryContextReset(so->queueCxt); + /* + * In the first scan of a query we allocate IOS items in the scan + * context, which is never reset. To not leak this memory, we + * manually free the queue entries. + */ + const bool freequeue = so->queueCxt == so->giststate->scanCxt; + /* + * Index-only scans require that vacuum can't clean up entries that + * we're still planning to return, so we hold a pin on the buffer until + * we're past the returned item (1 pin count for every index tuple). + * When rescan is called, however, we need to clean up the pins that + * we still hold, lest we leak them and lose a buffer entry to that + * page. + */ + const bool unpinqueue = scan->xs_want_itup; + + if (freequeue || unpinqueue) + { + while (!pairingheap_is_empty(so->queue)) + { + pairingheap_node *node; + GISTSearchItem *item; + + node = pairingheap_remove_first(so->queue); + item = pairingheap_container(GISTSearchItem, phNode, node); + + /* + * If we need to unpin a buffer for IOS' heap items, do so + * now. + */ + if (unpinqueue && item->blkno == InvalidBlockNumber) + { + Assert(BufferIsValid(item->data.heap.buffer)); + ReleaseBuffer(item->data.heap.buffer); + } + + /* + * item->data.heap.recontup is stored in the separate memory + * context so->pageDataCxt, which is always reset; so we don't + * need to free that. + * "item" itself is allocated into the queue context, which is + * generally reset in rescan. + * However, only in the first scan, we allocate these items + * into the main scan context, which isn't reset; so we must + * free these items, or else we'd leak the memory for the + * duration of the query. + */ + if (freequeue) + pfree(item); + } + } + + if (so->queueCxt == so->giststate->scanCxt) + { + /* second time through */ + so->queueCxt = AllocSetContextCreate(so->giststate->scanCxt, + "GiST queue context", + ALLOCSET_DEFAULT_SIZES); + } + else + { + /* third or later time through */ + MemoryContextReset(so->queueCxt); + } + first_time = false; } @@ -341,6 +397,15 @@ gistrescan(IndexScanDesc scan, ScanKey key, int nkeys, /* any previous xs_hitup will have been pfree'd in context resets above */ scan->xs_hitup = NULL; + + if (scan->xs_want_itup) + { + if (BufferIsValid(so->pagePin)) + { + ReleaseBuffer(so->pagePin); + so->pagePin = InvalidBuffer; + } + } } void @@ -348,6 +413,36 @@ gistendscan(IndexScanDesc scan) { GISTScanOpaque so = (GISTScanOpaque) scan->opaque; + if (scan->xs_want_itup) + { + if (BufferIsValid(so->pagePin)) + { + ReleaseBuffer(so->pagePin); + so->pagePin = InvalidBuffer; + } + + /* unpin any leftover buffers */ + while (!pairingheap_is_empty(so->queue)) + { + pairingheap_node *node; + GISTSearchItem *item; + + /* + * Note: unlike gistrescan, there is no need to actually free the + * items here, as that's handled by memory context reset in the + * call to freeGISTstate() below. + */ + node = pairingheap_remove_first(so->queue); + item = pairingheap_container(GISTSearchItem, phNode, node); + + if (item->blkno == InvalidBlockNumber) + { + Assert(BufferIsValid(item->data.heap.buffer)); + ReleaseBuffer(item->data.heap.buffer); + } + } + } + /* * freeGISTstate is enough to clean up everything made by gistbeginscan, * as well as the queueCxt if there is a separate context for it. diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c index fe0bfb781ca..840b3d586ed 100644 --- a/src/backend/access/gist/gistvacuum.c +++ b/src/backend/access/gist/gistvacuum.c @@ -289,10 +289,10 @@ restart: info->strategy); /* - * We are not going to stay here for a long time, aggressively grab an - * exclusive lock. + * We are not going to stay here for a long time, aggressively grab a + * cleanup lock. */ - LockBuffer(buffer, GIST_EXCLUSIVE); + LockBufferForCleanup(buffer); page = (Page) BufferGetPage(buffer); if (gistPageRecyclable(page)) diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h index 39404ec7cdb..e559117e7d7 100644 --- a/src/include/access/gist_private.h +++ b/src/include/access/gist_private.h @@ -124,6 +124,7 @@ typedef struct GISTSearchHeapItem * index-only scans */ OffsetNumber offnum; /* track offset in page to mark tuple as * LP_DEAD */ + Buffer buffer; /* buffer to unpin, when IOS */ } GISTSearchHeapItem; /* Unvisited item, either index page or heap tuple */ @@ -176,6 +177,7 @@ typedef struct GISTScanOpaqueData OffsetNumber curPageData; /* next item to return */ MemoryContext pageDataCxt; /* context holding the fetched tuples, for * index-only scans */ + Buffer pagePin; /* buffer of page, if pinned */ } GISTScanOpaqueData; typedef GISTScanOpaqueData *GISTScanOpaque; -- 2.43.0