From d864b395f0c0012312d518c28787ae40dcdbe5b1 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Tue, 8 Oct 2024 11:40:54 -0400 Subject: [PATCH v4] Optimize and simplify nbtree backwards scans. --- src/include/access/nbtree.h | 11 +- src/backend/access/nbtree/nbtree.c | 40 ++- src/backend/access/nbtree/nbtsearch.c | 463 +++++++++++++------------- src/backend/access/nbtree/nbtutils.c | 2 +- 4 files changed, 267 insertions(+), 249 deletions(-) diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index d64300fb9..c082bdf64 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -954,6 +954,7 @@ typedef struct BTScanPosData XLogRecPtr lsn; /* pos in the WAL stream when page was read */ BlockNumber currPage; /* page referenced by items array */ + BlockNumber prevPage; /* page's left link when we scanned it */ BlockNumber nextPage; /* page's right link when we scanned it */ /* @@ -1022,6 +1023,7 @@ typedef BTScanPosData *BTScanPos; #define BTScanPosInvalidate(scanpos) \ do { \ (scanpos).currPage = InvalidBlockNumber; \ + (scanpos).prevPage = InvalidBlockNumber; \ (scanpos).nextPage = InvalidBlockNumber; \ (scanpos).buf = InvalidBuffer; \ (scanpos).lsn = InvalidXLogRecPtr; \ @@ -1090,7 +1092,6 @@ typedef struct BTReadPageState OffsetNumber minoff; /* Lowest non-pivot tuple's offset */ OffsetNumber maxoff; /* Highest non-pivot tuple's offset */ IndexTuple finaltup; /* Needed by scans with array keys */ - BlockNumber prev_scan_page; /* previous _bt_parallel_release block */ Page page; /* Page being read */ /* Per-tuple input parameters, set by _bt_readpage for _bt_checkkeys */ @@ -1192,11 +1193,13 @@ extern int btgettreeheight(Relation rel); * prototypes for internal functions in nbtree.c */ extern bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, - bool first); -extern void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page); + BlockNumber *currblkno, bool first); +extern void _bt_parallel_release(IndexScanDesc scan, + BlockNumber next_scan_page, + BlockNumber curr_page); extern void _bt_parallel_done(IndexScanDesc scan); extern void _bt_parallel_primscan_schedule(IndexScanDesc scan, - BlockNumber prev_scan_page); + BlockNumber curr_page); /* * prototypes for functions in nbtdedup.c diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 56e502c4f..b926b99ba 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -66,7 +66,9 @@ typedef enum */ typedef struct BTParallelScanDescData { - BlockNumber btps_scanPage; /* latest or next page to be scanned */ + BlockNumber btps_nextScanPage; /* next page to be scanned */ + BlockNumber btps_lastCurrPage; /* page whose sibling link was copied into + * btps_scanPage */ BTPS_State btps_pageStatus; /* indicates whether next page is * available for scan. see above for * possible states of parallel scan. */ @@ -548,7 +550,8 @@ btinitparallelscan(void *target) BTParallelScanDesc bt_target = (BTParallelScanDesc) target; SpinLockInit(&bt_target->btps_mutex); - bt_target->btps_scanPage = InvalidBlockNumber; + bt_target->btps_nextScanPage = InvalidBlockNumber; + bt_target->btps_lastCurrPage = InvalidBlockNumber; bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED; ConditionVariableInit(&bt_target->btps_cv); } @@ -573,7 +576,8 @@ btparallelrescan(IndexScanDesc scan) * consistency. */ SpinLockAcquire(&btscan->btps_mutex); - btscan->btps_scanPage = InvalidBlockNumber; + btscan->btps_nextScanPage = InvalidBlockNumber; + btscan->btps_lastCurrPage = InvalidBlockNumber; btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED; SpinLockRelease(&btscan->btps_mutex); } @@ -600,7 +604,8 @@ btparallelrescan(IndexScanDesc scan) * Callers should ignore the value of pageno if the return value is false. */ bool -_bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first) +_bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, + BlockNumber *currblkno, bool first) { BTScanOpaque so = (BTScanOpaque) scan->opaque; bool exit_loop = false; @@ -609,6 +614,7 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first) BTParallelScanDesc btscan; *pageno = P_NONE; + *currblkno = InvalidBlockNumber; if (first) { @@ -684,7 +690,8 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first) * of advancing it to a new page! */ btscan->btps_pageStatus = BTPARALLEL_ADVANCING; - *pageno = btscan->btps_scanPage; + *pageno = btscan->btps_nextScanPage; + *currblkno = btscan->btps_lastCurrPage; exit_loop = true; } SpinLockRelease(&btscan->btps_mutex); @@ -699,17 +706,22 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first) /* * _bt_parallel_release() -- Complete the process of advancing the scan to a - * new page. We now have the new value btps_scanPage; some other backend + * new page. We now have the new value btps_nextScanPage; another backend * can now begin advancing the scan. * - * Callers whose scan uses array keys must save their scan_page argument so + * Callers whose scan uses array keys must save their curr_page argument so * that it can be passed to _bt_parallel_primscan_schedule, should caller * determine that another primitive index scan is required. If that happens, * scan_page won't be scanned by any backend (unless the next primitive index * scan lands on scan_page). + * + * Note: unlike the serial case, parallel scans don't need to remember both + * sibling links. Caller can just pass us whichever link is next for the + * current scan direction because it cannot change within a parallel scan. */ void -_bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page) +_bt_parallel_release(IndexScanDesc scan, BlockNumber next_scan_page, + BlockNumber curr_page) { ParallelIndexScanDesc parallel_scan = scan->parallel_scan; BTParallelScanDesc btscan; @@ -718,7 +730,8 @@ _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page) parallel_scan->ps_offset); SpinLockAcquire(&btscan->btps_mutex); - btscan->btps_scanPage = scan_page; + btscan->btps_nextScanPage = next_scan_page; + btscan->btps_lastCurrPage = curr_page; btscan->btps_pageStatus = BTPARALLEL_IDLE; SpinLockRelease(&btscan->btps_mutex); ConditionVariableSignal(&btscan->btps_cv); @@ -774,13 +787,13 @@ _bt_parallel_done(IndexScanDesc scan) /* * _bt_parallel_primscan_schedule() -- Schedule another primitive index scan. * - * Caller passes the block number most recently passed to _bt_parallel_release + * Caller passes the curr_page most recently passed to _bt_parallel_release * by its backend. Caller successfully schedules the next primitive index scan * if the shared parallel state hasn't been seized since caller's backend last * advanced the scan. */ void -_bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber prev_scan_page) +_bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber curr_page) { BTScanOpaque so = (BTScanOpaque) scan->opaque; ParallelIndexScanDesc parallel_scan = scan->parallel_scan; @@ -792,10 +805,11 @@ _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber prev_scan_page) parallel_scan->ps_offset); SpinLockAcquire(&btscan->btps_mutex); - if (btscan->btps_scanPage == prev_scan_page && + if (btscan->btps_lastCurrPage == curr_page && btscan->btps_pageStatus == BTPARALLEL_IDLE) { - btscan->btps_scanPage = InvalidBlockNumber; + btscan->btps_nextScanPage = InvalidBlockNumber; + btscan->btps_lastCurrPage = InvalidBlockNumber; btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN; /* Serialize scan's current array keys */ diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index fff7c89ea..047c891d8 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -43,10 +43,12 @@ static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, int tupleOffset); static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir); -static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir); -static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, - ScanDirection dir); -static Buffer _bt_walk_left(Relation rel, Buffer buf); +static bool _bt_readfirstpage(IndexScanDesc scan, ScanDirection dir, + OffsetNumber offnum); +static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, + BlockNumber lastcurrblkno, ScanDirection dir); +static Buffer _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno, + BlockNumber lastcurrblkno); static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir); static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir); @@ -924,7 +926,9 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) */ if (scan->parallel_scan != NULL) { - status = _bt_parallel_seize(scan, &blkno, true); + BlockNumber lastcurrblkno; + + status = _bt_parallel_seize(scan, &blkno, &lastcurrblkno, true); /* * Initialize arrays (when _bt_parallel_seize didn't already set up @@ -942,7 +946,11 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) } else if (blkno != InvalidBlockNumber) { - if (!_bt_parallel_readpage(scan, blkno, dir)) + Assert(!so->needPrimScan); + + _bt_initialize_more_data(so, dir); + + if (!_bt_readnextpage(scan, blkno, lastcurrblkno, dir)) return false; goto readcomplete; } @@ -1427,8 +1435,6 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) } } - PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot); - _bt_initialize_more_data(so, dir); /* position to the precise item on the page */ @@ -1455,21 +1461,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * for the page. For example, when inskey is both < the leaf page's high * key and > all of its non-pivot tuples, offnum will be "maxoff + 1". */ - if (!_bt_readpage(scan, dir, offnum, true)) - { - /* - * There's no actually-matching data on this page. Try to advance to - * the next page. Return false if there's no matching data at all. - */ - _bt_unlockbuf(scan->indexRelation, so->currPos.buf); - if (!_bt_steppage(scan, dir)) - return false; - } - else - { - /* We have at least one item to return as scan's first item */ - _bt_drop_lock_and_maybe_pin(scan, &so->currPos); - } + if (!_bt_readfirstpage(scan, dir, offnum)) + return false; readcomplete: /* OK, itemIndex says what to return */ @@ -1582,17 +1575,6 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, page = BufferGetPage(so->currPos.buf); opaque = BTPageGetOpaque(page); - /* allow next page be processed by parallel worker */ - if (scan->parallel_scan) - { - if (ScanDirectionIsForward(dir)) - pstate.prev_scan_page = opaque->btpo_next; - else - pstate.prev_scan_page = BufferGetBlockNumber(so->currPos.buf); - - _bt_parallel_release(scan, pstate.prev_scan_page); - } - indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation); arrayKeys = so->numArrayKeys != 0; minoff = P_FIRSTDATAKEY(opaque); @@ -1626,10 +1608,10 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf); /* - * we must save the page's right-link while scanning it; this tells us - * where to step right to after we're done with these items. There is no - * corresponding need for the left-link, since splits always go right. + * we must save the page's right-link and left-link; this tells us where + * to step right/left to after we're done with these items */ + so->currPos.prevPage = opaque->btpo_prev; so->currPos.nextPage = opaque->btpo_next; /* initialize tuple workspace to empty */ @@ -1640,6 +1622,18 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, * good. */ Assert(BTScanPosIsPinned(so->currPos)); + Assert(!P_IGNORE(opaque)); + + /* allow next page to be processed by parallel worker */ + if (scan->parallel_scan) + { + if (ScanDirectionIsForward(dir)) + _bt_parallel_release(scan, so->currPos.nextPage, + so->currPos.currPage); + else + _bt_parallel_release(scan, so->currPos.prevPage, + so->currPos.currPage); + } /* * Prechecking the value of the continuescan flag for the last item on the @@ -1934,7 +1928,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, * We don't need to visit page to the left when no more matches will * be found there */ - if (!pstate.continuescan || P_LEFTMOST(opaque)) + if (!pstate.continuescan) so->currPos.moreLeft = false; Assert(itemIndex >= 0); @@ -2035,19 +2029,21 @@ _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, /* * _bt_steppage() -- Step to next page containing valid data for scan * - * On entry, if so->currPos.buf is valid the buffer is pinned but not locked; - * if pinned, we'll drop the pin before moving to next page. The buffer is - * not locked on entry. + * On entry, if so->currPos.buf is valid the buffer is pinned but not locked. + * This is a wrapper on _bt_readnextpage that performs final steps for the + * current page (including dropping its pin). It sets up the _bt_readnextpage + * call using either local state saved by the last _bt_readpage call, or using + * shared state consumed by seizing the parallel scan. * - * For success on a scan using a non-MVCC snapshot we hold a pin, but not a - * read lock, on that page. If we do not hold the pin, we set so->currPos.buf - * to InvalidBuffer. We return true to indicate success. + * Parallel scan callers that have already seized the scan should directly + * call _bt_readnextpage, rather than calling here. */ static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir) { BTScanOpaque so = (BTScanOpaque) scan->opaque; BlockNumber blkno = InvalidBlockNumber; + BlockNumber lastcurrblkno; bool status; Assert(BTScanPosIsValid(so->currPos)); @@ -2098,85 +2094,139 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) else so->markPos.moreLeft = true; } + + /* mark/restore not supported by parallel scans */ + Assert(!scan->parallel_scan); } - if (ScanDirectionIsForward(dir)) + /* release the previously pinned leaf page buffer, if any */ + BTScanPosUnpinIfPinned(so->currPos); + + /* Walk to the next page with data */ + if (!scan->parallel_scan) { - /* Walk right to the next page with data */ - if (scan->parallel_scan != NULL) + /* Not parallel, so use nextPage/prevPage from local scan state */ + if (ScanDirectionIsForward(dir)) { - /* - * Seize the scan to get the next block number; if the scan has - * ended already, bail out. - */ - status = _bt_parallel_seize(scan, &blkno, false); - if (!status) - { - /* release the previous buffer, if pinned */ - BTScanPosUnpinIfPinned(so->currPos); - BTScanPosInvalidate(so->currPos); - return false; - } + blkno = so->currPos.nextPage; + /* Remember we left a page with data */ + so->currPos.moreLeft = true; } else { - /* Not parallel, so use the previously-saved nextPage link. */ - blkno = so->currPos.nextPage; + blkno = so->currPos.prevPage; + /* Remember we left a page with data */ + so->currPos.moreRight = true; } - - /* Remember we left a page with data */ - so->currPos.moreLeft = true; - - /* release the previous buffer, if pinned */ - BTScanPosUnpinIfPinned(so->currPos); + lastcurrblkno = so->currPos.currPage; } else { - /* Remember we left a page with data */ - so->currPos.moreRight = true; + /* + * Seize the scan to get the nextPage and currPage from shared + * parallel state + */ + status = _bt_parallel_seize(scan, &blkno, &lastcurrblkno, false); + if (!status) + { + BTScanPosInvalidate(so->currPos); + return false; + } - if (scan->parallel_scan != NULL) - { - /* - * Seize the scan to get the current block number; if the scan has - * ended already, bail out. - */ - status = _bt_parallel_seize(scan, &blkno, false); - BTScanPosUnpinIfPinned(so->currPos); - if (!status) - { - BTScanPosInvalidate(so->currPos); - return false; - } - } - else - { - /* Not parallel, so just use our own notion of the current page */ - blkno = so->currPos.currPage; - } + /* no point in remembering we left a page with data here */ } - if (!_bt_readnextpage(scan, blkno, dir)) + if (!_bt_readnextpage(scan, blkno, lastcurrblkno, dir)) return false; - /* We have at least one item to return as scan's next item */ - _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + return true; +} +/* + * _bt_readfirstpage() -- Read first page containing valid data for scan + * + * Caller passes us a page offset number that's the first tuple that'll be + * read by _bt_readpage. This offset typically comes from _bt_binsrch. + * + * On entry, so->currPos.buf must be pinned and locked. Also, parallel scan + * callers must have seized the scan before calling here. + * + * On success exit, so->currPos is updated to contain data from the next + * interesting page, and we return true. We hold a pin on the buffer on + * success exit, except for scans that _bt_drop_lock_and_maybe_pin indicates + * can safely have their pin dropped to avoid blocking progress by VACUUM. + * + * If there are no more matching records in the given direction, we set + * so->currPos.buf to InvalidBuffer, and return false. Note that this could + * just signal the end of the current primitive index scan. + * + * We always release the scan for a parallel scan caller, regardless of + * success or failure; we do this by calling _bt_parallel_release at the + * earliest opportunity. + */ +static bool +_bt_readfirstpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) +{ + Relation rel = scan->indexRelation; + BTScanOpaque so = (BTScanOpaque) scan->opaque; + + Assert(BufferIsValid(so->currPos.buf)); + Assert(!P_IGNORE(BTPageGetOpaque(BufferGetPage(so->currPos.buf)))); + + PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), + scan->xs_snapshot); + + if (_bt_readpage(scan, dir, offnum, true)) + { + /* We have at least one item to return as scan's first item */ + _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + return true; + } + + /* + * There's no actually-matching data on this page. Try to advance to the + * next page. + */ + _bt_unlockbuf(scan->indexRelation, so->currPos.buf); + if (!_bt_steppage(scan, dir)) + return false; + + /* + * We have at least one item to return as scan's first item (just not from + * the first page it read). No need to call _bt_drop_lock_and_maybe_pin + * here; _bt_readnextpage already did that for us. + */ return true; } /* * _bt_readnextpage() -- Read next page containing valid data for scan * - * On success exit, so->currPos is updated to contain data from the next - * interesting page, and we return true. Caller must release the lock (and - * maybe the pin) on the buffer on success exit. + * caller's blkno is the next interesting page's link, taken from either the + * previously-saved right link or left link. lastcurrblkno is the page that + * was current at the point where the blkno link was saved, which we somtimes + * use to reason about concurrent page splits/page deletions (this is only + * required by backwards scans, but we always have it, just to be consistent). * - * If there are no more matching records in the given direction, we drop all - * locks and pins, set so->currPos.buf to InvalidBuffer, and return false. + * On entry, so->currPos.buf must not be pinned or locked. Also, parallel + * scan callers must have seized the scan before calling here. + * + * On success exit, so->currPos is updated to contain data from the next + * interesting page, and we return true. We hold a pin on the buffer on + * success exit, except for scans that _bt_drop_lock_and_maybe_pin indicates + * can safely have their pin dropped to avoid blocking progress by VACUUM. + * + * If there are no more matching records in the given direction, we set + * so->currPos.buf to InvalidBuffer, and return false. Note that this could + * just signal the end of the current primitive index scan. + * + * We always release the scan for a parallel scan caller, regardless of + * success or failure; we do this by calling _bt_parallel_release at the + * earliest opportunity. */ static bool -_bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) +_bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, + BlockNumber lastcurrblkno, ScanDirection dir) { BTScanOpaque so = (BTScanOpaque) scan->opaque; Relation rel; @@ -2186,6 +2236,8 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) rel = scan->indexRelation; + Assert(!BTScanPosIsPinned(so->currPos)); + if (ScanDirectionIsForward(dir)) { for (;;) @@ -2217,84 +2269,54 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) } else if (scan->parallel_scan != NULL) { - /* allow next page be processed by parallel worker */ - _bt_parallel_release(scan, opaque->btpo_next); + /* _bt_readpage not called, so do this for ourselves */ + _bt_parallel_release(scan, opaque->btpo_next, blkno); } - /* nope, keep going */ if (scan->parallel_scan != NULL) { + /* no matching tuples, so seize a page to the right */ _bt_relbuf(rel, so->currPos.buf); - status = _bt_parallel_seize(scan, &blkno, false); + status = _bt_parallel_seize(scan, &blkno, &lastcurrblkno, false); if (!status) { + /* don't call _bt_parallel_done */ BTScanPosInvalidate(so->currPos); return false; } } else { + /* no matching tuples, so consider following right link */ + lastcurrblkno = blkno; blkno = opaque->btpo_next; _bt_relbuf(rel, so->currPos.buf); + so->currPos.buf = InvalidBuffer; } } } else { - /* - * Should only happen in parallel cases, when some other backend - * advanced the scan. - */ - if (so->currPos.currPage != blkno) - { - BTScanPosUnpinIfPinned(so->currPos); - so->currPos.currPage = blkno; - } - - /* Done if we know that the left sibling link isn't of interest */ - if (!so->currPos.moreLeft) - { - BTScanPosUnpinIfPinned(so->currPos); - _bt_parallel_done(scan); - BTScanPosInvalidate(so->currPos); - return false; - } - - /* - * Walk left to the next page with data. This is much more complex - * than the walk-right case because of the possibility that the page - * to our left splits while we are in flight to it, plus the - * possibility that the page we were on gets deleted after we leave - * it. See nbtree/README for details. - * - * It might be possible to rearrange this code to have less overhead - * in pinning and locking, but that would require capturing the left - * sibling block number when the page is initially read, and then - * optimistically starting there (rather than pinning the page twice). - * It is not clear that this would be worth the complexity. - */ - if (BTScanPosIsPinned(so->currPos)) - _bt_lockbuf(rel, so->currPos.buf, BT_READ); - else - so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ); - for (;;) { - /* Done if we know that the left sibling link isn't of interest */ - if (!so->currPos.moreLeft) + /* + * if we're at end of scan, give up and mark parallel scan as + * done, so that all the workers can finish their scan + */ + if (blkno == P_NONE || !so->currPos.moreLeft) { - _bt_relbuf(rel, so->currPos.buf); _bt_parallel_done(scan); BTScanPosInvalidate(so->currPos); return false; } - /* Step to next physical page */ - so->currPos.buf = _bt_walk_left(rel, so->currPos.buf); + /* move left at least one page (checks for interrupts, too) */ + so->currPos.buf = _bt_lock_and_validate_left(rel, &blkno, + lastcurrblkno); - /* if we're physically at end of index, return failure */ if (so->currPos.buf == InvalidBuffer) { + /* must have been a concurrent deletion of leftmost page */ _bt_parallel_done(scan); BTScanPosInvalidate(so->currPos); return false; @@ -2309,7 +2331,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) opaque = BTPageGetOpaque(page); if (!P_IGNORE(opaque)) { - PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot); + PredicateLockPage(rel, blkno, scan->xs_snapshot); /* see if there are any matches on this page */ /* note that this will clear moreLeft if we can stop */ if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page), false)) @@ -2317,51 +2339,34 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) } else if (scan->parallel_scan != NULL) { - /* allow next page be processed by parallel worker */ - _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf)); + /* _bt_readpage not called, so do this for ourselves */ + Assert(P_ISHALFDEAD(opaque)); + _bt_parallel_release(scan, opaque->btpo_prev, blkno); } - /* - * For parallel scans, get the last page scanned as it is quite - * possible that by the time we try to seize the scan, some other - * worker has already advanced the scan to a different page. We - * must continue based on the latest page scanned by any worker. - */ if (scan->parallel_scan != NULL) { + /* no matching tuples, so seize a page to the left */ _bt_relbuf(rel, so->currPos.buf); - status = _bt_parallel_seize(scan, &blkno, false); + status = _bt_parallel_seize(scan, &blkno, &lastcurrblkno, false); if (!status) { + /* don't call _bt_parallel_done */ BTScanPosInvalidate(so->currPos); return false; } - so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ); + } + else + { + /* no matching tuples, so consider following left link */ + lastcurrblkno = blkno; + blkno = opaque->btpo_prev; + _bt_relbuf(rel, so->currPos.buf); + so->currPos.buf = InvalidBuffer; } } } - return true; -} - -/* - * _bt_parallel_readpage() -- Read current page containing valid data for scan - * - * On success, release lock and maybe pin on buffer. We return true to - * indicate success. - */ -static bool -_bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) -{ - BTScanOpaque so = (BTScanOpaque) scan->opaque; - - Assert(!so->needPrimScan); - - _bt_initialize_more_data(so, dir); - - if (!_bt_readnextpage(scan, blkno, dir)) - return false; - /* We have at least one item to return as scan's next item */ _bt_drop_lock_and_maybe_pin(scan, &so->currPos); @@ -2369,48 +2374,41 @@ _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) } /* - * _bt_walk_left() -- step left one page, if possible + * _bt_lock_and_validate_left() -- lock caller's left sibling blkno, + * recovering from concurrent page splits/page deletions when necessary * - * The given buffer must be pinned and read-locked. This will be dropped - * before stepping left. On return, we have pin and read lock on the - * returned page, instead. + * Called during backwards scans, to deal with their unique concurrency rules. * - * Returns InvalidBuffer if there is no page to the left (no lock is held - * in that case). + * blkno points to the block number of the page that we expect to move the + * scan to. We'll successfully move the scan there when we find that its + * right sibling link points to lastcurrblkno, the page that we just finished + * reading. Otherwise, we have to figure out which page is the correct one to + * visit next the hard way, which involves reasoning about both concurrent + * page splits and concurrent page deletions. See nbtree/README for details. + * + * On return, we have both a pin and a read lock on the returned page, whose + * block number will be set in *blkno. Returns InvalidBuffer if there is no + * page to the left (no lock is held in that case). * * It is possible for the returned leaf page to be half-dead; caller must * check that condition and step left again when required. */ static Buffer -_bt_walk_left(Relation rel, Buffer buf) +_bt_lock_and_validate_left(Relation rel, BlockNumber *blkno, + BlockNumber lastcurrblkno) { - Page page; - BTPageOpaque opaque; - - page = BufferGetPage(buf); - opaque = BTPageGetOpaque(page); + BlockNumber origblkno = *blkno; /* detects circular links */ for (;;) { - BlockNumber obknum; - BlockNumber lblkno; - BlockNumber blkno; + Buffer buf; + Page page; + BTPageOpaque opaque; int tries; - /* if we're at end of tree, release buf and return failure */ - if (P_LEFTMOST(opaque)) - { - _bt_relbuf(rel, buf); - break; - } - /* remember original page we are stepping left from */ - obknum = BufferGetBlockNumber(buf); - /* step left */ - blkno = lblkno = opaque->btpo_prev; - _bt_relbuf(rel, buf); /* check for interrupts while we're not holding any buffer lock */ CHECK_FOR_INTERRUPTS(); - buf = _bt_getbuf(rel, blkno, BT_READ); + buf = _bt_getbuf(rel, *blkno, BT_READ); page = BufferGetPage(buf); opaque = BTPageGetOpaque(page); @@ -2427,21 +2425,26 @@ _bt_walk_left(Relation rel, Buffer buf) tries = 0; for (;;) { - if (!P_ISDELETED(opaque) && opaque->btpo_next == obknum) + if (!P_ISDELETED(opaque) && opaque->btpo_next == lastcurrblkno) { /* Found desired page, return it */ return buf; } if (P_RIGHTMOST(opaque) || ++tries > 4) break; - blkno = opaque->btpo_next; - buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ); + /* step right */ + *blkno = opaque->btpo_next; + buf = _bt_relandgetbuf(rel, buf, *blkno, BT_READ); page = BufferGetPage(buf); opaque = BTPageGetOpaque(page); } - /* Return to the original page to see what's up */ - buf = _bt_relandgetbuf(rel, buf, obknum, BT_READ); + /* + * Return to the original page (usually the page most recently read by + * _bt_readpage, which is passed by caller as lastcurrblkno) to see + * what's up with sibling links + */ + buf = _bt_relandgetbuf(rel, buf, lastcurrblkno, BT_READ); page = BufferGetPage(buf); opaque = BTPageGetOpaque(page); if (P_ISDELETED(opaque)) @@ -2457,8 +2460,8 @@ _bt_walk_left(Relation rel, Buffer buf) if (P_RIGHTMOST(opaque)) elog(ERROR, "fell off the end of index \"%s\"", RelationGetRelationName(rel)); - blkno = opaque->btpo_next; - buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ); + *blkno = opaque->btpo_next; + buf = _bt_relandgetbuf(rel, buf, *blkno, BT_READ); page = BufferGetPage(buf); opaque = BTPageGetOpaque(page); if (!P_ISDELETED(opaque)) @@ -2466,9 +2469,10 @@ _bt_walk_left(Relation rel, Buffer buf) } /* - * Now return to top of loop, resetting obknum to point to this - * nondeleted page, and try again. + * remember that this page is actually now the leftmost one whose + * key space we've already read all tuples from */ + lastcurrblkno = BufferGetBlockNumber(buf); } else { @@ -2477,11 +2481,22 @@ _bt_walk_left(Relation rel, Buffer buf) * to the left got split or deleted. Without this check, we'd go * into an infinite loop if there's anything wrong. */ - if (opaque->btpo_prev == lblkno) + if (opaque->btpo_prev == origblkno) elog(ERROR, "could not find left sibling of block %u in index \"%s\"", - obknum, RelationGetRelationName(rel)); - /* Okay to try again with new lblkno value */ + lastcurrblkno, RelationGetRelationName(rel)); + /* Okay to try again, since left sibling link changed */ } + + if (P_LEFTMOST(opaque)) + { + /* previous leftmost page must have been concurrently deleted */ + _bt_relbuf(rel, buf); + break; + } + + /* next loop iteration will start over with a clean slate */ + *blkno = origblkno = opaque->btpo_prev; + _bt_relbuf(rel, buf); } return InvalidBuffer; @@ -2605,7 +2620,6 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) return false; } - PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot); page = BufferGetPage(buf); opaque = BTPageGetOpaque(page); Assert(P_ISLEAF(opaque)); @@ -2637,21 +2651,8 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) /* * Now load data from the first page of the scan. */ - if (!_bt_readpage(scan, dir, start, true)) - { - /* - * There's no actually-matching data on this page. Try to advance to - * the next page. Return false if there's no matching data at all. - */ - _bt_unlockbuf(scan->indexRelation, so->currPos.buf); - if (!_bt_steppage(scan, dir)) - return false; - } - else - { - /* We have at least one item to return as scan's first item */ - _bt_drop_lock_and_maybe_pin(scan, &so->currPos); - } + if (!_bt_readfirstpage(scan, dir, start)) + return false; /* OK, itemIndex says what to return */ currItem = &so->currPos.items[so->currPos.itemIndex]; diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index b4ba51357..03d5c671f 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -2442,7 +2442,7 @@ new_prim_scan: so->needPrimScan = true; /* ...but call _bt_first again */ if (scan->parallel_scan) - _bt_parallel_primscan_schedule(scan, pstate->prev_scan_page); + _bt_parallel_primscan_schedule(scan, so->currPos.currPage); /* Caller's tuple doesn't match the new qual */ return false; -- 2.45.2