From 54127e3be41ac51ff3640960c08b4e65aa4a4892 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Fri, 11 Oct 2024 15:57:27 +0200 Subject: [PATCH v1] Avoid full btree index scans when skipping is possible Previously, we could ignore the skip signal until the end of the range of values producable by the index scan key. Now, we can fail to start a new primscan only for up to number of parallel workers + 1 buffers, at the cost of doing a bit more before releasing the scan while we process the 'we may need a new primitive scan' signal. If we detect that a parallel worker in the same primscan range thinks this is the right moment to start a new primitive scan, we don't release the parallel scan immediately, but instead only release it after reading the pages contents to find out if we really should start a new primitive scan. If so, we start that new primitive scan, and if instead we find we've already skidded into the range of pages we would've arrived on with the skip scan, we instead mark that the primitive scan has reached a new primscan range, do some cleanup, and then continue the scan as usual. --- src/include/access/nbtree.h | 10 ++- src/backend/access/nbtree/nbtree.c | 112 +++++++++++++++++++++++--- src/backend/access/nbtree/nbtsearch.c | 15 +++- 3 files changed, 121 insertions(+), 16 deletions(-) diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index d64300fb97..efe6728dd3 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1052,6 +1052,11 @@ typedef struct BTScanOpaqueData FmgrInfo *orderProcs; /* ORDER procs for required equality keys */ MemoryContext arrayContext; /* scan-lifespan context for array data */ + /* local state for coordinating skips in parallel scans */ + bool testPrimScan; /* Are we trying to do a new primitive scan */ + uint32 arrElemsGen; /* Generation number of prim scan we want to + * improve on */ + /* info about killed items if any (killedItems is NULL if never used) */ int *killedItems; /* currPos.items indexes of killed items */ int numKilled; /* number of currently stored items */ @@ -1193,7 +1198,10 @@ extern int btgettreeheight(Relation rel); */ extern bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first); -extern void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page); +extern void _bt_parallel_opt_release_early(IndexScanDesc scan, + BlockNumber scan_page); +extern void _bt_parallel_opt_release_late(IndexScanDesc scan, + BlockNumber scan_page); extern void _bt_parallel_done(IndexScanDesc scan); extern void _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber prev_scan_page); diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 56e502c4fc..04c8d6e786 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -70,12 +70,15 @@ typedef struct BTParallelScanDescData BTPS_State btps_pageStatus; /* indicates whether next page is * available for scan. see above for * possible states of parallel scan. */ + uint32 btps_arrElemsGen; /* number of new prim scan opportunities */ + bool btps_checkPrimScan; /* did we skid past the most opportune + * endpoint of a primitive scan? */ slock_t btps_mutex; /* protects above variables, btps_arrElems */ ConditionVariable btps_cv; /* used to synchronize parallel scan */ /* * btps_arrElems is used when scans need to schedule another primitive - * index scan. Holds BTArrayKeyInfo.cur_elem offsets for scan keys. + * index scan. Holds the values for BTScanOpaque->arrayKeys[.].cur_elem. */ int btps_arrElems[FLEXIBLE_ARRAY_MEMBER]; } BTParallelScanDescData; @@ -335,6 +338,9 @@ btbeginscan(Relation rel, int nkeys, int norderbys) so->arrayKeys = NULL; so->orderProcs = NULL; so->arrayContext = NULL; + + so->testPrimScan = false; + so->arrElemsGen = 0; so->killedItems = NULL; /* until needed */ so->numKilled = 0; @@ -550,6 +556,8 @@ btinitparallelscan(void *target) SpinLockInit(&bt_target->btps_mutex); bt_target->btps_scanPage = InvalidBlockNumber; bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED; + bt_target->btps_arrElemsGen = 0; + bt_target->btps_checkPrimScan = false; ConditionVariableInit(&bt_target->btps_cv); } @@ -575,13 +583,15 @@ btparallelrescan(IndexScanDesc scan) SpinLockAcquire(&btscan->btps_mutex); btscan->btps_scanPage = InvalidBlockNumber; btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED; + btscan->btps_arrElemsGen = 0; + btscan->btps_checkPrimScan = false; SpinLockRelease(&btscan->btps_mutex); } /* * _bt_parallel_seize() -- Begin the process of advancing the scan to a new - * page. Other scans must wait until we call _bt_parallel_release() - * or _bt_parallel_done(). + * page. Other scans must wait until we call _bt_parallel_done(), + * [_btp]_opt_release_early/late(), or [_btp]_primscan_schedule(). * * The return value is true if we successfully seized the scan and false * if we did not. The latter case occurs when no pages remain, or when @@ -640,6 +650,8 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first) { /* We're done with this parallel index scan */ status = false; + so->testPrimScan = false; + so->arrElemsGen = 0; } else if (btscan->btps_pageStatus == BTPARALLEL_NEED_PRIMSCAN) { @@ -659,6 +671,8 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first) } *pageno = InvalidBlockNumber; exit_loop = true; + so->arrElemsGen = btscan->btps_arrElemsGen; + so->testPrimScan = false; } else { @@ -685,6 +699,9 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first) */ btscan->btps_pageStatus = BTPARALLEL_ADVANCING; *pageno = btscan->btps_scanPage; + + so->arrElemsGen = btscan->btps_arrElemsGen; + exit_loop = true; } SpinLockRelease(&btscan->btps_mutex); @@ -698,7 +715,7 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first) } /* - * _bt_parallel_release() -- Complete the process of advancing the scan to a + * _bt_parallel_opt_release_early() -- Complete the process of advancing the scan to a * new page. We now have the new value btps_scanPage; some other backend * can now begin advancing the scan. * @@ -709,19 +726,76 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first) * scan lands on scan_page). */ void -_bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page) +_bt_parallel_opt_release_early(IndexScanDesc scan, BlockNumber scan_page) +{ + ParallelIndexScanDesc parallel_scan = scan->parallel_scan; + BTParallelScanDesc btscan; + + btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan, + parallel_scan->ps_offset); + + SpinLockAcquire(&btscan->btps_mutex); + /* + * If a parallel worker noticed that it had skipped past the end of a + * primitive scan after another backend already acquired the parallel scan + * status, we don't release the scan before reading the page's contents. + * Instead, we transition to a position which will + */ + if (likely(!btscan->btps_checkPrimScan)) + { + btscan->btps_scanPage = scan_page; + btscan->btps_pageStatus = BTPARALLEL_IDLE; + SpinLockRelease(&btscan->btps_mutex); + ConditionVariableSignal(&btscan->btps_cv); + } + else + { + BTScanOpaque so = (BTScanOpaque) scan->opaque; + so->testPrimScan = true; + SpinLockRelease(&btscan->btps_mutex); + } +} + +/* + * _bt_parallel_opt_release_late() -- Complete the process of advancing the + * scan to a new page. + * + * We're only called when a concurrent backend wanted to schedule a skip scan, + * but failed to do so because the parallel scan already advanced past its + * own page. + */ +void +_bt_parallel_opt_release_late(IndexScanDesc scan, BlockNumber scan_page) { ParallelIndexScanDesc parallel_scan = scan->parallel_scan; + BTScanOpaque so = (BTScanOpaque) scan->opaque; BTParallelScanDesc btscan; + + if (!so->testPrimScan) + return; btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan, parallel_scan->ps_offset); SpinLockAcquire(&btscan->btps_mutex); + Assert(btscan->btps_checkPrimScan); btscan->btps_scanPage = scan_page; btscan->btps_pageStatus = BTPARALLEL_IDLE; + + /* + * A late release implies that 1) a concurrent backend noticed we + * should've started a new primitive scan, and that 2) the current scan + * position is already at-or-past the point where that scan would've + * started. So, we do what a new primitive scan would've done with the + * shared state: we increase the generation number, and unset + * checkPrimScan. + */ + btscan->btps_checkPrimScan = false; + btscan->btps_arrElemsGen += 1; SpinLockRelease(&btscan->btps_mutex); ConditionVariableSignal(&btscan->btps_cv); + + so->testPrimScan = false; } /* @@ -738,6 +812,7 @@ _bt_parallel_done(IndexScanDesc scan) ParallelIndexScanDesc parallel_scan = scan->parallel_scan; BTParallelScanDesc btscan; bool status_changed = false; + so->testPrimScan = false; /* Do nothing, for non-parallel scans */ if (parallel_scan == NULL) @@ -774,10 +849,10 @@ _bt_parallel_done(IndexScanDesc scan) /* * _bt_parallel_primscan_schedule() -- Schedule another primitive index scan. * - * Caller passes the block number most recently passed to _bt_parallel_release - * by its backend. Caller successfully schedules the next primitive index scan - * if the shared parallel state hasn't been seized since caller's backend last - * advanced the scan. + * Caller passes the block number most recently passed to + * _bt_parallel_opt_release_early by its backend. Caller successfully + * schedules the next primitive index scan if the shared parallel state hasn't + * been seized since caller's backend last advanced the scan. */ void _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber prev_scan_page) @@ -792,11 +867,13 @@ _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber prev_scan_page) parallel_scan->ps_offset); SpinLockAcquire(&btscan->btps_mutex); - if (btscan->btps_scanPage == prev_scan_page && - btscan->btps_pageStatus == BTPARALLEL_IDLE) + if ((btscan->btps_scanPage == prev_scan_page && + btscan->btps_pageStatus == BTPARALLEL_IDLE) || + unlikely(so->testPrimScan)) { btscan->btps_scanPage = InvalidBlockNumber; btscan->btps_pageStatus = BTPARALLEL_NEED_PRIMSCAN; + btscan->btps_arrElemsGen += 1; /* Serialize scan's current array keys */ for (int i = 0; i < so->numArrayKeys; i++) @@ -806,7 +883,20 @@ _bt_parallel_primscan_schedule(IndexScanDesc scan, BlockNumber prev_scan_page) btscan->btps_arrElems[i] = array->cur_elem; } } + /* + * If the shared array keys are still those of the primitive scan we used + * to access prev_scan_page, make a note that the next page may be a good + * opportunity to start a new primitive scan. Once marked, a worker will + * not release the scan until it has processed its page and knows for + * sure whether a new prim scan is needed. + */ + else if (btscan->btps_arrElemsGen == so->arrElemsGen) + { + btscan->btps_checkPrimScan = true; + } SpinLockRelease(&btscan->btps_mutex); + + so->testPrimScan = false; } /* diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index fff7c89ead..8a9a0e6626 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -1555,7 +1555,7 @@ _bt_next(IndexScanDesc scan, ScanDirection dir) * * In the case of a parallel scan, caller must have called _bt_parallel_seize * prior to calling this function; this function will invoke - * _bt_parallel_release before returning. + * _bt_parallel_opt_release_early before returning. * * Returns true if any matching items found on the page, false if none. */ @@ -1590,7 +1590,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, else pstate.prev_scan_page = BufferGetBlockNumber(so->currPos.buf); - _bt_parallel_release(scan, pstate.prev_scan_page); + _bt_parallel_opt_release_early(scan, pstate.prev_scan_page); } indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation); @@ -1943,6 +1943,13 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, so->currPos.itemIndex = MaxTIDsPerBTreePage - 1; } + /* + * If !continuescan, releasing will be or has been done by either + * [_btp]_done or [_btp]_skipscan_schedule. + */ + if (scan->parallel_scan && pstate.continuescan) + _bt_parallel_opt_release_late(scan, pstate.prev_scan_page); + return (so->currPos.firstItem <= so->currPos.lastItem); } @@ -2218,7 +2225,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) else if (scan->parallel_scan != NULL) { /* allow next page be processed by parallel worker */ - _bt_parallel_release(scan, opaque->btpo_next); + _bt_parallel_opt_release_early(scan, opaque->btpo_next); } /* nope, keep going */ @@ -2318,7 +2325,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) else if (scan->parallel_scan != NULL) { /* allow next page be processed by parallel worker */ - _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf)); + _bt_parallel_opt_release_early(scan, BufferGetBlockNumber(so->currPos.buf)); } /* -- 2.45.2