From 1ae97d26aa5a1fb3e7dafc4160960bc144e4be9e Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Sat, 17 Jun 2023 17:03:36 -0700 Subject: [PATCH v6] Enhance nbtree ScalarArrayOp execution. Commit 9e8da0f7 taught nbtree to handle ScalarArrayOpExpr quals natively. This works by pushing additional context about the arrays down into the nbtree index AM, as index quals. This information enabled nbtree to execute multiple primitive index scans as part of an index scan executor node that was treated as one continuous index scan. The motivation behind this earlier work was enabling index-only scans with ScalarArrayOpExpr clauses (SAOP quals are traditionally executed via BitmapOr nodes, which is largely index-AM-agnostic, but always requires heap access). The general idea of giving the index AM this additional context can be pushed a lot further, though. Teach nbtree SAOP index scans to advance array scan keys by applying information about the physical characteristics of the index at runtime. The array key state machine advances the current array keys using the next index tuple in line to be scanned, at the point where the scan reaches the end of index tuples matching its current array keys. We dynamically decide whether to perform another primitive index scan (or whether to stick with the ongoing leaf level traversal) using a set of heuristics that aim to minimize repeat index descents. This approach can be far more efficient: many cases that previously required thousands of primitive index scans now require as few as one single primitive index scan. All duplicative index page accesses are now avoided. nbtree can now execute required and non-required array/SAOP scan keys in the most efficient way possible. Naturally, only required SAOP keys (i.e. those that can terminate the top-level scan) are capable of triggering a new primitive index scan; non-required keys never affect the scan's position. Consequently, index scans on a composite index with (say) a high-order inequality key and a low-order SAOP key (which nbtree will make into a non-required scan key) will now reliably output rows in index order. The scan is always executed as one large index scan under the hood, which is obviously the fastest way to do it, for the usual reasons: it avoids useless repeat index page accesses across successive primitive index scans. More importantly, nbtree's very general approach removes any question of index scan nodes outputting rows in an order that doesn't match the index. This enables the removal of various special cases from the planner -- which in turn makes the nbtree enhancements more effective and more widely applicable. Bugfix commit 807a40c5 taught the planner to avoid generating unsafe path keys: path keys on a multicolumn index path, with a SAOP clause on any attribute beyond the first/most significant attribute. These cases are now all safe, so we go back to generating path keys without regard for the presence of SAOP clauses (just like with any other clause type). Also undo changes from follow-up bugfix commit a4523c5a, which taught the planner to produce alternative index paths without low-order ScalarArrayOpExpr quals (paths where the quals appear as filter quals instead). Now there is never any need to make a cost-based choice between an index scan that can be trusted to return tuples in index order (but has SAOP filter quals), and a more selective index scan that can apply true SAOP index quals for one or more low-order index columns (but cannot be trusted to produce tuples in index order). Many of the queries sped up by the enhancements added by this commit won't benefit much from avoiding repeat index page accesses. The most compelling cases are those where query execution _completely_ avoids many heap page accesses that filter quals would have otherwise required, just to eliminate one or more non-matching rows from each heap page. (In general, index scan filter quals always need "extra" heap accesses to eliminate non-matching rows, since expression evaluation is only deemed safe with visible rows. Whereas index quals never require inline visibility checks; they can just eliminate non-matching rows up front.) Author: Peter Geoghegan Discussion: https://postgr.es/m/CAH2-Wz=ksvN_sjcnD1+Bt-WtifRA5ok48aDYnq3pkKhxgMQpcw@mail.gmail.com --- src/include/access/nbtree.h | 42 +- src/backend/access/nbtree/nbtree.c | 63 +- src/backend/access/nbtree/nbtsearch.c | 92 +- src/backend/access/nbtree/nbtutils.c | 1472 +++++++++++++++++++- src/backend/optimizer/path/indxpath.c | 86 +- src/backend/utils/adt/selfuncs.c | 122 +- doc/src/sgml/monitoring.sgml | 13 + src/test/regress/expected/create_index.out | 61 +- src/test/regress/expected/join.out | 5 +- src/test/regress/sql/create_index.sql | 20 +- 10 files changed, 1700 insertions(+), 276 deletions(-) diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 7bfbf3086..566e1c15d 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -965,7 +965,7 @@ typedef struct BTScanPosData * moreLeft and moreRight track whether we think there may be matching * index entries to the left and right of the current page, respectively. * We can clear the appropriate one of these flags when _bt_checkkeys() - * returns continuescan = false. + * sets BTReadPageState.continuescan = false. */ bool moreLeft; bool moreRight; @@ -1043,13 +1043,13 @@ typedef struct BTScanOpaqueData /* workspace for SK_SEARCHARRAY support */ ScanKey arrayKeyData; /* modified copy of scan->keyData */ - bool arraysStarted; /* Started array keys, but have yet to "reach - * past the end" of all arrays? */ int numArrayKeys; /* number of equality-type array keys (-1 if * there are any unsatisfiable array keys) */ - int arrayKeyCount; /* count indicating number of array scan keys - * processed */ + bool needPrimScan; /* Perform another primitive scan? */ BTArrayKeyInfo *arrayKeys; /* info about each equality-type array key */ + FmgrInfo *orderProcs; /* ORDER procs for equality constraint keys */ + int numPrimScans; /* Running tally of # primitive index scans + * (used to coordinate parallel workers) */ MemoryContext arrayContext; /* scan-lifespan context for array data */ /* info about killed items if any (killedItems is NULL if never used) */ @@ -1083,6 +1083,29 @@ typedef struct BTScanOpaqueData typedef BTScanOpaqueData *BTScanOpaque; +/* + * _bt_readpage state used across _bt_checkkeys calls for a page + * + * When _bt_readpage is called during a forward scan that has one or more + * equality-type SK_SEARCHARRAY scan keys, it has an extra responsibility: to + * set up information about the final tuple from the page. This must happen + * before the first call to _bt_checkkeys. _bt_checkkeys uses the final tuple + * to manage advancement of the scan's array keys more efficiently. + */ +typedef struct BTReadPageState +{ + /* Input parameters, set by _bt_readpage */ + ScanDirection dir; /* current scan direction */ + IndexTuple finaltup; /* final tuple (high key for forward scans) */ + + /* Output parameters, set by _bt_checkkeys */ + bool continuescan; /* Terminate ongoing (primitive) index scan? */ + + /* Private _bt_checkkeys-managed state */ + bool finaltupchecked; /* final tuple checked against current + * SK_SEARCHARRAY array keys? */ +} BTReadPageState; + /* * We use some private sk_flags bits in preprocessed scan keys. We're allowed * to use bits 16-31 (see skey.h). The uppermost bits are copied from the @@ -1090,6 +1113,7 @@ typedef BTScanOpaqueData *BTScanOpaque; */ #define SK_BT_REQFWD 0x00010000 /* required to continue forward scan */ #define SK_BT_REQBKWD 0x00020000 /* required to continue backward scan */ +#define SK_BT_RDDNARRAY 0x00040000 /* redundant in array preprocessing */ #define SK_BT_INDOPTION_SHIFT 24 /* must clear the above bits */ #define SK_BT_DESC (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT) #define SK_BT_NULLS_FIRST (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT) @@ -1160,7 +1184,7 @@ extern bool btcanreturn(Relation index, int attno); extern bool _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno); extern void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page); extern void _bt_parallel_done(IndexScanDesc scan); -extern void _bt_parallel_advance_array_keys(IndexScanDesc scan); +extern void _bt_parallel_next_primitive_scan(IndexScanDesc scan); /* * prototypes for functions in nbtdedup.c @@ -1253,12 +1277,12 @@ extern BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup); extern void _bt_freestack(BTStack stack); extern void _bt_preprocess_array_keys(IndexScanDesc scan); extern void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir); -extern bool _bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir); +extern bool _bt_array_keys_remain(IndexScanDesc scan, ScanDirection dir); extern void _bt_mark_array_keys(IndexScanDesc scan); extern void _bt_restore_array_keys(IndexScanDesc scan); extern void _bt_preprocess_keys(IndexScanDesc scan); -extern bool _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, - int tupnatts, ScanDirection dir, bool *continuescan, +extern bool _bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, + IndexTuple tuple, bool finaltup, bool requiredMatchedByPrecheck); extern void _bt_killitems(IndexScanDesc scan); extern BTCycleId _bt_vacuum_cycleid(Relation rel); diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index a88b36a58..6328a8a63 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -48,8 +48,8 @@ * BTPARALLEL_IDLE indicates that no backend is currently advancing the scan * to a new page; some process can start doing that. * - * BTPARALLEL_DONE indicates that the scan is complete (including error exit). - * We reach this state once for every distinct combination of array keys. + * BTPARALLEL_DONE indicates that the primitive index scan is complete + * (including error exit). Reached once per primitive index scan. */ typedef enum { @@ -69,8 +69,8 @@ typedef struct BTParallelScanDescData BTPS_State btps_pageStatus; /* indicates whether next page is * available for scan. see above for * possible states of parallel scan. */ - int btps_arrayKeyCount; /* count indicating number of array scan - * keys processed by parallel scan */ + int btps_numPrimScans; /* count indicating number of primitive + * index scans (used with array keys) */ slock_t btps_mutex; /* protects above variables */ ConditionVariable btps_cv; /* used to synchronize parallel scan */ } BTParallelScanDescData; @@ -275,8 +275,8 @@ btgettuple(IndexScanDesc scan, ScanDirection dir) /* If we have a tuple, return it ... */ if (res) break; - /* ... otherwise see if we have more array keys to deal with */ - } while (so->numArrayKeys && _bt_advance_array_keys(scan, dir)); + /* ... otherwise see if we need another primitive index scan */ + } while (so->numArrayKeys && _bt_array_keys_remain(scan, dir)); return res; } @@ -333,8 +333,8 @@ btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) ntids++; } } - /* Now see if we have more array keys to deal with */ - } while (so->numArrayKeys && _bt_advance_array_keys(scan, ForwardScanDirection)); + /* Now see if we need another primitive index scan */ + } while (so->numArrayKeys && _bt_array_keys_remain(scan, ForwardScanDirection)); return ntids; } @@ -364,9 +364,10 @@ btbeginscan(Relation rel, int nkeys, int norderbys) so->keyData = NULL; so->arrayKeyData = NULL; /* assume no array keys for now */ - so->arraysStarted = false; so->numArrayKeys = 0; + so->needPrimScan = false; so->arrayKeys = NULL; + so->orderProcs = NULL; so->arrayContext = NULL; so->killedItems = NULL; /* until needed */ @@ -406,7 +407,8 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, } so->markItemIndex = -1; - so->arrayKeyCount = 0; + so->needPrimScan = false; + so->numPrimScans = 0; so->firstPage = false; BTScanPosUnpinIfPinned(so->markPos); BTScanPosInvalidate(so->markPos); @@ -588,7 +590,7 @@ btinitparallelscan(void *target) SpinLockInit(&bt_target->btps_mutex); bt_target->btps_scanPage = InvalidBlockNumber; bt_target->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED; - bt_target->btps_arrayKeyCount = 0; + bt_target->btps_numPrimScans = 0; ConditionVariableInit(&bt_target->btps_cv); } @@ -614,7 +616,7 @@ btparallelrescan(IndexScanDesc scan) SpinLockAcquire(&btscan->btps_mutex); btscan->btps_scanPage = InvalidBlockNumber; btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED; - btscan->btps_arrayKeyCount = 0; + btscan->btps_numPrimScans = 0; SpinLockRelease(&btscan->btps_mutex); } @@ -625,7 +627,11 @@ btparallelrescan(IndexScanDesc scan) * * The return value is true if we successfully seized the scan and false * if we did not. The latter case occurs if no pages remain for the current - * set of scankeys. + * primitive index scan. + * + * When array scan keys are in use, each worker process independently advances + * its array keys. It's crucial that each worker process never be allowed to + * scan a page from before the current scan position. * * If the return value is true, *pageno returns the next or current page * of the scan (depending on the scan direction). An invalid block number @@ -656,16 +662,17 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno) SpinLockAcquire(&btscan->btps_mutex); pageStatus = btscan->btps_pageStatus; - if (so->arrayKeyCount < btscan->btps_arrayKeyCount) + if (so->numPrimScans < btscan->btps_numPrimScans) { - /* Parallel scan has already advanced to a new set of scankeys. */ + /* Top-level scan already moved on to next primitive index scan */ status = false; } else if (pageStatus == BTPARALLEL_DONE) { /* - * We're done with this set of scankeys. This may be the end, or - * there could be more sets to try. + * We're done with this primitive index scan. This might have + * been the final primitive index scan required, or the top-level + * index scan might require additional primitive scans. */ status = false; } @@ -697,9 +704,12 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno) void _bt_parallel_release(IndexScanDesc scan, BlockNumber scan_page) { + BTScanOpaque so PG_USED_FOR_ASSERTS_ONLY = (BTScanOpaque) scan->opaque; ParallelIndexScanDesc parallel_scan = scan->parallel_scan; BTParallelScanDesc btscan; + Assert(!so->needPrimScan); + btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan, parallel_scan->ps_offset); @@ -733,12 +743,11 @@ _bt_parallel_done(IndexScanDesc scan) parallel_scan->ps_offset); /* - * Mark the parallel scan as done for this combination of scan keys, - * unless some other process already did so. See also - * _bt_advance_array_keys. + * Mark the primitive index scan as done, unless some other process + * already did so. See also _bt_array_keys_remain. */ SpinLockAcquire(&btscan->btps_mutex); - if (so->arrayKeyCount >= btscan->btps_arrayKeyCount && + if (so->numPrimScans >= btscan->btps_numPrimScans && btscan->btps_pageStatus != BTPARALLEL_DONE) { btscan->btps_pageStatus = BTPARALLEL_DONE; @@ -752,14 +761,14 @@ _bt_parallel_done(IndexScanDesc scan) } /* - * _bt_parallel_advance_array_keys() -- Advances the parallel scan for array - * keys. + * _bt_parallel_next_primitive_scan() -- Advances parallel primitive scan + * counter when array keys are in use. * - * Updates the count of array keys processed for both local and parallel + * Updates the count of primitive index scans for both local and parallel * scans. */ void -_bt_parallel_advance_array_keys(IndexScanDesc scan) +_bt_parallel_next_primitive_scan(IndexScanDesc scan) { BTScanOpaque so = (BTScanOpaque) scan->opaque; ParallelIndexScanDesc parallel_scan = scan->parallel_scan; @@ -768,13 +777,13 @@ _bt_parallel_advance_array_keys(IndexScanDesc scan) btscan = (BTParallelScanDesc) OffsetToPointer((void *) parallel_scan, parallel_scan->ps_offset); - so->arrayKeyCount++; + so->numPrimScans++; SpinLockAcquire(&btscan->btps_mutex); if (btscan->btps_pageStatus == BTPARALLEL_DONE) { btscan->btps_scanPage = InvalidBlockNumber; btscan->btps_pageStatus = BTPARALLEL_NOT_INITIALIZED; - btscan->btps_arrayKeyCount++; + btscan->btps_numPrimScans++; } SpinLockRelease(&btscan->btps_mutex); } diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index efc5284e5..b2addd714 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -893,7 +893,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) */ if (!so->qual_ok) { - /* Notify any other workers that we're done with this scan key. */ + /* Notify any other workers that this primitive scan is done */ _bt_parallel_done(scan); return false; } @@ -952,6 +952,10 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) * one we use --- by definition, they are either redundant or * contradictory. * + * When SK_SEARCHARRAY keys are in use, _bt_tuple_before_array_keys is + * used to avoid prematurely stopping the scan when an array equality qual + * has its array keys advanced. + * * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier. * If the index stores nulls at the end of the index we'll be starting * from, and we have no boundary key for the column (which means the key @@ -1537,9 +1541,8 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) BTPageOpaque opaque; OffsetNumber minoff; OffsetNumber maxoff; + BTReadPageState pstate; int itemIndex; - bool continuescan; - int indnatts; bool requiredMatchedByPrecheck; /* @@ -1560,8 +1563,11 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf)); } - continuescan = true; /* default assumption */ - indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation); + pstate.dir = dir; + pstate.finaltup = NULL; + pstate.continuescan = true; /* default assumption */ + pstate.finaltupchecked = false; + minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); @@ -1609,9 +1615,11 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) * the last item on the page would give a more precise answer. * * We skip this for the first page in the scan to evade the possible - * slowdown of the point queries. + * slowdown of the point queries. Do the same with scans with array keys, + * since that makes the optimization unsafe (our search-type scan keys can + * change during any call to _bt_checkkeys whenever array keys are used). */ - if (!so->firstPage && minoff < maxoff) + if (!so->firstPage && minoff < maxoff && !so->numArrayKeys) { ItemId iid; IndexTuple itup; @@ -1625,8 +1633,9 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) * set flag to true if all required keys are satisfied and false * otherwise. */ - (void) _bt_checkkeys(scan, itup, indnatts, dir, - &requiredMatchedByPrecheck, false); + _bt_checkkeys(scan, &pstate, itup, false, false); + requiredMatchedByPrecheck = pstate.continuescan; + pstate.continuescan = true; /* reset */ } else { @@ -1636,6 +1645,14 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) if (ScanDirectionIsForward(dir)) { + /* SK_SEARCHARRAY forward scans must provide high key up front */ + if (so->numArrayKeys && !P_RIGHTMOST(opaque)) + { + ItemId iid = PageGetItemId(page, P_HIKEY); + + pstate.finaltup = (IndexTuple) PageGetItem(page, iid); + } + /* load items[] in ascending order */ itemIndex = 0; @@ -1659,8 +1676,8 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) itup = (IndexTuple) PageGetItem(page, iid); - passes_quals = _bt_checkkeys(scan, itup, indnatts, dir, - &continuescan, requiredMatchedByPrecheck); + passes_quals = _bt_checkkeys(scan, &pstate, itup, false, + requiredMatchedByPrecheck); /* * If the result of prechecking required keys was true, then in @@ -1668,8 +1685,8 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) * result is the same. */ Assert(!requiredMatchedByPrecheck || - passes_quals == _bt_checkkeys(scan, itup, indnatts, dir, - &continuescan, false)); + passes_quals == _bt_checkkeys(scan, &pstate, itup, false, + false)); if (passes_quals) { /* tuple passes all scan key conditions */ @@ -1703,7 +1720,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) } } /* When !continuescan, there can't be any more matches, so stop */ - if (!continuescan) + if (!pstate.continuescan) break; offnum = OffsetNumberNext(offnum); @@ -1720,17 +1737,23 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) * only appear on non-pivot tuples on the right sibling page are * common. */ - if (continuescan && !P_RIGHTMOST(opaque)) + if (pstate.continuescan && !P_RIGHTMOST(opaque)) { - ItemId iid = PageGetItemId(page, P_HIKEY); - IndexTuple itup = (IndexTuple) PageGetItem(page, iid); - int truncatt; + IndexTuple itup; - truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation); - _bt_checkkeys(scan, itup, truncatt, dir, &continuescan, false); + if (pstate.finaltup) + itup = pstate.finaltup; + else + { + ItemId iid = PageGetItemId(page, P_HIKEY); + + itup = (IndexTuple) PageGetItem(page, iid); + } + + _bt_checkkeys(scan, &pstate, itup, true, false); } - if (!continuescan) + if (!pstate.continuescan) so->currPos.moreRight = false; Assert(itemIndex <= MaxTIDsPerBTreePage); @@ -1740,6 +1763,14 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) } else { + /* SK_SEARCHARRAY backward scans must provide final tuple up front */ + if (so->numArrayKeys && minoff < maxoff) + { + ItemId iid = PageGetItemId(page, minoff); + + pstate.finaltup = (IndexTuple) PageGetItem(page, iid); + } + /* load items[] in descending order */ itemIndex = MaxTIDsPerBTreePage; @@ -1751,6 +1782,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) IndexTuple itup; bool tuple_alive; bool passes_quals; + bool finaltup = (offnum == minoff); /* * If the scan specifies not to return killed tuples, then we @@ -1761,12 +1793,18 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) * tuple on the page, we do check the index keys, to prevent * uselessly advancing to the page to the left. This is similar * to the high key optimization used by forward scans. + * + * Separately, _bt_checkkeys actually requires that we call it + * with the final non-pivot tuple from the page, if there's one + * (final processed tuple, or first tuple in offset number terms). + * We must indicate which particular tuple comes last, too. */ if (scan->ignore_killed_tuples && ItemIdIsDead(iid)) { Assert(offnum >= P_FIRSTDATAKEY(opaque)); - if (offnum > P_FIRSTDATAKEY(opaque)) + if (!finaltup) { + Assert(offnum > minoff); offnum = OffsetNumberPrev(offnum); continue; } @@ -1778,8 +1816,8 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) itup = (IndexTuple) PageGetItem(page, iid); - passes_quals = _bt_checkkeys(scan, itup, indnatts, dir, - &continuescan, requiredMatchedByPrecheck); + passes_quals = _bt_checkkeys(scan, &pstate, itup, finaltup, + requiredMatchedByPrecheck); /* * If the result of prechecking required keys was true, then in @@ -1787,8 +1825,8 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) * result is the same. */ Assert(!requiredMatchedByPrecheck || - passes_quals == _bt_checkkeys(scan, itup, indnatts, dir, - &continuescan, false)); + passes_quals == _bt_checkkeys(scan, &pstate, itup, + finaltup, false)); if (passes_quals && tuple_alive) { /* tuple passes all scan key conditions */ @@ -1827,7 +1865,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) } } } - if (!continuescan) + if (!pstate.continuescan) { /* there can't be any more matches, so stop */ so->currPos.moreLeft = false; diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 1510b97fb..8318e6250 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -33,7 +33,7 @@ typedef struct BTSortArrayContext { - FmgrInfo flinfo; + FmgrInfo *orderproc; Oid collation; bool reverse; } BTSortArrayContext; @@ -41,15 +41,41 @@ typedef struct BTSortArrayContext static Datum _bt_find_extreme_element(IndexScanDesc scan, ScanKey skey, StrategyNumber strat, Datum *elems, int nelems); +static void _bt_sort_cmp_func_setup(IndexScanDesc scan, ScanKey skey); static int _bt_sort_array_elements(IndexScanDesc scan, ScanKey skey, bool reverse, Datum *elems, int nelems); +static int _bt_merge_arrays(IndexScanDesc scan, ScanKey skey, bool reverse, + Datum *elems_orig, int nelems_orig, + Datum *elems_next, int nelems_next); static int _bt_compare_array_elements(const void *a, const void *b, void *arg); +static inline int32 _bt_compare_array_skey(FmgrInfo *orderproc, + Datum tupdatum, bool tupnull, + Datum arrdatum, ScanKey cur); +static int _bt_binsrch_array_skey(FmgrInfo *orderproc, + bool cur_elem_start, ScanDirection dir, + Datum tupdatum, bool tupnull, + BTArrayKeyInfo *array, ScanKey cur, + int32 *final_result); +static bool _bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir); +static bool _bt_tuple_before_array_skeys(IndexScanDesc scan, + BTReadPageState *pstate, + IndexTuple tuple); +static bool _bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, + IndexTuple tuple, bool skrequiredtrigger); +static void _bt_preprocess_keys_leafbuf(IndexScanDesc scan); +#ifdef USE_ASSERT_CHECKING +static bool _bt_verify_array_scankeys(IndexScanDesc scan); +#endif static bool _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op, ScanKey leftarg, ScanKey rightarg, bool *result); static bool _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption); static void _bt_mark_scankey_required(ScanKey skey); +static bool _bt_check_compare(ScanDirection dir, BTScanOpaque so, + IndexTuple tuple, int tupnatts, TupleDesc tupdesc, + bool *continuescan, bool *skrequiredtrigger, + bool requiredMatchedByPrecheck); static bool _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, TupleDesc tupdesc, ScanDirection dir, bool *continuescan); @@ -198,13 +224,48 @@ _bt_freestack(BTStack stack) * If there are any SK_SEARCHARRAY scan keys, deconstruct the array(s) and * set up BTArrayKeyInfo info for each one that is an equality-type key. * Prepare modified scan keys in so->arrayKeyData, which will hold the current - * array elements during each primitive indexscan operation. For inequality - * array keys, it's sufficient to find the extreme element value and replace - * the whole array with that scalar value. + * array elements. + * + * _bt_preprocess_keys treats each primitive scan as an independent piece of + * work. That structure pushes the responsibility for preprocessing that must + * work "across array keys" onto us. This division of labor makes sense once + * you consider that we're typically called no more than once per btrescan, + * whereas _bt_preprocess_keys is always called once per primitive index scan. + * + * Currently we perform two kinds of preprocessing to deal with redundancies. + * For inequality array keys, it's sufficient to find the extreme element + * value and replace the whole array with that scalar value. This eliminates + * all but one array key as redundant. Similarly, we are capable of "merging + * together" multiple equality array keys from two or more input scan keys + * into a single output scan key that contains only the intersecting array + * elements. This can eliminate many redundant array elements, as well as + * eliminating whole array scan keys as redundant. + * + * Note: _bt_start_array_keys actually sets up the cur_elem counters later on, + * once the scan direction is known. * * Note: the reason we need so->arrayKeyData, rather than just scribbling * on scan->keyData, is that callers are permitted to call btrescan without * supplying a new set of scankey data. + * + * Note: _bt_preprocess_keys is responsible for creating the so->keyData scan + * keys used by _bt_checkkeys. Index scans that don't use equality array keys + * will have _bt_preprocess_keys treat scan->keyData as input and so->keyData + * as output. Scans that use equality array keys have _bt_preprocess_keys + * treat so->arrayKeyData (which is our output) as their input, while (as per + * usual) outputting so->keyData for _bt_checkkeys. This function adds an + * additional layer of indirection that allows _bt_preprocess_keys to more or + * less avoid dealing with SK_SEARCHARRAY as a special case. + * + * Note: _bt_preprocess_keys_leafbuf works by updating already-processed + * output keys (so->keyData) in-place. It cannot eliminate redundant or + * contradictory scan keys. This necessitates having _bt_preprocess_keys + * understand that it is unsafe to eliminate "redundant" SK_SEARCHARRAY + * equality scan keys on the basis of what is actually just the current array + * key values -- it must conservatively assume that such a scan key might no + * longer be redundant after the next _bt_preprocess_keys_leafbuf call. + * Ideally we'd be able to deal with that by eliminating a subset of truly + * redundant array keys up-front, but it doesn't seem worth the trouble. */ void _bt_preprocess_array_keys(IndexScanDesc scan) @@ -212,7 +273,9 @@ _bt_preprocess_array_keys(IndexScanDesc scan) BTScanOpaque so = (BTScanOpaque) scan->opaque; int numberOfKeys = scan->numberOfKeys; int16 *indoption = scan->indexRelation->rd_indoption; + int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(scan->indexRelation); int numArrayKeys; + int lastEqualityArrayAtt = -1; ScanKey cur; int i; MemoryContext oldContext; @@ -265,6 +328,7 @@ _bt_preprocess_array_keys(IndexScanDesc scan) /* Allocate space for per-array data in the workspace context */ so->arrayKeys = (BTArrayKeyInfo *) palloc0(numArrayKeys * sizeof(BTArrayKeyInfo)); + so->orderProcs = (FmgrInfo *) palloc0(nkeyatts * sizeof(FmgrInfo)); /* Now process each array key */ numArrayKeys = 0; @@ -281,6 +345,16 @@ _bt_preprocess_array_keys(IndexScanDesc scan) int j; cur = &so->arrayKeyData[i]; + + /* + * Attributes with equality-type scan keys (including but not limited + * to array scan keys) will need a 3-way comparison function. Set + * that up now. (Avoids repeating work for the same attribute.) + */ + if (cur->sk_strategy == BTEqualStrategyNumber && + !OidIsValid(so->orderProcs[cur->sk_attno - 1].fn_oid)) + _bt_sort_cmp_func_setup(scan, cur); + if (!(cur->sk_flags & SK_SEARCHARRAY)) continue; @@ -357,6 +431,46 @@ _bt_preprocess_array_keys(IndexScanDesc scan) (indoption[cur->sk_attno - 1] & INDOPTION_DESC) != 0, elem_values, num_nonnulls); + /* + * If this scan key is semantically equivalent to a previous equality + * operator array scan key, merge the two arrays together to eliminate + * redundant non-intersecting elements (and redundant whole scan keys) + */ + if (lastEqualityArrayAtt == cur->sk_attno) + { + BTArrayKeyInfo *prev = &so->arrayKeys[numArrayKeys - 1]; + + Assert(so->arrayKeyData[prev->scan_key].sk_func.fn_oid == + cur->sk_func.fn_oid); + Assert(so->arrayKeyData[prev->scan_key].sk_subtype == + cur->sk_subtype); + + /* We could pfree(elem_values) after, but not worth the cycles */ + num_elems = _bt_merge_arrays(scan, cur, + (indoption[cur->sk_attno - 1] & INDOPTION_DESC) != 0, + prev->elem_values, prev->num_elems, + elem_values, num_elems); + + /* + * If there are no intersecting elements left from merging this + * array into the previous array on the same attribute, the scan + * qual is unsatisfiable + */ + if (num_elems == 0) + { + numArrayKeys = -1; + break; + } + + /* + * Lower the number of elements from the previous array, and mark + * this scan key/array as redundant for every primitive index scan + */ + prev->num_elems = num_elems; + cur->sk_flags |= SK_BT_RDDNARRAY; + continue; + } + /* * And set up the BTArrayKeyInfo data. */ @@ -364,6 +478,7 @@ _bt_preprocess_array_keys(IndexScanDesc scan) so->arrayKeys[numArrayKeys].num_elems = num_elems; so->arrayKeys[numArrayKeys].elem_values = elem_values; numArrayKeys++; + lastEqualityArrayAtt = cur->sk_attno; } so->numArrayKeys = numArrayKeys; @@ -437,26 +552,20 @@ _bt_find_extreme_element(IndexScanDesc scan, ScanKey skey, } /* - * _bt_sort_array_elements() -- sort and de-dup array elements + * Look up the appropriate comparison function in the opfamily. * - * The array elements are sorted in-place, and the new number of elements - * after duplicate removal is returned. - * - * scan and skey identify the index column, whose opfamily determines the - * comparison semantics. If reverse is true, we sort in descending order. + * Note: it's possible that this would fail, if the opfamily is incomplete, + * but it seems quite unlikely that an opfamily would omit non-cross-type + * support functions for any datatype that it supports at all. */ -static int -_bt_sort_array_elements(IndexScanDesc scan, ScanKey skey, - bool reverse, - Datum *elems, int nelems) +static void +_bt_sort_cmp_func_setup(IndexScanDesc scan, ScanKey skey) { + BTScanOpaque so = (BTScanOpaque) scan->opaque; Relation rel = scan->indexRelation; Oid elemtype; RegProcedure cmp_proc; - BTSortArrayContext cxt; - - if (nelems <= 1) - return nelems; /* no work to do */ + FmgrInfo *orderproc = &so->orderProcs[skey->sk_attno - 1]; /* * Determine the nominal datatype of the array elements. We have to @@ -471,12 +580,10 @@ _bt_sort_array_elements(IndexScanDesc scan, ScanKey skey, * Look up the appropriate comparison function in the opfamily. * * Note: it's possible that this would fail, if the opfamily is - * incomplete, but it seems quite unlikely that an opfamily would omit - * non-cross-type support functions for any datatype that it supports at - * all. + * incomplete. */ cmp_proc = get_opfamily_proc(rel->rd_opfamily[skey->sk_attno - 1], - elemtype, + rel->rd_opcintype[skey->sk_attno - 1], elemtype, BTORDER_PROC); if (!RegProcedureIsValid(cmp_proc)) @@ -484,8 +591,32 @@ _bt_sort_array_elements(IndexScanDesc scan, ScanKey skey, BTORDER_PROC, elemtype, elemtype, rel->rd_opfamily[skey->sk_attno - 1]); + /* Save in orderproc entry for attribute */ + fmgr_info_cxt(cmp_proc, orderproc, so->arrayContext); +} + +/* + * _bt_sort_array_elements() -- sort and de-dup array elements + * + * The array elements are sorted in-place, and the new number of elements + * after duplicate removal is returned. + * + * scan and skey identify the index column, whose opfamily determines the + * comparison semantics. If reverse is true, we sort in descending order. + */ +static int +_bt_sort_array_elements(IndexScanDesc scan, ScanKey skey, + bool reverse, + Datum *elems, int nelems) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + BTSortArrayContext cxt; + + if (nelems <= 1) + return nelems; /* no work to do */ + /* Sort the array elements */ - fmgr_info(cmp_proc, &cxt.flinfo); + cxt.orderproc = &so->orderProcs[skey->sk_attno - 1]; cxt.collation = skey->sk_collation; cxt.reverse = reverse; qsort_arg(elems, nelems, sizeof(Datum), @@ -496,6 +627,48 @@ _bt_sort_array_elements(IndexScanDesc scan, ScanKey skey, _bt_compare_array_elements, &cxt); } +/* + * _bt_merge_arrays() -- merge together duplicate array keys + * + * Both scan key's have array elements that have already been sorted and + * deduplicated. + */ +static int +_bt_merge_arrays(IndexScanDesc scan, ScanKey skey, bool reverse, + Datum *elems_orig, int nelems_orig, + Datum *elems_next, int nelems_next) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + BTSortArrayContext cxt; + Datum *merged = palloc(sizeof(Datum) * nelems_orig); + int merged_nelems = 0; + + /* + * Incrementally copy the original array into a temp buffer, skipping over + * any items that are missing from the "next" array + */ + cxt.orderproc = &so->orderProcs[skey->sk_attno - 1]; + cxt.collation = skey->sk_collation; + cxt.reverse = reverse; + for (int i = 0; i < nelems_orig; i++) + { + Datum *elem = elems_orig + i; + + if (bsearch_arg(elem, elems_next, nelems_next, sizeof(Datum), + _bt_compare_array_elements, &cxt)) + merged[merged_nelems++] = *elem; + } + + /* + * Overwrite the original array with temp buffer so that we're only left + * with intersecting array elements + */ + memcpy(elems_orig, merged, merged_nelems * sizeof(Datum)); + pfree(merged); + + return merged_nelems; +} + /* * qsort_arg comparator for sorting array elements */ @@ -507,7 +680,7 @@ _bt_compare_array_elements(const void *a, const void *b, void *arg) BTSortArrayContext *cxt = (BTSortArrayContext *) arg; int32 compare; - compare = DatumGetInt32(FunctionCall2Coll(&cxt->flinfo, + compare = DatumGetInt32(FunctionCall2Coll(cxt->orderproc, cxt->collation, da, db)); if (cxt->reverse) @@ -515,6 +688,161 @@ _bt_compare_array_elements(const void *a, const void *b, void *arg) return compare; } +/* + * Comparator uses to search for the next array element when array keys need + * to be advanced via one or more binary searches + * + * This routine returns: + * <0 if tupdatum < arrdatum; + * 0 if tupdatum == arrdatum; + * >0 if tupdatum > arrdatum. + * + * This is essentially the same interface as _bt_compare: both functions + * compare the value that they're searching for to a binary search pivot. + * However, unlike _bt_compare, this function's "tuple argument" comes first, + * while its "array/scankey argument" comes second. +*/ +static inline int32 +_bt_compare_array_skey(FmgrInfo *orderproc, + Datum tupdatum, bool tupnull, + Datum arrdatum, ScanKey cur) +{ + int32 result = 0; + + Assert(cur->sk_strategy == BTEqualStrategyNumber); + Assert((cur->sk_flags & SK_ROW_HEADER) == 0); + + if (cur->sk_flags & SK_ISNULL) /* array/scan key is NULL */ + { + if (tupnull) + result = 0; /* NULL "=" NULL */ + else if (cur->sk_flags & SK_BT_NULLS_FIRST) + result = 1; /* NULL "<" NOT_NULL */ + else + result = -1; /* NULL ">" NOT_NULL */ + } + else if (tupnull) /* array/scan key is NOT_NULL and tuple item + * is NULL */ + { + if (cur->sk_flags & SK_BT_NULLS_FIRST) + result = -1; /* NOT_NULL ">" NULL */ + else + result = 1; /* NOT_NULL "<" NULL */ + } + else + { + /* + * Like _bt_compare, we need to be careful of cross-type comparisons, + * so the left value has to be the value that came from an index + * tuple. (Array scan keys cannot be cross-type, but other required + * scan keys that use an equal operator can be.) + */ + result = DatumGetInt32(FunctionCall2Coll(orderproc, cur->sk_collation, + tupdatum, arrdatum)); + + /* + * Unlike _bt_compare, we flip the sign when column is a DESC column + * (and *not* when column is ASC). This matches the approach taken by + * _bt_check_rowcompare, which performs similar three-way comparisons. + */ + if (cur->sk_flags & SK_BT_DESC) + INVERT_COMPARE_RESULT(result); + } + + return result; +} + +/* + * _bt_binsrch_array_skey() -- Binary search for next matching array key + * + * cur_elem_start indicates if the binary search should begin at the array's + * current element (or have the current element as an upper bound if it's a + * backward scan). This (and information about the scan's direction) allows + * searches against required scan key arrays to reuse earlier search bounds as + * an optimization. + * + * Returns an index to the first array element >= caller's tupdatum argument. + * Also sets *final_result to whatever _bt_compare_array_skey returned when we + * directly compared the returned array element to caller's tupdatum argument. + */ +static int +_bt_binsrch_array_skey(FmgrInfo *orderproc, + bool cur_elem_start, ScanDirection dir, + Datum tupdatum, bool tupnull, + BTArrayKeyInfo *array, ScanKey cur, + int32 *final_result) +{ + int low_elem, + mid_elem, + high_elem, + result = 0; + + Assert(cur->sk_flags & SK_SEARCHARRAY); + Assert(cur->sk_strategy == BTEqualStrategyNumber); + Assert(!cur_elem_start || + array->elem_values[array->cur_elem] == cur->sk_argument); + + if (ScanDirectionIsForward(dir)) + { + if (cur_elem_start) + low_elem = array->cur_elem; + else + low_elem = 0; + high_elem = array->num_elems - 1; + } + else + { + low_elem = 0; + if (cur_elem_start) + high_elem = array->cur_elem; + else + high_elem = array->num_elems - 1; + } + mid_elem = -1; + + while (high_elem > low_elem) + { + Datum arrdatum; + + mid_elem = low_elem + ((high_elem - low_elem) / 2); + arrdatum = array->elem_values[mid_elem]; + + result = _bt_compare_array_skey(orderproc, tupdatum, tupnull, + arrdatum, cur); + + if (result == 0) + { + /* + * Each array was deduplicated during initial preprocessing, so + * it's safe to quit as soon as we see an equal array element. + * This often saves an extra comparison or two... + */ + low_elem = mid_elem; + break; + } + + if (result > 0) + low_elem = mid_elem + 1; + else + high_elem = mid_elem; + } + + /* + * ...but our caller also cares about how its searched-for tuple datum + * compares to the array element we'll return. We set *final_result with + * the result of that comparison specifically. + * + * Avoid setting *final_result to the wrong comparison's result. + */ + if (low_elem != mid_elem) + result = _bt_compare_array_skey(orderproc, tupdatum, tupnull, + array->elem_values[low_elem], cur); + + *final_result = result; + + return low_elem; +} + /* * _bt_start_array_keys() -- Initialize array keys at start of a scan * @@ -539,30 +867,35 @@ _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir) curArrayKey->cur_elem = 0; skey->sk_argument = curArrayKey->elem_values[curArrayKey->cur_elem]; } - - so->arraysStarted = true; } /* - * _bt_advance_array_keys() -- Advance to next set of array elements + * _bt_advance_array_keys_increment() -- Advance to next set of array elements + * + * Advances the array keys by a single increment in the current scan + * direction. When there are multiple array keys this can roll over from the + * lowest order array to higher order arrays. * * Returns true if there is another set of values to consider, false if not. * On true result, the scankeys are initialized with the next set of values. + * On false result, the scankeys stay the same, and the array keys are not + * advanced (every array is still at its final element for scan direction). */ -bool -_bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir) +static bool +_bt_advance_array_keys_increment(IndexScanDesc scan, ScanDirection dir) { BTScanOpaque so = (BTScanOpaque) scan->opaque; bool found = false; - int i; + + Assert(!so->needPrimScan); /* * We must advance the last array key most quickly, since it will * correspond to the lowest-order index column among the available - * qualifications. This is necessary to ensure correct ordering of output - * when there are multiple array keys. + * qualifications. Rolling over like this is necessary to ensure correct + * ordering of output when there are multiple array keys. */ - for (i = so->numArrayKeys - 1; i >= 0; i--) + for (int i = so->numArrayKeys - 1; i >= 0; i--) { BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i]; ScanKey skey = &so->arrayKeyData[curArrayKey->scan_key]; @@ -596,19 +929,31 @@ _bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir) break; } - /* advance parallel scan */ - if (scan->parallel_scan != NULL) - _bt_parallel_advance_array_keys(scan); + if (found) + return true; /* - * When no new array keys were found, the scan is "past the end" of the - * array keys. _bt_start_array_keys can still "restart" the array keys if - * a rescan is required. + * Don't allow the entire set of array keys to roll over: restore the + * array keys to the state they were in before we were called. + * + * This ensures that the array keys only ratchet forward (or backwards in + * the case of backward scans). Our "so->arrayKeyData" scan keys should + * always match the current "so->keyData" search-type scan keys (except + * for a brief moment during array key advancement). */ - if (!found) - so->arraysStarted = false; + for (int i = 0; i < so->numArrayKeys; i++) + { + BTArrayKeyInfo *rollarray = &so->arrayKeys[i]; + ScanKey skey = &so->arrayKeyData[rollarray->scan_key]; - return found; + if (ScanDirectionIsBackward(dir)) + rollarray->cur_elem = 0; + else + rollarray->cur_elem = rollarray->num_elems - 1; + skey->sk_argument = rollarray->elem_values[rollarray->cur_elem]; + } + + return false; } /* @@ -622,6 +967,8 @@ _bt_mark_array_keys(IndexScanDesc scan) BTScanOpaque so = (BTScanOpaque) scan->opaque; int i; + Assert(_bt_verify_array_scankeys(scan)); + for (i = 0; i < so->numArrayKeys; i++) { BTArrayKeyInfo *curArrayKey = &so->arrayKeys[i]; @@ -661,20 +1008,691 @@ _bt_restore_array_keys(IndexScanDesc scan) * If we changed any keys, we must redo _bt_preprocess_keys. That might * sound like overkill, but in cases with multiple keys per index column * it seems necessary to do the full set of pushups. - * - * Also do this whenever the scan's set of array keys "wrapped around" at - * the end of the last primitive index scan. There won't have been a call - * to _bt_preprocess_keys from some other place following wrap around, so - * we do it for ourselves. */ - if (changed || !so->arraysStarted) + if (changed) { _bt_preprocess_keys(scan); /* The mark should have been set on a consistent set of keys... */ Assert(so->qual_ok); } + + Assert(_bt_verify_array_scankeys(scan)); } +/* + * Routine to determine if a continuescan=false tuple (set that way by an + * initial call to _bt_check_compare) must advance the scan's array keys. + * Only call here when _bt_check_compare already set continuescan=false. + * + * Returns true when caller passes a tuple that is < the current set of array + * keys for the most significant non-equal column/scan key (or > for backwards + * scans). This means that it cannot possibly be time to advance the array + * keys just yet. _bt_checkkeys caller should suppress its _bt_check_compare + * call, and return -- the tuple is treated as not satisfying our indexquals. + * + * Returns false when caller's tuple is >= the current array keys (or <=, in + * the case of backwards scans). This means that it is now time for our + * caller to advance the array keys (unless caller broke the rules by not + * checking with _bt_check_compare before calling here). + * + * Note: advancing the array keys may be required when every attribute value + * from caller's tuple is equal to corresponding scan key/array datums. See + * function header comments at the start of _bt_advance_array_keys for more. + */ +static bool +_bt_tuple_before_array_skeys(IndexScanDesc scan, BTReadPageState *pstate, + IndexTuple tuple) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Relation rel = scan->indexRelation; + ScanDirection dir = pstate->dir; + TupleDesc itupdesc = RelationGetDescr(rel); + bool tuple_before_array_keys = false; + ScanKey cur; + int ntupatts = BTreeTupleGetNAtts(tuple, rel), + ikey; + + Assert(so->numArrayKeys > 0); + Assert(so->numberOfKeys > 0); + Assert(!so->needPrimScan); + + for (cur = so->keyData, ikey = 0; ikey < so->numberOfKeys; cur++, ikey++) + { + int attnum = cur->sk_attno; + FmgrInfo *orderproc; + Datum tupdatum; + bool tupnull, + skrequired; + int32 result; + + /* + * We only deal with equality strategy scan keys. We leave handling + * of inequalities up to _bt_check_compare. + */ + if (cur->sk_strategy != BTEqualStrategyNumber) + continue; + + /* + * Determine if this scan key is required. + * + * Equality strategy scan keys are either required in both directions + * or neither direction, so the current scan direction doesn't need to + * be tested here. + */ + skrequired = (cur->sk_flags & SK_BT_REQFWD); + Assert(!skrequired || (cur->sk_flags & SK_BT_REQBKWD)); + + /* + * Unlike _bt_advance_array_keys, we never deal with any non-required + * array keys. Cases where skrequiredtrigger is set to false by + * _bt_check_compare should never call here. We are only called after + * _bt_check_compare provisionally indicated that the scan should be + * terminated due to a _required_ scan key not being satisfied. + * + * We expect _bt_check_compare to notice and report required scan keys + * before non-required ones. _bt_advance_array_keys might still have + * to advance non-required array keys in passing for a tuple that we + * were called for, but it doesn't need advanced notice of that from + * us. + */ + if (!skrequired) + break; + + if (attnum > ntupatts) + { + /* + * When we reach a high key's truncated attribute, assume that the + * tuple attribute's value is >= the scan's search-type scan keys + */ + break; + } + + tupdatum = index_getattr(tuple, attnum, itupdesc, &tupnull); + + orderproc = &so->orderProcs[attnum - 1]; + result = _bt_compare_array_skey(orderproc, tupdatum, tupnull, + cur->sk_argument, cur); + + if (result != 0) + { + if (ScanDirectionIsForward(dir)) + tuple_before_array_keys = result < 0; + else + tuple_before_array_keys = result > 0; + + break; + } + } + + return tuple_before_array_keys; +} + +/* + * _bt_array_keys_remain() -- Start another primitive index scan? + * + * Returns true if _bt_checkkeys determined that another primitive index scan + * must take place by calling _bt_first. Otherwise returns false, indicating + * that caller's top-level scan is now past the point where further matching + * index tuples can be found (for the current scan direction). + * + * Only call here during scans with one or more equality type array scan keys. + * All other scans should just call _bt_first once, no matter what. + * + * Top-level index scans executed via multiple primitive index scans must not + * fail to output index tuples in the usual order for the index -- just like + * any other index scan would. The state machine that manages the scan's + * array keys must only start primitive index scans when they cover key space + * strictly greater than the key space for tuples that the scan has already + * returned (or strictly less in the backwards scan case). Otherwise the scan + * could output the same index tuples more than once, or in the wrong order. + * + * This is managed by limiting the cases that can trigger new primitive index + * scans to those involving required array scan keys and/or other required + * scan keys that use the equality strategy. In particular, the state machine + * must not allow high order required scan keys using an inequality strategy + * (which are only required in one scan direction) to directly trigger a new + * primitive index scan that advances low order non-required array scan keys. + * For example, a query such as "SELECT thousand, tenthous FROM tenk1 WHERE + * thousand < 2 AND tenthous IN (1001,3000) ORDER BY thousand" whose execution + * involves a scan of an index on "(thousand, tenthous)" must perform no more + * than a single primitive index scan. Otherwise we risk outputting tuples in + * the wrong order. Array key values for the non-required scan key on the + * "tenthous" column must not dictate top-level scan order. Primitive index + * scans mustn't scan tuples already scanned by some earlier primitive scan. + * + * In fact, nbtree makes a stronger guarantee than is strictly necessary here: + * it guarantees that the top-level scan won't repeat any leaf page reads. + * (Actually, that can still happen when the scan is repositioned, or the scan + * direction changes -- but that's just as true with other types of scans.) + */ +bool +_bt_array_keys_remain(IndexScanDesc scan, ScanDirection dir) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + + Assert(so->numArrayKeys); + + /* + * Array keys are advanced within _bt_checkkeys when the scan reaches the + * leaf level (more precisely, they're advanced when the scan reaches the + * end of each distinct set of array elements). This process avoids + * repeat access to leaf pages (across multiple primitive index scans) by + * opportunistically advancing the scan's array keys when it allows the + * primitive index scan to find nearby matching tuples (or to eliminate + * array keys with no matching tuples from further consideration). + * + * _bt_checkkeys sets a simple flag variable that we check here. This + * tells us if we need to perform another primitive index scan for the + * now-current array keys or not. We'll unset the flag once again to + * acknowledge having started a new primitive scan (or we'll see that it + * isn't set and end the top-level scan right away). + * + * We cannot rely on _bt_first always reaching _bt_checkkeys here. There + * are various scenarios where that won't happen. For example, if the + * index is completely empty, then _bt_first won't get as far as calling + * _bt_readpage/_bt_checkkeys. + * + * We also don't expect _bt_checkkeys to be reached when searching for a + * non-existent value that happens to be higher than any existing value in + * the index. No _bt_checkkeys are expected when _bt_readpage reads the + * rightmost page during such a scan -- even a _bt_checkkeys call against + * the high key won't happen. There is an analogous issue for backwards + * scans that search for a value lower than all existing index tuples. + * + * We don't actually require special handling for these cases -- we don't + * need to be explicitly instructed to _not_ perform another primitive + * index scan. This is correct for all of the cases we've listed so far, + * which all involve primitive index scans that access pages "near the + * boundaries of the key space" (the leftmost page, the rightmost page, or + * an imaginary empty leaf root page). If _bt_checkkeys cannot be reached + * by a primitive index scan for one set of array keys, it follows that it + * also won't be reached for any later set of array keys. + * + * There is one exception: the case where _bt_first's _bt_preprocess_keys + * call determined that the scan's input scan keys can never be satisfied. + * That might be true for one set of array keys, but not the next set. + */ + if (!so->qual_ok) + { + /* + * Defensively check for interrupts -- the scan's next call to + * _bt_first won't be able to do so if the next set of keys also turn + * out to be unsatisfiable + */ + CHECK_FOR_INTERRUPTS(); + + /* Can't use _bt_advance_array_keys so use incremental advancement */ + so->needPrimScan = false; + if (_bt_advance_array_keys_increment(scan, dir)) + return true; + } + + /* Time for another primitive index scan? */ + if (so->needPrimScan) + { + /* Have our caller call _bt_first once more */ + so->needPrimScan = false; + if (scan->parallel_scan != NULL) + _bt_parallel_next_primitive_scan(scan); + + return true; + } + + if (scan->parallel_scan != NULL) + _bt_parallel_done(scan); + + /* + * No more primitive index scans. Terminate the top-level scan. + */ + return false; +} + +/* + * _bt_advance_array_keys() -- Advance array elements using a tuple + * + * Returns true if all required equality-type scan keys (in particular, those + * that are array keys) now have exact matching values to those from tuple. + * Returns false when the tuple isn't an exact match in this sense. + * + * Sets pstate.continuescan for caller when we return false. When we return + * true it's up to caller to call _bt_check_compare to recheck the tuple. The + * second call should be allowed to set pstate.continuescan=false without + * further intervention, since tuple must be <= the array keys after we're + * called (actually, that guarantee applies to all required equality-type scan + * keys, and does not apply to non-required array keys). + * + * When called with skrequiredtrigger=true, the call only expects to have to + * deal with non-required equality array keys. The rules are a little + * different during these calls. We'll always set pstate.continuescan=true, + * since (by definition) a non-required scan key never terminates the scan. + * + * If we reach the end of all of the required array keys for the current scan + * direction, we will effectively end the top-level index scan. + * + * This function will always advance the array keys by at least one increment + * (except when it ends the top-level index scan having reached a tuple beyond + * the scan's final array key, and except during !skrequiredtrigger calls). + * + * _bt_tuple_before_array_skeys is responsible for determining if the current + * place in the scan is >= the current array keys. Calling here before that + * point will prematurely advance the array keys, leading to wrong query + * results (though this precondition is checked here via an assertion). + * + * We're responsible for ensuring that caller's tuple is <= current/newly + * advanced required array keys once we return (this postcondition is also + * checked via another assertion). We try to find an exact match, but failing + * that we'll advance the array keys to whatever set of keys comes next in the + * key space (among the keys that we actually have). In general, the scan's + * array keys can only ever "ratchet forwards", progressing in lock step with + * the scan. + * + * (The invariants are the same for backwards scans, except that the operators + * are flipped: just replace the precondition's >= operator with a <=, and the + * postcondition's <= operator with with a >=. In other words, just swap the + * precondition with the postcondition.) + * + * Note that we may sometimes need to advance the array keys in spite of the + * existing array keys already being an exact match for every corresponding + * value from caller's tuple. We fall back on "incrementally" advancing the + * array keys in these cases, which all involve non-array scan keys. For + * example, with a composite index on (a, b) and a qual "WHERE a IN (3,5) AND + * b < 42", we'll be called for both "a" keys (i.e. keys 3 and 5) when the + * scan reaches tuples where "b >= 42". Even though "a" array keys continue + * to have exact matches for tuples "b >= 42" (for both array key groupings), + * we will still advance the array for "a" via our fallback on incremental + * advancement each time we're called. The first time we're called (when the + * scan reaches a tuple >= "(3, 42)"), we advance the array key (from 3 to 5). + * This gives our caller the option of starting a new primitive index scan + * that quickly locates the start of tuples > "(5, -inf)". The second time + * we're called (when the scan reaches a tuple >= "(5, 42)"), we incrementally + * advance the keys a second time. This second call ends the top-level scan. + * + * Note also that we deal with all required equality-type scan keys here; it's + * not limited to array scan keys. We need to handle non-array equality cases + * here because they're equality constraints for the scan, in the same way + * that array scan keys are. + */ +static bool +_bt_advance_array_keys(IndexScanDesc scan, BTReadPageState *pstate, + IndexTuple tuple, bool skrequiredtrigger) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Relation rel = scan->indexRelation; + ScanDirection dir = pstate->dir; + TupleDesc itupdesc = RelationGetDescr(rel); + ScanKey cur; + int ikey, + arrayidx = 0, + ntupatts = BTreeTupleGetNAtts(tuple, rel); + bool arrays_advanced = false, + arrays_exhausted, + beyond_end_advance = false, + all_eqtype_sk_equal = true, + all_required_eqtype_sk_equal PG_USED_FOR_ASSERTS_ONLY = true; + + /* + * Must only be called when tuple is >= current required array keys + * (except during backwards scans, when it must be <= the array keys) + */ + Assert(_bt_verify_array_scankeys(scan)); + Assert(!skrequiredtrigger || + !_bt_tuple_before_array_skeys(scan, pstate, tuple)); + + /* + * Try to advance array keys via a series of binary searches. + * + * Loop iterates through the current scankeys (so->keyData, which were + * output by _bt_preprocess_keys earlier) and then sets input scan keys + * (so->arrayKeyData scan keys) to new array values. + */ + for (cur = so->keyData, ikey = 0; ikey < so->numberOfKeys; cur++, ikey++) + { + BTArrayKeyInfo *array = NULL; + ScanKey skeyarray = NULL; + FmgrInfo *orderproc; + int attnum = cur->sk_attno, + set_elem = 0; + Datum tupdatum; + bool skrequired, + tupnull; + int32 result; + + /* + * We only deal with equality strategy scan keys. We leave handling + * of inequalities up to _bt_check_compare. + */ + if (cur->sk_strategy != BTEqualStrategyNumber) + continue; + + /* + * Determine if this scan key is required. + * + * Equality strategy scan keys are either required in both directions + * or neither direction, so the current scan direction doesn't need to + * be tested here. + */ + skrequired = (cur->sk_flags & SK_BT_REQFWD); + Assert(!skrequired || (cur->sk_flags & SK_BT_REQBKWD)); + + /* + * Set up ORDER 3-way comparison function and array state + */ + orderproc = &so->orderProcs[attnum - 1]; + if (cur->sk_flags & SK_SEARCHARRAY) + { + Assert(arrayidx < so->numArrayKeys); + array = &so->arrayKeys[arrayidx++]; + skeyarray = &so->arrayKeyData[array->scan_key]; + Assert(skeyarray->sk_attno == attnum); + } + + /* + * Optimization: Skip over non-required scan keys when we know that + * they can't have changed (because _bt_check_compare triggered this + * call due to encountering an unsatisified non-required array qual) + */ + if (skrequired && !skrequiredtrigger) + { + Assert(!beyond_end_advance && !arrays_advanced); + + continue; + } + + /* + * Here we perform steps for all array scan keys after a required + * array scan key whose binary search triggered "beyond end of array + * element" array advancement due to encountering a tuple attribute + * value > the closest matching array key (or < for backwards scans). + * + * We help to make sure that the array keys are ultimately advanced + * such that caller's tuple is < final array keys (or > final keys). + * We're behind the scan right now, but we'll fully "catch up" once + * outside the loop (we'll be immediately ahead of this tuple). See + * below for a detailed explanation. + * + * NB: We must do this for all arrays -- not just required arrays. + * Otherwise the final incremental array advancement step (that takes + * place just outside the loop) won't "carry" in the way we expect. + */ + if (beyond_end_advance) + { + int final_elem_dir; + + Assert(skrequiredtrigger); + Assert(!all_eqtype_sk_equal && !all_required_eqtype_sk_equal); + + if (ScanDirectionIsBackward(dir) || !array) + final_elem_dir = 0; + else + final_elem_dir = array->num_elems - 1; + + if (array && array->cur_elem != final_elem_dir) + { + array->cur_elem = final_elem_dir; + skeyarray->sk_argument = array->elem_values[final_elem_dir]; + arrays_advanced = true; + } + + continue; + } + + /* + * Here we perform steps for any required scan keys after the first + * required scan key whose tuple attribute was < the closest matching + * array key when we dealt with it (or > for backwards scans). + * + * This earlier required array key already puts us ahead of caller's + * tuple in the key space (for the current scan direction). We must + * make sure that subsequent lower-order array keys do not put us too + * far ahead (ahead of tuples that have yet to be seen by our caller). + * For example, when a tuple "(a, b) = (42, 5)" advances the array + * keys on "a" from 40 to 45, we must also set "b" to whatever the + * first array element for "b" is. It would be wrong to allow "b" to + * be set to a value from the tuple, since the value is actually from + * a different part of the key space. + * + * Also perform the same steps with truncated high key attributes. + * You can think of this as a "binary search" for the element closest + * to the value -inf. This is another case where we have to avoid + * getting too far ahead of the scan. + */ + if (!all_eqtype_sk_equal || attnum > ntupatts) + { + int first_elem_dir; + + Assert((skrequiredtrigger && arrays_advanced) || + attnum > ntupatts); + Assert(!beyond_end_advance); + + if (ScanDirectionIsForward(dir) || !array) + first_elem_dir = 0; + else + first_elem_dir = array->num_elems - 1; + + if (array && array->cur_elem != first_elem_dir) + { + array->cur_elem = first_elem_dir; + skeyarray->sk_argument = array->elem_values[first_elem_dir]; + arrays_advanced = true; + } + + continue; + } + + /* + * Search in scankey's array for the corresponding tuple attribute + * value from caller's tuple + */ + tupdatum = index_getattr(tuple, attnum, itupdesc, &tupnull); + + if (!array) + { + if (!skrequired) + continue; + + /* + * This is a required non-array equality strategy scan key, which + * we'll treat as a degenerate single value array + */ + result = _bt_compare_array_skey(orderproc, tupdatum, tupnull, + cur->sk_argument, cur); + } + else + { + /* Determine if search bounds are reusable (optimization) */ + bool cur_elem_start = (skrequired && !arrays_advanced); + + /* + * Binary search for closest match that's available from the array + */ + set_elem = _bt_binsrch_array_skey(orderproc, cur_elem_start, dir, + tupdatum, tupnull, array, cur, + &result); + } + + /* Consider advancing array keys */ + Assert(!array || (set_elem >= 0 && set_elem < array->num_elems)); + if (array && array->cur_elem != set_elem) + { + array->cur_elem = set_elem; + skeyarray->sk_argument = array->elem_values[set_elem]; + arrays_advanced = true; + + /* + * We shouldn't have to advance a required array when called due + * to _bt_check_compare determining that a non-required array + * needs to be advanced. We expect _bt_check_compare to notice + * and report required scan keys before non-required ones. + */ + Assert(skrequiredtrigger || !skrequired); + } + + /* + * Consider "beyond end of array element" array advancement. + * + * When the tuple attribute value is > the closest matching array key + * (or < in the backwards scan case), we need to ratchet the array + * forward (backward) by one position, so that the array is set to a + * value < the tuple attribute value instead (or to a value > tuple's + * value). + * + * This process has to work for all of the arrays, not just this one: + * it must "carry" to higher-order arrays when the set_elem that we + * just used for this array happens to have been the final element + * (final for the current scan direction). That's why we don't handle + * this issue by modifying this array's set_elem (that won't "carry"). + * + * Our approach is to set each subsequent lower-order array to its + * final element. We'll then advance the array keys incrementally, + * just outside the loop. That way earlier/higher order arrays + * (arrays before _this_ array) can advance as and when required. + * + * The array keys advance a little like the way that an mileage gauge + * advances. Imagine a mechanical display that rolls over from 999 to + * 000 every time we drive our car another 1,000 miles. Each decimal + * digit behaves a little like an array from the array state machine + * implemented by this function. + * + * Suppose we have 3 array keys a, b, and c. Each "digit"/array has + * 10 distinct elements that happen to match across each array: values + * 0 through to 9. Caller's tuple "(a, b, c) = (3, 7.9, 2)" might + * initially have its "b" array advanced up to the value 7 (7 being + * the closest match the "b" array has), and its "c" array advanced up + * to 9. The incremental advancement step (outside the loop) will + * then finish the process by "advancing" (actually, rolling over) the + * array on "c" to the value 0, which would immediately carry over to + * "b", which will then advance to the value 8 ("rounding up" from 7). + * Under this scheme, the array keys only ever ratchet forward, and + * array key advancement by us takes place as infrequently as possible + * (see also: this function's postcondition assertions, below). + * + * Incremental advancement can also carry all the way past the most + * significant array, exhausting all of the scan's array keys in one + * step. Suppose, for example, that a later call here passes a tuple + * "(a, b, c) = (9, 9.9, 4)". Once again we can't find an exact match + * for "b", so we'll set beyond_end_advance. This time, incremental + * advancement rolls over all the way past "a", the most significant + * array. _bt_advance_array_keys_increment will return false when + * this happens, indicating that all array keys are now exhausted. + * This triggers the end of the top-level index scan below. + */ + Assert(!beyond_end_advance); + if (skrequired && + ((ScanDirectionIsForward(dir) && result > 0) || + (ScanDirectionIsBackward(dir) && result < 0))) + beyond_end_advance = true; + + /* + * Also track whether all attributes from the tuple are equal to the + * array keys that we'll be advancing to (or to existing array keys + * that didn't need to be advanced) + */ + if (result != 0) + { + all_eqtype_sk_equal = false; + if (skrequired) + all_required_eqtype_sk_equal = false; + + /* Just skip if triggered by a non-required scan key */ + if (!skrequiredtrigger) + break; + } + } + + /* + * Consider if we need to advance the array keys incrementally to finish + * off "beyond end of array element" array advancement. + * + * Also fall back on incremental advancement in cases where we couldn't + * advance the array keys any other way. See function header comments for + * an example of this, where inequality-type scan keys alone drive array + * key advancement. (We don't directly deal with inequality type scan + * keys here, but cases that use the fallback must involve inequalities.) + */ + arrays_exhausted = false; + if ((beyond_end_advance || !arrays_advanced) && skrequiredtrigger) + { + /* Fallback case must have all-equal equality type scan keys */ + Assert(beyond_end_advance || all_required_eqtype_sk_equal); + + if (!_bt_advance_array_keys_increment(scan, dir)) + arrays_exhausted = true; + else + arrays_advanced = true; + + /* + * The newly advanced array keys won't be equal anymore, so remember + * that in order to avoid a second _bt_check_compare call for tuple + */ + all_eqtype_sk_equal = all_required_eqtype_sk_equal = false; + } + + Assert(arrays_exhausted || arrays_advanced || !skrequiredtrigger); + + /* + * If we haven't yet exhausted all required array scan keys, allow the + * ongoing primitive index scan to continue + */ + pstate->continuescan = !arrays_exhausted; + + /* Cannot set continuescan=false when called for non-required array */ + Assert(pstate->continuescan || skrequiredtrigger); + + if (arrays_advanced) + { + /* + * We advanced the array keys, and so must perform a targeted form of + * in-place preprocessing of the scan's search-type scan keys. + * + * If we missed this final step then any call to _bt_check_compare + * would use stale array keys until such time as _bt_preprocess_keys + * was once again called by _bt_first. But it's a good idea to do + * this even when there won't be another primitive index scan. + */ + _bt_preprocess_keys_leafbuf(scan); + + /* + * If any required array keys were advanced, be prepared to recheck + * the final tuple against the new array keys (as an optimization) + */ + if (skrequiredtrigger) + pstate->finaltupchecked = false; + } + + /* + * Postcondition assertions. + * + * Tuple must now be <= current/newly advanced required array keys. Same + * goes for other required equality type scan keys, which are "degenerate + * single value arrays" for our purposes. (As usual the rule is the same + * for backwards scans, but the operator is flipped: tuple must be >= new + * array keys.) + * + * We're stricter than that in cases where the tuple was already equal to + * the previous array keys when we were called: tuple must now be < the + * new array keys (or > the array keys). This is a consequence of the + * fallback on incremental advancement used to indirectly handle cases + * where an inequality triggers array key advancement. (See function + * header comments for an example of this.) + * + * Our caller decides when to start primitive index scans based in part on + * the current array keys. It always needs to see a precise array-wise + * picture of the scan's progress. If we ever advanced the array keys by + * less than the exact maximum safe amount, our caller might go on to make + * subtly wrong decisions about when to quit the ongoing primitive scan. + * (These assertions won't reliably detect every case where the array keys + * haven't advance by the expected/maximum amount, but they come close.) + */ + Assert(_bt_verify_array_scankeys(scan)); + Assert(_bt_tuple_before_array_skeys(scan, pstate, tuple) == + (!all_required_eqtype_sk_equal && !arrays_exhausted)); + + /* All-equal required equality keys shouldn't be from before this call */ + Assert(!all_required_eqtype_sk_equal || !skrequiredtrigger || + arrays_advanced || arrays_exhausted); + + return all_eqtype_sk_equal && pstate->continuescan; +} /* * _bt_preprocess_keys() -- Preprocess scan keys @@ -749,6 +1767,21 @@ _bt_restore_array_keys(IndexScanDesc scan) * Again, missing cross-type operators might cause us to fail to prove the * quals contradictory when they really are, but the scan will work correctly. * + * Index scans with array keys need to be able to advance each array's keys + * and make them the current search-type scan keys without calling here. They + * expect to be able to call _bt_preprocess_keys_leafbuf instead (a stripped + * down version of this function that's specialized to array key index scans). + * We need to be careful about that case here when we determine redundancy; + * equality quals must not be eliminated as redundant on the basis of array + * input keys that might change before another call here takes place. + * + * Note, however, that the presence of an array scan key doesn't affect how we + * determine if index quals are contradictory. Contradictory qual scans move + * on to the next primitive index scan right away, by incrementing the scan's + * array keys once control reaches _bt_array_keys_remain. There won't ever be + * a call to _bt_preprocess_keys_leafbuf before the next call here, so there + * is nothing for us to break. + * * Row comparison keys are currently also treated without any smarts: * we just transfer them into the preprocessed array without any * editorialization. We can treat them the same as an ordinary inequality @@ -895,8 +1928,11 @@ _bt_preprocess_keys(IndexScanDesc scan) so->qual_ok = false; return; } - /* else discard the redundant non-equality key */ - xform[j] = NULL; + else if (!(eq->sk_flags & SK_SEARCHARRAY)) + { + /* else discard the redundant non-equality key */ + xform[j] = NULL; + } } /* else, cannot determine redundancy, keep both keys */ } @@ -986,6 +2022,22 @@ _bt_preprocess_keys(IndexScanDesc scan) continue; } + /* + * Is this an array scan key that _bt_preprocess_array_keys merged + * with some earlier array key during its initial preprocessing pass? + */ + if (cur->sk_flags & SK_BT_RDDNARRAY) + { + /* + * key is redundant for this primitive index scan (and will be + * redundant during all subsequent primitive index scans) + */ + Assert(cur->sk_flags & SK_SEARCHARRAY); + Assert(j == (BTEqualStrategyNumber - 1)); + Assert(so->numArrayKeys > 0); + continue; + } + /* have we seen one of these before? */ if (xform[j] == NULL) { @@ -999,7 +2051,26 @@ _bt_preprocess_keys(IndexScanDesc scan) &test_result)) { if (test_result) - xform[j] = cur; + { + if (j == (BTEqualStrategyNumber - 1) && + ((xform[j]->sk_flags & SK_SEARCHARRAY) || + (cur->sk_flags & SK_SEARCHARRAY))) + { + /* + * Must never replace an = array operator ourselves, + * nor can we ever fail to remember an = array + * operator. _bt_preprocess_keys_leafbuf expects + * this. + */ + ScanKey outkey = &outkeys[new_numberOfKeys++]; + + memcpy(outkey, cur, sizeof(ScanKeyData)); + if (numberOfEqualCols == attno - 1) + _bt_mark_scankey_required(outkey); + } + else + xform[j] = cur; + } else if (j == (BTEqualStrategyNumber - 1)) { /* key == a && key == b, but a != b */ @@ -1027,6 +2098,96 @@ _bt_preprocess_keys(IndexScanDesc scan) so->numberOfKeys = new_numberOfKeys; } +/* + * _bt_preprocess_keys_leafbuf() -- Preprocess array scan keys only + * + * Stripped down version of _bt_preprocess_keys that can be called with a + * buffer lock held. Reuses much of the work performed during the previous + * _bt_preprocess_keys call. + * + * This function just transfers newly advanced array keys that were set in + * "so->arrayKeyData" to corresponding "so->keyData" search-type scan keys. + * It does not independently detect redunant or contradictory scan keys. + */ +static void +_bt_preprocess_keys_leafbuf(IndexScanDesc scan) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + ScanKey cur; + int ikey, + arrayidx = 0; + + Assert(so->qual_ok); + + for (cur = so->keyData, ikey = 0; ikey < so->numberOfKeys; cur++, ikey++) + { + BTArrayKeyInfo *array; + ScanKey skeyarray; + + Assert((cur->sk_flags & SK_BT_RDDNARRAY) == 0); + + /* Just update equality array scan keys */ + if (cur->sk_strategy != BTEqualStrategyNumber || + !(cur->sk_flags & SK_SEARCHARRAY)) + continue; + + array = &so->arrayKeys[arrayidx++]; + skeyarray = &so->arrayKeyData[array->scan_key]; + + /* Update the scan key's argument */ + Assert(cur->sk_attno == skeyarray->sk_attno); + cur->sk_argument = skeyarray->sk_argument; + } + + Assert(arrayidx == so->numArrayKeys); +} + +/* + * Verify that the scan's "so->arrayKeyData" scan keys are in agreement with + * the current "so->keyData" search-type scan keys. Used within assertions. + */ +#ifdef USE_ASSERT_CHECKING +static bool +_bt_verify_array_scankeys(IndexScanDesc scan) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + ScanKey cur; + int ikey, + arrayidx = 0; + + if (!so->qual_ok) + return false; + + for (cur = so->keyData, ikey = 0; ikey < so->numberOfKeys; cur++, ikey++) + { + BTArrayKeyInfo *array; + ScanKey skeyarray; + + if (cur->sk_strategy != BTEqualStrategyNumber || + !(cur->sk_flags & SK_SEARCHARRAY)) + continue; + + array = &so->arrayKeys[arrayidx++]; + skeyarray = &so->arrayKeyData[array->scan_key]; + + /* Verify so->arrayKeyData input scan key has expected sk_argument */ + if (skeyarray->sk_argument != array->elem_values[array->cur_elem]) + return false; + + /* Verify so->arrayKeyData input scan key agrees with output key */ + if (cur->sk_attno != skeyarray->sk_attno) + return false; + if (cur->sk_argument != skeyarray->sk_argument) + return false; + } + + if (arrayidx != so->numArrayKeys) + return false; + + return true; +} +#endif + /* * Compare two scankey values using a specified operator. * @@ -1360,41 +2521,198 @@ _bt_mark_scankey_required(ScanKey skey) * * Return true if so, false if not. If the tuple fails to pass the qual, * we also determine whether there's any need to continue the scan beyond - * this tuple, and set *continuescan accordingly. See comments for + * this tuple, and set pstate.continuescan accordingly. See comments for * _bt_preprocess_keys(), above, about how this is done. * - * Forward scan callers can pass a high key tuple in the hopes of having - * us set *continuescan to false, and avoiding an unnecessary visit to - * the page to the right. + * Forward scan callers can pass a high key tuple in the hopes of having us + * set pstate.continuescan to false, and avoiding an unnecessary visit to the + * page to the right. + * + * Forwards scan callers with equality type array scan keys are obligated to + * set up page state in a way that makes it possible for us to check the final + * tuple (the high key for a forward scan) early, before we've expended too + * much effort on comparing tuples that cannot possibly be matches for any set + * of array keys. This is just an optimization. + * + * Advances the current set of array keys for SK_SEARCHARRAY scans where + * appropriate. These callers are required to initialize the page level high + * key in pstate before the first call here for the page (when the scan + * direction is forwards). Note that we rely on _bt_readpage calling here in + * page offset number order (for its scan direction). Any other order will + * lead to inconsistent array key state. * * scan: index scan descriptor (containing a search-type scankey) + * pstate: Page level input and output parameters * tuple: index tuple to test - * tupnatts: number of attributes in tupnatts (high key may be truncated) - * dir: direction we are scanning in - * continuescan: output parameter (will be set correctly in all cases) + * finaltup: Is tuple the final one we'll be called with for this page? * requiredMatchedByPrecheck: indicates that scan keys required for * direction scan are already matched */ bool -_bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, - ScanDirection dir, bool *continuescan, +_bt_checkkeys(IndexScanDesc scan, BTReadPageState *pstate, + IndexTuple tuple, bool finaltup, bool requiredMatchedByPrecheck) { - TupleDesc tupdesc; - BTScanOpaque so; - int keysz; + TupleDesc tupdesc = RelationGetDescr(scan->indexRelation); + int natts = BTreeTupleGetNAtts(tuple, scan->indexRelation); + BTScanOpaque so = (BTScanOpaque) scan->opaque; + bool res; + bool skrequiredtrigger; + + Assert(pstate->continuescan); + Assert(!so->needPrimScan); + + res = _bt_check_compare(pstate->dir, so, tuple, natts, tupdesc, + &pstate->continuescan, &skrequiredtrigger, + requiredMatchedByPrecheck); + + /* + * Only one _bt_check_compare call is required in the common case where + * there are no equality-type array scan keys. + * + * When there are array scan keys then we can still accept the first + * answer we get from _bt_check_compare when continuescan wasn't unset. + */ + if (!so->numArrayKeys || pstate->continuescan) + return res; + + /* + * _bt_check_compare set continuescan=false in the presence of equality + * type array keys. It's possible that we haven't reached the start of + * the array keys just yet. It's also possible that we need to advance + * the array keys now. (Or perhaps we really do need to terminate the + * top-level scan.) + */ + pstate->continuescan = true; /* new initial assumption */ + + if (skrequiredtrigger && _bt_tuple_before_array_skeys(scan, pstate, tuple)) + { + /* + * Tuple is still < the current array scan key values (as well as + * other equality type scan keys) if this is a forward scan. + * (Backwards scans reach here with a tuple > equality constraints.) + * We must now consider how to proceed with the ongoing primitive + * index scan. + * + * Should _bt_readpage continue with this page for now, in the hope of + * finding tuples whose key space is covered by the current array keys + * before too long? Or, should it give up and start a new primitive + * index scan instead? + * + * Our policy is to terminate the primitive index scan at the end of + * the current page if the current (most recently advanced) array keys + * don't cover the final tuple from the page. This policy is fairly + * conservative overall. Note, however, that our policy effectively + * infers what the next sibling page is likely to look like based on + * details from the current page (in particular its final tuple). + * + * It's possible that we'll gamble and lose: a grouping of tuples + * covered by the current array keys could be aligned with the key + * space boundaries of the current leaf page, without any later array + * keys having key space that is covered by the next sibling page. + */ + if (finaltup || (!pstate->finaltupchecked && pstate->finaltup && + _bt_tuple_before_array_skeys(scan, pstate, + pstate->finaltup))) + { + /* + * This is the final tuple (the high key for forward scans, or the + * tuple at the first offset number for backward scans), but it is + * still before the current array keys. As such, we're unwilling + * to allow the current primitive index scan to continue to the + * next leaf page. Start a new primitive index scan that will + * reposition the top-level scan to the first leaf page whose key + * space is covered by our _current_ array keys. We expect that + * this process will effectively make the scan "skip over" a group + * of leaf pages that cannot possibly contain any matching tuples. + * + * Note: _bt_readpage stashes the final tuple, which allows us to + * make this check early. We thereby avoid comparing very many + * extra tuples on the page. This is just an optimization; + * skipping these useless comparisons should never change our + * final conclusion about what the scan should do next. + */ + pstate->continuescan = false; + so->needPrimScan = true; + } + else if (!finaltup && pstate->finaltup) + { + /* + * Remember that the final tuple has been checked with this + * particular set of array keys. + * + * It might make sense to check the same tuple again at some point + * during the ongoing _bt_readpage-wise scan of this page. But it + * is definitely wasteful to repeat the same check before the + * array keys are advanced by some later non-final tuple. + */ + pstate->finaltupchecked = true; + } + + /* + * In any case, this indextuple doesn't match the qual + */ + return false; + } + + /* + * Caller's tuple is >= the current set of array keys and other equality + * constraint scan keys (or <= if this is a backwards scans). + * + * It it now time to advance the array keys based on the values from this + * tuple. Do that now, while determining in passing if the tuple matches + * the newly advanced set of array keys (if we've any left). + * + * This call will also set continuescan for us (or tells us to perform + * another _bt_check_compare call, which then sets continuescan for us). + */ + if (!_bt_advance_array_keys(scan, pstate, tuple, skrequiredtrigger)) + { + /* + * Tuple doesn't match any later array keys, either. Give up on this + * tuple being a match. + */ + return false; + } + + /* + * Advanced array keys to values that are exact matches for corresponding + * attribute values from the tuple. Check back with _bt_check_compare. + */ + return _bt_check_compare(pstate->dir, so, tuple, natts, tupdesc, + &pstate->continuescan, &skrequiredtrigger, + false); +} + +/* + * Test whether an indextuple satisfies current scan condition. + * + * Return true if so, false if not. If not, also clear *continuescan if + * it's not possible for any future tuples in the current scan direction to + * pass the qual with the current set of array keys. + * + * This is a subroutine for _bt_checkkeys. It is written with the assumption + * that reaching the end of each distinct set of array keys terminates the + * ongoing primitive index scan. It is up to our caller (that has more + * context than we have available here) to override that initial determination + * when it makes more sense to advance the array keys and continue with + * further tuples from the same leaf page. + */ +static bool +_bt_check_compare(ScanDirection dir, BTScanOpaque so, + IndexTuple tuple, int tupnatts, TupleDesc tupdesc, + bool *continuescan, bool *skrequiredtrigger, + bool requiredMatchedByPrecheck) +{ int ikey; ScanKey key; - Assert(BTreeTupleGetNAtts(tuple, scan->indexRelation) == tupnatts); + Assert(!so->numArrayKeys || !requiredMatchedByPrecheck); *continuescan = true; /* default assumption */ + *skrequiredtrigger = true; /* default assumption */ - tupdesc = RelationGetDescr(scan->indexRelation); - so = (BTScanOpaque) scan->opaque; - keysz = so->numberOfKeys; - - for (key = so->keyData, ikey = 0; ikey < keysz; key++, ikey++) + for (key = so->keyData, ikey = 0; ikey < so->numberOfKeys; key++, ikey++) { Datum datum; bool isNull; @@ -1526,7 +2844,7 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, * _bt_first() except for the NULLs checking, which have already done * above. */ - if (!requiredOppositeDir) + if (!requiredOppositeDir || so->numArrayKeys) { test = FunctionCall2Coll(&key->sk_func, key->sk_collation, datum, key->sk_argument); @@ -1549,10 +2867,22 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, * qual fails, it is critical that equality quals be used for the * initial positioning in _bt_first() when they are available. See * comments in _bt_first(). + * + * Scans with equality-type array scan keys run into a similar + * problem whenever they advance the array keys. Our caller uses + * _bt_tuple_before_array_skeys to avoid the problem there. */ if (requiredSameDir) *continuescan = false; + if ((key->sk_flags & SK_SEARCHARRAY) && + key->sk_strategy == BTEqualStrategyNumber) + { + if (!requiredSameDir) + *skrequiredtrigger = false; + *continuescan = false; + } + /* * In any case, this indextuple doesn't match the qual. */ @@ -1571,7 +2901,7 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, * it's not possible for any future tuples in the current scan direction * to pass the qual. * - * This is a subroutine for _bt_checkkeys, which see for more info. + * This is a subroutine for _bt_checkkeys/_bt_check_compare. */ static bool _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c index 03a5fbdc6..e37597c26 100644 --- a/src/backend/optimizer/path/indxpath.c +++ b/src/backend/optimizer/path/indxpath.c @@ -106,8 +106,7 @@ static List *build_index_paths(PlannerInfo *root, RelOptInfo *rel, IndexOptInfo *index, IndexClauseSet *clauses, bool useful_predicate, ScanTypeControl scantype, - bool *skip_nonnative_saop, - bool *skip_lower_saop); + bool *skip_nonnative_saop); static List *build_paths_for_OR(PlannerInfo *root, RelOptInfo *rel, List *clauses, List *other_clauses); static List *generate_bitmap_or_paths(PlannerInfo *root, RelOptInfo *rel, @@ -706,8 +705,6 @@ eclass_already_used(EquivalenceClass *parent_ec, Relids oldrelids, * index AM supports them natively, we should just include them in simple * index paths. If not, we should exclude them while building simple index * paths, and then make a separate attempt to include them in bitmap paths. - * Furthermore, we should consider excluding lower-order ScalarArrayOpExpr - * quals so as to create ordered paths. */ static void get_index_paths(PlannerInfo *root, RelOptInfo *rel, @@ -716,37 +713,17 @@ get_index_paths(PlannerInfo *root, RelOptInfo *rel, { List *indexpaths; bool skip_nonnative_saop = false; - bool skip_lower_saop = false; ListCell *lc; /* * Build simple index paths using the clauses. Allow ScalarArrayOpExpr - * clauses only if the index AM supports them natively, and skip any such - * clauses for index columns after the first (so that we produce ordered - * paths if possible). + * clauses only if the index AM supports them natively. */ indexpaths = build_index_paths(root, rel, index, clauses, index->predOK, ST_ANYSCAN, - &skip_nonnative_saop, - &skip_lower_saop); - - /* - * If we skipped any lower-order ScalarArrayOpExprs on an index with an AM - * that supports them, then try again including those clauses. This will - * produce paths with more selectivity but no ordering. - */ - if (skip_lower_saop) - { - indexpaths = list_concat(indexpaths, - build_index_paths(root, rel, - index, clauses, - index->predOK, - ST_ANYSCAN, - &skip_nonnative_saop, - NULL)); - } + &skip_nonnative_saop); /* * Submit all the ones that can form plain IndexScan plans to add_path. (A @@ -784,7 +761,6 @@ get_index_paths(PlannerInfo *root, RelOptInfo *rel, index, clauses, false, ST_BITMAPSCAN, - NULL, NULL); *bitindexpaths = list_concat(*bitindexpaths, indexpaths); } @@ -817,27 +793,19 @@ get_index_paths(PlannerInfo *root, RelOptInfo *rel, * to true if we found any such clauses (caller must initialize the variable * to false). If it's NULL, we do not ignore ScalarArrayOpExpr clauses. * - * If skip_lower_saop is non-NULL, we ignore ScalarArrayOpExpr clauses for - * non-first index columns, and we set *skip_lower_saop to true if we found - * any such clauses (caller must initialize the variable to false). If it's - * NULL, we do not ignore non-first ScalarArrayOpExpr clauses, but they will - * result in considering the scan's output to be unordered. - * * 'rel' is the index's heap relation * 'index' is the index for which we want to generate paths * 'clauses' is the collection of indexable clauses (IndexClause nodes) * 'useful_predicate' indicates whether the index has a useful predicate * 'scantype' indicates whether we need plain or bitmap scan support * 'skip_nonnative_saop' indicates whether to accept SAOP if index AM doesn't - * 'skip_lower_saop' indicates whether to accept non-first-column SAOP */ static List * build_index_paths(PlannerInfo *root, RelOptInfo *rel, IndexOptInfo *index, IndexClauseSet *clauses, bool useful_predicate, ScanTypeControl scantype, - bool *skip_nonnative_saop, - bool *skip_lower_saop) + bool *skip_nonnative_saop) { List *result = NIL; IndexPath *ipath; @@ -848,7 +816,6 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, List *orderbyclausecols; List *index_pathkeys; List *useful_pathkeys; - bool found_lower_saop_clause; bool pathkeys_possibly_useful; bool index_is_ordered; bool index_only_scan; @@ -880,19 +847,11 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, * on by btree and possibly other places.) The list can be empty, if the * index AM allows that. * - * found_lower_saop_clause is set true if we accept a ScalarArrayOpExpr - * index clause for a non-first index column. This prevents us from - * assuming that the scan result is ordered. (Actually, the result is - * still ordered if there are equality constraints for all earlier - * columns, but it seems too expensive and non-modular for this code to be - * aware of that refinement.) - * * We also build a Relids set showing which outer rels are required by the * selected clauses. Any lateral_relids are included in that, but not * otherwise accounted for. */ index_clauses = NIL; - found_lower_saop_clause = false; outer_relids = bms_copy(rel->lateral_relids); for (indexcol = 0; indexcol < index->nkeycolumns; indexcol++) { @@ -903,30 +862,20 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, IndexClause *iclause = (IndexClause *) lfirst(lc); RestrictInfo *rinfo = iclause->rinfo; - /* We might need to omit ScalarArrayOpExpr clauses */ - if (IsA(rinfo->clause, ScalarArrayOpExpr)) + /* + * We might need to omit ScalarArrayOpExpr clauses when index AM + * lacks native support + */ + if (!index->amsearcharray && IsA(rinfo->clause, ScalarArrayOpExpr)) { - if (!index->amsearcharray) + if (skip_nonnative_saop) { - if (skip_nonnative_saop) - { - /* Ignore because not supported by index */ - *skip_nonnative_saop = true; - continue; - } - /* Caller had better intend this only for bitmap scan */ - Assert(scantype == ST_BITMAPSCAN); - } - if (indexcol > 0) - { - if (skip_lower_saop) - { - /* Caller doesn't want to lose index ordering */ - *skip_lower_saop = true; - continue; - } - found_lower_saop_clause = true; + /* Ignore because not supported by index */ + *skip_nonnative_saop = true; + continue; } + /* Caller had better intend this only for bitmap scan */ + Assert(scantype == ST_BITMAPSCAN); } /* OK to include this clause */ @@ -956,11 +905,9 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, /* * 2. Compute pathkeys describing index's ordering, if any, then see how * many of them are actually useful for this query. This is not relevant - * if we are only trying to build bitmap indexscans, nor if we have to - * assume the scan is unordered. + * if we are only trying to build bitmap indexscans. */ pathkeys_possibly_useful = (scantype != ST_BITMAPSCAN && - !found_lower_saop_clause && has_useful_pathkeys(root, rel)); index_is_ordered = (index->sortopfamily != NULL); if (index_is_ordered && pathkeys_possibly_useful) @@ -1212,7 +1159,6 @@ build_paths_for_OR(PlannerInfo *root, RelOptInfo *rel, index, &clauseset, useful_predicate, ST_BITMAPSCAN, - NULL, NULL); result = list_concat(result, indexpaths); } diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index c4fcd0076..1b899b2db 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -6444,8 +6444,6 @@ genericcostestimate(PlannerInfo *root, double numIndexTuples; double spc_random_page_cost; double num_sa_scans; - double num_outer_scans; - double num_scans; double qual_op_cost; double qual_arg_cost; List *selectivityQuals; @@ -6460,7 +6458,7 @@ genericcostestimate(PlannerInfo *root, /* * Check for ScalarArrayOpExpr index quals, and estimate the number of - * index scans that will be performed. + * primitive index scans that will be performed for caller */ num_sa_scans = 1; foreach(l, indexQuals) @@ -6490,19 +6488,8 @@ genericcostestimate(PlannerInfo *root, */ numIndexTuples = costs->numIndexTuples; if (numIndexTuples <= 0.0) - { numIndexTuples = indexSelectivity * index->rel->tuples; - /* - * The above calculation counts all the tuples visited across all - * scans induced by ScalarArrayOpExpr nodes. We want to consider the - * average per-indexscan number, so adjust. This is a handy place to - * round to integer, too. (If caller supplied tuple estimate, it's - * responsible for handling these considerations.) - */ - numIndexTuples = rint(numIndexTuples / num_sa_scans); - } - /* * We can bound the number of tuples by the index size in any case. Also, * always estimate at least one tuple is touched, even when @@ -6540,27 +6527,31 @@ genericcostestimate(PlannerInfo *root, * * The above calculations are all per-index-scan. However, if we are in a * nestloop inner scan, we can expect the scan to be repeated (with - * different search keys) for each row of the outer relation. Likewise, - * ScalarArrayOpExpr quals result in multiple index scans. This creates - * the potential for cache effects to reduce the number of disk page - * fetches needed. We want to estimate the average per-scan I/O cost in - * the presence of caching. + * different search keys) for each row of the outer relation. This + * creates the potential for cache effects to reduce the number of disk + * page fetches needed. We want to estimate the average per-scan I/O cost + * in the presence of caching. * * We use the Mackert-Lohman formula (see costsize.c for details) to * estimate the total number of page fetches that occur. While this * wasn't what it was designed for, it seems a reasonable model anyway. * Note that we are counting pages not tuples anymore, so we take N = T = * index size, as if there were one "tuple" per page. + * + * Note: we assume that there will be no repeat index page fetches across + * ScalarArrayOpExpr primitive scans from the same logical index scan. + * This is guaranteed to be true for btree indexes, but is very optimistic + * with index AMs that cannot natively execute ScalarArrayOpExpr quals. + * However, these same index AMs also accept our default pessimistic + * approach to counting num_sa_scans (btree caller caps this), so we don't + * expect the final indexTotalCost to be wildly over-optimistic. */ - num_outer_scans = loop_count; - num_scans = num_sa_scans * num_outer_scans; - - if (num_scans > 1) + if (loop_count > 1) { double pages_fetched; /* total page fetches ignoring cache effects */ - pages_fetched = numIndexPages * num_scans; + pages_fetched = numIndexPages * loop_count; /* use Mackert and Lohman formula to adjust for cache effects */ pages_fetched = index_pages_fetched(pages_fetched, @@ -6570,11 +6561,9 @@ genericcostestimate(PlannerInfo *root, /* * Now compute the total disk access cost, and then report a pro-rated - * share for each outer scan. (Don't pro-rate for ScalarArrayOpExpr, - * since that's internal to the indexscan.) + * share for each outer scan */ - indexTotalCost = (pages_fetched * spc_random_page_cost) - / num_outer_scans; + indexTotalCost = (pages_fetched * spc_random_page_cost) / loop_count; } else { @@ -6590,10 +6579,8 @@ genericcostestimate(PlannerInfo *root, * evaluated once at the start of the scan to reduce them to runtime keys * to pass to the index AM (see nodeIndexscan.c). We model the per-tuple * CPU costs as cpu_index_tuple_cost plus one cpu_operator_cost per - * indexqual operator. Because we have numIndexTuples as a per-scan - * number, we have to multiply by num_sa_scans to get the correct result - * for ScalarArrayOpExpr cases. Similarly add in costs for any index - * ORDER BY expressions. + * indexqual operator. Similarly add in costs for any index ORDER BY + * expressions. * * Note: this neglects the possible costs of rechecking lossy operators. * Detecting that that might be needed seems more expensive than it's @@ -6606,7 +6593,7 @@ genericcostestimate(PlannerInfo *root, indexStartupCost = qual_arg_cost; indexTotalCost += qual_arg_cost; - indexTotalCost += numIndexTuples * num_sa_scans * (cpu_index_tuple_cost + qual_op_cost); + indexTotalCost += numIndexTuples * (cpu_index_tuple_cost + qual_op_cost); /* * Generic assumption about index correlation: there isn't any. @@ -6684,7 +6671,6 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, bool eqQualHere; bool found_saop; bool found_is_null_op; - double num_sa_scans; ListCell *lc; /* @@ -6699,17 +6685,12 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, * * For a RowCompareExpr, we consider only the first column, just as * rowcomparesel() does. - * - * If there's a ScalarArrayOpExpr in the quals, we'll actually perform N - * index scans not one, but the ScalarArrayOpExpr's operator can be - * considered to act the same as it normally does. */ indexBoundQuals = NIL; indexcol = 0; eqQualHere = false; found_saop = false; found_is_null_op = false; - num_sa_scans = 1; foreach(lc, path->indexclauses) { IndexClause *iclause = lfirst_node(IndexClause, lc); @@ -6749,14 +6730,9 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, else if (IsA(clause, ScalarArrayOpExpr)) { ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) clause; - Node *other_operand = (Node *) lsecond(saop->args); - int alength = estimate_array_length(other_operand); clause_op = saop->opno; found_saop = true; - /* count number of SA scans induced by indexBoundQuals only */ - if (alength > 1) - num_sa_scans *= alength; } else if (IsA(clause, NullTest)) { @@ -6816,13 +6792,6 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, JOIN_INNER, NULL); numIndexTuples = btreeSelectivity * index->rel->tuples; - - /* - * As in genericcostestimate(), we have to adjust for any - * ScalarArrayOpExpr quals included in indexBoundQuals, and then round - * to integer. - */ - numIndexTuples = rint(numIndexTuples / num_sa_scans); } /* @@ -6832,6 +6801,48 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, genericcostestimate(root, path, loop_count, &costs); + /* + * Now compensate for btree's ability to efficiently execute scans with + * SAOP clauses. + * + * btree automatically combines individual ScalarArrayOpExpr primitive + * index scans whenever the tuples covered by the next set of array keys + * are close to tuples covered by the current set. This makes the final + * number of descents particularly difficult to estimate. However, btree + * scans never visit any single leaf page more than once. That puts a + * natural floor under the worst case number of descents. + * + * It's particularly important that we not wildly overestimate the number + * of descents needed for a clause list with several SAOPs -- the costs + * really aren't multiplicative in the way genericcostestimate expects. In + * general, most distinct combinations of SAOP keys will tend to not find + * any matching tuples. Furthermore, btree scans search for the next set + * of array keys using the next tuple in line, and so won't even need a + * direct comparison to eliminate most non-matching sets of array keys. + * + * Clamp the number of descents to the estimated number of leaf page + * visits. This is still fairly pessimistic, but tends to result in more + * accurate costing of scans with several SAOP clauses -- especially when + * each array has more than a few elements. The cost of adding additional + * array constants to a low-order SAOP column should saturate past a + * certain point (except where selectivity estimates continue to shift). + * + * Also clamp the number of descents to 1/3 the number of index pages. + * This avoids implausibly high estimates with low selectivity paths, + * where scans frequently require no more than one or two descents. + * + * XXX Ideally, we'd also account for the fact that non-boundary SAOP + * clause quals (which the B-Tree code uses "non-required" scan keys for) + * won't actually contribute to the total number of descents of the index. + * This would require pushing down more context into genericcostestimate. + */ + if (costs.num_sa_scans > 1) + { + costs.num_sa_scans = Min(costs.num_sa_scans, costs.numIndexPages); + costs.num_sa_scans = Min(costs.num_sa_scans, index->pages / 3); + costs.num_sa_scans = Max(costs.num_sa_scans, 1); + } + /* * Add a CPU-cost component to represent the costs of initial btree * descent. We don't charge any I/O cost for touching upper btree levels, @@ -6839,9 +6850,9 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, * comparisons to descend a btree of N leaf tuples. We charge one * cpu_operator_cost per comparison. * - * If there are ScalarArrayOpExprs, charge this once per SA scan. The - * ones after the first one are not startup cost so far as the overall - * plan is concerned, so add them only to "total" cost. + * If there are ScalarArrayOpExprs, charge this once per estimated + * primitive SA scan. The ones after the first one are not startup cost + * so far as the overall plan goes, so just add them to "total" cost. */ if (index->tuples > 1) /* avoid computing log(0) */ { @@ -6858,7 +6869,8 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, * in cases where only a single leaf page is expected to be visited. This * cost is somewhat arbitrarily set at 50x cpu_operator_cost per page * touched. The number of such pages is btree tree height plus one (ie, - * we charge for the leaf page too). As above, charge once per SA scan. + * we charge for the leaf page too). As above, charge once per estimated + * primitive SA scan. */ descentCost = (index->tree_height + 1) * DEFAULT_PAGE_CPU_MULTIPLIER * cpu_operator_cost; costs.indexStartupCost += descentCost; diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index e068f7e24..da90412d5 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -4035,6 +4035,19 @@ description | Waiting for a newly initialized WAL file to reach durable storage + + + Every time an index is searched, the index's + pg_stat_all_indexes.idx_scan + field is incremented. This usually happens once per index scan node + execution, but might take place several times during execution of a scan + that searches for multiple values together. Only queries that use certain + SQL constructs to search for rows matching any value + out of a list (or an array) of multiple scalar values are affected. See + for details. + + + diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out index acfd9d1f4..84c068ae3 100644 --- a/src/test/regress/expected/create_index.out +++ b/src/test/regress/expected/create_index.out @@ -1910,7 +1910,7 @@ SELECT count(*) FROM dupindexcols (1 row) -- --- Check ordering of =ANY indexqual results (bug in 9.2.0) +-- Check that index scans with =ANY indexquals return rows in index order -- explain (costs off) SELECT unique1 FROM tenk1 @@ -1936,12 +1936,11 @@ explain (costs off) SELECT thousand, tenthous FROM tenk1 WHERE thousand < 2 AND tenthous IN (1001,3000) ORDER BY thousand; - QUERY PLAN -------------------------------------------------------- + QUERY PLAN +-------------------------------------------------------------------------------- Index Only Scan using tenk1_thous_tenthous on tenk1 - Index Cond: (thousand < 2) - Filter: (tenthous = ANY ('{1001,3000}'::integer[])) -(3 rows) + Index Cond: ((thousand < 2) AND (tenthous = ANY ('{1001,3000}'::integer[]))) +(2 rows) SELECT thousand, tenthous FROM tenk1 WHERE thousand < 2 AND tenthous IN (1001,3000) @@ -1952,18 +1951,35 @@ ORDER BY thousand; 1 | 1001 (2 rows) +explain (costs off) +SELECT thousand, tenthous FROM tenk1 +WHERE thousand < 2 AND tenthous IN (1001,3000) +ORDER BY thousand DESC, tenthous DESC; + QUERY PLAN +-------------------------------------------------------------------------------- + Index Only Scan Backward using tenk1_thous_tenthous on tenk1 + Index Cond: ((thousand < 2) AND (tenthous = ANY ('{1001,3000}'::integer[]))) +(2 rows) + +SELECT thousand, tenthous FROM tenk1 +WHERE thousand < 2 AND tenthous IN (1001,3000) +ORDER BY thousand DESC, tenthous DESC; + thousand | tenthous +----------+---------- + 1 | 1001 + 0 | 3000 +(2 rows) + SET enable_indexonlyscan = OFF; explain (costs off) SELECT thousand, tenthous FROM tenk1 WHERE thousand < 2 AND tenthous IN (1001,3000) ORDER BY thousand; - QUERY PLAN --------------------------------------------------------------------------------------- - Sort - Sort Key: thousand - -> Index Scan using tenk1_thous_tenthous on tenk1 - Index Cond: ((thousand < 2) AND (tenthous = ANY ('{1001,3000}'::integer[]))) -(4 rows) + QUERY PLAN +-------------------------------------------------------------------------------- + Index Scan using tenk1_thous_tenthous on tenk1 + Index Cond: ((thousand < 2) AND (tenthous = ANY ('{1001,3000}'::integer[]))) +(2 rows) SELECT thousand, tenthous FROM tenk1 WHERE thousand < 2 AND tenthous IN (1001,3000) @@ -1974,6 +1990,25 @@ ORDER BY thousand; 1 | 1001 (2 rows) +explain (costs off) +SELECT thousand, tenthous FROM tenk1 +WHERE thousand < 2 AND tenthous IN (1001,3000) +ORDER BY thousand DESC, tenthous DESC; + QUERY PLAN +-------------------------------------------------------------------------------- + Index Scan Backward using tenk1_thous_tenthous on tenk1 + Index Cond: ((thousand < 2) AND (tenthous = ANY ('{1001,3000}'::integer[]))) +(2 rows) + +SELECT thousand, tenthous FROM tenk1 +WHERE thousand < 2 AND tenthous IN (1001,3000) +ORDER BY thousand DESC, tenthous DESC; + thousand | tenthous +----------+---------- + 1 | 1001 + 0 | 3000 +(2 rows) + RESET enable_indexonlyscan; -- -- Check elimination of constant-NULL subexpressions diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out index 892ea5f17..f4939cd74 100644 --- a/src/test/regress/expected/join.out +++ b/src/test/regress/expected/join.out @@ -8620,10 +8620,9 @@ where j1.id1 % 1000 = 1 and j2.id1 % 1000 = 1 and j2.id1 >= any (array[1,5]); Merge Cond: (j1.id1 = j2.id1) Join Filter: (j2.id2 = j1.id2) -> Index Scan using j1_id1_idx on j1 - -> Index Only Scan using j2_pkey on j2 + -> Index Scan using j2_id1_idx on j2 Index Cond: (id1 >= ANY ('{1,5}'::integer[])) - Filter: ((id1 % 1000) = 1) -(7 rows) +(6 rows) select * from j1 inner join j2 on j1.id1 = j2.id1 and j1.id2 = j2.id2 diff --git a/src/test/regress/sql/create_index.sql b/src/test/regress/sql/create_index.sql index d49ce9f30..41b955a27 100644 --- a/src/test/regress/sql/create_index.sql +++ b/src/test/regress/sql/create_index.sql @@ -753,7 +753,7 @@ SELECT count(*) FROM dupindexcols WHERE f1 BETWEEN 'WA' AND 'ZZZ' and id < 1000 and f1 ~<~ 'YX'; -- --- Check ordering of =ANY indexqual results (bug in 9.2.0) +-- Check that index scans with =ANY indexquals return rows in index order -- explain (costs off) @@ -774,6 +774,15 @@ SELECT thousand, tenthous FROM tenk1 WHERE thousand < 2 AND tenthous IN (1001,3000) ORDER BY thousand; +explain (costs off) +SELECT thousand, tenthous FROM tenk1 +WHERE thousand < 2 AND tenthous IN (1001,3000) +ORDER BY thousand DESC, tenthous DESC; + +SELECT thousand, tenthous FROM tenk1 +WHERE thousand < 2 AND tenthous IN (1001,3000) +ORDER BY thousand DESC, tenthous DESC; + SET enable_indexonlyscan = OFF; explain (costs off) @@ -785,6 +794,15 @@ SELECT thousand, tenthous FROM tenk1 WHERE thousand < 2 AND tenthous IN (1001,3000) ORDER BY thousand; +explain (costs off) +SELECT thousand, tenthous FROM tenk1 +WHERE thousand < 2 AND tenthous IN (1001,3000) +ORDER BY thousand DESC, tenthous DESC; + +SELECT thousand, tenthous FROM tenk1 +WHERE thousand < 2 AND tenthous IN (1001,3000) +ORDER BY thousand DESC, tenthous DESC; + RESET enable_indexonlyscan; -- -- 2.42.0