From 31217a6e401861e25620720132addfddd2a2f5b5 Mon Sep 17 00:00:00 2001 From: Floris van Nee Date: Fri, 15 Nov 2019 09:46:53 -0500 Subject: [PATCH 4/5] Index skip scan Implementation of Index Skip Scan (see Loose Index Scan in the wiki [1]) as part of the IndexOnlyScan, IndexScan and BitmapIndexScan for nbtree. This patch improves performance of two main types of queries significantly: - SELECT DISTINCT, SELECT DISTINCT ON - Regular SELECTs with WHERE-clauses on non-leading index attributes For example, given an nbtree index on three columns (a,b,c), the following queries may now be significantly faster: - SELECT DISTINCT ON (a) * FROM t1 - SELECT * FROM t1 WHERE b=2 - SELECT * FROM t1 WHERE b IN (10,40) - SELECT DISTINCT ON (a,b) * FROM t1 WHERE c BETWEEN 10 AND 100 ORDER BY a,b,c Original patch and design were proposed by Thomas Munro [2], revived and improved by Dmitry Dolgov and Jesper Pedersen. Further enhanced functionality added by Floris van Nee regarding a more general and performant skip implementation. [1] https://wiki.postgresql.org/wiki/Loose_indexscan [2] https://www.postgresql.org/message-id/flat/CADLWmXXbTSBxP-MzJuPAYSsL_2f0iPm5VWPbCvDbVvfX93FKkw%40mail.gmail.com Author: Floris van Nee, Jesper Pedersen, Dmitry Dolgov Reviewed-by: Thomas Munro, David Rowley, Kyotaro Horiguchi, Tomas Vondra, Peter Geoghegan --- contrib/amcheck/verify_nbtree.c | 4 +- contrib/bloom/blutils.c | 3 + doc/src/sgml/config.sgml | 15 + doc/src/sgml/indexam.sgml | 121 +- doc/src/sgml/indices.sgml | 28 + src/backend/access/brin/brin.c | 3 + src/backend/access/gin/ginutil.c | 3 + src/backend/access/gist/gist.c | 3 + src/backend/access/hash/hash.c | 3 + src/backend/access/index/indexam.c | 163 ++ src/backend/access/nbtree/Makefile | 1 + src/backend/access/nbtree/nbtinsert.c | 2 +- src/backend/access/nbtree/nbtpage.c | 2 +- src/backend/access/nbtree/nbtree.c | 58 +- src/backend/access/nbtree/nbtsearch.c | 790 ++++----- src/backend/access/nbtree/nbtskip.c | 1455 +++++++++++++++++ src/backend/access/nbtree/nbtsort.c | 2 +- src/backend/access/nbtree/nbtutils.c | 850 +++++++++- src/backend/access/spgist/spgutils.c | 3 + src/backend/commands/explain.c | 29 + src/backend/executor/execScan.c | 37 +- src/backend/executor/nodeBitmapIndexscan.c | 22 +- src/backend/executor/nodeIndexonlyscan.c | 69 +- src/backend/executor/nodeIndexscan.c | 72 +- src/backend/nodes/copyfuncs.c | 5 + src/backend/nodes/outfuncs.c | 6 + src/backend/nodes/readfuncs.c | 5 + src/backend/optimizer/path/allpaths.c | 54 +- src/backend/optimizer/path/costsize.c | 1 + src/backend/optimizer/path/indxpath.c | 68 + src/backend/optimizer/path/pathkeys.c | 72 + src/backend/optimizer/plan/createplan.c | 38 +- src/backend/optimizer/plan/planner.c | 16 +- src/backend/optimizer/util/pathnode.c | 78 + src/backend/optimizer/util/plancat.c | 3 + src/backend/utils/misc/guc.c | 9 + src/backend/utils/misc/postgresql.conf.sample | 1 + src/backend/utils/sort/tuplesort.c | 4 +- src/include/access/amapi.h | 19 + src/include/access/genam.h | 16 + src/include/access/nbtree.h | 143 +- src/include/executor/executor.h | 4 + src/include/nodes/execnodes.h | 7 + src/include/nodes/pathnodes.h | 6 + src/include/nodes/plannodes.h | 5 + src/include/optimizer/cost.h | 1 + src/include/optimizer/pathnode.h | 4 + src/include/optimizer/paths.h | 4 + src/interfaces/libpq/encnames.c | 1 + src/interfaces/libpq/wchar.c | 1 + src/test/regress/expected/select_distinct.out | 599 +++++++ src/test/regress/expected/sysviews.out | 3 +- src/test/regress/sql/select_distinct.sql | 248 +++ 53 files changed, 4613 insertions(+), 546 deletions(-) create mode 100644 src/backend/access/nbtree/nbtskip.c create mode 120000 src/interfaces/libpq/encnames.c create mode 120000 src/interfaces/libpq/wchar.c diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index 42a830c33b..2849e4bc72 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -2652,7 +2652,7 @@ bt_rootdescend(BtreeCheckState *state, IndexTuple itup) Buffer lbuf; bool exists; - key = _bt_mkscankey(state->rel, itup); + key = _bt_mkscankey(state->rel, itup, NULL); Assert(key->heapkeyspace && key->scantid != NULL); /* @@ -3108,7 +3108,7 @@ bt_mkscankey_pivotsearch(Relation rel, IndexTuple itup) { BTScanInsert skey; - skey = _bt_mkscankey(rel, itup); + skey = _bt_mkscankey(rel, itup, NULL); skey->pivotsearch = true; return skey; diff --git a/contrib/bloom/blutils.c b/contrib/bloom/blutils.c index 754de008d4..d35edd1d09 100644 --- a/contrib/bloom/blutils.c +++ b/contrib/bloom/blutils.c @@ -134,6 +134,9 @@ blhandler(PG_FUNCTION_ARGS) amroutine->ambulkdelete = blbulkdelete; amroutine->amvacuumcleanup = blvacuumcleanup; amroutine->amcanreturn = NULL; + amroutine->amskip = NULL; + amroutine->ambeginskipscan = NULL; + amroutine->amgetskiptuple = NULL; amroutine->amcostestimate = blcostestimate; amroutine->amoptions = bloptions; amroutine->amproperty = NULL; diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 0bcc6fd322..5c377fea08 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -5007,6 +5007,21 @@ ANY num_sync ( + enable_indexskipscan (boolean) + + enable_indexskipscan configuration parameter + + + + + Enables or disables the query planner's use of index-skip-scan plan + types (see ). The default is + on. + + + + enable_material (boolean) diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml index cf359fa9ff..1ced71a45f 100644 --- a/doc/src/sgml/indexam.sgml +++ b/doc/src/sgml/indexam.sgml @@ -151,6 +151,9 @@ typedef struct IndexAmRoutine amendscan_function amendscan; ammarkpos_function ammarkpos; /* can be NULL */ amrestrpos_function amrestrpos; /* can be NULL */ + amskip_function amskip; /* can be NULL */ + ambeginscan_skip_function ambeginskipscan; /* can be NULL */ + amgettuple_with_skip_function amgetskiptuple; /* can be NULL */ /* interface functions to support parallel index scans */ amestimateparallelscan_function amestimateparallelscan; /* can be NULL */ @@ -751,6 +754,122 @@ amrestrpos (IndexScanDesc scan); struct may be set to NULL. + + +bool +amskip (IndexScanDesc scan, + ScanDirection prefixDir, + ScanDirection postfixDir); + + Skip past all tuples where the first 'prefix' columns have the same value as + the last tuple returned in the current scan. The arguments are: + + + + scan + + + Index scan information + + + + + + prefixDir + + + The direction in which the prefix part of the tuple is advancing. + + + + + + postfixDir + + + The direction in which the postfix (everything after the prefix) of the tuple is advancing. + + + + + + + + +IndexScanDesc +ambeginscan_skip (Relation indexRelation, + int nkeys, + int norderbys, + int prefix); + + Prepare for an index scan. The nkeys and norderbys + parameters indicate the number of quals and ordering operators that will be + used in the scan; these may be useful for space allocation purposes. + Note that the actual values of the scan keys aren't provided yet. + The result must be a palloc'd struct. + For implementation reasons the index access method + must create this struct by calling + RelationGetIndexScan(). In most cases + ambeginscan does little beyond making that call and perhaps + acquiring locks; + the interesting parts of index-scan startup are in amrescan. + If this is a skip scan, prefix must indicate the length of the prefix that can be + skipped over. Prefix can be set to -1 to disable skipping, which will yield an + identical scan to a regular call to ambeginscan. + + + + boolean + amgettuple_skip (IndexScanDesc scan, + ScanDirection prefixDir, + ScanDirection postfixDir); + + Fetch the next tuple in the given scan, moving in the given + directions. Directions are specified by the direction of the prefix we're moving in, + of which the size of the prefix has been specified in the btbeginscan_skip + call. This direction can be different in DISTINCT scans when fetching backwards + from a cursor. + Returns true if a tuple was + obtained, false if no matching tuples remain. In the true case the tuple + TID is stored into the scan structure. Note that + success means only that the index contains an entry that matches + the scan keys, not that the tuple necessarily still exists in the heap or + will pass the caller's snapshot test. On success, amgettuple + must also set scan->xs_recheck to true or false. + False means it is certain that the index entry matches the scan keys. + true means this is not certain, and the conditions represented by the + scan keys must be rechecked against the heap tuple after fetching it. + This provision supports lossy index operators. + Note that rechecking will extend only to the scan conditions; a partial + index predicate (if any) is never rechecked by amgettuple + callers. + + + + If the index supports index-only + scans (i.e., amcanreturn returns true for it), + then on success the AM must also check scan->xs_want_itup, + and if that is true it must return the originally indexed data for the + index entry. The data can be returned in the form of an + IndexTuple pointer stored at scan->xs_itup, + with tuple descriptor scan->xs_itupdesc; or in the form of + a HeapTuple pointer stored at scan->xs_hitup, + with tuple descriptor scan->xs_hitupdesc. (The latter + format should be used when reconstructing data that might possibly not fit + into an IndexTuple.) In either case, + management of the data referenced by the pointer is the access method's + responsibility. The data must remain good at least until the next + amgettuple, amrescan, or amendscan + call for the scan. + + + + The amgettuple function need only be provided if the access + method supports plain index scans. If it doesn't, the + amgettuple field in its IndexAmRoutine + struct must be set to NULL. + + In addition to supporting ordinary index scans, some types of index may wish to support parallel index scans, which allow @@ -766,7 +885,7 @@ amrestrpos (IndexScanDesc scan); functions may be implemented to support parallel index scans: - + Size amestimateparallelscan (void); diff --git a/doc/src/sgml/indices.sgml b/doc/src/sgml/indices.sgml index 56fbd45178..38d0bfa4d9 100644 --- a/doc/src/sgml/indices.sgml +++ b/doc/src/sgml/indices.sgml @@ -1297,6 +1297,34 @@ SELECT target FROM tests WHERE subject = 'some-subject' AND success; and later will recognize such cases and allow index-only scans to be generated, but older versions will not. + + + Index Skip Scans + + + index + index-skip scans + + + index-skip scan + + + + When the rows retrieved from an index scan are then deduplicated by + eliminating rows matching on a prefix of index keys (e.g. when using + SELECT DISTINCT), the planner will consider + skipping groups of rows with a matching key prefix. When a row with + a particular prefix is found, remaining rows with the same key prefix + are skipped. The larger the number of rows with the same key prefix + rows (i.e. the lower the number of distinct key prefixes in the index), + the more efficient this is. + + + Additionally, a skip scan can be considered in regular SELECT + queries. When filtering on an non-leading attribute of an index, the planner + may choose a skip scan. + + diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index ccc9fa0959..7efce94edb 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -118,6 +118,9 @@ brinhandler(PG_FUNCTION_ARGS) amroutine->ambulkdelete = brinbulkdelete; amroutine->amvacuumcleanup = brinvacuumcleanup; amroutine->amcanreturn = NULL; + amroutine->amskip = NULL; + amroutine->ambeginskipscan = NULL; + amroutine->amgetskiptuple = NULL; amroutine->amcostestimate = brincostestimate; amroutine->amoptions = brinoptions; amroutine->amproperty = NULL; diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c index 6d2d71be32..ed5d5040ee 100644 --- a/src/backend/access/gin/ginutil.c +++ b/src/backend/access/gin/ginutil.c @@ -66,6 +66,9 @@ ginhandler(PG_FUNCTION_ARGS) amroutine->ambulkdelete = ginbulkdelete; amroutine->amvacuumcleanup = ginvacuumcleanup; amroutine->amcanreturn = NULL; + amroutine->amskip = NULL; + amroutine->ambeginskipscan = NULL; + amroutine->amgetskiptuple = NULL; amroutine->amcostestimate = gincostestimate; amroutine->amoptions = ginoptions; amroutine->amproperty = NULL; diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c index 0683f42c25..3748dd30b6 100644 --- a/src/backend/access/gist/gist.c +++ b/src/backend/access/gist/gist.c @@ -87,6 +87,9 @@ gisthandler(PG_FUNCTION_ARGS) amroutine->ambulkdelete = gistbulkdelete; amroutine->amvacuumcleanup = gistvacuumcleanup; amroutine->amcanreturn = gistcanreturn; + amroutine->amskip = NULL; + amroutine->ambeginskipscan = NULL; + amroutine->amgetskiptuple = NULL; amroutine->amcostestimate = gistcostestimate; amroutine->amoptions = gistoptions; amroutine->amproperty = gistproperty; diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index eb3810494f..eaf5431a72 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -84,6 +84,9 @@ hashhandler(PG_FUNCTION_ARGS) amroutine->ambulkdelete = hashbulkdelete; amroutine->amvacuumcleanup = hashvacuumcleanup; amroutine->amcanreturn = NULL; + amroutine->amskip = NULL; + amroutine->ambeginskipscan = NULL; + amroutine->amgetskiptuple = NULL; amroutine->amcostestimate = hashcostestimate; amroutine->amoptions = hashoptions; amroutine->amproperty = NULL; diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 5e22479b7a..bd54e2ff64 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -14,7 +14,9 @@ * index_open - open an index relation by relation OID * index_close - close an index relation * index_beginscan - start a scan of an index with amgettuple + * index_beginscan_skip - start a scan of an index with amgettuple and skipping * index_beginscan_bitmap - start a scan of an index with amgetbitmap + * index_beginscan_bitmap_skip - start a skip scan of an index with amgetbitmap * index_rescan - restart a scan of an index * index_endscan - end a scan * index_insert - insert an index tuple into a relation @@ -25,14 +27,17 @@ * index_parallelrescan - (re)start a parallel scan of an index * index_beginscan_parallel - join parallel index scan * index_getnext_tid - get the next TID from a scan + * index_getnext_tid_skip - get the next TID from a skip scan * index_fetch_heap - get the scan's next heap tuple * index_getnext_slot - get the next tuple from a scan + * index_getnext_slot - get the next tuple from a skip scan * index_getbitmap - get all tuples from a scan * index_bulk_delete - bulk deletion of index tuples * index_vacuum_cleanup - post-deletion cleanup of an index * index_can_return - does index support index-only scans? * index_getprocid - get a support procedure OID * index_getprocinfo - get a support procedure's lookup info + * index_skip - advance past duplicate key values in a scan * * NOTES * This file contains the index_ routines which used @@ -224,6 +229,78 @@ index_beginscan(Relation heapRelation, return scan; } +static IndexScanDesc +index_beginscan_internal_skip(Relation indexRelation, + int nkeys, int norderbys, int prefix, Snapshot snapshot, + ParallelIndexScanDesc pscan, bool temp_snap) +{ + IndexScanDesc scan; + + RELATION_CHECKS; + CHECK_REL_PROCEDURE(ambeginskipscan); + + if (!(indexRelation->rd_indam->ampredlocks)) + PredicateLockRelation(indexRelation, snapshot); + + /* + * We hold a reference count to the relcache entry throughout the scan. + */ + RelationIncrementReferenceCount(indexRelation); + + /* + * Tell the AM to open a scan. + */ + scan = indexRelation->rd_indam->ambeginskipscan(indexRelation, nkeys, + norderbys, prefix); + /* Initialize information for parallel scan. */ + scan->parallel_scan = pscan; + scan->xs_temp_snap = temp_snap; + + return scan; +} + +IndexScanDesc +index_beginscan_skip(Relation heapRelation, + Relation indexRelation, + Snapshot snapshot, + int nkeys, int norderbys, int prefix) +{ + IndexScanDesc scan; + + scan = index_beginscan_internal_skip(indexRelation, nkeys, norderbys, prefix, snapshot, NULL, false); + + /* + * Save additional parameters into the scandesc. Everything else was set + * up by RelationGetIndexScan. + */ + scan->heapRelation = heapRelation; + scan->xs_snapshot = snapshot; + + /* prepare to fetch index matches from table */ + scan->xs_heapfetch = table_index_fetch_begin(heapRelation); + + return scan; +} + +IndexScanDesc +index_beginscan_bitmap_skip(Relation indexRelation, + Snapshot snapshot, + int nkeys, + int prefix) +{ + IndexScanDesc scan; + + scan = index_beginscan_internal_skip(indexRelation, nkeys, 0, prefix, snapshot, NULL, false); + + /* + * Save additional parameters into the scandesc. Everything else was set + * up by RelationGetIndexScan. + */ + scan->xs_snapshot = snapshot; + + return scan; +} + /* * index_beginscan_bitmap - start a scan of an index with amgetbitmap * @@ -553,6 +630,45 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction) return &scan->xs_heaptid; } +ItemPointer +index_getnext_tid_skip(IndexScanDesc scan, ScanDirection prefixDir, ScanDirection postfixDir) +{ + bool found; + + SCAN_CHECKS; + CHECK_SCAN_PROCEDURE(amgetskiptuple); + + Assert(TransactionIdIsValid(RecentXmin)); + + /* + * The AM's amgettuple proc finds the next index entry matching the scan + * keys, and puts the TID into scan->xs_heaptid. It should also set + * scan->xs_recheck and possibly scan->xs_itup/scan->xs_hitup, though we + * pay no attention to those fields here. + */ + found = scan->indexRelation->rd_indam->amgetskiptuple(scan, prefixDir, postfixDir); + + /* Reset kill flag immediately for safety */ + scan->kill_prior_tuple = false; + scan->xs_heap_continue = false; + + /* If we're out of index entries, we're done */ + if (!found) + { + /* release resources (like buffer pins) from table accesses */ + if (scan->xs_heapfetch) + table_index_fetch_reset(scan->xs_heapfetch); + + return NULL; + } + Assert(ItemPointerIsValid(&scan->xs_heaptid)); + + pgstat_count_index_tuples(scan->indexRelation, 1); + + /* Return the TID of the tuple we found. */ + return &scan->xs_heaptid; +} + /* ---------------- * index_fetch_heap - get the scan's next heap tuple * @@ -644,6 +760,38 @@ index_getnext_slot(IndexScanDesc scan, ScanDirection direction, TupleTableSlot * return false; } +bool +index_getnext_slot_skip(IndexScanDesc scan, ScanDirection prefixDir, ScanDirection postfixDir, TupleTableSlot *slot) +{ + for (;;) + { + if (!scan->xs_heap_continue) + { + ItemPointer tid; + + /* Time to fetch the next TID from the index */ + tid = index_getnext_tid_skip(scan, prefixDir, postfixDir); + + /* If we're out of index entries, we're done */ + if (tid == NULL) + break; + + Assert(ItemPointerEquals(tid, &scan->xs_heaptid)); + } + + /* + * Fetch the next (or only) visible heap tuple for this index entry. + * If we don't find anything, loop around and grab the next TID from + * the index. + */ + Assert(ItemPointerIsValid(&scan->xs_heaptid)); + if (index_fetch_heap(scan, slot)) + return true; + } + + return false; +} + /* ---------------- * index_getbitmap - get all tuples at once from an index scan * @@ -739,6 +887,21 @@ index_can_return(Relation indexRelation, int attno) return indexRelation->rd_indam->amcanreturn(indexRelation, attno); } +/* ---------------- + * index_skip + * + * Skip past all tuples where the first 'prefix' columns have the + * same value as the last tuple returned in the current scan. + * ---------------- + */ +bool +index_skip(IndexScanDesc scan, ScanDirection prefixDir, ScanDirection postfixDir) +{ + SCAN_CHECKS; + + return scan->indexRelation->rd_indam->amskip(scan, prefixDir, postfixDir); +} + /* ---------------- * index_getprocid * diff --git a/src/backend/access/nbtree/Makefile b/src/backend/access/nbtree/Makefile index d69808e78c..da96ac00a6 100644 --- a/src/backend/access/nbtree/Makefile +++ b/src/backend/access/nbtree/Makefile @@ -19,6 +19,7 @@ OBJS = \ nbtpage.o \ nbtree.o \ nbtsearch.o \ + nbtskip.o \ nbtsort.o \ nbtsplitloc.o \ nbtutils.o \ diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index 7355e1dba1..483818e1e1 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -107,7 +107,7 @@ _bt_doinsert(Relation rel, IndexTuple itup, bool checkingunique = (checkUnique != UNIQUE_CHECK_NO); /* we need an insertion scan key to do our search, so build one */ - itup_key = _bt_mkscankey(rel, itup); + itup_key = _bt_mkscankey(rel, itup, NULL); if (checkingunique) { diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 5bc7c3616a..361417c685 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -1968,7 +1968,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate) } /* we need an insertion scan key for the search, so build one */ - itup_key = _bt_mkscankey(rel, targetkey); + itup_key = _bt_mkscankey(rel, targetkey, NULL); /* find the leftmost leaf page with matching pivot/high key */ itup_key->pivotsearch = true; stack = _bt_search(rel, itup_key, &sleafbuf, BT_READ, NULL); diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 40ad0956e0..0afaa3e34c 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -123,6 +123,7 @@ bthandler(PG_FUNCTION_ARGS) amroutine->ambulkdelete = btbulkdelete; amroutine->amvacuumcleanup = btvacuumcleanup; amroutine->amcanreturn = btcanreturn; + amroutine->amskip = btskip; amroutine->amcostestimate = btcostestimate; amroutine->amoptions = btoptions; amroutine->amproperty = btproperty; @@ -130,8 +131,10 @@ bthandler(PG_FUNCTION_ARGS) amroutine->amvalidate = btvalidate; amroutine->amadjustmembers = btadjustmembers; amroutine->ambeginscan = btbeginscan; + amroutine->ambeginskipscan = btbeginscan_skip; amroutine->amrescan = btrescan; amroutine->amgettuple = btgettuple; + amroutine->amgetskiptuple = btgettuple_skip; amroutine->amgetbitmap = btgetbitmap; amroutine->amendscan = btendscan; amroutine->ammarkpos = btmarkpos; @@ -208,6 +211,15 @@ btinsert(Relation rel, Datum *values, bool *isnull, */ bool btgettuple(IndexScanDesc scan, ScanDirection dir) +{ + return btgettuple_skip(scan, dir, dir); +} + +/* + * btgettuple() -- Get the next tuple in the scan. + */ +bool +btgettuple_skip(IndexScanDesc scan, ScanDirection prefixDir, ScanDirection postfixDir) { BTScanOpaque so = (BTScanOpaque) scan->opaque; bool res; @@ -226,7 +238,7 @@ btgettuple(IndexScanDesc scan, ScanDirection dir) if (so->numArrayKeys < 0) return false; - _bt_start_array_keys(scan, dir); + _bt_start_array_keys(scan, prefixDir); } /* This loop handles advancing to the next array elements, if any */ @@ -238,7 +250,7 @@ btgettuple(IndexScanDesc scan, ScanDirection dir) * _bt_first() to get the first item in the scan. */ if (!BTScanPosIsValid(so->currPos)) - res = _bt_first(scan, dir); + res = _bt_first(scan, prefixDir, postfixDir); else { /* @@ -265,14 +277,14 @@ btgettuple(IndexScanDesc scan, ScanDirection dir) /* * Now continue the scan. */ - res = _bt_next(scan, dir); + res = _bt_next(scan, prefixDir, postfixDir); } /* If we have a tuple, return it ... */ if (res) break; /* ... otherwise see if we have more array keys to deal with */ - } while (so->numArrayKeys && _bt_advance_array_keys(scan, dir)); + } while (so->numArrayKeys && _bt_advance_array_keys(scan, prefixDir)); return res; } @@ -303,7 +315,7 @@ btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) do { /* Fetch the first page & tuple */ - if (_bt_first(scan, ForwardScanDirection)) + if (_bt_first(scan, ForwardScanDirection, ForwardScanDirection)) { /* Save tuple ID, and continue scanning */ heapTid = &scan->xs_heaptid; @@ -319,7 +331,7 @@ btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) if (++so->currPos.itemIndex > so->currPos.lastItem) { /* let _bt_next do the heavy lifting */ - if (!_bt_next(scan, ForwardScanDirection)) + if (!_bt_next(scan, ForwardScanDirection, ForwardScanDirection)) break; } @@ -340,6 +352,16 @@ btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) */ IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys) +{ + return btbeginscan_skip(rel, nkeys, norderbys, -1); +} + + +/* + * btbeginscan() -- start a scan on a btree index + */ +IndexScanDesc +btbeginscan_skip(Relation rel, int nkeys, int norderbys, int skipPrefix) { IndexScanDesc scan; BTScanOpaque so; @@ -374,10 +396,20 @@ btbeginscan(Relation rel, int nkeys, int norderbys) */ so->currTuples = so->markTuples = NULL; + so->skipData = NULL; + scan->xs_itupdesc = RelationGetDescr(rel); scan->opaque = so; + if (skipPrefix > 0) + { + so->skipData = (BTSkip) palloc0(sizeof(BTSkipData)); + so->skipData->prefix = skipPrefix; + + elog(DEBUG1, "skip prefix: %d", skipPrefix); + } + return scan; } @@ -440,6 +472,15 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, _bt_preprocess_array_keys(scan); } +/* + * btskip() -- skip to the beginning of the next key prefix + */ +bool +btskip(IndexScanDesc scan, ScanDirection prefixDir, ScanDirection postfixDir) +{ + return _bt_skip(scan, prefixDir, postfixDir); +} + /* * btendscan() -- close down a scan */ @@ -473,6 +514,8 @@ btendscan(IndexScanDesc scan) if (so->currTuples != NULL) pfree(so->currTuples); /* so->markTuples should not be pfree'd, see btrescan */ + if (_bt_skip_enabled(so)) + pfree(so->skipData); pfree(so); } @@ -556,6 +599,9 @@ btrestrpos(IndexScanDesc scan) if (so->currTuples) memcpy(so->currTuples, so->markTuples, so->markPos.nextTupleOffset); + if (so->skipData) + memcpy(&so->skipData->curPos, &so->skipData->markPos, + sizeof(BTSkipPosData)); } else BTScanPosInvalidate(so->currPos); diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index d1177d8772..faff9d8652 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -17,19 +17,17 @@ #include "access/nbtree.h" #include "access/relscan.h" +#include "catalog/catalog.h" #include "miscadmin.h" #include "pgstat.h" #include "storage/predicate.h" +#include "utils/guc.h" #include "utils/lsyscache.h" #include "utils/rel.h" -static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp); -static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf); static int _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum); -static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, - OffsetNumber offnum); static void _bt_saveitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, IndexTuple itup); static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex, @@ -38,14 +36,12 @@ static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex, static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, int tupleOffset); -static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir); -static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir); static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir); -static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot); static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir); -static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir); - +static inline bool _bt_checkkeys_extended(IndexScanDesc scan, IndexTuple tuple, int tupnatts, + ScanDirection dir, bool isRegularMode, + bool *continuescan, int *prefixskipindex); /* * _bt_drop_lock_and_maybe_pin() @@ -61,7 +57,7 @@ static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir); * will remain in shared memory for as long as it takes to scan the index * buffer page. */ -static void +void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp) { _bt_unlockbuf(scan->indexRelation, sp->buf); @@ -339,7 +335,7 @@ _bt_moveright(Relation rel, * the given page. _bt_binsrch() has no lock or refcount side effects * on the buffer. */ -static OffsetNumber +OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf) @@ -845,25 +841,23 @@ _bt_compare(Relation rel, * in locating the scan start position. */ bool -_bt_first(IndexScanDesc scan, ScanDirection dir) +_bt_first(IndexScanDesc scan, ScanDirection prefixDir, ScanDirection postfixDir) { Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; Buffer buf; BTStack stack; OffsetNumber offnum; - StrategyNumber strat; - bool nextkey; bool goback; BTScanInsertData inskey; ScanKey startKeys[INDEX_MAX_KEYS]; ScanKeyData notnullkeys[INDEX_MAX_KEYS]; int keysCount = 0; - int i; bool status; StrategyNumber strat_total; BTScanPosItem *currItem; BlockNumber blkno; + IndexTuple itup; Assert(!BTScanPosIsValid(so->currPos)); @@ -904,184 +898,13 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) } else if (blkno != InvalidBlockNumber) { - if (!_bt_parallel_readpage(scan, blkno, dir)) + if (!_bt_parallel_readpage(scan, blkno, prefixDir)) return false; goto readcomplete; } } - /*---------- - * Examine the scan keys to discover where we need to start the scan. - * - * We want to identify the keys that can be used as starting boundaries; - * these are =, >, or >= keys for a forward scan or =, <, <= keys for - * a backwards scan. We can use keys for multiple attributes so long as - * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept - * a > or < boundary or find an attribute with no boundary (which can be - * thought of as the same as "> -infinity"), we can't use keys for any - * attributes to its right, because it would break our simplistic notion - * of what initial positioning strategy to use. - * - * When the scan keys include cross-type operators, _bt_preprocess_keys - * may not be able to eliminate redundant keys; in such cases we will - * arbitrarily pick a usable one for each attribute. This is correct - * but possibly not optimal behavior. (For example, with keys like - * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when - * x=5 would be more efficient.) Since the situation only arises given - * a poorly-worded query plus an incomplete opfamily, live with it. - * - * When both equality and inequality keys appear for a single attribute - * (again, only possible when cross-type operators appear), we *must* - * select one of the equality keys for the starting point, because - * _bt_checkkeys() will stop the scan as soon as an equality qual fails. - * For example, if we have keys like "x >= 4 AND x = 10" and we elect to - * start at x=4, we will fail and stop before reaching x=10. If multiple - * equality quals survive preprocessing, however, it doesn't matter which - * one we use --- by definition, they are either redundant or - * contradictory. - * - * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier. - * If the index stores nulls at the end of the index we'll be starting - * from, and we have no boundary key for the column (which means the key - * we deduced NOT NULL from is an inequality key that constrains the other - * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to - * use as a boundary key. If we didn't do this, we might find ourselves - * traversing a lot of null entries at the start of the scan. - * - * In this loop, row-comparison keys are treated the same as keys on their - * first (leftmost) columns. We'll add on lower-order columns of the row - * comparison below, if possible. - * - * The selected scan keys (at most one per index column) are remembered by - * storing their addresses into the local startKeys[] array. - *---------- - */ - strat_total = BTEqualStrategyNumber; - if (so->numberOfKeys > 0) - { - AttrNumber curattr; - ScanKey chosen; - ScanKey impliesNN; - ScanKey cur; - - /* - * chosen is the so-far-chosen key for the current attribute, if any. - * We don't cast the decision in stone until we reach keys for the - * next attribute. - */ - curattr = 1; - chosen = NULL; - /* Also remember any scankey that implies a NOT NULL constraint */ - impliesNN = NULL; - - /* - * Loop iterates from 0 to numberOfKeys inclusive; we use the last - * pass to handle after-last-key processing. Actual exit from the - * loop is at one of the "break" statements below. - */ - for (cur = so->keyData, i = 0;; cur++, i++) - { - if (i >= so->numberOfKeys || cur->sk_attno != curattr) - { - /* - * Done looking at keys for curattr. If we didn't find a - * usable boundary key, see if we can deduce a NOT NULL key. - */ - if (chosen == NULL && impliesNN != NULL && - ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ? - ScanDirectionIsForward(dir) : - ScanDirectionIsBackward(dir))) - { - /* Yes, so build the key in notnullkeys[keysCount] */ - chosen = ¬nullkeys[keysCount]; - ScanKeyEntryInitialize(chosen, - (SK_SEARCHNOTNULL | SK_ISNULL | - (impliesNN->sk_flags & - (SK_BT_DESC | SK_BT_NULLS_FIRST))), - curattr, - ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ? - BTGreaterStrategyNumber : - BTLessStrategyNumber), - InvalidOid, - InvalidOid, - InvalidOid, - (Datum) 0); - } - - /* - * If we still didn't find a usable boundary key, quit; else - * save the boundary key pointer in startKeys. - */ - if (chosen == NULL) - break; - startKeys[keysCount++] = chosen; - - /* - * Adjust strat_total, and quit if we have stored a > or < - * key. - */ - strat = chosen->sk_strategy; - if (strat != BTEqualStrategyNumber) - { - strat_total = strat; - if (strat == BTGreaterStrategyNumber || - strat == BTLessStrategyNumber) - break; - } - - /* - * Done if that was the last attribute, or if next key is not - * in sequence (implying no boundary key is available for the - * next attribute). - */ - if (i >= so->numberOfKeys || - cur->sk_attno != curattr + 1) - break; - - /* - * Reset for next attr. - */ - curattr = cur->sk_attno; - chosen = NULL; - impliesNN = NULL; - } - - /* - * Can we use this key as a starting boundary for this attr? - * - * If not, does it imply a NOT NULL constraint? (Because - * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber, - * *any* inequality key works for that; we need not test.) - */ - switch (cur->sk_strategy) - { - case BTLessStrategyNumber: - case BTLessEqualStrategyNumber: - if (chosen == NULL) - { - if (ScanDirectionIsBackward(dir)) - chosen = cur; - else - impliesNN = cur; - } - break; - case BTEqualStrategyNumber: - /* override any non-equality choice */ - chosen = cur; - break; - case BTGreaterEqualStrategyNumber: - case BTGreaterStrategyNumber: - if (chosen == NULL) - { - if (ScanDirectionIsForward(dir)) - chosen = cur; - else - impliesNN = cur; - } - break; - } - } - } + keysCount = _bt_choose_scan_keys(so->keyData, so->numberOfKeys, prefixDir, startKeys, notnullkeys, &strat_total, 0); /* * If we found no usable boundary keys, we have to start from one end of @@ -1092,260 +915,112 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) { bool match; - match = _bt_endpoint(scan, dir); - - if (!match) + if (!_bt_skip_enabled(so)) { - /* No match, so mark (parallel) scan finished */ - _bt_parallel_done(scan); - } + match = _bt_endpoint(scan, prefixDir); - return match; - } + if (!match) + { + /* No match, so mark (parallel) scan finished */ + _bt_parallel_done(scan); + } - /* - * We want to start the scan somewhere within the index. Set up an - * insertion scankey we can use to search for the boundary point we - * identified above. The insertion scankey is built using the keys - * identified by startKeys[]. (Remaining insertion scankey fields are - * initialized after initial-positioning strategy is finalized.) - */ - Assert(keysCount <= INDEX_MAX_KEYS); - for (i = 0; i < keysCount; i++) - { - ScanKey cur = startKeys[i]; + return match; + } + else + { + Relation rel = scan->indexRelation; + Buffer buf; + Page page; + BTPageOpaque opaque; + OffsetNumber start; + BTSkipCompareResult cmp = {0}; - Assert(cur->sk_attno == i + 1); + _bt_skip_create_scankeys(rel, so); - if (cur->sk_flags & SK_ROW_HEADER) - { /* - * Row comparison header: look to the first row member instead. - * - * The member scankeys are already in insertion format (ie, they - * have sk_func = 3-way-comparison function), but we have to watch - * out for nulls, which _bt_preprocess_keys didn't check. A null - * in the first row member makes the condition unmatchable, just - * like qual_ok = false. + * Scan down to the leftmost or rightmost leaf page and position + * the scan on the leftmost or rightmost item on that page. + * Start the skip scan from there to find the first matching item */ - ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument); + buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(prefixDir), scan->xs_snapshot); - Assert(subkey->sk_flags & SK_ROW_MEMBER); - if (subkey->sk_flags & SK_ISNULL) + if (!BufferIsValid(buf)) { - _bt_parallel_done(scan); + /* + * Empty index. Lock the whole relation, as nothing finer to lock + * exists. + */ + PredicateLockRelation(rel, scan->xs_snapshot); + BTScanPosInvalidate(so->currPos); return false; } - memcpy(inskey.scankeys + i, subkey, sizeof(ScanKeyData)); - /* - * If the row comparison is the last positioning key we accepted, - * try to add additional keys from the lower-order row members. - * (If we accepted independent conditions on additional index - * columns, we use those instead --- doesn't seem worth trying to - * determine which is more restrictive.) Note that this is OK - * even if the row comparison is of ">" or "<" type, because the - * condition applied to all but the last row member is effectively - * ">=" or "<=", and so the extra keys don't break the positioning - * scheme. But, by the same token, if we aren't able to use all - * the row members, then the part of the row comparison that we - * did use has to be treated as just a ">=" or "<=" condition, and - * so we'd better adjust strat_total accordingly. - */ - if (i == keysCount - 1) + PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot); + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + Assert(P_ISLEAF(opaque)); + + if (ScanDirectionIsForward(prefixDir)) { - bool used_all_subkeys = false; + /* There could be dead pages to the left, so not this: */ + /* Assert(P_LEFTMOST(opaque)); */ - Assert(!(subkey->sk_flags & SK_ROW_END)); - for (;;) - { - subkey++; - Assert(subkey->sk_flags & SK_ROW_MEMBER); - if (subkey->sk_attno != keysCount + 1) - break; /* out-of-sequence, can't use it */ - if (subkey->sk_strategy != cur->sk_strategy) - break; /* wrong direction, can't use it */ - if (subkey->sk_flags & SK_ISNULL) - break; /* can't use null keys */ - Assert(keysCount < INDEX_MAX_KEYS); - memcpy(inskey.scankeys + keysCount, subkey, - sizeof(ScanKeyData)); - keysCount++; - if (subkey->sk_flags & SK_ROW_END) - { - used_all_subkeys = true; - break; - } - } - if (!used_all_subkeys) - { - switch (strat_total) - { - case BTLessStrategyNumber: - strat_total = BTLessEqualStrategyNumber; - break; - case BTGreaterStrategyNumber: - strat_total = BTGreaterEqualStrategyNumber; - break; - } - } - break; /* done with outer loop */ + start = P_FIRSTDATAKEY(opaque); } - } - else - { - /* - * Ordinary comparison key. Transform the search-style scan key - * to an insertion scan key by replacing the sk_func with the - * appropriate btree comparison function. - * - * If scankey operator is not a cross-type comparison, we can use - * the cached comparison function; otherwise gotta look it up in - * the catalogs. (That can't lead to infinite recursion, since no - * indexscan initiated by syscache lookup will use cross-data-type - * operators.) - * - * We support the convention that sk_subtype == InvalidOid means - * the opclass input type; this is a hack to simplify life for - * ScanKeyInit(). - */ - if (cur->sk_subtype == rel->rd_opcintype[i] || - cur->sk_subtype == InvalidOid) + else if (ScanDirectionIsBackward(prefixDir)) { - FmgrInfo *procinfo; - - procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC); - ScanKeyEntryInitializeWithInfo(inskey.scankeys + i, - cur->sk_flags, - cur->sk_attno, - InvalidStrategy, - cur->sk_subtype, - cur->sk_collation, - procinfo, - cur->sk_argument); + Assert(P_RIGHTMOST(opaque)); + + start = PageGetMaxOffsetNumber(page); } else { - RegProcedure cmp_proc; - - cmp_proc = get_opfamily_proc(rel->rd_opfamily[i], - rel->rd_opcintype[i], - cur->sk_subtype, - BTORDER_PROC); - if (!RegProcedureIsValid(cmp_proc)) - elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"", - BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype, - cur->sk_attno, RelationGetRelationName(rel)); - ScanKeyEntryInitialize(inskey.scankeys + i, - cur->sk_flags, - cur->sk_attno, - InvalidStrategy, - cur->sk_subtype, - cur->sk_collation, - cmp_proc, - cur->sk_argument); + elog(ERROR, "invalid scan direction: %d", (int) prefixDir); } - } - } - /*---------- - * Examine the selected initial-positioning strategy to determine exactly - * where we need to start the scan, and set flag variables to control the - * code below. - * - * If nextkey = false, _bt_search and _bt_binsrch will locate the first - * item >= scan key. If nextkey = true, they will locate the first - * item > scan key. - * - * If goback = true, we will then step back one item, while if - * goback = false, we will start the scan on the located item. - *---------- - */ - switch (strat_total) - { - case BTLessStrategyNumber: - - /* - * Find first item >= scankey, then back up one to arrive at last - * item < scankey. (Note: this positioning strategy is only used - * for a backward scan, so that is always the correct starting - * position.) - */ - nextkey = false; - goback = true; - break; - - case BTLessEqualStrategyNumber: - - /* - * Find first item > scankey, then back up one to arrive at last - * item <= scankey. (Note: this positioning strategy is only used - * for a backward scan, so that is always the correct starting - * position.) - */ - nextkey = true; - goback = true; - break; - - case BTEqualStrategyNumber: - - /* - * If a backward scan was specified, need to start with last equal - * item not first one. + /* remember which buffer we have pinned */ + so->currPos.buf = buf; + so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf); + + itup = _bt_get_tuple_from_offset(so, start); + /* in some cases, we can (or have to) skip further inside the prefix. + * we can do this if we have extra quals becoming available, eg. + * WHERE b=2 on an index on (a,b). + * We must, if this is not regular mode (prefixDir!=postfixDir). + * Because this means we're at the end of the prefix, while we should be + * at the beginning. */ - if (ScanDirectionIsBackward(dir)) + if (_bt_has_extra_quals_after_skip(so->skipData, postfixDir, 0) || + !_bt_skip_is_regular_mode(prefixDir, postfixDir)) { - /* - * This is the same as the <= strategy. We will check at the - * end whether the found item is actually =. - */ - nextkey = true; - goback = true; + _bt_skip_extra_conditions(scan, &itup, &start, prefixDir, postfixDir, &cmp); } - else + /* now find the next matching tuple */ + match = _bt_skip_find_next(scan, itup, start, prefixDir, postfixDir); + if (!match) { - /* - * This is the same as the >= strategy. We will check at the - * end whether the found item is actually =. - */ - nextkey = false; - goback = false; + if (_bt_skip_is_always_valid(so)) + _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + return false; } - break; - case BTGreaterEqualStrategyNumber: + _bt_drop_lock_and_maybe_pin(scan, &so->currPos); - /* - * Find first item >= scankey. (This is only used for forward - * scans.) - */ - nextkey = false; - goback = false; - break; - - case BTGreaterStrategyNumber: - - /* - * Find first item > scankey. (This is only used for forward - * scans.) - */ - nextkey = true; - goback = false; - break; + currItem = &so->currPos.items[so->currPos.itemIndex]; + scan->xs_heaptid = currItem->heapTid; + if (scan->xs_want_itup) + scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); - default: - /* can't get here, but keep compiler quiet */ - elog(ERROR, "unrecognized strat_total: %d", (int) strat_total); - return false; + return true; + } } - /* Initialize remaining insertion scan key fields */ - _bt_metaversion(rel, &inskey.heapkeyspace, &inskey.allequalimage); - inskey.anynullkeys = false; /* unused */ - inskey.nextkey = nextkey; - inskey.pivotsearch = false; - inskey.scantid = NULL; - inskey.keysz = keysCount; + if (!_bt_create_insertion_scan_key(rel, prefixDir, startKeys, keysCount, &inskey, &strat_total, &goback)) + { + _bt_parallel_done(scan); + return false; + } /* * Use the manufactured insertion scan key to descend the tree and @@ -1377,7 +1052,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot); - _bt_initialize_more_data(so, dir); + _bt_initialize_more_data(so, prefixDir); /* position to the precise item on the page */ offnum = _bt_binsrch(rel, &inskey, buf); @@ -1407,23 +1082,81 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) Assert(!BTScanPosIsValid(so->currPos)); so->currPos.buf = buf; - /* - * Now load data from the first page of the scan. - */ - if (!_bt_readpage(scan, dir, offnum)) + if (_bt_skip_enabled(so)) { - /* - * There's no actually-matching data on this page. Try to advance to - * the next page. Return false if there's no matching data at all. + Page page; + BTPageOpaque opaque; + OffsetNumber minoff; + bool match; + BTSkipCompareResult cmp = {0}; + + /* first create the skip scan keys */ + _bt_skip_create_scankeys(rel, so); + + /* remember which page we have pinned */ + so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf); + + page = BufferGetPage(so->currPos.buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + minoff = P_FIRSTDATAKEY(opaque); + /* _binsrch + goback parameter can leave the offnum before the first item on the page + * or after the last item on the page. if that is the case we need to either step + * back or forward one page */ - _bt_unlockbuf(scan->indexRelation, so->currPos.buf); - if (!_bt_steppage(scan, dir)) + if (offnum < minoff) + { + _bt_unlockbuf(rel, so->currPos.buf); + if (!_bt_step_back_page(scan, &itup, &offnum)) + return false; + page = BufferGetPage(so->currPos.buf); + } + else if (offnum > PageGetMaxOffsetNumber(page)) + { + BlockNumber next = opaque->btpo_next; + _bt_unlockbuf(rel, so->currPos.buf); + if (!_bt_step_forward_page(scan, next, &itup, &offnum)) + return false; + page = BufferGetPage(so->currPos.buf); + } + + itup = _bt_get_tuple_from_offset(so, offnum); + /* check if we can skip even more because we can use new conditions */ + if (_bt_has_extra_quals_after_skip(so->skipData, postfixDir, inskey.keysz) || + !_bt_skip_is_regular_mode(prefixDir, postfixDir)) + { + _bt_skip_extra_conditions(scan, &itup, &offnum, prefixDir, postfixDir, &cmp); + } + /* now find the tuple */ + match = _bt_skip_find_next(scan, itup, offnum, prefixDir, postfixDir); + if (!match) + { + if (_bt_skip_is_always_valid(so)) + _bt_drop_lock_and_maybe_pin(scan, &so->currPos); return false; + } + + _bt_drop_lock_and_maybe_pin(scan, &so->currPos); } else { - /* Drop the lock, and maybe the pin, on the current page */ - _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + /* + * Now load data from the first page of the scan. + */ + if (!_bt_readpage(scan, prefixDir, &offnum, true)) + { + /* + * There's no actually-matching data on this page. Try to advance to + * the next page. Return false if there's no matching data at all. + */ + _bt_unlockbuf(scan->indexRelation, so->currPos.buf); + if (!_bt_steppage(scan, prefixDir)) + return false; + } + else + { + /* Drop the lock, and maybe the pin, on the current page */ + _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + } } readcomplete: @@ -1451,29 +1184,113 @@ readcomplete: * so->currPos.buf to InvalidBuffer. */ bool -_bt_next(IndexScanDesc scan, ScanDirection dir) +_bt_next(IndexScanDesc scan, ScanDirection prefixDir, ScanDirection postfixDir) { BTScanOpaque so = (BTScanOpaque) scan->opaque; BTScanPosItem *currItem; - /* - * Advance to next tuple on current page; or if there's no more, try to - * step to the next page with data. - */ - if (ScanDirectionIsForward(dir)) + if (!_bt_skip_enabled(so)) { - if (++so->currPos.itemIndex > so->currPos.lastItem) + /* + * Advance to next tuple on current page; or if there's no more, try to + * step to the next page with data. + */ + if (ScanDirectionIsForward(prefixDir)) { - if (!_bt_steppage(scan, dir)) - return false; + if (++so->currPos.itemIndex > so->currPos.lastItem) + { + if (!_bt_steppage(scan, prefixDir)) + return false; + } + } + else + { + if (--so->currPos.itemIndex < so->currPos.firstItem) + { + if (!_bt_steppage(scan, prefixDir)) + return false; + } } } else { - if (--so->currPos.itemIndex < so->currPos.firstItem) + bool match; + IndexTuple itup = NULL; + OffsetNumber offnum = InvalidOffsetNumber; + + if (ScanDirectionIsForward(postfixDir)) { - if (!_bt_steppage(scan, dir)) - return false; + if (++so->currPos.itemIndex > so->currPos.lastItem) + { + if (prefixDir != so->skipData->curPos.nextDirection) + { + /* this happens when doing a cursor scan and changing + * direction in the meantime. eg. first fetch forwards, + * then backwards. + * we *always* just go to the next page instead of skipping, + * because that's the only safe option. + */ + so->skipData->curPos.nextAction = SkipStateNext; + so->skipData->curPos.nextDirection = prefixDir; + } + + if (so->skipData->curPos.nextAction == SkipStateNext) + { + /* we should just go forwards one page, no skipping is necessary */ + if (!_bt_step_forward_page(scan, so->currPos.nextPage, &itup, &offnum)) + return false; + } + else if (so->skipData->curPos.nextAction == SkipStateStop) + { + /* we've reached the end of the index, or we cannot find any more keys */ + BTScanPosUnpinIfPinned(so->currPos); + BTScanPosInvalidate(so->currPos); + return false; + } + + /* now find the next tuple */ + match = _bt_skip_find_next(scan, itup, offnum, prefixDir, postfixDir); + if (!match) + { + if (_bt_skip_is_always_valid(so)) + _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + return false; + } + _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + } + } + else + { + if (--so->currPos.itemIndex < so->currPos.firstItem) + { + if (prefixDir != so->skipData->curPos.nextDirection) + { + so->skipData->curPos.nextAction = SkipStateNext; + so->skipData->curPos.nextDirection = prefixDir; + } + + if (so->skipData->curPos.nextAction == SkipStateNext) + { + if (!_bt_step_back_page(scan, &itup, &offnum)) + return false; + } + else if (so->skipData->curPos.nextAction == SkipStateStop) + { + BTScanPosUnpinIfPinned(so->currPos); + BTScanPosInvalidate(so->currPos); + return false; + } + + /* now find the next tuple */ + match = _bt_skip_find_next(scan, itup, offnum, prefixDir, postfixDir); + if (!match) + { + if (_bt_skip_is_always_valid(so)) + _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + return false; + } + _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + } } } @@ -1505,8 +1322,8 @@ _bt_next(IndexScanDesc scan, ScanDirection dir) * * Returns true if any matching items found on the page, false if none. */ -static bool -_bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) +bool +_bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber *offnum, bool isRegularMode) { BTScanOpaque so = (BTScanOpaque) scan->opaque; Page page; @@ -1516,6 +1333,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) int itemIndex; bool continuescan; int indnatts; + int prefixskipindex; /* * We must have the buffer pinned and locked, but the usual macro can't be @@ -1574,11 +1392,11 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) /* load items[] in ascending order */ itemIndex = 0; - offnum = Max(offnum, minoff); + *offnum = Max(*offnum, minoff); - while (offnum <= maxoff) + while (*offnum <= maxoff) { - ItemId iid = PageGetItemId(page, offnum); + ItemId iid = PageGetItemId(page, *offnum); IndexTuple itup; /* @@ -1587,19 +1405,19 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) */ if (scan->ignore_killed_tuples && ItemIdIsDead(iid)) { - offnum = OffsetNumberNext(offnum); + *offnum = OffsetNumberNext(*offnum); continue; } itup = (IndexTuple) PageGetItem(page, iid); - if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan)) + if (_bt_checkkeys_extended(scan, itup, indnatts, dir, isRegularMode, &continuescan, &prefixskipindex)) { /* tuple passes all scan key conditions */ if (!BTreeTupleIsPosting(itup)) { /* Remember it */ - _bt_saveitem(so, itemIndex, offnum, itup); + _bt_saveitem(so, itemIndex, *offnum, itup); itemIndex++; } else @@ -1611,26 +1429,30 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) * TID */ tupleOffset = - _bt_setuppostingitems(so, itemIndex, offnum, + _bt_setuppostingitems(so, itemIndex, *offnum, BTreeTupleGetPostingN(itup, 0), itup); itemIndex++; /* Remember additional TIDs */ for (int i = 1; i < BTreeTupleGetNPosting(itup); i++) { - _bt_savepostingitem(so, itemIndex, offnum, + _bt_savepostingitem(so, itemIndex, *offnum, BTreeTupleGetPostingN(itup, i), tupleOffset); itemIndex++; } } } + + *offnum = OffsetNumberNext(*offnum); + /* When !continuescan, there can't be any more matches, so stop */ if (!continuescan) break; - - offnum = OffsetNumberNext(offnum); + if (!isRegularMode && prefixskipindex != -1) + break; } + *offnum = OffsetNumberPrev(*offnum); /* * We don't need to visit page to the right when the high key @@ -1650,7 +1472,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) int truncatt; truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation); - _bt_checkkeys(scan, itup, truncatt, dir, &continuescan); + _bt_checkkeys(scan, itup, truncatt, dir, &continuescan, NULL); } if (!continuescan) @@ -1666,11 +1488,11 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) /* load items[] in descending order */ itemIndex = MaxTIDsPerBTreePage; - offnum = Min(offnum, maxoff); + *offnum = Min(*offnum, maxoff); - while (offnum >= minoff) + while (*offnum >= minoff) { - ItemId iid = PageGetItemId(page, offnum); + ItemId iid = PageGetItemId(page, *offnum); IndexTuple itup; bool tuple_alive; bool passes_quals; @@ -1687,10 +1509,10 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) */ if (scan->ignore_killed_tuples && ItemIdIsDead(iid)) { - Assert(offnum >= P_FIRSTDATAKEY(opaque)); - if (offnum > P_FIRSTDATAKEY(opaque)) + Assert(*offnum >= P_FIRSTDATAKEY(opaque)); + if (*offnum > P_FIRSTDATAKEY(opaque)) { - offnum = OffsetNumberPrev(offnum); + *offnum = OffsetNumberPrev(*offnum); continue; } @@ -1701,8 +1523,8 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) itup = (IndexTuple) PageGetItem(page, iid); - passes_quals = _bt_checkkeys(scan, itup, indnatts, dir, - &continuescan); + passes_quals = _bt_checkkeys_extended(scan, itup, indnatts, dir, + isRegularMode, &continuescan, &prefixskipindex); if (passes_quals && tuple_alive) { /* tuple passes all scan key conditions */ @@ -1710,7 +1532,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) { /* Remember it */ itemIndex--; - _bt_saveitem(so, itemIndex, offnum, itup); + _bt_saveitem(so, itemIndex, *offnum, itup); } else { @@ -1728,28 +1550,32 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) */ itemIndex--; tupleOffset = - _bt_setuppostingitems(so, itemIndex, offnum, + _bt_setuppostingitems(so, itemIndex, *offnum, BTreeTupleGetPostingN(itup, 0), itup); /* Remember additional TIDs */ for (int i = 1; i < BTreeTupleGetNPosting(itup); i++) { itemIndex--; - _bt_savepostingitem(so, itemIndex, offnum, + _bt_savepostingitem(so, itemIndex, *offnum, BTreeTupleGetPostingN(itup, i), tupleOffset); } } } + + *offnum = OffsetNumberPrev(*offnum); + if (!continuescan) { /* there can't be any more matches, so stop */ so->currPos.moreLeft = false; break; } - - offnum = OffsetNumberPrev(offnum); + if (!isRegularMode && prefixskipindex != -1) + break; } + *offnum = OffsetNumberNext(*offnum); Assert(itemIndex >= 0); so->currPos.firstItem = itemIndex; @@ -1857,7 +1683,7 @@ _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, * read lock, on that page. If we do not hold the pin, we set so->currPos.buf * to InvalidBuffer. We return true to indicate success. */ -static bool +bool _bt_steppage(IndexScanDesc scan, ScanDirection dir) { BTScanOpaque so = (BTScanOpaque) scan->opaque; @@ -1885,6 +1711,9 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) if (so->markTuples) memcpy(so->markTuples, so->currTuples, so->currPos.nextTupleOffset); + if (so->skipData) + memcpy(&so->skipData->markPos, &so->skipData->curPos, + sizeof(BTSkipPosData)); so->markPos.itemIndex = so->markItemIndex; so->markItemIndex = -1; } @@ -1964,7 +1793,7 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) * If there are no more matching records in the given direction, we drop all * locks and pins, set so->currPos.buf to InvalidBuffer, and return false. */ -static bool +bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) { BTScanOpaque so = (BTScanOpaque) scan->opaque; @@ -1972,6 +1801,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) Page page; BTPageOpaque opaque; bool status; + OffsetNumber offnum; rel = scan->indexRelation; @@ -2002,7 +1832,8 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) PredicateLockPage(rel, blkno, scan->xs_snapshot); /* see if there are any matches on this page */ /* note that this will clear moreRight if we can stop */ - if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque))) + offnum = P_FIRSTDATAKEY(opaque); + if (_bt_readpage(scan, dir, &offnum, true)) break; } else if (scan->parallel_scan != NULL) @@ -2104,7 +1935,8 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot); /* see if there are any matches on this page */ /* note that this will clear moreLeft if we can stop */ - if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page))) + offnum = PageGetMaxOffsetNumber(page); + if (_bt_readpage(scan, dir, &offnum, true)) break; } else if (scan->parallel_scan != NULL) @@ -2172,7 +2004,7 @@ _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) * to be half-dead; the caller should check that condition and step left * again if it's important. */ -static Buffer +Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot) { Page page; @@ -2436,7 +2268,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) /* * Now load data from the first page of the scan. */ - if (!_bt_readpage(scan, dir, start)) + if (!_bt_readpage(scan, dir, &start, true)) { /* * There's no actually-matching data on this page. Try to advance to @@ -2465,7 +2297,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) * _bt_initialize_more_data() -- initialize moreLeft/moreRight appropriately * for scan direction */ -static inline void +inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir) { /* initialize moreLeft/moreRight appropriately for scan direction */ @@ -2482,3 +2314,25 @@ _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir) so->numKilled = 0; /* just paranoia */ so->markItemIndex = -1; /* ditto */ } + +/* Forward the call to either _bt_checkkeys, which is a simple + * and fastest way of checking keys, or to _bt_checkkeys_skip, + * which is a slower way to check the keys, but it will return extra + * information about whether or not we should stop reading the current page + * and skip. The expensive checking is only necessary when !isRegularMode, eg. + * when prefixDir!=postfixDir, which only happens when scanning from cursors backwards + */ +static inline bool +_bt_checkkeys_extended(IndexScanDesc scan, IndexTuple tuple, int tupnatts, + ScanDirection dir, bool isRegularMode, + bool *continuescan, int *prefixskipindex) +{ + if (isRegularMode) + { + return _bt_checkkeys(scan, tuple, tupnatts, dir, continuescan, prefixskipindex); + } + else + { + return _bt_checkkeys_skip(scan, tuple, tupnatts, dir, continuescan, prefixskipindex); + } +} diff --git a/src/backend/access/nbtree/nbtskip.c b/src/backend/access/nbtree/nbtskip.c new file mode 100644 index 0000000000..e2dbaf2e69 --- /dev/null +++ b/src/backend/access/nbtree/nbtskip.c @@ -0,0 +1,1455 @@ +/*------------------------------------------------------------------------- + * + * nbtskip.c + * Search code related to skip scan for postgres btrees. + * + * + * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/nbtree/nbtskip.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/nbtree.h" +#include "access/relscan.h" +#include "catalog/catalog.h" +#include "miscadmin.h" +#include "utils/guc.h" +#include "storage/predicate.h" +#include "utils/lsyscache.h" +#include "utils/rel.h" + +static inline void _bt_update_scankey_with_tuple(BTScanInsert scankeys, + Relation indexRel, IndexTuple itup, int numattrs); +static inline bool _bt_scankey_within_page(IndexScanDesc scan, BTScanInsert key, Buffer buf); +static inline int32 _bt_compare_until(Relation rel, BTScanInsert key, IndexTuple itup, int prefix); +static inline void +_bt_determine_next_action(IndexScanDesc scan, BTSkipCompareResult *cmp, OffsetNumber firstOffnum, + OffsetNumber lastOffnum, ScanDirection postfixDir, BTSkipState *nextAction); +static inline void +_bt_determine_next_action_after_skip(BTScanOpaque so, BTSkipCompareResult *cmp, ScanDirection prefixDir, + ScanDirection postfixDir, int skipped, BTSkipState *nextAction); +static inline void +_bt_determine_next_action_after_skip_extra(BTScanOpaque so, BTSkipCompareResult *cmp, BTSkipState *nextAction); +static inline void _bt_copy_scankey(BTScanInsert to, BTScanInsert from, int numattrs); +static inline IndexTuple _bt_get_tuple_from_offset_with_copy(BTScanOpaque so, OffsetNumber curTupleOffnum); + +static void _bt_skip_update_scankey_after_read(IndexScanDesc scan, IndexTuple curTuple, + ScanDirection prefixDir, ScanDirection postfixDir); +static void _bt_skip_update_scankey_for_prefix_skip(IndexScanDesc scan, Relation indexRel, + int prefix, IndexTuple itup, ScanDirection prefixDir); +static bool _bt_try_in_page_skip(IndexScanDesc scan, ScanDirection prefixDir); +static void debug_print(IndexTuple itup, BTScanInsert scanKey, Relation rel, char *extra); + +/* probably to be removed but useful for debugging during patch implementation */ +static void debug_print(IndexTuple itup, BTScanInsert scanKey, Relation rel, char *extra) +{ + bool isnull[INDEX_MAX_KEYS]; + Datum values[INDEX_MAX_KEYS]; + char *lkey_desc = NULL; + + /* Avoid infinite recursion -- don't instrument catalog indexes */ + if (!IsCatalogRelation(rel)) + { + TupleDesc itupdesc = RelationGetDescr(rel); + int natts; + int indnkeyatts = rel->rd_index->indnkeyatts; + + Oid typOutput; + bool varlenatype; + char *val; + int i; + + char buf[8096] = {0}; + int idx = 0; + + if (itup != NULL) + { + natts = BTreeTupleGetNAtts(itup, rel); + itupdesc->natts = Min(indnkeyatts, natts); + memset(&isnull, 0xFF, sizeof(isnull)); + index_deform_tuple(itup, itupdesc, values, isnull); + + rel->rd_index->indnkeyatts = natts; + + /* + * Since the regression tests should pass when the instrumentation + * patch is applied, be prepared for BuildIndexValueDescription() to + * return NULL due to security considerations. + */ + lkey_desc = BuildIndexValueDescription(rel, values, isnull); + } + + for (i = 0; i < scanKey->keysz; i++) + { + ScanKey cur = &scanKey->scankeys[i]; + + if (i != 0) + { + buf[idx] = ','; + idx++; + } + + if (!(cur->sk_flags & SK_ISNULL)) + { + if (cur->sk_subtype != InvalidOid) + getTypeOutputInfo(cur->sk_subtype, + &typOutput, &varlenatype); + else + getTypeOutputInfo(rel->rd_opcintype[i], + &typOutput, &varlenatype); + val = OidOutputFunctionCall(typOutput, cur->sk_argument); + if (val) + { + unsigned long tocopy = strnlen(val, 15); + memcpy(buf + idx, val, tocopy); + idx += tocopy; + pfree(val); + } + else + { + memcpy(buf + idx, "n/a", 3); + idx += 3; + } + } + else + { + memcpy(buf + idx, "null", 4); + idx += 4; + } + } + buf[idx] = 0; + + elog(DEBUG1, "%s : %s tuple(%s) sk(%s)", + extra, RelationGetRelationName(rel), lkey_desc ? lkey_desc : "N/A", buf); + + /* Cleanup */ + itupdesc->natts = IndexRelationGetNumberOfAttributes(rel); + rel->rd_index->indnkeyatts = indnkeyatts; + if (lkey_desc) + pfree(lkey_desc); + } +} + +/* + * returns whether we're at the end of a scan. + * the scan position can be invalid even though we still + * should continue the scan. this happens for example when + * we're scanning with prefixDir!=postfixDir. when looking at the first + * prefix, we traverse the items within the prefix from max to min. + * if none of them match, we actually run off the start of the index, + * meaning none of the tuples within this prefix match. the scan pos becomes + * invalid, however, we do need to look further to the next prefix. + * therefore, this function still returns true in this particular case. + */ +static inline bool +_bt_skip_is_valid(BTScanOpaque so, ScanDirection prefixDir, ScanDirection postfixDir) +{ + return BTScanPosIsValid(so->currPos) || + (!_bt_skip_is_regular_mode(prefixDir, postfixDir) && + so->skipData->curPos.nextAction != SkipStateStop); +} + +/* try finding the next tuple to skip to within the local tuple storage. + * local tuple storage is filled during _bt_readpage with all matching + * tuples on that page. if we can find the next prefix here it saves + * us doing a scan from root. + * Note that this optimization only works with _bt_regular_mode == true + * If this is not the case, the local tuple workspace will always only + * contain tuples of one specific prefix (_bt_readpage will stop at + * the end of a prefx) + */ +static bool +_bt_try_in_page_skip(IndexScanDesc scan, ScanDirection prefixDir) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + BTScanPosItem *currItem; + BTSkip skip = so->skipData; + IndexTuple itup = NULL; + bool goback; + int low, high, starthigh, startlow; + int32 result, + cmpval; + BTScanInsert key = &so->skipData->curPos.skipScanKey; + + currItem = &so->currPos.items[so->currPos.itemIndex]; + itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); + + _bt_skip_update_scankey_for_prefix_skip(scan, scan->indexRelation, skip->prefix, itup, prefixDir); + + _bt_set_bsearch_flags(key->scankeys[key->keysz - 1].sk_strategy, prefixDir, &key->nextkey, &goback); + + /* Requesting nextkey semantics while using scantid seems nonsensical */ + Assert(!key->nextkey || key->scantid == NULL); + /* scantid-set callers must use _bt_binsrch_insert() on leaf pages */ + + startlow = low = ScanDirectionIsForward(prefixDir) ? so->currPos.itemIndex + 1 : so->currPos.firstItem; + starthigh = high = ScanDirectionIsForward(prefixDir) ? so->currPos.lastItem : so->currPos.itemIndex - 1; + + /* + * If there are no keys on the page, return the first available slot. Note + * this covers two cases: the page is really empty (no keys), or it + * contains only a high key. The latter case is possible after vacuuming. + * This can never happen on an internal page, however, since they are + * never empty (an internal page must have children). + */ + if (unlikely(high < low)) + return false; + + /* + * Binary search to find the first key on the page >= scan key, or first + * key > scankey when nextkey is true. + * + * For nextkey=false (cmpval=1), the loop invariant is: all slots before + * 'low' are < scan key, all slots at or after 'high' are >= scan key. + * + * For nextkey=true (cmpval=0), the loop invariant is: all slots before + * 'low' are <= scan key, all slots at or after 'high' are > scan key. + * + * We can fall out when high == low. + */ + high++; /* establish the loop invariant for high */ + + cmpval = key->nextkey ? 0 : 1; /* select comparison value */ + + while (high > low) + { + int mid = low + ((high - low) / 2); + + /* We have low <= mid < high, so mid points at a real slot */ + + currItem = &so->currPos.items[mid]; + itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); + result = _bt_compare_until(scan->indexRelation, key, itup, skip->prefix); + + if (result >= cmpval) + low = mid + 1; + else + high = mid; + } + + if (high > starthigh) + return false; + + if (goback) + { + low--; + if (low < startlow) + return false; + } + + so->currPos.itemIndex = low; + + if (DEBUG1 >= log_min_messages || DEBUG1 >= client_min_messages) + { + currItem = &so->currPos.items[low]; + itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); + debug_print(itup, &so->skipData->curPos.skipScanKey, scan->indexRelation, "skip-in-page"); + } + + return true; +} + +/* + * _bt_skip() -- Skip items that have the same prefix as the most recently + * fetched index tuple. + * + * in: pinned, not locked + * out: pinned, not locked (unless end of scan, then unpinned) + */ +bool +_bt_skip(IndexScanDesc scan, ScanDirection prefixDir, ScanDirection postfixDir) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + BTScanPosItem *currItem; + IndexTuple itup = NULL; + OffsetNumber curTupleOffnum = InvalidOffsetNumber; + BTSkipCompareResult cmp; + BTSkip skip = so->skipData; + OffsetNumber first; + + /* in page skip only works when prefixDir == postfixDir */ + if (!_bt_skip_is_regular_mode(prefixDir, postfixDir) || !_bt_try_in_page_skip(scan, prefixDir)) + { + currItem = &so->currPos.items[so->currPos.itemIndex]; + itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); + + so->skipData->curPos.nextSkipIndex = so->skipData->prefix; + _bt_skip_once(scan, &itup, &curTupleOffnum, true, prefixDir, postfixDir); + _bt_skip_until_match(scan, &itup, &curTupleOffnum, prefixDir, postfixDir); + if (!_bt_skip_is_always_valid(so)) + return false; + + first = curTupleOffnum; + _bt_readpage(scan, postfixDir, &curTupleOffnum, _bt_skip_is_regular_mode(prefixDir, postfixDir)); + if (DEBUG2 >= log_min_messages || DEBUG2 >= client_min_messages) + { + print_itup(BufferGetBlockNumber(so->currPos.buf), _bt_get_tuple_from_offset(so, first), NULL, scan->indexRelation, + "first item on page compared after skip"); + print_itup(BufferGetBlockNumber(so->currPos.buf), _bt_get_tuple_from_offset(so, curTupleOffnum), NULL, scan->indexRelation, + "last item on page compared after skip"); + } + _bt_compare_current_item(scan, _bt_get_tuple_from_offset(so, curTupleOffnum), + IndexRelationGetNumberOfAttributes(scan->indexRelation), + postfixDir, _bt_skip_is_regular_mode(prefixDir, postfixDir), &cmp); + _bt_determine_next_action(scan, &cmp, first, curTupleOffnum, postfixDir, &skip->curPos.nextAction); + skip->curPos.nextDirection = prefixDir; + skip->curPos.nextSkipIndex = cmp.prefixSkipIndex; + _bt_skip_update_scankey_after_read(scan, _bt_get_tuple_from_offset(so, curTupleOffnum), prefixDir, postfixDir); + + _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + } + + /* prepare for the call to _bt_next, because _bt_next increments this to get to the tuple we want to be at */ + if (ScanDirectionIsForward(postfixDir)) + so->currPos.itemIndex--; + else + so->currPos.itemIndex++; + + return true; +} + +IndexTuple +_bt_get_tuple_from_offset(BTScanOpaque so, OffsetNumber curTupleOffnum) +{ + Page page = BufferGetPage(so->currPos.buf); + return (IndexTuple) PageGetItem(page, PageGetItemId(page, curTupleOffnum)); +} + +static IndexTuple +_bt_get_tuple_from_offset_with_copy(BTScanOpaque so, OffsetNumber curTupleOffnum) +{ + Page page = BufferGetPage(so->currPos.buf); + IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, curTupleOffnum)); + Size itupsz = IndexTupleSize(itup); + memcpy(so->skipData->curPos.skipTuple, itup, itupsz); + + return (IndexTuple) so->skipData->curPos.skipTuple; +} + +static void +_bt_determine_next_action(IndexScanDesc scan, BTSkipCompareResult *cmp, OffsetNumber firstOffnum, OffsetNumber lastOffnum, ScanDirection postfixDir, BTSkipState *nextAction) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + + if (cmp->fullKeySkip) + *nextAction = SkipStateStop; + else if (ScanDirectionIsForward(postfixDir)) + { + OffsetNumber firstItem = firstOffnum, lastItem = lastOffnum; + if (cmp->prefixSkip) + { + *nextAction = SkipStateSkip; + } + else + { + IndexTuple toCmp; + if (so->currPos.lastItem >= so->currPos.firstItem) + toCmp = _bt_get_tuple_from_offset_with_copy(so, so->currPos.items[so->currPos.lastItem].indexOffset); + else + toCmp = _bt_get_tuple_from_offset_with_copy(so, firstItem); + _bt_update_scankey_with_tuple(&so->skipData->currentTupleKey, + scan->indexRelation, toCmp, RelationGetNumberOfAttributes(scan->indexRelation)); + if (_bt_has_extra_quals_after_skip(so->skipData, postfixDir, so->skipData->prefix) && !cmp->equal && + (cmp->prefixCmpResult != 0 || + _bt_compare_until(scan->indexRelation, &so->skipData->currentTupleKey, + _bt_get_tuple_from_offset(so, lastItem), so->skipData->prefix) != 0)) + *nextAction = SkipStateSkipExtra; + else + *nextAction = SkipStateNext; + } + } + else + { + OffsetNumber firstItem = lastOffnum, lastItem = firstOffnum; + if (cmp->prefixSkip) + { + *nextAction = SkipStateSkip; + } + else + { + IndexTuple toCmp; + if (so->currPos.lastItem >= so->currPos.firstItem) + toCmp = _bt_get_tuple_from_offset_with_copy(so, so->currPos.items[so->currPos.firstItem].indexOffset); + else + toCmp = _bt_get_tuple_from_offset_with_copy(so, lastItem); + _bt_update_scankey_with_tuple(&so->skipData->currentTupleKey, + scan->indexRelation, toCmp, RelationGetNumberOfAttributes(scan->indexRelation)); + if (_bt_has_extra_quals_after_skip(so->skipData, postfixDir, so->skipData->prefix) && !cmp->equal && + (cmp->prefixCmpResult != 0 || + _bt_compare_until(scan->indexRelation, &so->skipData->currentTupleKey, + _bt_get_tuple_from_offset(so, firstItem), so->skipData->prefix) != 0)) + *nextAction = SkipStateSkipExtra; + else + *nextAction = SkipStateNext; + } + } +} + +static inline bool +_bt_should_prefix_skip(BTSkipCompareResult *cmp) +{ + return cmp->prefixSkip || cmp->prefixCmpResult != 0; +} + +static inline void +_bt_determine_next_action_after_skip(BTScanOpaque so, BTSkipCompareResult *cmp, ScanDirection prefixDir, + ScanDirection postfixDir, int skipped, BTSkipState *nextAction) +{ + if (!_bt_skip_is_always_valid(so) || cmp->fullKeySkip) + *nextAction = SkipStateStop; + else if (cmp->equal && _bt_skip_is_regular_mode(prefixDir, postfixDir)) + *nextAction = SkipStateNext; + else if (_bt_should_prefix_skip(cmp) && _bt_skip_is_regular_mode(prefixDir, postfixDir) && + ((ScanDirectionIsForward(prefixDir) && cmp->skCmpResult == -1) || + (ScanDirectionIsBackward(prefixDir) && cmp->skCmpResult == 1))) + *nextAction = SkipStateSkip; + else if (!_bt_skip_is_regular_mode(prefixDir, postfixDir) || + _bt_has_extra_quals_after_skip(so->skipData, postfixDir, skipped) || + cmp->prefixCmpResult != 0) + *nextAction = SkipStateSkipExtra; + else + *nextAction = SkipStateNext; +} + +static inline void +_bt_determine_next_action_after_skip_extra(BTScanOpaque so, BTSkipCompareResult *cmp, BTSkipState *nextAction) +{ + if (!_bt_skip_is_always_valid(so) || cmp->fullKeySkip) + *nextAction = SkipStateStop; + else if (cmp->equal) + *nextAction = SkipStateNext; + else if (_bt_should_prefix_skip(cmp)) + *nextAction = SkipStateSkip; + else + *nextAction = SkipStateNext; +} + +/* just a debug function that prints a scankey. will be removed for final patch */ +static inline void +_print_skey(IndexScanDesc scan, BTScanInsert scanKey) +{ + Oid typOutput; + bool varlenatype; + char *val; + int i; + Relation rel = scan->indexRelation; + + for (i = 0; i < scanKey->keysz; i++) + { + ScanKey cur = &scanKey->scankeys[i]; + if (!IsCatalogRelation(rel)) + { + if (!(cur->sk_flags & SK_ISNULL)) + { + if (cur->sk_subtype != InvalidOid) + getTypeOutputInfo(cur->sk_subtype, + &typOutput, &varlenatype); + else + getTypeOutputInfo(rel->rd_opcintype[i], + &typOutput, &varlenatype); + val = OidOutputFunctionCall(typOutput, cur->sk_argument); + if (val) + { + elog(DEBUG1, "%s sk attr %d val: %s (%s, %s)", + RelationGetRelationName(rel), i, val, + (cur->sk_flags & SK_BT_NULLS_FIRST) != 0 ? "NULLS FIRST" : "NULLS LAST", + (cur->sk_flags & SK_BT_DESC) != 0 ? "DESC" : "ASC"); + pfree(val); + } + } + else + { + elog(DEBUG1, "%s sk attr %d val: NULL (%s, %s)", + RelationGetRelationName(rel), i, + (cur->sk_flags & SK_BT_NULLS_FIRST) != 0 ? "NULLS FIRST" : "NULLS LAST", + (cur->sk_flags & SK_BT_DESC) != 0 ? "DESC" : "ASC"); + } + } + } +} + +bool +_bt_checkkeys_skip(IndexScanDesc scan, IndexTuple tuple, int tupnatts, + ScanDirection dir, bool *continuescan, int *prefixskipindex) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + BTSkip skip = so->skipData; + + bool match = _bt_checkkeys(scan, tuple, tupnatts, dir, continuescan, prefixskipindex); + int prefixCmpResult = _bt_compare_until(scan->indexRelation, &skip->curPos.skipScanKey, tuple, skip->prefix); + if (*prefixskipindex == -1 && prefixCmpResult != 0) + { + *prefixskipindex = skip->prefix; + return false; + } + else + { + bool newcont; + _bt_checkkeys_threeway(scan, tuple, tupnatts, dir, &newcont, prefixskipindex); + if (*prefixskipindex == -1 && prefixCmpResult != 0) + { + *prefixskipindex = skip->prefix; + return false; + } + } + return match; +} + +/* + * Compare a scankey with a given tuple but only the first prefix columns + * This function returns 0 if the first 'prefix' columns are equal + * -1 if key < itup for the first prefix columns + * 1 if key > itup for the first prefix columns + */ +int32 +_bt_compare_until(Relation rel, + BTScanInsert key, + IndexTuple itup, + int prefix) +{ + TupleDesc itupdesc = RelationGetDescr(rel); + ScanKey scankey; + int ncmpkey; + + Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel)); + + ncmpkey = Min(prefix, key->keysz); + scankey = key->scankeys; + for (int i = 1; i <= ncmpkey; i++) + { + Datum datum; + bool isNull; + int32 result; + + datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull); + + /* see comments about NULLs handling in btbuild */ + if (scankey->sk_flags & SK_ISNULL) /* key is NULL */ + { + if (isNull) + result = 0; /* NULL "=" NULL */ + else if (scankey->sk_flags & SK_BT_NULLS_FIRST) + result = -1; /* NULL "<" NOT_NULL */ + else + result = 1; /* NULL ">" NOT_NULL */ + } + else if (isNull) /* key is NOT_NULL and item is NULL */ + { + if (scankey->sk_flags & SK_BT_NULLS_FIRST) + result = 1; /* NOT_NULL ">" NULL */ + else + result = -1; /* NOT_NULL "<" NULL */ + } + else + { + /* + * The sk_func needs to be passed the index value as left arg and + * the sk_argument as right arg (they might be of different + * types). Since it is convenient for callers to think of + * _bt_compare as comparing the scankey to the index item, we have + * to flip the sign of the comparison result. (Unless it's a DESC + * column, in which case we *don't* flip the sign.) + */ + result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func, + scankey->sk_collation, + datum, + scankey->sk_argument)); + + if (!(scankey->sk_flags & SK_BT_DESC)) + INVERT_COMPARE_RESULT(result); + } + + /* if the keys are unequal, return the difference */ + if (result != 0) + return result; + + scankey++; + } + return 0; +} + + +/* + * Create initial scankeys for skipping and stores them in the skipData + * structure + */ +void +_bt_skip_create_scankeys(Relation rel, BTScanOpaque so) +{ + int keysCount; + BTSkip skip = so->skipData; + StrategyNumber stratTotal; + ScanKey keyPointers[INDEX_MAX_KEYS]; + bool goback; + /* we need to create both forward and backward keys because the scan direction + * may change at any moment in scans with a cursor. + * we could technically delay creation of the second until first use as an optimization + * but that is not implemented yet. + */ + keysCount = _bt_choose_scan_keys(so->keyData, so->numberOfKeys, ForwardScanDirection, + keyPointers, skip->fwdNotNullKeys, &stratTotal, skip->prefix); + _bt_create_insertion_scan_key(rel, ForwardScanDirection, keyPointers, keysCount, + &skip->fwdScanKey, &stratTotal, &goback); + + keysCount = _bt_choose_scan_keys(so->keyData, so->numberOfKeys, BackwardScanDirection, + keyPointers, skip->bwdNotNullKeys, &stratTotal, skip->prefix); + _bt_create_insertion_scan_key(rel, BackwardScanDirection, keyPointers, keysCount, + &skip->bwdScanKey, &stratTotal, &goback); + + _bt_metaversion(rel, &skip->curPos.skipScanKey.heapkeyspace, + &skip->curPos.skipScanKey.allequalimage); + skip->curPos.skipScanKey.anynullkeys = false; /* unused */ + skip->curPos.skipScanKey.nextkey = false; + skip->curPos.skipScanKey.pivotsearch = false; + skip->curPos.skipScanKey.scantid = NULL; + skip->curPos.skipScanKey.keysz = 0; + + /* setup scankey for the current tuple as well. it's not necessarily that + * we will use the data from the current tuple already, + * but we need the rest of the data structure to be set up correctly + * for when we use it to create skip->curPos.skipScanKey keys later + */ + _bt_mkscankey(rel, NULL, &skip->currentTupleKey); +} + +/* + * _bt_scankey_within_page() -- check if the provided scankey could be found + * within a page, specified by the buffer. + */ +static inline bool +_bt_scankey_within_page(IndexScanDesc scan, BTScanInsert key, + Buffer buf) +{ + /* @todo: optimization is still possible here to + * only check either the low or the high, depending on + * which direction *we came from* AND which direction + * *we are planning to scan* + */ + OffsetNumber low, high; + Page page = BufferGetPage(buf); + BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page); + int ans_lo, ans_hi; + + low = P_FIRSTDATAKEY(opaque); + high = PageGetMaxOffsetNumber(page); + + if (unlikely(high < low)) + return false; + + ans_lo = _bt_compare(scan->indexRelation, + key, page, low); + ans_hi = _bt_compare(scan->indexRelation, + key, page, high); + if (key->nextkey) + { + /* sk < last && sk >= first */ + return ans_lo >= 0 && ans_hi == -1; + } + else + { + /* sk <= last && sk > first */ + return ans_lo == 1 && ans_hi <= 0; + } +} + +/* in: pinned and locked, out: pinned and locked (unless end of scan) */ +static void +_bt_skip_find(IndexScanDesc scan, IndexTuple *curTuple, OffsetNumber *curTupleOffnum, + BTScanInsert scanKey, ScanDirection dir) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + OffsetNumber offnum; + BTStack stack; + Buffer buf; + bool goback; + Page page; + BTPageOpaque opaque; + OffsetNumber minoff; + Relation rel = scan->indexRelation; + bool fromroot = true; + + _bt_set_bsearch_flags(scanKey->scankeys[scanKey->keysz - 1].sk_strategy, dir, &scanKey->nextkey, &goback); + + if ((DEBUG2 >= log_min_messages || DEBUG2 >= client_min_messages) && !IsCatalogRelation(rel)) + { + if (*curTuple != NULL) + print_itup(BufferGetBlockNumber(so->currPos.buf), *curTuple, NULL, rel, + "before btree search"); + + elog(DEBUG1, "%s searching tree with %d keys, nextkey=%d, goback=%d", + RelationGetRelationName(rel), scanKey->keysz, scanKey->nextkey, + goback); + + _print_skey(scan, scanKey); + } + + if (*curTupleOffnum == InvalidOffsetNumber) + { + BTScanPosUnpinIfPinned(so->currPos); + } + else + { + if (_bt_scankey_within_page(scan, scanKey, so->currPos.buf)) + { + elog(DEBUG1, "sk found within current page"); + + offnum = _bt_binsrch(scan->indexRelation, scanKey, so->currPos.buf); + fromroot = false; + } + else + { + _bt_unlockbuf(rel, so->currPos.buf); + ReleaseBuffer(so->currPos.buf); + so->currPos.buf = InvalidBuffer; + } + } + + /* + * We haven't found scan key within the current page, so let's scan from + * the root. Use _bt_search and _bt_binsrch to get the buffer and offset + * number + */ + if (fromroot) + { + stack = _bt_search(scan->indexRelation, scanKey, + &buf, BT_READ, scan->xs_snapshot); + _bt_freestack(stack); + so->currPos.buf = buf; + + offnum = _bt_binsrch(scan->indexRelation, scanKey, buf); + + /* Lock the page for SERIALIZABLE transactions */ + PredicateLockPage(scan->indexRelation, BufferGetBlockNumber(so->currPos.buf), + scan->xs_snapshot); + } + + page = BufferGetPage(so->currPos.buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + + if (goback) + { + offnum = OffsetNumberPrev(offnum); + minoff = P_FIRSTDATAKEY(opaque); + if (offnum < minoff) + { + _bt_unlockbuf(rel, so->currPos.buf); + if (!_bt_step_back_page(scan, curTuple, curTupleOffnum)) + return; + page = BufferGetPage(so->currPos.buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + offnum = PageGetMaxOffsetNumber(page); + } + } + else if (offnum > PageGetMaxOffsetNumber(page)) + { + BlockNumber next = opaque->btpo_next; + _bt_unlockbuf(rel, so->currPos.buf); + if (!_bt_step_forward_page(scan, next, curTuple, curTupleOffnum)) + return; + page = BufferGetPage(so->currPos.buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + offnum = P_FIRSTDATAKEY(opaque); + } + + /* We know in which direction to look */ + _bt_initialize_more_data(so, dir); + + *curTupleOffnum = offnum; + *curTuple = _bt_get_tuple_from_offset(so, offnum); + so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf); + + if (DEBUG2 >= log_min_messages || DEBUG2 >= client_min_messages) + print_itup(BufferGetBlockNumber(so->currPos.buf), *curTuple, NULL, rel, + "after btree search"); +} + +static inline bool +_bt_step_one_page(IndexScanDesc scan, ScanDirection dir, IndexTuple *curTuple, + OffsetNumber *curTupleOffnum) +{ + if (ScanDirectionIsForward(dir)) + { + BTScanOpaque so = (BTScanOpaque) scan->opaque; + return _bt_step_forward_page(scan, so->currPos.nextPage, curTuple, curTupleOffnum); + } + else + { + return _bt_step_back_page(scan, curTuple, curTupleOffnum); + } +} + +/* in: possibly pinned, but unlocked, out: pinned and locked */ +bool +_bt_step_forward_page(IndexScanDesc scan, BlockNumber next, IndexTuple *curTuple, + OffsetNumber *curTupleOffnum) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Relation rel = scan->indexRelation; + BlockNumber blkno = next; + Page page; + BTPageOpaque opaque; + + Assert(BTScanPosIsValid(so->currPos)); + + /* Before leaving current page, deal with any killed items */ + if (so->numKilled > 0) + _bt_killitems(scan); + + /* + * Before we modify currPos, make a copy of the page data if there was a + * mark position that needs it. + */ + if (so->markItemIndex >= 0) + { + /* bump pin on current buffer for assignment to mark buffer */ + if (BTScanPosIsPinned(so->currPos)) + IncrBufferRefCount(so->currPos.buf); + memcpy(&so->markPos, &so->currPos, + offsetof(BTScanPosData, items[1]) + + so->currPos.lastItem * sizeof(BTScanPosItem)); + if (so->markTuples) + memcpy(so->markTuples, so->currTuples, + so->currPos.nextTupleOffset); + so->markPos.itemIndex = so->markItemIndex; + if (so->skipData) + memcpy(&so->skipData->markPos, &so->skipData->curPos, + sizeof(BTSkipPosData)); + so->markItemIndex = -1; + } + + /* Remember we left a page with data */ + so->currPos.moreLeft = true; + + /* release the previous buffer, if pinned */ + BTScanPosUnpinIfPinned(so->currPos); + + { + for (;;) + { + /* + * if we're at end of scan, give up and mark parallel scan as + * done, so that all the workers can finish their scan + */ + if (blkno == P_NONE) + { + _bt_parallel_done(scan); + BTScanPosInvalidate(so->currPos); + return false; + } + + /* check for interrupts while we're not holding any buffer lock */ + CHECK_FOR_INTERRUPTS(); + /* step right one page */ + so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ); + page = BufferGetPage(so->currPos.buf); + TestForOldSnapshot(scan->xs_snapshot, rel, page); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + /* check for deleted page */ + if (!P_IGNORE(opaque)) + { + PredicateLockPage(rel, blkno, scan->xs_snapshot); + *curTupleOffnum = P_FIRSTDATAKEY(opaque); + *curTuple = _bt_get_tuple_from_offset(so, *curTupleOffnum); + break; + } + + blkno = opaque->btpo_next; + _bt_relbuf(rel, so->currPos.buf); + } + } + + return true; +} + +/* in: possibly pinned, but unlocked, out: pinned and locked */ +bool +_bt_step_back_page(IndexScanDesc scan, IndexTuple *curTuple, OffsetNumber *curTupleOffnum) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + + Assert(BTScanPosIsValid(so->currPos)); + + /* Before leaving current page, deal with any killed items */ + if (so->numKilled > 0) + _bt_killitems(scan); + + /* + * Before we modify currPos, make a copy of the page data if there was a + * mark position that needs it. + */ + if (so->markItemIndex >= 0) + { + /* bump pin on current buffer for assignment to mark buffer */ + if (BTScanPosIsPinned(so->currPos)) + IncrBufferRefCount(so->currPos.buf); + memcpy(&so->markPos, &so->currPos, + offsetof(BTScanPosData, items[1]) + + so->currPos.lastItem * sizeof(BTScanPosItem)); + if (so->markTuples) + memcpy(so->markTuples, so->currTuples, + so->currPos.nextTupleOffset); + if (so->skipData) + memcpy(&so->skipData->markPos, &so->skipData->curPos, + sizeof(BTSkipPosData)); + so->markPos.itemIndex = so->markItemIndex; + so->markItemIndex = -1; + } + + /* Remember we left a page with data */ + so->currPos.moreRight = true; + + /* Not parallel, so just use our own notion of the current page */ + + { + Relation rel; + Page page; + BTPageOpaque opaque; + + rel = scan->indexRelation; + + if (BTScanPosIsPinned(so->currPos)) + _bt_lockbuf(rel, so->currPos.buf, BT_READ); + else + so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ); + + for (;;) + { + /* Step to next physical page */ + so->currPos.buf = _bt_walk_left(rel, so->currPos.buf, + scan->xs_snapshot); + + /* if we're physically at end of index, return failure */ + if (so->currPos.buf == InvalidBuffer) + { + BTScanPosInvalidate(so->currPos); + return false; + } + + /* + * Okay, we managed to move left to a non-deleted page. Done if + * it's not half-dead and contains matching tuples. Else loop back + * and do it all again. + */ + page = BufferGetPage(so->currPos.buf); + TestForOldSnapshot(scan->xs_snapshot, rel, page); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + if (!P_IGNORE(opaque)) + { + PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot); + *curTupleOffnum = PageGetMaxOffsetNumber(page); + *curTuple = _bt_get_tuple_from_offset(so, *curTupleOffnum); + break; + } + } + } + + return true; +} + +/* holds lock as long as curTupleOffnum != InvalidOffsetNumber */ +bool +_bt_skip_find_next(IndexScanDesc scan, IndexTuple curTuple, OffsetNumber curTupleOffnum, + ScanDirection prefixDir, ScanDirection postfixDir) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + BTSkip skip = so->skipData; + BTSkipCompareResult cmp; + + while (_bt_skip_is_valid(so, prefixDir, postfixDir)) + { + bool found; + _bt_skip_until_match(scan, &curTuple, &curTupleOffnum, prefixDir, postfixDir); + + while (_bt_skip_is_always_valid(so)) + { + OffsetNumber first = curTupleOffnum; + found = _bt_readpage(scan, postfixDir, &curTupleOffnum, + _bt_skip_is_regular_mode(prefixDir, postfixDir)); + if (DEBUG2 >= log_min_messages || DEBUG2 >= client_min_messages) + { + print_itup(BufferGetBlockNumber(so->currPos.buf), + _bt_get_tuple_from_offset(so, first), NULL, scan->indexRelation, + "first item on page compared"); + print_itup(BufferGetBlockNumber(so->currPos.buf), + _bt_get_tuple_from_offset(so, curTupleOffnum), NULL, scan->indexRelation, + "last item on page compared"); + } + _bt_compare_current_item(scan, _bt_get_tuple_from_offset(so, curTupleOffnum), + IndexRelationGetNumberOfAttributes(scan->indexRelation), + postfixDir, _bt_skip_is_regular_mode(prefixDir, postfixDir), &cmp); + _bt_determine_next_action(scan, &cmp, first, curTupleOffnum, + postfixDir, &skip->curPos.nextAction); + skip->curPos.nextDirection = prefixDir; + skip->curPos.nextSkipIndex = cmp.prefixSkipIndex; + + if (found) + { + _bt_skip_update_scankey_after_read(scan, _bt_get_tuple_from_offset(so, curTupleOffnum), + prefixDir, postfixDir); + return true; + } + else if (skip->curPos.nextAction == SkipStateNext) + { + if (curTupleOffnum != InvalidOffsetNumber) + _bt_unlockbuf(scan->indexRelation, so->currPos.buf); + if (!_bt_step_one_page(scan, postfixDir, &curTuple, &curTupleOffnum)) + return false; + } + else if (skip->curPos.nextAction == SkipStateSkip || skip->curPos.nextAction == SkipStateSkipExtra) + { + curTuple = _bt_get_tuple_from_offset(so, curTupleOffnum); + _bt_skip_update_scankey_after_read(scan, curTuple, prefixDir, postfixDir); + _bt_unlockbuf(scan->indexRelation, so->currPos.buf); + curTupleOffnum = InvalidOffsetNumber; + curTuple = NULL; + break; + } + else if (skip->curPos.nextAction == SkipStateStop) + { + _bt_unlockbuf(scan->indexRelation, so->currPos.buf); + BTScanPosUnpinIfPinned(so->currPos); + BTScanPosInvalidate(so->currPos); + return false; + } + else + { + Assert(false); + } + } + } + return false; +} + +void +_bt_skip_until_match(IndexScanDesc scan, IndexTuple *curTuple, OffsetNumber *curTupleOffnum, + ScanDirection prefixDir, ScanDirection postfixDir) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + BTSkip skip = so->skipData; + while (_bt_skip_is_valid(so, prefixDir, postfixDir) && + (skip->curPos.nextAction == SkipStateSkip || skip->curPos.nextAction == SkipStateSkipExtra)) + { + _bt_skip_once(scan, curTuple, curTupleOffnum, + skip->curPos.nextAction == SkipStateSkip, prefixDir, postfixDir); + } +} + +void +_bt_compare_current_item(IndexScanDesc scan, IndexTuple tuple, int tupnatts, ScanDirection dir, + bool isRegularMode, BTSkipCompareResult* cmp) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + BTSkip skip = so->skipData; + + if (_bt_skip_is_always_valid(so)) + { + bool continuescan = true; + + cmp->equal = _bt_checkkeys(scan, tuple, tupnatts, dir, &continuescan, &cmp->prefixSkipIndex); + cmp->fullKeySkip = !continuescan; + /* prefix can be smaller than scankey due to extra quals being added + * therefore we need to compare both. @todo this can be optimized into one function call */ + cmp->prefixCmpResult = _bt_compare_until(scan->indexRelation, &skip->curPos.skipScanKey, tuple, skip->prefix); + cmp->skCmpResult = _bt_compare_until(scan->indexRelation, + &skip->curPos.skipScanKey, tuple, skip->curPos.skipScanKey.keysz); + if (cmp->prefixSkipIndex == -1) + { + if (isRegularMode) + { + cmp->prefixSkip = false; + cmp->prefixSkipIndex = skip->prefix; + } + else + { + cmp->prefixSkip = ScanDirectionIsForward(dir) ? cmp->prefixCmpResult < 0 : cmp->prefixCmpResult > 0; + cmp->prefixSkipIndex = skip->prefix; + } + } + else + { + int newskip = -1; + _bt_checkkeys_threeway(scan, tuple, tupnatts, dir, &continuescan, &newskip); + if (newskip != -1) + { + cmp->prefixSkip = true; + cmp->prefixSkipIndex = newskip; + } + else + { + if (isRegularMode) + { + cmp->prefixSkip = false; + cmp->prefixSkipIndex = skip->prefix; + } + else + { + cmp->prefixSkip = ScanDirectionIsForward(dir) ? cmp->prefixCmpResult < 0 : cmp->prefixCmpResult > 0; + cmp->prefixSkipIndex = skip->prefix; + } + } + } + + if (DEBUG2 >= log_min_messages || DEBUG2 >= client_min_messages) + { + print_itup(BufferGetBlockNumber(so->currPos.buf), tuple, NULL, scan->indexRelation, + "compare item"); + _print_skey(scan, &skip->curPos.skipScanKey); + elog(DEBUG1, "result: eq: %d fkskip: %d pfxskip: %d prefixcmpres: %d prefixskipidx: %d", cmp->equal, cmp->fullKeySkip, + _bt_should_prefix_skip(cmp), cmp->prefixCmpResult, cmp->prefixSkipIndex); + } + } + else + { + /* we cannot stop the scan if !isRegularMode - then we do need to skip to the next prefix */ + cmp->fullKeySkip = isRegularMode; + cmp->equal = false; + cmp->prefixCmpResult = -2; + cmp->prefixSkip = true; + cmp->prefixSkipIndex = skip->prefix; + cmp->skCmpResult = -2; + } +} + +void +_bt_skip_once(IndexScanDesc scan, IndexTuple *curTuple, OffsetNumber *curTupleOffnum, + bool forceSkip, ScanDirection prefixDir, ScanDirection postfixDir) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + BTSkip skip = so->skipData; + BTSkipCompareResult cmp; + bool doskip = forceSkip; + int skipIndex = skip->curPos.nextSkipIndex; + skip->curPos.nextAction = SkipStateSkipExtra; + + while (doskip) + { + int toskip = skipIndex; + if (*curTuple != NULL) + { + if (skip->prefix <= skipIndex || !_bt_skip_is_regular_mode(prefixDir, postfixDir)) + { + toskip = skip->prefix; + } + + _bt_skip_update_scankey_for_prefix_skip(scan, scan->indexRelation, + toskip, *curTuple, prefixDir); + } + + if (DEBUG1 >= log_min_messages || DEBUG1 >= client_min_messages) + { + debug_print(*curTuple, &so->skipData->curPos.skipScanKey, scan->indexRelation, "skip"); + } + + _bt_skip_find(scan, curTuple, curTupleOffnum, &skip->curPos.skipScanKey, prefixDir); + + if (_bt_skip_is_always_valid(so)) + { + _bt_skip_update_scankey_for_extra_skip(scan, scan->indexRelation, + prefixDir, prefixDir, true, *curTuple); + _bt_compare_current_item(scan, *curTuple, + IndexRelationGetNumberOfAttributes(scan->indexRelation), + prefixDir, + _bt_skip_is_regular_mode(prefixDir, postfixDir), &cmp); + skipIndex = cmp.prefixSkipIndex; + _bt_determine_next_action_after_skip(so, &cmp, prefixDir, + postfixDir, toskip, &skip->curPos.nextAction); + } + else + { + skip->curPos.nextAction = SkipStateStop; + } + doskip = skip->curPos.nextAction == SkipStateSkip; + } + if (skip->curPos.nextAction != SkipStateStop && skip->curPos.nextAction != SkipStateNext) + _bt_skip_extra_conditions(scan, curTuple, curTupleOffnum, prefixDir, postfixDir, &cmp); +} + +void +_bt_skip_extra_conditions(IndexScanDesc scan, IndexTuple *curTuple, OffsetNumber *curTupleOffnum, + ScanDirection prefixDir, ScanDirection postfixDir, BTSkipCompareResult *cmp) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + BTSkip skip = so->skipData; + bool regularMode = _bt_skip_is_regular_mode(prefixDir, postfixDir); + if (_bt_skip_is_always_valid(so)) + { + do + { + if (*curTuple != NULL) + _bt_skip_update_scankey_for_extra_skip(scan, scan->indexRelation, + postfixDir, prefixDir, false, *curTuple); + if (DEBUG1 >= log_min_messages || DEBUG1 >= client_min_messages) + { + debug_print(*curTuple, &so->skipData->curPos.skipScanKey, scan->indexRelation, "skip-extra"); + } + _bt_skip_find(scan, curTuple, curTupleOffnum, &skip->curPos.skipScanKey, postfixDir); + _bt_compare_current_item(scan, *curTuple, + IndexRelationGetNumberOfAttributes(scan->indexRelation), + postfixDir, _bt_skip_is_regular_mode(prefixDir, postfixDir), cmp); + } while (regularMode && cmp->prefixCmpResult != 0 && !cmp->equal && !cmp->fullKeySkip); + skip->curPos.nextSkipIndex = cmp->prefixSkipIndex; + } + _bt_determine_next_action_after_skip_extra(so, cmp, &skip->curPos.nextAction); +} + +static void +_bt_skip_update_scankey_after_read(IndexScanDesc scan, IndexTuple curTuple, + ScanDirection prefixDir, ScanDirection postfixDir) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + BTSkip skip = so->skipData; + if (skip->curPos.nextAction == SkipStateSkip) + { + int toskip = skip->curPos.nextSkipIndex; + if (skip->prefix <= skip->curPos.nextSkipIndex || + !_bt_skip_is_regular_mode(prefixDir, postfixDir)) + { + toskip = skip->prefix; + } + + if (_bt_skip_is_regular_mode(prefixDir, postfixDir)) + _bt_skip_update_scankey_for_prefix_skip(scan, scan->indexRelation, + toskip, curTuple, prefixDir); + else + _bt_skip_update_scankey_for_prefix_skip(scan, scan->indexRelation, + toskip, NULL, prefixDir); + } + else if (skip->curPos.nextAction == SkipStateSkipExtra) + { + _bt_skip_update_scankey_for_extra_skip(scan, scan->indexRelation, + postfixDir, prefixDir, false, curTuple); + } +} + +static inline int +_bt_compare_one(ScanKey scankey, Datum datum2, bool isNull2) +{ + int32 result; + Datum datum1 = scankey->sk_argument; + bool isNull1 = scankey->sk_flags & SK_ISNULL; + /* see comments about NULLs handling in btbuild */ + if (isNull1) /* key is NULL */ + { + if (isNull2) + result = 0; /* NULL "=" NULL */ + else if (scankey->sk_flags & SK_BT_NULLS_FIRST) + result = -1; /* NULL "<" NOT_NULL */ + else + result = 1; /* NULL ">" NOT_NULL */ + } + else if (isNull2) /* key is NOT_NULL and item is NULL */ + { + if (scankey->sk_flags & SK_BT_NULLS_FIRST) + result = 1; /* NOT_NULL ">" NULL */ + else + result = -1; /* NOT_NULL "<" NULL */ + } + else + { + /* + * The sk_func needs to be passed the index value as left arg and + * the sk_argument as right arg (they might be of different + * types). Since it is convenient for callers to think of + * _bt_compare as comparing the scankey to the index item, we have + * to flip the sign of the comparison result. (Unless it's a DESC + * column, in which case we *don't* flip the sign.) + */ + result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func, + scankey->sk_collation, + datum2, + datum1)); + + if (!(scankey->sk_flags & SK_BT_DESC)) + INVERT_COMPARE_RESULT(result); + } + return result; +} + +/* + * set up new values for the existing scankeys + * based on the current index tuple + */ +static inline void +_bt_update_scankey_with_tuple(BTScanInsert insertKey, Relation indexRel, IndexTuple itup, int numattrs) +{ + TupleDesc itupdesc; + int i; + ScanKey scankeys = insertKey->scankeys; + + insertKey->keysz = numattrs; + itupdesc = RelationGetDescr(indexRel); + for (i = 0; i < numattrs; i++) + { + Datum datum; + bool null; + int flags; + + datum = index_getattr(itup, i + 1, itupdesc, &null); + flags = (null ? SK_ISNULL : 0) | + (indexRel->rd_indoption[i] << SK_BT_INDOPTION_SHIFT); + scankeys[i].sk_flags = flags; + scankeys[i].sk_argument = datum; + } +} + +/* copy the elements important to a skip from one insertion sk to another */ +static inline void +_bt_copy_scankey(BTScanInsert to, BTScanInsert from, int numattrs) +{ + memcpy(to->scankeys, from->scankeys, sizeof(ScanKeyData) * (unsigned long)numattrs); + to->nextkey = from->nextkey; + to->keysz = numattrs; +} + +/* + * Updates the existing scankey for skipping to the next prefix + * alwaysUsePrefix determines how many attrs the scankey will have + * when true, it will always have skip->prefix number of attributes, + * otherwise, the value can be less, which will be determined by the comparison + * result with the current tuple. + * for example, a SELECT * FROM tbl WHERE b<2, index (a,b,c) and when skipping with prefix size=2 + * if we encounter the tuple (1,3,1) - this does not match the qual b<2. however, we also know that + * it is not useful to skip to any next qual with prefix=2 (eg. (1,4)), because that will definitely not + * match either. However, we do want to skip to eg. (2,0). Therefore, we skip over prefix=1 in this case. + * + * the provided itup may be null. this happens when we don't want to use the current tuple to update + * the scankey, but instead want to use the existing curPos.skipScanKey to fill currentTupleKey. this accounts + * for some edge cases. + */ +static void +_bt_skip_update_scankey_for_prefix_skip(IndexScanDesc scan, Relation indexRel, + int prefix, IndexTuple itup, ScanDirection prefixDir) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + BTSkip skip = so->skipData; + /* we use skip->prefix is alwaysUsePrefix is set or if skip->prefix is smaller than whatever the + * comparison result provided, such that we never skip more than skip->prefix + */ + int numattrs = prefix; + + if (itup != NULL) + { + Size itupsz = IndexTupleSize(itup); + memcpy(so->skipData->curPos.skipTuple, itup, itupsz); + + _bt_update_scankey_with_tuple(&skip->currentTupleKey, indexRel, (IndexTuple)so->skipData->curPos.skipTuple, numattrs); + _bt_copy_scankey(&skip->curPos.skipScanKey, &skip->currentTupleKey, numattrs); + } + else + { + skip->curPos.skipScanKey.keysz = numattrs; + _bt_copy_scankey(&skip->currentTupleKey, &skip->curPos.skipScanKey, numattrs); + } + /* update strategy for last attribute as we will use this to determine the rest of the + * rest of the flags (goback) when doing the actual tree search + */ + skip->currentTupleKey.scankeys[numattrs - 1].sk_strategy = + skip->curPos.skipScanKey.scankeys[numattrs - 1].sk_strategy = + ScanDirectionIsForward(prefixDir) ? BTGreaterStrategyNumber : BTLessStrategyNumber; +} + +/* update the scankey for skipping the 'extra' conditions, opportunities + * that arise when we have just skipped to a new prefix and can try to skip + * within the prefix to the right tuple by using extra quals when available + * + * @todo as an optimization it should be possible to optimize calls to this function + * and to _bt_skip_update_scankey_for_prefix_skip to some more specific functions that + * will need to do less copying of data. + */ +void +_bt_skip_update_scankey_for_extra_skip(IndexScanDesc scan, Relation indexRel, ScanDirection curDir, + ScanDirection prefixDir, bool prioritizeEqual, IndexTuple itup) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + BTSkip skip = so->skipData; + BTScanInsert toCopy; + int i, left, lastNonTuple = skip->prefix; + + /* first make sure that currentTupleKey is correct at all times */ + _bt_skip_update_scankey_for_prefix_skip(scan, indexRel, skip->prefix, itup, prefixDir); + /* then do the actual work to setup curPos.skipScanKey - distinguish between work that depends on overallDir + * (those attributes between attribute number 1 and 'prefix' inclusive) + * and work that depends on curDir + * (those attributes between attribute number 'prefix' + 1 and fwdScanKey.keysz inclusive) + */ + if (ScanDirectionIsForward(prefixDir)) + { + /* + * if overallDir is Forward, we need to choose between fwdScanKey or + * currentTupleKey. we need to choose the most restrictive one - + * in most cases this means choosing eg. a>5 over a=2 when scanning forward, + * unless prioritizeEqual is set. this is done for certain special cases + */ + for (i = 0; i < skip->prefix; i++) + { + ScanKey scankey = &skip->fwdScanKey.scankeys[i]; + ScanKey scankeyItem = &skip->currentTupleKey.scankeys[i]; + if (scankey->sk_attno != 0 && (_bt_compare_one(scankey, scankeyItem->sk_argument, scankeyItem->sk_flags & SK_ISNULL) > 0 + || (prioritizeEqual && scankey->sk_strategy == BTEqualStrategyNumber))) + { + memcpy(skip->curPos.skipScanKey.scankeys + i, scankey, sizeof(ScanKeyData)); + lastNonTuple = i; + } + else + { + if (lastNonTuple < i) + break; + memcpy(skip->curPos.skipScanKey.scankeys + i, scankeyItem, sizeof(ScanKeyData)); + } + /* for now choose equal here - it could actually be improved a bit @todo by choosing the strategy + * from the scankeys, but it doesn't matter a lot + */ + skip->curPos.skipScanKey.scankeys[i].sk_strategy = BTEqualStrategyNumber; + } + } + else + { + /* similar for backward but in opposite direction */ + for (i = 0; i < skip->prefix; i++) + { + ScanKey scankey = &skip->bwdScanKey.scankeys[i]; + ScanKey scankeyItem = &skip->currentTupleKey.scankeys[i]; + if (scankey->sk_attno != 0 && (_bt_compare_one(scankey, scankeyItem->sk_argument, scankeyItem->sk_flags & SK_ISNULL) < 0 + || (prioritizeEqual && scankey->sk_strategy == BTEqualStrategyNumber))) + { + memcpy(skip->curPos.skipScanKey.scankeys + i, scankey, sizeof(ScanKeyData)); + lastNonTuple = i; + } + else + { + if (lastNonTuple < i) + break; + memcpy(skip->curPos.skipScanKey.scankeys + i, scankeyItem, sizeof(ScanKeyData)); + } + skip->curPos.skipScanKey.scankeys[i].sk_strategy = BTEqualStrategyNumber; + } + } + + /* + * the remaining keys are the quals after the prefix + */ + if (ScanDirectionIsForward(curDir)) + toCopy = &skip->fwdScanKey; + else + toCopy = &skip->bwdScanKey; + + if (lastNonTuple >= skip->prefix - 1) + { + left = toCopy->keysz - skip->prefix; + if (left > 0) + { + memcpy(skip->curPos.skipScanKey.scankeys + skip->prefix, toCopy->scankeys + i, sizeof(ScanKeyData) * (unsigned long)left); + } + skip->curPos.skipScanKey.keysz = toCopy->keysz; + } + else + { + skip->curPos.skipScanKey.keysz = lastNonTuple + 1; + } +} diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 54c8eb1289..03005f89c9 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -560,7 +560,7 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2) wstate.heap = btspool->heap; wstate.index = btspool->index; - wstate.inskey = _bt_mkscankey(wstate.index, NULL); + wstate.inskey = _bt_mkscankey(wstate.index, NULL, NULL); /* _bt_mkscankey() won't set allequalimage without metapage */ wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true); wstate.btws_use_wal = RelationNeedsWAL(wstate.index); diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index c72b4566de..40da00f72d 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -49,10 +49,10 @@ static bool _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op, ScanKey leftarg, ScanKey rightarg, bool *result); static bool _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption); -static void _bt_mark_scankey_required(ScanKey skey); +static void _bt_mark_scankey_required(ScanKey skey, int forwardReqFlag, int backwardReqFlag); static bool _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, TupleDesc tupdesc, - ScanDirection dir, bool *continuescan); + ScanDirection dir, bool *continuescan, int *prefixskipindex); static int _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key); @@ -87,9 +87,8 @@ static int _bt_keep_natts(Relation rel, IndexTuple lastleft, * field themselves. */ BTScanInsert -_bt_mkscankey(Relation rel, IndexTuple itup) +_bt_mkscankey(Relation rel, IndexTuple itup, BTScanInsert key) { - BTScanInsert key; ScanKey skey; TupleDesc itupdesc; int indnkeyatts; @@ -109,8 +108,10 @@ _bt_mkscankey(Relation rel, IndexTuple itup) * Truncated attributes and non-key attributes are omitted from the final * scan key. */ - key = palloc(offsetof(BTScanInsertData, scankeys) + - sizeof(ScanKeyData) * indnkeyatts); + if (key == NULL) + key = palloc(offsetof(BTScanInsertData, scankeys) + + sizeof(ScanKeyData) * indnkeyatts); + if (itup) _bt_metaversion(rel, &key->heapkeyspace, &key->allequalimage); else @@ -155,7 +156,7 @@ _bt_mkscankey(Relation rel, IndexTuple itup) ScanKeyEntryInitializeWithInfo(&skey[i], flags, (AttrNumber) (i + 1), - InvalidStrategy, + BTEqualStrategyNumber, InvalidOid, rel->rd_indcollation[i], procinfo, @@ -745,7 +746,7 @@ _bt_preprocess_keys(IndexScanDesc scan) int numberOfKeys = scan->numberOfKeys; int16 *indoption = scan->indexRelation->rd_indoption; int new_numberOfKeys; - int numberOfEqualCols; + int numberOfEqualCols, numberOfEqualColsSincePrefix; ScanKey inkeys; ScanKey outkeys; ScanKey cur; @@ -754,6 +755,7 @@ _bt_preprocess_keys(IndexScanDesc scan) int i, j; AttrNumber attno; + int prefix = 0; /* initialize result variables */ so->qual_ok = true; @@ -762,6 +764,11 @@ _bt_preprocess_keys(IndexScanDesc scan) if (numberOfKeys < 1) return; /* done if qual-less scan */ + if (_bt_skip_enabled(so)) + { + prefix = so->skipData->prefix; + } + /* * Read so->arrayKeyData if array keys are present, else scan->keyData */ @@ -786,7 +793,9 @@ _bt_preprocess_keys(IndexScanDesc scan) so->numberOfKeys = 1; /* We can mark the qual as required if it's for first index col */ if (cur->sk_attno == 1) - _bt_mark_scankey_required(outkeys); + _bt_mark_scankey_required(outkeys, SK_BT_REQFWD, SK_BT_REQBKWD); + if (cur->sk_attno <= prefix + 1) + _bt_mark_scankey_required(outkeys, SK_BT_REQSKIPFWD, SK_BT_REQSKIPBKWD); return; } @@ -795,6 +804,8 @@ _bt_preprocess_keys(IndexScanDesc scan) */ new_numberOfKeys = 0; numberOfEqualCols = 0; + numberOfEqualColsSincePrefix = 0; + /* * Initialize for processing of keys for attr 1. @@ -830,6 +841,8 @@ _bt_preprocess_keys(IndexScanDesc scan) if (i == numberOfKeys || cur->sk_attno != attno) { int priorNumberOfEqualCols = numberOfEqualCols; + int priorNumberOfEqualColsSincePrefix = numberOfEqualColsSincePrefix; + /* check input keys are correctly ordered */ if (i < numberOfKeys && cur->sk_attno < attno) @@ -880,6 +893,8 @@ _bt_preprocess_keys(IndexScanDesc scan) } /* track number of attrs for which we have "=" keys */ numberOfEqualCols++; + if (attno > prefix) + numberOfEqualColsSincePrefix++; } /* try to keep only one of <, <= */ @@ -929,7 +944,9 @@ _bt_preprocess_keys(IndexScanDesc scan) memcpy(outkey, xform[j], sizeof(ScanKeyData)); if (priorNumberOfEqualCols == attno - 1) - _bt_mark_scankey_required(outkey); + _bt_mark_scankey_required(outkey, SK_BT_REQFWD, SK_BT_REQBKWD); + if (attno <= prefix || priorNumberOfEqualColsSincePrefix == attno - prefix - 1) + _bt_mark_scankey_required(outkey, SK_BT_REQSKIPFWD, SK_BT_REQSKIPBKWD); } } @@ -954,7 +971,9 @@ _bt_preprocess_keys(IndexScanDesc scan) memcpy(outkey, cur, sizeof(ScanKeyData)); if (numberOfEqualCols == attno - 1) - _bt_mark_scankey_required(outkey); + _bt_mark_scankey_required(outkey, SK_BT_REQFWD, SK_BT_REQBKWD); + if (attno <= prefix || numberOfEqualColsSincePrefix == attno - prefix - 1) + _bt_mark_scankey_required(outkey, SK_BT_REQSKIPFWD, SK_BT_REQSKIPBKWD); /* * We don't support RowCompare using equality; such a qual would @@ -997,7 +1016,9 @@ _bt_preprocess_keys(IndexScanDesc scan) memcpy(outkey, cur, sizeof(ScanKeyData)); if (numberOfEqualCols == attno - 1) - _bt_mark_scankey_required(outkey); + _bt_mark_scankey_required(outkey, SK_BT_REQFWD, SK_BT_REQBKWD); + if (attno <= prefix || numberOfEqualColsSincePrefix == attno - prefix - 1) + _bt_mark_scankey_required(outkey, SK_BT_REQSKIPFWD, SK_BT_REQSKIPBKWD); } } } @@ -1295,7 +1316,7 @@ _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption) * anyway on a rescan. Something to keep an eye on though. */ static void -_bt_mark_scankey_required(ScanKey skey) +_bt_mark_scankey_required(ScanKey skey, int forwardReqFlag, int backwardReqFlag) { int addflags; @@ -1303,14 +1324,14 @@ _bt_mark_scankey_required(ScanKey skey) { case BTLessStrategyNumber: case BTLessEqualStrategyNumber: - addflags = SK_BT_REQFWD; + addflags = forwardReqFlag; break; case BTEqualStrategyNumber: - addflags = SK_BT_REQFWD | SK_BT_REQBKWD; + addflags = forwardReqFlag | backwardReqFlag; break; case BTGreaterEqualStrategyNumber: case BTGreaterStrategyNumber: - addflags = SK_BT_REQBKWD; + addflags = backwardReqFlag; break; default: elog(ERROR, "unrecognized StrategyNumber: %d", @@ -1353,17 +1374,22 @@ _bt_mark_scankey_required(ScanKey skey) */ bool _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, - ScanDirection dir, bool *continuescan) + ScanDirection dir, bool *continuescan, int *prefixSkipIndex) { TupleDesc tupdesc; BTScanOpaque so; int keysz; int ikey; ScanKey key; + int pfx; + + if (prefixSkipIndex == NULL) + prefixSkipIndex = &pfx; Assert(BTreeTupleGetNAtts(tuple, scan->indexRelation) == tupnatts); *continuescan = true; /* default assumption */ + *prefixSkipIndex = -1; tupdesc = RelationGetDescr(scan->indexRelation); so = (BTScanOpaque) scan->opaque; @@ -1392,7 +1418,7 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, if (key->sk_flags & SK_ROW_HEADER) { if (_bt_check_rowcompare(key, tuple, tupnatts, tupdesc, dir, - continuescan)) + continuescan, prefixSkipIndex)) continue; return false; } @@ -1429,6 +1455,13 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, ScanDirectionIsBackward(dir)) *continuescan = false; + if ((key->sk_flags & SK_BT_REQSKIPFWD) && + ScanDirectionIsForward(dir)) + *prefixSkipIndex = key->sk_attno - 1; + else if ((key->sk_flags & SK_BT_REQSKIPBKWD) && + ScanDirectionIsBackward(dir)) + *prefixSkipIndex = key->sk_attno - 1; + /* * In any case, this indextuple doesn't match the qual. */ @@ -1452,6 +1485,10 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && ScanDirectionIsBackward(dir)) *continuescan = false; + + if ((key->sk_flags & (SK_BT_REQSKIPFWD | SK_BT_REQSKIPBKWD)) && + ScanDirectionIsBackward(dir)) + *prefixSkipIndex = key->sk_attno - 1; } else { @@ -1468,6 +1505,9 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && ScanDirectionIsForward(dir)) *continuescan = false; + if ((key->sk_flags & (SK_BT_REQSKIPFWD | SK_BT_REQSKIPBKWD)) && + ScanDirectionIsBackward(dir)) + *prefixSkipIndex = key->sk_attno - 1; } /* @@ -1498,6 +1538,13 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, ScanDirectionIsBackward(dir)) *continuescan = false; + if ((key->sk_flags & SK_BT_REQSKIPFWD) && + ScanDirectionIsForward(dir)) + *prefixSkipIndex = key->sk_attno - 1; + else if ((key->sk_flags & SK_BT_REQSKIPBKWD) && + ScanDirectionIsBackward(dir)) + *prefixSkipIndex = key->sk_attno - 1; + /* * In any case, this indextuple doesn't match the qual. */ @@ -1509,6 +1556,228 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, return true; } +bool +_bt_checkkeys_threeway(IndexScanDesc scan, IndexTuple tuple, int tupnatts, + ScanDirection dir, bool *continuescan, int *prefixSkipIndex) +{ + TupleDesc tupdesc; + BTScanOpaque so; + int keysz; + int ikey; + ScanKey key; + int pfx; + BTScanInsert keys; + bool overallmatch = true; + + if (prefixSkipIndex == NULL) + prefixSkipIndex = &pfx; + + Assert(BTreeTupleGetNAtts(tuple, scan->indexRelation) == tupnatts); + + *continuescan = true; /* default assumption */ + *prefixSkipIndex = -1; + + tupdesc = RelationGetDescr(scan->indexRelation); + so = (BTScanOpaque) scan->opaque; + if (ScanDirectionIsForward(dir)) + keys = &so->skipData->bwdScanKey; + else + keys = &so->skipData->fwdScanKey; + + keysz = keys->keysz; + + for (key = keys->scankeys, ikey = 0; ikey < keysz; key++, ikey++) + { + Datum datum; + bool isNull; + int cmpresult; + + if (key->sk_attno == 0) + continue; + + if (key->sk_attno > tupnatts) + { + /* + * This attribute is truncated (must be high key). The value for + * this attribute in the first non-pivot tuple on the page to the + * right could be any possible value. Assume that truncated + * attribute passes the qual. + */ + Assert(ScanDirectionIsForward(dir)); + continue; + } + + /* row-comparison keys need special processing */ + Assert((key->sk_flags & SK_ROW_HEADER) == 0); + + datum = index_getattr(tuple, + key->sk_attno, + tupdesc, + &isNull); + + if (key->sk_flags & SK_ISNULL) + { + /* Handle IS NULL/NOT NULL tests */ + if (key->sk_flags & SK_SEARCHNULL) + { + if (isNull) + continue; /* tuple satisfies this qual */ + } + else + { + Assert(key->sk_flags & SK_SEARCHNOTNULL); + if (!isNull) + continue; /* tuple satisfies this qual */ + } + + /* + * Tuple fails this qual. If it's a required qual for the current + * scan direction, then we can conclude no further tuples will + * pass, either. + */ + if ((key->sk_flags & SK_BT_REQFWD) && + ScanDirectionIsForward(dir)) + { + *continuescan = false; + return false; + } + else if ((key->sk_flags & SK_BT_REQBKWD) && + ScanDirectionIsBackward(dir)) + { + *continuescan = false; + return false; + } + + if ((key->sk_flags & SK_BT_REQSKIPFWD) && + ScanDirectionIsForward(dir)) + { + *prefixSkipIndex = key->sk_attno - 1; + return false; + } + else if ((key->sk_flags & SK_BT_REQSKIPBKWD) && + ScanDirectionIsBackward(dir)) + { + *prefixSkipIndex = key->sk_attno - 1; + return false; + } + + overallmatch = false; + } + + if (isNull) + { + if (key->sk_flags & SK_BT_NULLS_FIRST) + { + /* + * Since NULLs are sorted before non-NULLs, we know we have + * reached the lower limit of the range of values for this + * index attr. On a backward scan, we can stop if this qual + * is one of the "must match" subset. We can stop regardless + * of whether the qual is > or <, so long as it's required, + * because it's not possible for any future tuples to pass. On + * a forward scan, however, we must keep going, because we may + * have initially positioned to the start of the index. + */ + if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && + ScanDirectionIsBackward(dir)) + { + *continuescan = false; + return false; + } + + if ((key->sk_flags & (SK_BT_REQSKIPFWD | SK_BT_REQSKIPBKWD)) && + ScanDirectionIsBackward(dir)) + { + *prefixSkipIndex = key->sk_attno - 1; + return false; + } + } + else + { + /* + * Since NULLs are sorted after non-NULLs, we know we have + * reached the upper limit of the range of values for this + * index attr. On a forward scan, we can stop if this qual is + * one of the "must match" subset. We can stop regardless of + * whether the qual is > or <, so long as it's required, + * because it's not possible for any future tuples to pass. On + * a backward scan, however, we must keep going, because we + * may have initially positioned to the end of the index. + */ + if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && + ScanDirectionIsForward(dir)) + { + *continuescan = false; + return false; + } + if ((key->sk_flags & (SK_BT_REQSKIPFWD | SK_BT_REQSKIPBKWD)) && + ScanDirectionIsBackward(dir)) + { + *prefixSkipIndex = key->sk_attno - 1; + return false; + } + } + + overallmatch = false; + } + + /* Perform the test --- three-way comparison not bool operator */ + cmpresult = DatumGetInt32(FunctionCall2Coll(&key->sk_func, + key->sk_collation, + datum, + key->sk_argument)); + if (key->sk_flags & SK_BT_DESC) + INVERT_COMPARE_RESULT(cmpresult); + + if (cmpresult != 0) + { + /* + * Tuple fails this qual. If it's a required qual for the current + * scan direction, then we can conclude no further tuples will + * pass, either. + * + * Note: because we stop the scan as soon as any required equality + * qual fails, it is critical that equality quals be used for the + * initial positioning in _bt_first() when they are available. See + * comments in _bt_first(). + */ + if ((key->sk_flags & SK_BT_REQFWD) && + ScanDirectionIsForward(dir) && cmpresult > 0) + { + *continuescan = false; + return false; + } + else if ((key->sk_flags & SK_BT_REQBKWD) && + ScanDirectionIsBackward(dir) && cmpresult < 0) + { + *continuescan = false; + return false; + } + + if ((key->sk_flags & SK_BT_REQSKIPFWD) && + ScanDirectionIsForward(dir) && cmpresult > 0) + { + *prefixSkipIndex = key->sk_attno - 1; + return false; + } + else if ((key->sk_flags & SK_BT_REQSKIPBKWD) && + ScanDirectionIsBackward(dir) && cmpresult < 0) + { + *prefixSkipIndex = key->sk_attno - 1; + return false; + } + + /* + * In any case, this indextuple doesn't match the qual. + */ + overallmatch = false; + } + } + + /* If we get here, the tuple passes all index quals. */ + return overallmatch; +} + /* * Test whether an indextuple satisfies a row-comparison scan condition. * @@ -1520,7 +1789,7 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, */ static bool _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, - TupleDesc tupdesc, ScanDirection dir, bool *continuescan) + TupleDesc tupdesc, ScanDirection dir, bool *continuescan, int *prefixSkipIndex) { ScanKey subkey = (ScanKey) DatumGetPointer(skey->sk_argument); int32 cmpresult = 0; @@ -1576,6 +1845,10 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && ScanDirectionIsBackward(dir)) *continuescan = false; + + if ((subkey->sk_flags & (SK_BT_REQSKIPFWD | SK_BT_REQBKWD) && + ScanDirectionIsBackward(dir))) + *prefixSkipIndex = subkey->sk_attno - 1; } else { @@ -1592,6 +1865,10 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && ScanDirectionIsForward(dir)) *continuescan = false; + + if ((subkey->sk_flags & (SK_BT_REQSKIPFWD | SK_BT_REQBKWD) && + ScanDirectionIsForward(dir))) + *prefixSkipIndex = subkey->sk_attno - 1; } /* @@ -1616,6 +1893,13 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, else if ((subkey->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir)) *continuescan = false; + + if ((subkey->sk_flags & SK_BT_REQSKIPFWD) && + ScanDirectionIsForward(dir)) + *prefixSkipIndex = subkey->sk_attno - 1; + else if ((subkey->sk_flags & SK_BT_REQSKIPBKWD) && + ScanDirectionIsBackward(dir)) + *prefixSkipIndex = subkey->sk_attno - 1; return false; } @@ -1678,6 +1962,13 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, else if ((subkey->sk_flags & SK_BT_REQBKWD) && ScanDirectionIsBackward(dir)) *continuescan = false; + + if ((subkey->sk_flags & SK_BT_REQSKIPFWD) && + ScanDirectionIsForward(dir)) + *prefixSkipIndex = subkey->sk_attno - 1; + else if ((subkey->sk_flags & SK_BT_REQSKIPBKWD) && + ScanDirectionIsBackward(dir)) + *prefixSkipIndex = subkey->sk_attno - 1; } return result; @@ -2733,3 +3024,524 @@ _bt_allequalimage(Relation rel, bool debugmessage) return allequalimage; } + +void _bt_set_bsearch_flags(StrategyNumber stratTotal, ScanDirection dir, bool* nextkey, bool* goback) +{ + /*---------- + * Examine the selected initial-positioning strategy to determine exactly + * where we need to start the scan, and set flag variables to control the + * code below. + * + * If nextkey = false, _bt_search and _bt_binsrch will locate the first + * item >= scan key. If nextkey = true, they will locate the first + * item > scan key. + * + * If goback = true, we will then step back one item, while if + * goback = false, we will start the scan on the located item. + *---------- + */ + switch (stratTotal) + { + case BTLessStrategyNumber: + + /* + * Find first item >= scankey, then back up one to arrive at last + * item < scankey. (Note: this positioning strategy is only used + * for a backward scan, so that is always the correct starting + * position.) + */ + *nextkey = false; + *goback = true; + break; + + case BTLessEqualStrategyNumber: + + /* + * Find first item > scankey, then back up one to arrive at last + * item <= scankey. (Note: this positioning strategy is only used + * for a backward scan, so that is always the correct starting + * position.) + */ + *nextkey = true; + *goback = true; + break; + + case BTEqualStrategyNumber: + + /* + * If a backward scan was specified, need to start with last equal + * item not first one. + */ + if (ScanDirectionIsBackward(dir)) + { + /* + * This is the same as the <= strategy. We will check at the + * end whether the found item is actually =. + */ + *nextkey = true; + *goback = true; + } + else + { + /* + * This is the same as the >= strategy. We will check at the + * end whether the found item is actually =. + */ + *nextkey = false; + *goback = false; + } + break; + + case BTGreaterEqualStrategyNumber: + + /* + * Find first item >= scankey. (This is only used for forward + * scans.) + */ + *nextkey = false; + *goback = false; + break; + + case BTGreaterStrategyNumber: + + /* + * Find first item > scankey. (This is only used for forward + * scans.) + */ + *nextkey = true; + *goback = false; + break; + + default: + /* can't get here, but keep compiler quiet */ + elog(ERROR, "unrecognized strat_total: %d", (int) stratTotal); + } +} + +bool _bt_create_insertion_scan_key(Relation rel, ScanDirection dir, ScanKey* startKeys, int keysCount, BTScanInsert inskey, StrategyNumber* stratTotal, bool* goback) +{ + int i; + bool nextkey; + + /* + * We want to start the scan somewhere within the index. Set up an + * insertion scankey we can use to search for the boundary point we + * identified above. The insertion scankey is built using the keys + * identified by startKeys[]. (Remaining insertion scankey fields are + * initialized after initial-positioning strategy is finalized.) + */ + Assert(keysCount <= INDEX_MAX_KEYS); + for (i = 0; i < keysCount; i++) + { + ScanKey cur = startKeys[i]; + + if (cur == NULL) + { + inskey->scankeys[i].sk_attno = 0; + continue; + } + + Assert(cur->sk_attno == i + 1); + + if (cur->sk_flags & SK_ROW_HEADER) + { + /* + * Row comparison header: look to the first row member instead. + * + * The member scankeys are already in insertion format (ie, they + * have sk_func = 3-way-comparison function), but we have to watch + * out for nulls, which _bt_preprocess_keys didn't check. A null + * in the first row member makes the condition unmatchable, just + * like qual_ok = false. + */ + ScanKey subkey = (ScanKey) DatumGetPointer(cur->sk_argument); + + Assert(subkey->sk_flags & SK_ROW_MEMBER); + if (subkey->sk_flags & SK_ISNULL) + { + return false; + } + memcpy(inskey->scankeys + i, subkey, sizeof(ScanKeyData)); + + /* + * If the row comparison is the last positioning key we accepted, + * try to add additional keys from the lower-order row members. + * (If we accepted independent conditions on additional index + * columns, we use those instead --- doesn't seem worth trying to + * determine which is more restrictive.) Note that this is OK + * even if the row comparison is of ">" or "<" type, because the + * condition applied to all but the last row member is effectively + * ">=" or "<=", and so the extra keys don't break the positioning + * scheme. But, by the same token, if we aren't able to use all + * the row members, then the part of the row comparison that we + * did use has to be treated as just a ">=" or "<=" condition, and + * so we'd better adjust strat_total accordingly. + */ + if (i == keysCount - 1) + { + bool used_all_subkeys = false; + + Assert(!(subkey->sk_flags & SK_ROW_END)); + for (;;) + { + subkey++; + Assert(subkey->sk_flags & SK_ROW_MEMBER); + if (subkey->sk_attno != keysCount + 1) + break; /* out-of-sequence, can't use it */ + if (subkey->sk_strategy != cur->sk_strategy) + break; /* wrong direction, can't use it */ + if (subkey->sk_flags & SK_ISNULL) + break; /* can't use null keys */ + Assert(keysCount < INDEX_MAX_KEYS); + memcpy(inskey->scankeys + keysCount, subkey, + sizeof(ScanKeyData)); + keysCount++; + if (subkey->sk_flags & SK_ROW_END) + { + used_all_subkeys = true; + break; + } + } + if (!used_all_subkeys) + { + switch (*stratTotal) + { + case BTLessStrategyNumber: + *stratTotal = BTLessEqualStrategyNumber; + break; + case BTGreaterStrategyNumber: + *stratTotal = BTGreaterEqualStrategyNumber; + break; + } + } + break; /* done with outer loop */ + } + } + else + { + /* + * Ordinary comparison key. Transform the search-style scan key + * to an insertion scan key by replacing the sk_func with the + * appropriate btree comparison function. + * + * If scankey operator is not a cross-type comparison, we can use + * the cached comparison function; otherwise gotta look it up in + * the catalogs. (That can't lead to infinite recursion, since no + * indexscan initiated by syscache lookup will use cross-data-type + * operators.) + * + * We support the convention that sk_subtype == InvalidOid means + * the opclass input type; this is a hack to simplify life for + * ScanKeyInit(). + */ + if (cur->sk_subtype == rel->rd_opcintype[i] || + cur->sk_subtype == InvalidOid) + { + FmgrInfo *procinfo; + + procinfo = index_getprocinfo(rel, cur->sk_attno, BTORDER_PROC); + ScanKeyEntryInitializeWithInfo(inskey->scankeys + i, + cur->sk_flags, + cur->sk_attno, + cur->sk_strategy, + cur->sk_subtype, + cur->sk_collation, + procinfo, + cur->sk_argument); + } + else + { + RegProcedure cmp_proc; + + cmp_proc = get_opfamily_proc(rel->rd_opfamily[i], + rel->rd_opcintype[i], + cur->sk_subtype, + BTORDER_PROC); + if (!RegProcedureIsValid(cmp_proc)) + elog(ERROR, "missing support function %d(%u,%u) for attribute %d of index \"%s\"", + BTORDER_PROC, rel->rd_opcintype[i], cur->sk_subtype, + cur->sk_attno, RelationGetRelationName(rel)); + ScanKeyEntryInitialize(inskey->scankeys + i, + cur->sk_flags, + cur->sk_attno, + cur->sk_strategy, + cur->sk_subtype, + cur->sk_collation, + cmp_proc, + cur->sk_argument); + } + } + } + + _bt_set_bsearch_flags(*stratTotal, dir, &nextkey, goback); + + /* Initialize remaining insertion scan key fields */ + _bt_metaversion(rel, &inskey->heapkeyspace, &inskey->allequalimage); + inskey->anynullkeys = false; /* unused */ + inskey->nextkey = nextkey; + inskey->pivotsearch = false; + inskey->scantid = NULL; + inskey->keysz = keysCount; + + return true; +} + +/*---------- + * Examine the scan keys to discover where we need to start the scan. + * + * We want to identify the keys that can be used as starting boundaries; + * these are =, >, or >= keys for a forward scan or =, <, <= keys for + * a backwards scan. We can use keys for multiple attributes so long as + * the prior attributes had only =, >= (resp. =, <=) keys. Once we accept + * a > or < boundary or find an attribute with no boundary (which can be + * thought of as the same as "> -infinity"), we can't use keys for any + * attributes to its right, because it would break our simplistic notion + * of what initial positioning strategy to use. + * + * When the scan keys include cross-type operators, _bt_preprocess_keys + * may not be able to eliminate redundant keys; in such cases we will + * arbitrarily pick a usable one for each attribute. This is correct + * but possibly not optimal behavior. (For example, with keys like + * "x >= 4 AND x >= 5" we would elect to scan starting at x=4 when + * x=5 would be more efficient.) Since the situation only arises given + * a poorly-worded query plus an incomplete opfamily, live with it. + * + * When both equality and inequality keys appear for a single attribute + * (again, only possible when cross-type operators appear), we *must* + * select one of the equality keys for the starting point, because + * _bt_checkkeys() will stop the scan as soon as an equality qual fails. + * For example, if we have keys like "x >= 4 AND x = 10" and we elect to + * start at x=4, we will fail and stop before reaching x=10. If multiple + * equality quals survive preprocessing, however, it doesn't matter which + * one we use --- by definition, they are either redundant or + * contradictory. + * + * Any regular (not SK_SEARCHNULL) key implies a NOT NULL qualifier. + * If the index stores nulls at the end of the index we'll be starting + * from, and we have no boundary key for the column (which means the key + * we deduced NOT NULL from is an inequality key that constrains the other + * end of the index), then we cons up an explicit SK_SEARCHNOTNULL key to + * use as a boundary key. If we didn't do this, we might find ourselves + * traversing a lot of null entries at the start of the scan. + * + * In this loop, row-comparison keys are treated the same as keys on their + * first (leftmost) columns. We'll add on lower-order columns of the row + * comparison below, if possible. + * + * The selected scan keys (at most one per index column) are remembered by + * storing their addresses into the local startKeys[] array. + *---------- + */ +int _bt_choose_scan_keys(ScanKey scanKeys, int numberOfKeys, ScanDirection dir, ScanKey* startKeys, ScanKeyData* notnullkeys, StrategyNumber* stratTotal, int prefix) +{ + StrategyNumber strat; + int keysCount = 0; + int i; + + *stratTotal = BTEqualStrategyNumber; + if (numberOfKeys > 0 || prefix > 0) + { + AttrNumber curattr; + ScanKey chosen; + ScanKey impliesNN; + ScanKey cur; + + /* + * chosen is the so-far-chosen key for the current attribute, if any. + * We don't cast the decision in stone until we reach keys for the + * next attribute. + */ + curattr = 1; + chosen = NULL; + /* Also remember any scankey that implies a NOT NULL constraint */ + impliesNN = NULL; + + /* + * Loop iterates from 0 to numberOfKeys inclusive; we use the last + * pass to handle after-last-key processing. Actual exit from the + * loop is at one of the "break" statements below. + */ + for (cur = scanKeys, i = 0;; cur++, i++) + { + if (i >= numberOfKeys || cur->sk_attno != curattr) + { + /* + * Done looking at keys for curattr. If we didn't find a + * usable boundary key, see if we can deduce a NOT NULL key. + */ + if (chosen == NULL && impliesNN != NULL && + ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ? + ScanDirectionIsForward(dir) : + ScanDirectionIsBackward(dir))) + { + /* Yes, so build the key in notnullkeys[keysCount] */ + chosen = ¬nullkeys[keysCount]; + ScanKeyEntryInitialize(chosen, + (SK_SEARCHNOTNULL | SK_ISNULL | + (impliesNN->sk_flags & + (SK_BT_DESC | SK_BT_NULLS_FIRST))), + curattr, + ((impliesNN->sk_flags & SK_BT_NULLS_FIRST) ? + BTGreaterStrategyNumber : + BTLessStrategyNumber), + InvalidOid, + InvalidOid, + InvalidOid, + (Datum) 0); + } + + /* + * If we still didn't find a usable boundary key, quit; else + * save the boundary key pointer in startKeys. + */ + if (chosen == NULL && curattr > prefix) + break; + startKeys[keysCount++] = chosen; + + /* + * Adjust strat_total, and quit if we have stored a > or < + * key. + */ + if (chosen != NULL && curattr > prefix) + { + strat = chosen->sk_strategy; + if (strat != BTEqualStrategyNumber) + { + *stratTotal = strat; + if (strat == BTGreaterStrategyNumber || + strat == BTLessStrategyNumber) + break; + } + } + + /* + * Done if that was the last attribute, or if next key is not + * in sequence (implying no boundary key is available for the + * next attribute). + */ + if (i >= numberOfKeys) + { + curattr++; + while(curattr <= prefix) + { + startKeys[keysCount++] = NULL; + curattr++; + } + break; + } + else if (cur->sk_attno != curattr + 1) + { + curattr++; + while(curattr < cur->sk_attno && curattr <= prefix) + { + startKeys[keysCount++] = NULL; + curattr++; + } + if (curattr > prefix && curattr != cur->sk_attno) + break; + } + else + { + curattr++; + } + + /* + * Reset for next attr. + */ + chosen = NULL; + impliesNN = NULL; + } + + /* + * Can we use this key as a starting boundary for this attr? + * + * If not, does it imply a NOT NULL constraint? (Because + * SK_SEARCHNULL keys are always assigned BTEqualStrategyNumber, + * *any* inequality key works for that; we need not test.) + */ + switch (cur->sk_strategy) + { + case BTLessStrategyNumber: + case BTLessEqualStrategyNumber: + if (chosen == NULL) + { + if (ScanDirectionIsBackward(dir)) + chosen = cur; + else + impliesNN = cur; + } + break; + case BTEqualStrategyNumber: + /* override any non-equality choice */ + chosen = cur; + break; + case BTGreaterEqualStrategyNumber: + case BTGreaterStrategyNumber: + if (chosen == NULL) + { + if (ScanDirectionIsForward(dir)) + chosen = cur; + else + impliesNN = cur; + } + break; + } + } + } + return keysCount; +} + +void print_itup(BlockNumber blk, IndexTuple left, IndexTuple right, Relation rel, char *extra) +{ + bool isnull[INDEX_MAX_KEYS]; + Datum values[INDEX_MAX_KEYS]; + char *lkey_desc = NULL; + char *rkey_desc; + + /* Avoid infinite recursion -- don't instrument catalog indexes */ + if (!IsCatalogRelation(rel)) + { + TupleDesc itupdesc = RelationGetDescr(rel); + int natts; + int indnkeyatts = rel->rd_index->indnkeyatts; + + natts = BTreeTupleGetNAtts(left, rel); + itupdesc->natts = Min(indnkeyatts, natts); + memset(&isnull, 0xFF, sizeof(isnull)); + index_deform_tuple(left, itupdesc, values, isnull); + rel->rd_index->indnkeyatts = natts; + + /* + * Since the regression tests should pass when the instrumentation + * patch is applied, be prepared for BuildIndexValueDescription() to + * return NULL due to security considerations. + */ + lkey_desc = BuildIndexValueDescription(rel, values, isnull); + if (lkey_desc && right) + { + /* + * Revolting hack: modify tuple descriptor to have number of key + * columns actually present in caller's pivot tuples + */ + natts = BTreeTupleGetNAtts(right, rel); + itupdesc->natts = Min(indnkeyatts, natts); + memset(&isnull, 0xFF, sizeof(isnull)); + index_deform_tuple(right, itupdesc, values, isnull); + rel->rd_index->indnkeyatts = natts; + rkey_desc = BuildIndexValueDescription(rel, values, isnull); + elog(DEBUG1, "%s blk %u sk > %s, sk <= %s %s", + RelationGetRelationName(rel), blk, lkey_desc, rkey_desc, + extra); + pfree(rkey_desc); + } + else + elog(DEBUG1, "%s blk %u sk check %s %s", + RelationGetRelationName(rel), blk, lkey_desc, extra); + + /* Cleanup */ + itupdesc->natts = IndexRelationGetNumberOfAttributes(rel); + rel->rd_index->indnkeyatts = indnkeyatts; + if (lkey_desc) + pfree(lkey_desc); + } +} diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c index 03a9cd36e6..fde43f8131 100644 --- a/src/backend/access/spgist/spgutils.c +++ b/src/backend/access/spgist/spgutils.c @@ -71,6 +71,9 @@ spghandler(PG_FUNCTION_ARGS) amroutine->ambulkdelete = spgbulkdelete; amroutine->amvacuumcleanup = spgvacuumcleanup; amroutine->amcanreturn = spgcanreturn; + amroutine->amskip = NULL; + amroutine->ambeginskipscan = NULL; + amroutine->amgetskiptuple = NULL; amroutine->amcostestimate = spgcostestimate; amroutine->amoptions = spgoptions; amroutine->amproperty = spgproperty; diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 10644dfac4..ef8e89d259 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -152,6 +152,7 @@ static void ExplainXMLTag(const char *tagname, int flags, ExplainState *es); static void ExplainIndentText(ExplainState *es); static void ExplainJSONLineEnding(ExplainState *es); static void ExplainYAMLLineStarting(ExplainState *es); +static void ExplainIndexSkipScanKeys(int skipPrefixSize, ExplainState *es); static void escape_yaml(StringInfo buf, const char *str); @@ -1114,6 +1115,22 @@ ExplainPreScanNode(PlanState *planstate, Bitmapset **rels_used) return planstate_tree_walker(planstate, ExplainPreScanNode, rels_used); } +/* + * ExplainIndexSkipScanKeys - + * Append information about index skip scan to es->str. + * + * Can be used to print the skip prefix size. + */ +static void +ExplainIndexSkipScanKeys(int skipPrefixSize, ExplainState *es) +{ + if (skipPrefixSize > 0) + { + if (es->format != EXPLAIN_FORMAT_TEXT) + ExplainPropertyInteger("Distinct Prefix", NULL, skipPrefixSize, es); + } +} + /* * ExplainNode - * Appends a description of a plan tree to es->str @@ -1461,6 +1478,9 @@ ExplainNode(PlanState *planstate, List *ancestors, { IndexScan *indexscan = (IndexScan *) plan; + if (indexscan->indexdistinct) + ExplainIndexSkipScanKeys(indexscan->indexskipprefixsize, es); + ExplainIndexScanDetails(indexscan->indexid, indexscan->indexorderdir, es); @@ -1471,6 +1491,9 @@ ExplainNode(PlanState *planstate, List *ancestors, { IndexOnlyScan *indexonlyscan = (IndexOnlyScan *) plan; + if (indexonlyscan->indexdistinct) + ExplainIndexSkipScanKeys(indexonlyscan->indexskipprefixsize, es); + ExplainIndexScanDetails(indexonlyscan->indexid, indexonlyscan->indexorderdir, es); @@ -1731,6 +1754,8 @@ ExplainNode(PlanState *planstate, List *ancestors, switch (nodeTag(plan)) { case T_IndexScan: + if (((IndexScan *) plan)->indexskipprefixsize > 0) + ExplainPropertyText("Skip scan", ((IndexScan *) plan)->indexdistinct ? "Distinct only" : "All", es); show_scan_qual(((IndexScan *) plan)->indexqualorig, "Index Cond", planstate, ancestors, es); if (((IndexScan *) plan)->indexqualorig) @@ -1744,6 +1769,8 @@ ExplainNode(PlanState *planstate, List *ancestors, planstate, es); break; case T_IndexOnlyScan: + if (((IndexOnlyScan *) plan)->indexskipprefixsize > 0) + ExplainPropertyText("Skip scan", ((IndexOnlyScan *) plan)->indexdistinct ? "Distinct only" : "All", es); show_scan_qual(((IndexOnlyScan *) plan)->indexqual, "Index Cond", planstate, ancestors, es); if (((IndexOnlyScan *) plan)->indexqual) @@ -1760,6 +1787,8 @@ ExplainNode(PlanState *planstate, List *ancestors, planstate->instrument->ntuples2, 0, es); break; case T_BitmapIndexScan: + if (((BitmapIndexScan *) plan)->indexskipprefixsize > 0) + ExplainPropertyText("Skip scan", "All", es); show_scan_qual(((BitmapIndexScan *) plan)->indexqualorig, "Index Cond", planstate, ancestors, es); break; diff --git a/src/backend/executor/execScan.c b/src/backend/executor/execScan.c index 9f1d8b6d1e..3c1a79a809 100644 --- a/src/backend/executor/execScan.c +++ b/src/backend/executor/execScan.c @@ -133,6 +133,14 @@ ExecScanFetch(ScanState *node, return (*accessMtd) (node); } +TupleTableSlot * +ExecScan(ScanState *node, + ExecScanAccessMtd accessMtd, /* function returning a tuple */ + ExecScanRecheckMtd recheckMtd) +{ + return ExecScanExtended(node, accessMtd, recheckMtd, NULL); +} + /* ---------------------------------------------------------------- * ExecScan * @@ -155,9 +163,10 @@ ExecScanFetch(ScanState *node, * ---------------------------------------------------------------- */ TupleTableSlot * -ExecScan(ScanState *node, +ExecScanExtended(ScanState *node, ExecScanAccessMtd accessMtd, /* function returning a tuple */ - ExecScanRecheckMtd recheckMtd) + ExecScanRecheckMtd recheckMtd, + ExecScanSkipMtd skipMtd) { ExprContext *econtext; ExprState *qual; @@ -170,6 +179,20 @@ ExecScan(ScanState *node, projInfo = node->ps.ps_ProjInfo; econtext = node->ps.ps_ExprContext; + if (skipMtd != NULL && node->ss_FirstTupleEmitted) + { + bool cont = skipMtd(node); + if (!cont) + { + node->ss_FirstTupleEmitted = false; + return ExecClearTuple(node->ss_ScanTupleSlot); + } + } + else + { + node->ss_FirstTupleEmitted = true; + } + /* interrupt checks are in ExecScanFetch */ /* @@ -178,8 +201,13 @@ ExecScan(ScanState *node, */ if (!qual && !projInfo) { + TupleTableSlot *slot; + ResetExprContext(econtext); - return ExecScanFetch(node, accessMtd, recheckMtd); + slot = ExecScanFetch(node, accessMtd, recheckMtd); + if (TupIsNull(slot)) + node->ss_FirstTupleEmitted = false; + return slot; } /* @@ -206,6 +234,7 @@ ExecScan(ScanState *node, */ if (TupIsNull(slot)) { + node->ss_FirstTupleEmitted = false; if (projInfo) return ExecClearTuple(projInfo->pi_state.resultslot); else @@ -306,6 +335,8 @@ ExecScanReScan(ScanState *node) */ ExecClearTuple(node->ss_ScanTupleSlot); + node->ss_FirstTupleEmitted = false; + /* Rescan EvalPlanQual tuple if we're inside an EvalPlanQual recheck */ if (estate->es_epq_active != NULL) { diff --git a/src/backend/executor/nodeBitmapIndexscan.c b/src/backend/executor/nodeBitmapIndexscan.c index 48c2036297..2b05970ec3 100644 --- a/src/backend/executor/nodeBitmapIndexscan.c +++ b/src/backend/executor/nodeBitmapIndexscan.c @@ -22,13 +22,14 @@ #include "postgres.h" #include "access/genam.h" +#include "access/relscan.h" #include "executor/execdebug.h" #include "executor/nodeBitmapIndexscan.h" #include "executor/nodeIndexscan.h" #include "miscadmin.h" +#include "utils/rel.h" #include "utils/memutils.h" - /* ---------------------------------------------------------------- * ExecBitmapIndexScan * @@ -223,6 +224,7 @@ ExecInitBitmapIndexScan(BitmapIndexScan *node, EState *estate, int eflags) indexstate->ss.ps.plan = (Plan *) node; indexstate->ss.ps.state = estate; indexstate->ss.ps.ExecProcNode = ExecBitmapIndexScan; + indexstate->ss.ss_FirstTupleEmitted = false; /* normally we don't make the result bitmap till runtime */ indexstate->biss_result = NULL; @@ -308,10 +310,20 @@ ExecInitBitmapIndexScan(BitmapIndexScan *node, EState *estate, int eflags) /* * Initialize scan descriptor. */ - indexstate->biss_ScanDesc = - index_beginscan_bitmap(indexstate->biss_RelationDesc, - estate->es_snapshot, - indexstate->biss_NumScanKeys); + if (node->indexskipprefixsize > 0) + { + indexstate->biss_ScanDesc = + index_beginscan_bitmap_skip(indexstate->biss_RelationDesc, + estate->es_snapshot, + indexstate->biss_NumScanKeys, + Min(IndexRelationGetNumberOfKeyAttributes(indexstate->biss_RelationDesc), + node->indexskipprefixsize)); + } + else + indexstate->biss_ScanDesc = + index_beginscan_bitmap(indexstate->biss_RelationDesc, + estate->es_snapshot, + indexstate->biss_NumScanKeys); /* * If no run-time keys to calculate, go ahead and pass the scankeys to the diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index 0754e28a9a..7b13ec8a87 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -41,6 +41,7 @@ #include "miscadmin.h" #include "storage/bufmgr.h" #include "storage/predicate.h" +#include "storage/itemptr.h" #include "utils/memutils.h" #include "utils/rel.h" @@ -49,6 +50,37 @@ static TupleTableSlot *IndexOnlyNext(IndexOnlyScanState *node); static void StoreIndexTuple(TupleTableSlot *slot, IndexTuple itup, TupleDesc itupdesc); +static bool +IndexOnlySkip(IndexOnlyScanState *node) +{ + EState *estate; + ScanDirection direction; + IndexScanDesc scandesc; + IndexOnlyScan *indexonlyscan = (IndexOnlyScan *) node->ss.ps.plan; + + if (!node->ioss_Distinct) + return true; + + /* + * extract necessary information from index scan node + */ + estate = node->ss.ps.state; + direction = estate->es_direction; + /* flip direction if this is an overall backward scan */ + if (ScanDirectionIsBackward(indexonlyscan->indexorderdir)) + { + if (ScanDirectionIsForward(direction)) + direction = BackwardScanDirection; + else if (ScanDirectionIsBackward(direction)) + direction = ForwardScanDirection; + } + scandesc = node->ioss_ScanDesc; + + if (!index_skip(scandesc, direction, indexonlyscan->indexorderdir)) + return false; + + return true; +} /* ---------------------------------------------------------------- * IndexOnlyNext @@ -65,6 +97,8 @@ IndexOnlyNext(IndexOnlyScanState *node) IndexScanDesc scandesc; TupleTableSlot *slot; ItemPointer tid; + ItemPointerData startTid; + IndexOnlyScan *indexonlyscan = (IndexOnlyScan *) node->ss.ps.plan; /* * extract necessary information from index scan node @@ -72,7 +106,7 @@ IndexOnlyNext(IndexOnlyScanState *node) estate = node->ss.ps.state; direction = estate->es_direction; /* flip direction if this is an overall backward scan */ - if (ScanDirectionIsBackward(((IndexOnlyScan *) node->ss.ps.plan)->indexorderdir)) + if (ScanDirectionIsBackward(indexonlyscan->indexorderdir)) { if (ScanDirectionIsForward(direction)) direction = BackwardScanDirection; @@ -90,11 +124,19 @@ IndexOnlyNext(IndexOnlyScanState *node) * serially executing an index only scan that was planned to be * parallel. */ - scandesc = index_beginscan(node->ss.ss_currentRelation, - node->ioss_RelationDesc, - estate->es_snapshot, - node->ioss_NumScanKeys, - node->ioss_NumOrderByKeys); + if (node->ioss_SkipPrefixSize > 0) + scandesc = index_beginscan_skip(node->ss.ss_currentRelation, + node->ioss_RelationDesc, + estate->es_snapshot, + node->ioss_NumScanKeys, + node->ioss_NumOrderByKeys, + Min(IndexRelationGetNumberOfKeyAttributes(node->ioss_RelationDesc), node->ioss_SkipPrefixSize)); + else + scandesc = index_beginscan(node->ss.ss_currentRelation, + node->ioss_RelationDesc, + estate->es_snapshot, + node->ioss_NumScanKeys, + node->ioss_NumOrderByKeys); node->ioss_ScanDesc = scandesc; @@ -114,11 +156,16 @@ IndexOnlyNext(IndexOnlyScanState *node) node->ioss_OrderByKeys, node->ioss_NumOrderByKeys); } + else + { + ItemPointerCopy(&scandesc->xs_heaptid, &startTid); + } /* * OK, now that we have what we need, fetch the next tuple. */ - while ((tid = index_getnext_tid(scandesc, direction)) != NULL) + while ((tid = node->ioss_SkipPrefixSize > 0 ? index_getnext_tid_skip(scandesc, direction, node->ioss_Distinct ? indexonlyscan->indexorderdir : direction) : + index_getnext_tid(scandesc, direction)) != NULL) { bool tuple_from_heap = false; @@ -314,9 +361,10 @@ ExecIndexOnlyScan(PlanState *pstate) if (node->ioss_NumRuntimeKeys != 0 && !node->ioss_RuntimeKeysReady) ExecReScan((PlanState *) node); - return ExecScan(&node->ss, + return ExecScanExtended(&node->ss, (ExecScanAccessMtd) IndexOnlyNext, - (ExecScanRecheckMtd) IndexOnlyRecheck); + (ExecScanRecheckMtd) IndexOnlyRecheck, + node->ioss_Distinct ? (ExecScanSkipMtd) IndexOnlySkip : NULL); } /* ---------------------------------------------------------------- @@ -504,6 +552,9 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags) indexstate->ss.ps.plan = (Plan *) node; indexstate->ss.ps.state = estate; indexstate->ss.ps.ExecProcNode = ExecIndexOnlyScan; + indexstate->ss.ss_FirstTupleEmitted = false; + indexstate->ioss_SkipPrefixSize = node->indexskipprefixsize; + indexstate->ioss_Distinct = node->indexdistinct; /* * Miscellaneous initialization diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index 2fffb1b437..c6b6e7a6fb 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -69,6 +69,37 @@ static void reorderqueue_push(IndexScanState *node, TupleTableSlot *slot, Datum *orderbyvals, bool *orderbynulls); static HeapTuple reorderqueue_pop(IndexScanState *node); +static bool +IndexSkip(IndexScanState *node) +{ + EState *estate; + ScanDirection direction; + IndexScanDesc scandesc; + IndexScan *indexscan = (IndexScan *) node->ss.ps.plan; + + if (!node->iss_Distinct) + return true; + + /* + * extract necessary information from index scan node + */ + estate = node->ss.ps.state; + direction = estate->es_direction; + /* flip direction if this is an overall backward scan */ + if (ScanDirectionIsBackward(indexscan->indexorderdir)) + { + if (ScanDirectionIsForward(direction)) + direction = BackwardScanDirection; + else if (ScanDirectionIsBackward(direction)) + direction = ForwardScanDirection; + } + scandesc = node->iss_ScanDesc; + + if (!index_skip(scandesc, direction, indexscan->indexorderdir)) + return false; + + return true; +} /* ---------------------------------------------------------------- * IndexNext @@ -85,6 +116,7 @@ IndexNext(IndexScanState *node) ScanDirection direction; IndexScanDesc scandesc; TupleTableSlot *slot; + IndexScan *indexscan = (IndexScan *) node->ss.ps.plan; /* * extract necessary information from index scan node @@ -92,7 +124,7 @@ IndexNext(IndexScanState *node) estate = node->ss.ps.state; direction = estate->es_direction; /* flip direction if this is an overall backward scan */ - if (ScanDirectionIsBackward(((IndexScan *) node->ss.ps.plan)->indexorderdir)) + if (ScanDirectionIsBackward(indexscan->indexorderdir)) { if (ScanDirectionIsForward(direction)) direction = BackwardScanDirection; @@ -109,14 +141,25 @@ IndexNext(IndexScanState *node) * We reach here if the index scan is not parallel, or if we're * serially executing an index scan that was planned to be parallel. */ - scandesc = index_beginscan(node->ss.ss_currentRelation, - node->iss_RelationDesc, - estate->es_snapshot, - node->iss_NumScanKeys, - node->iss_NumOrderByKeys); + if (node->iss_SkipPrefixSize > 0) + scandesc = index_beginscan_skip(node->ss.ss_currentRelation, + node->iss_RelationDesc, + estate->es_snapshot, + node->iss_NumScanKeys, + node->iss_NumOrderByKeys, + Min(IndexRelationGetNumberOfKeyAttributes(node->iss_RelationDesc), node->iss_SkipPrefixSize)); + else + scandesc = index_beginscan(node->ss.ss_currentRelation, + node->iss_RelationDesc, + estate->es_snapshot, + node->iss_NumScanKeys, + node->iss_NumOrderByKeys); node->iss_ScanDesc = scandesc; + /* Index skip scan assumes xs_want_itup, so set it to true if we skip over distinct */ + node->iss_ScanDesc->xs_want_itup = indexscan->indexdistinct; + /* * If no run-time keys to calculate or they are ready, go ahead and * pass the scankeys to the index AM. @@ -130,7 +173,9 @@ IndexNext(IndexScanState *node) /* * ok, now that we have what we need, fetch the next tuple. */ - while (index_getnext_slot(scandesc, direction, slot)) + while (node->iss_SkipPrefixSize > 0 ? + index_getnext_slot_skip(scandesc, direction, node->iss_Distinct ? indexscan->indexorderdir : direction, slot) : + index_getnext_slot(scandesc, direction, slot)) { CHECK_FOR_INTERRUPTS(); @@ -530,13 +575,15 @@ ExecIndexScan(PlanState *pstate) ExecReScan((PlanState *) node); if (node->iss_NumOrderByKeys > 0) - return ExecScan(&node->ss, + return ExecScanExtended(&node->ss, (ExecScanAccessMtd) IndexNextWithReorder, - (ExecScanRecheckMtd) IndexRecheck); + (ExecScanRecheckMtd) IndexRecheck, + node->iss_Distinct ? (ExecScanSkipMtd) IndexSkip : NULL); else - return ExecScan(&node->ss, + return ExecScanExtended(&node->ss, (ExecScanAccessMtd) IndexNext, - (ExecScanRecheckMtd) IndexRecheck); + (ExecScanRecheckMtd) IndexRecheck, + node->iss_Distinct ? (ExecScanSkipMtd) IndexSkip : NULL); } /* ---------------------------------------------------------------- @@ -910,6 +957,9 @@ ExecInitIndexScan(IndexScan *node, EState *estate, int eflags) indexstate->ss.ps.plan = (Plan *) node; indexstate->ss.ps.state = estate; indexstate->ss.ps.ExecProcNode = ExecIndexScan; + indexstate->ss.ss_FirstTupleEmitted = false; + indexstate->iss_SkipPrefixSize = node->indexskipprefixsize; + indexstate->iss_Distinct = node->indexdistinct; /* * Miscellaneous initialization diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index 652ba7f8ee..268e799e82 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -497,6 +497,8 @@ _copyIndexScan(const IndexScan *from) COPY_NODE_FIELD(indexorderbyorig); COPY_NODE_FIELD(indexorderbyops); COPY_SCALAR_FIELD(indexorderdir); + COPY_SCALAR_FIELD(indexskipprefixsize); + COPY_SCALAR_FIELD(indexdistinct); return newnode; } @@ -522,6 +524,8 @@ _copyIndexOnlyScan(const IndexOnlyScan *from) COPY_NODE_FIELD(indexorderby); COPY_NODE_FIELD(indextlist); COPY_SCALAR_FIELD(indexorderdir); + COPY_SCALAR_FIELD(indexskipprefixsize); + COPY_SCALAR_FIELD(indexdistinct); return newnode; } @@ -546,6 +550,7 @@ _copyBitmapIndexScan(const BitmapIndexScan *from) COPY_SCALAR_FIELD(isshared); COPY_NODE_FIELD(indexqual); COPY_NODE_FIELD(indexqualorig); + COPY_SCALAR_FIELD(indexskipprefixsize); return newnode; } diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index 5a25a50edc..be6b359aa3 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -569,6 +569,8 @@ _outIndexScan(StringInfo str, const IndexScan *node) WRITE_NODE_FIELD(indexorderbyorig); WRITE_NODE_FIELD(indexorderbyops); WRITE_ENUM_FIELD(indexorderdir, ScanDirection); + WRITE_INT_FIELD(indexskipprefixsize); + WRITE_INT_FIELD(indexdistinct); } static void @@ -583,6 +585,9 @@ _outIndexOnlyScan(StringInfo str, const IndexOnlyScan *node) WRITE_NODE_FIELD(indexorderby); WRITE_NODE_FIELD(indextlist); WRITE_ENUM_FIELD(indexorderdir, ScanDirection); + WRITE_INT_FIELD(indexskipprefixsize); + WRITE_INT_FIELD(indexdistinct); + } static void @@ -596,6 +601,7 @@ _outBitmapIndexScan(StringInfo str, const BitmapIndexScan *node) WRITE_BOOL_FIELD(isshared); WRITE_NODE_FIELD(indexqual); WRITE_NODE_FIELD(indexqualorig); + WRITE_INT_FIELD(indexskipprefixsize); } static void diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c index 54d97ac3d0..b95f217475 100644 --- a/src/backend/nodes/readfuncs.c +++ b/src/backend/nodes/readfuncs.c @@ -1878,6 +1878,8 @@ _readIndexScan(void) READ_NODE_FIELD(indexorderbyorig); READ_NODE_FIELD(indexorderbyops); READ_ENUM_FIELD(indexorderdir, ScanDirection); + READ_INT_FIELD(indexskipprefixsize); + READ_INT_FIELD(indexdistinct); READ_DONE(); } @@ -1897,6 +1899,8 @@ _readIndexOnlyScan(void) READ_NODE_FIELD(indexorderby); READ_NODE_FIELD(indextlist); READ_ENUM_FIELD(indexorderdir, ScanDirection); + READ_INT_FIELD(indexskipprefixsize); + READ_INT_FIELD(indexdistinct); READ_DONE(); } @@ -1915,6 +1919,7 @@ _readBitmapIndexScan(void) READ_BOOL_FIELD(isshared); READ_NODE_FIELD(indexqual); READ_NODE_FIELD(indexqualorig); + READ_INT_FIELD(indexskipprefixsize); READ_DONE(); } diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index 4dfc5d29bc..9cac47a9ca 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -1755,7 +1755,9 @@ generate_orderedappend_paths(PlannerInfo *root, RelOptInfo *rel, List *pathkeys = (List *) lfirst(lcp); List *startup_subpaths = NIL; List *total_subpaths = NIL; + List *uniq_total_subpaths = NIL; bool startup_neq_total = false; + bool uniq_neq_total = false; ListCell *lcr; bool match_partition_order; bool match_partition_order_desc; @@ -1784,7 +1786,8 @@ generate_orderedappend_paths(PlannerInfo *root, RelOptInfo *rel, { RelOptInfo *childrel = (RelOptInfo *) lfirst(lcr); Path *cheapest_startup, - *cheapest_total; + *cheapest_total, + *cheapest_uniq_total = NULL; /* Locate the right paths, if they are available. */ cheapest_startup = @@ -1800,6 +1803,19 @@ generate_orderedappend_paths(PlannerInfo *root, RelOptInfo *rel, TOTAL_COST, false); + cheapest_uniq_total = + get_cheapest_path_for_pathkeys(childrel->unique_pathlist, + pathkeys, + NULL, + TOTAL_COST, + false); + + if (cheapest_uniq_total != NULL && !uniq_neq_total) + { + uniq_neq_total = true; + uniq_total_subpaths = list_copy(total_subpaths); + } + /* * If we can't find any paths with the right order just use the * cheapest-total path; we'll have to sort it later. @@ -1812,6 +1828,9 @@ generate_orderedappend_paths(PlannerInfo *root, RelOptInfo *rel, Assert(cheapest_total->param_info == NULL); } + if (cheapest_uniq_total == NULL) + cheapest_uniq_total = cheapest_total; + /* * Notice whether we actually have different paths for the * "cheapest" and "total" cases; frequently there will be no point @@ -1838,6 +1857,12 @@ generate_orderedappend_paths(PlannerInfo *root, RelOptInfo *rel, startup_subpaths = lappend(startup_subpaths, cheapest_startup); total_subpaths = lappend(total_subpaths, cheapest_total); + + if (uniq_neq_total) + { + cheapest_uniq_total = get_singleton_append_subpath(cheapest_uniq_total); + uniq_total_subpaths = lappend(uniq_total_subpaths, cheapest_uniq_total); + } } else if (match_partition_order_desc) { @@ -1851,6 +1876,12 @@ generate_orderedappend_paths(PlannerInfo *root, RelOptInfo *rel, startup_subpaths = lcons(cheapest_startup, startup_subpaths); total_subpaths = lcons(cheapest_total, total_subpaths); + + if (uniq_neq_total) + { + cheapest_uniq_total = get_singleton_append_subpath(cheapest_uniq_total); + uniq_total_subpaths = lcons(cheapest_uniq_total, uniq_total_subpaths); + } } else { @@ -1862,6 +1893,11 @@ generate_orderedappend_paths(PlannerInfo *root, RelOptInfo *rel, &startup_subpaths, NULL); accumulate_append_subpath(cheapest_total, &total_subpaths, NULL); + if (uniq_neq_total) + { + accumulate_append_subpath(cheapest_uniq_total, + &uniq_total_subpaths, NULL); + } } } @@ -1888,6 +1924,16 @@ generate_orderedappend_paths(PlannerInfo *root, RelOptInfo *rel, 0, false, -1)); + if (uniq_neq_total) + add_unique_path(rel, (Path *) create_append_path(root, + rel, + uniq_total_subpaths, + NIL, + pathkeys, + NULL, + 0, + false, + -1)); } else { @@ -1903,6 +1949,12 @@ generate_orderedappend_paths(PlannerInfo *root, RelOptInfo *rel, total_subpaths, pathkeys, NULL)); + if (uniq_neq_total) + add_unique_path(rel, (Path *) create_merge_append_path(root, + rel, + uniq_total_subpaths, + pathkeys, + NULL)); } } } diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 1e4d404f02..ff75c02003 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -133,6 +133,7 @@ int max_parallel_workers_per_gather = 2; bool enable_seqscan = true; bool enable_indexscan = true; bool enable_indexonlyscan = true; +bool enable_indexskipscan = true; bool enable_bitmapscan = true; bool enable_tidscan = true; bool enable_sort = true; diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c index 0e4e00eaf0..ba2dd30a13 100644 --- a/src/backend/optimizer/path/indxpath.c +++ b/src/backend/optimizer/path/indxpath.c @@ -784,6 +784,16 @@ get_index_paths(PlannerInfo *root, RelOptInfo *rel, { IndexPath *ipath = (IndexPath *) lfirst(lc); + /* + * To prevent unique paths from index skip scans being potentially used + * when not needed scan keep them in a separate pathlist. + */ + if (ipath->indexdistinct) + { + add_unique_path(rel, (Path *) ipath); + continue; + } + if (index->amhasgettuple) add_path(rel, (Path *) ipath); @@ -872,6 +882,7 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, bool pathkeys_possibly_useful; bool index_is_ordered; bool index_only_scan; + bool can_skip; int indexcol; /* @@ -1021,6 +1032,9 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, index_only_scan = (scantype != ST_BITMAPSCAN && check_index_only(rel, index)); + /* Check if an index skip scan is possible. */ + can_skip = enable_indexskipscan & index->amcanskip; + /* * 4. Generate an indexscan path if there are relevant restriction clauses * in the current clauses, OR the index ordering is potentially useful for @@ -1044,6 +1058,33 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, false); result = lappend(result, ipath); + /* Consider index skip scan as well */ + if (root->query_uniquekeys != NULL && can_skip) + { + int numusefulkeys = list_length(useful_pathkeys); + int numsortkeys = list_length(root->query_pathkeys); + + if (numusefulkeys == numsortkeys) + { + int prefix; + if (list_length(root->distinct_pathkeys) > 0) + prefix = find_index_prefix_for_pathkey(root, + index, + ForwardScanDirection, + llast_node(PathKey, + root->distinct_pathkeys)); + else + /* all are distinct keys are constant and optimized away. + * skipping with 1 is sufficient as all are constant anyway + */ + prefix = 1; + + result = lappend(result, + create_skipscan_unique_path(root, index, + (Path *) ipath, prefix)); + } + } + /* * If appropriate, consider parallel index scan. We don't allow * parallel index scan for bitmap index scans. @@ -1099,6 +1140,33 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, false); result = lappend(result, ipath); + /* Consider index skip scan as well */ + if (root->query_uniquekeys != NULL && can_skip) + { + int numusefulkeys = list_length(useful_pathkeys); + int numsortkeys = list_length(root->query_pathkeys); + + if (numusefulkeys == numsortkeys) + { + int prefix; + if (list_length(root->distinct_pathkeys) > 0) + prefix = find_index_prefix_for_pathkey(root, + index, + BackwardScanDirection, + llast_node(PathKey, + root->distinct_pathkeys)); + else + /* all are distinct keys are constant and optimized away. + * skipping with 1 is sufficient as all are constant anyway + */ + prefix = 1; + + result = lappend(result, + create_skipscan_unique_path(root, index, + (Path *) ipath, prefix)); + } + } + /* If appropriate, consider parallel index scan */ if (index->amcanparallel && rel->consider_parallel && outer_relids == NULL && diff --git a/src/backend/optimizer/path/pathkeys.c b/src/backend/optimizer/path/pathkeys.c index ad4fe19872..519bbbb788 100644 --- a/src/backend/optimizer/path/pathkeys.c +++ b/src/backend/optimizer/path/pathkeys.c @@ -522,6 +522,78 @@ get_cheapest_parallel_safe_total_inner(List *paths) * NEW PATHKEY FORMATION ****************************************************************************/ +/* + * Find the prefix size for a specific path key in an index. + * For example, an index with (a,b,c) finding path key b will + * return prefix 2. + * Returns 0 when not found. + */ +int +find_index_prefix_for_pathkey(PlannerInfo *root, + IndexOptInfo *index, + ScanDirection scandir, + PathKey *pathkey) +{ + ListCell *lc; + int i; + + i = 0; + foreach(lc, index->indextlist) + { + TargetEntry *indextle = (TargetEntry *) lfirst(lc); + Expr *indexkey; + bool reverse_sort; + bool nulls_first; + PathKey *cpathkey; + + /* + * INCLUDE columns are stored in index unordered, so they don't + * support ordered index scan. + */ + if (i >= index->nkeycolumns) + break; + + /* We assume we don't need to make a copy of the tlist item */ + indexkey = indextle->expr; + + if (ScanDirectionIsBackward(scandir)) + { + reverse_sort = !index->reverse_sort[i]; + nulls_first = !index->nulls_first[i]; + } + else + { + reverse_sort = index->reverse_sort[i]; + nulls_first = index->nulls_first[i]; + } + + /* + * OK, try to make a canonical pathkey for this sort key. Note we're + * underneath any outer joins, so nullable_relids should be NULL. + */ + cpathkey = make_pathkey_from_sortinfo(root, + indexkey, + NULL, + index->sortopfamily[i], + index->opcintype[i], + index->indexcollations[i], + reverse_sort, + nulls_first, + 0, + index->rel->relids, + false); + + if (cpathkey == pathkey) + { + return i + 1; + } + + i++; + } + + return 0; +} + /* * build_index_pathkeys * Build a pathkeys list that describes the ordering induced by an index diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 3dc0176a51..673e122714 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -185,15 +185,20 @@ static IndexScan *make_indexscan(List *qptlist, List *qpqual, Index scanrelid, Oid indexid, List *indexqual, List *indexqualorig, List *indexorderby, List *indexorderbyorig, List *indexorderbyops, - ScanDirection indexscandir); + ScanDirection indexscandir, + int skipprefix, + bool distinct); static IndexOnlyScan *make_indexonlyscan(List *qptlist, List *qpqual, Index scanrelid, Oid indexid, List *indexqual, List *indexorderby, List *indextlist, - ScanDirection indexscandir); + ScanDirection indexscandir, + int skipprefix, + bool distinct); static BitmapIndexScan *make_bitmap_indexscan(Index scanrelid, Oid indexid, List *indexqual, - List *indexqualorig); + List *indexqualorig, + int skipPrefixSize); static BitmapHeapScan *make_bitmap_heapscan(List *qptlist, List *qpqual, Plan *lefttree, @@ -3066,7 +3071,9 @@ create_indexscan_plan(PlannerInfo *root, fixed_indexquals, fixed_indexorderbys, best_path->indexinfo->indextlist, - best_path->indexscandir); + best_path->indexscandir, + best_path->indexskipprefix, + best_path->indexdistinct); else scan_plan = (Scan *) make_indexscan(tlist, qpqual, @@ -3077,7 +3084,9 @@ create_indexscan_plan(PlannerInfo *root, fixed_indexorderbys, indexorderbys, indexorderbyops, - best_path->indexscandir); + best_path->indexscandir, + best_path->indexskipprefix, + best_path->indexdistinct); copy_generic_path_info(&scan_plan->plan, &best_path->path); @@ -3367,7 +3376,8 @@ create_bitmap_subplan(PlannerInfo *root, Path *bitmapqual, plan = (Plan *) make_bitmap_indexscan(iscan->scan.scanrelid, iscan->indexid, iscan->indexqual, - iscan->indexqualorig); + iscan->indexqualorig, + iscan->indexskipprefixsize); /* and set its cost/width fields appropriately */ plan->startup_cost = 0.0; plan->total_cost = ipath->indextotalcost; @@ -5410,7 +5420,9 @@ make_indexscan(List *qptlist, List *indexorderby, List *indexorderbyorig, List *indexorderbyops, - ScanDirection indexscandir) + ScanDirection indexscandir, + int skipPrefixSize, + bool distinct) { IndexScan *node = makeNode(IndexScan); Plan *plan = &node->scan.plan; @@ -5427,6 +5439,8 @@ make_indexscan(List *qptlist, node->indexorderbyorig = indexorderbyorig; node->indexorderbyops = indexorderbyops; node->indexorderdir = indexscandir; + node->indexskipprefixsize = skipPrefixSize; + node->indexdistinct = distinct; return node; } @@ -5439,7 +5453,9 @@ make_indexonlyscan(List *qptlist, List *indexqual, List *indexorderby, List *indextlist, - ScanDirection indexscandir) + ScanDirection indexscandir, + int skipPrefixSize, + bool distinct) { IndexOnlyScan *node = makeNode(IndexOnlyScan); Plan *plan = &node->scan.plan; @@ -5454,6 +5470,8 @@ make_indexonlyscan(List *qptlist, node->indexorderby = indexorderby; node->indextlist = indextlist; node->indexorderdir = indexscandir; + node->indexskipprefixsize = skipPrefixSize; + node->indexdistinct = distinct; return node; } @@ -5462,7 +5480,8 @@ static BitmapIndexScan * make_bitmap_indexscan(Index scanrelid, Oid indexid, List *indexqual, - List *indexqualorig) + List *indexqualorig, + int skipPrefixSize) { BitmapIndexScan *node = makeNode(BitmapIndexScan); Plan *plan = &node->scan.plan; @@ -5475,6 +5494,7 @@ make_bitmap_indexscan(Index scanrelid, node->indexid = indexid; node->indexqual = indexqual; node->indexqualorig = indexqualorig; + node->indexskipprefixsize = skipPrefixSize; return node; } diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index ea2408c13f..91b1ca2634 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -3109,12 +3109,18 @@ standard_qp_callback(PlannerInfo *root, void *extra) if (parse->distinctClause && grouping_is_sortable(parse->distinctClause)) + { root->distinct_pathkeys = make_pathkeys_for_sortclauses(root, parse->distinctClause, tlist); + root->query_uniquekeys = build_uniquekeys(root, parse->distinctClause); + } else + { root->distinct_pathkeys = NIL; + root->query_uniquekeys = NIL; + } root->sort_pathkeys = make_pathkeys_for_sortclauses(root, @@ -4441,7 +4447,7 @@ create_final_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel, RelOptInfo *distinct_rel) { Query *parse = root->parse; - Path *cheapest_input_path = input_rel->cheapest_total_path; + Path *cheapest_input_path = input_rel->cheapest_distinct_unique_path; double numDistinctRows; bool allow_hash; Path *path; @@ -4514,8 +4520,14 @@ create_final_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel, { Path *path = (Path *) lfirst(lc); - if (query_has_uniquekeys_for(root, needed_pathkeys, false)) + if (query_has_uniquekeys_for(root, path->uniquekeys, false)) add_path(distinct_rel, path); + else if (pathkeys_contained_in(needed_pathkeys, path->pathkeys)) + add_path(distinct_rel, (Path *) + create_upper_unique_path(root, distinct_rel, + path, + list_length(root->distinct_pathkeys), + numDistinctRows)); } /* For explicit-sort case, always use the more rigorous clause */ diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 74e100e5a9..16633fd672 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -245,6 +245,7 @@ set_cheapest(RelOptInfo *parent_rel) { Path *cheapest_startup_path; Path *cheapest_total_path; + Path *cheapest_distinct_unique_path; Path *best_param_path; List *parameterized_paths; ListCell *p; @@ -256,6 +257,7 @@ set_cheapest(RelOptInfo *parent_rel) cheapest_startup_path = cheapest_total_path = best_param_path = NULL; parameterized_paths = NIL; + cheapest_distinct_unique_path = NULL; foreach(p, parent_rel->pathlist) { @@ -354,6 +356,36 @@ set_cheapest(RelOptInfo *parent_rel) cheapest_total_path = best_param_path; Assert(cheapest_total_path != NULL); + cheapest_distinct_unique_path = cheapest_total_path; + + foreach(p, parent_rel->unique_pathlist) + { + Path *path = (Path *) lfirst(p); + int cmp; + + /* Unparameterized path, so consider it for cheapest slots */ + if (cheapest_distinct_unique_path == NULL) + { + cheapest_distinct_unique_path = path; + continue; + } + + /* + * If we find two paths of identical costs, try to keep the + * better-sorted one. The paths might have unrelated sort + * orderings, in which case we can only guess which might be + * better to keep, but if one is superior then we definitely + * should keep that one. + */ + cmp = compare_path_costs(cheapest_distinct_unique_path, path, TOTAL_COST); + if (cmp > 0 || + (cmp == 0 && + compare_pathkeys(cheapest_distinct_unique_path->pathkeys, + path->pathkeys) == PATHKEYS_BETTER2)) + cheapest_distinct_unique_path = path; + } + + parent_rel->cheapest_distinct_unique_path = cheapest_distinct_unique_path; parent_rel->cheapest_startup_path = cheapest_startup_path; parent_rel->cheapest_total_path = cheapest_total_path; parent_rel->cheapest_unique_path = NULL; /* computed only if needed */ @@ -1293,6 +1325,10 @@ create_append_path(PlannerInfo *root, pathnode->path.parallel_safe = rel->consider_parallel; pathnode->path.parallel_workers = parallel_workers; pathnode->path.pathkeys = pathkeys; + if (list_length(subpaths) == 1) + { + pathnode->path.uniquekeys = ((Path*)linitial(subpaths))->uniquekeys; + } /* * For parallel append, non-partial paths are sorted by descending total @@ -1437,6 +1473,10 @@ create_merge_append_path(PlannerInfo *root, pathnode->path.parallel_workers = 0; pathnode->path.pathkeys = pathkeys; pathnode->subpaths = subpaths; + if (list_length(subpaths) == 1) + { + pathnode->path.uniquekeys = ((Path*)linitial(subpaths))->uniquekeys; + } /* * Apply query-wide LIMIT if known and path is for sole base relation. @@ -3094,6 +3134,44 @@ create_upper_unique_path(PlannerInfo *root, return pathnode; } +/* + * create_skipscan_unique_path + * Creates a pathnode the same as an existing IndexPath except based on + * skipping duplicate values. This may or may not be cheaper than using + * create_upper_unique_path. + * + * The input path must be an IndexPath for an index that supports amskip. + */ +IndexPath * +create_skipscan_unique_path(PlannerInfo *root, IndexOptInfo *index, + Path *basepath, int prefix) +{ + IndexPath *pathnode = makeNode(IndexPath); + int numDistinctRows; + UniqueKey *ukey; + + Assert(IsA(basepath, IndexPath)); + + /* We don't want to modify basepath, so make a copy. */ + memcpy(pathnode, basepath, sizeof(IndexPath)); + + ukey = linitial_node(UniqueKey, root->query_uniquekeys); + + Assert(prefix > 0); + pathnode->indexskipprefix = prefix; + pathnode->indexdistinct = true; + pathnode->path.uniquekeys = root->query_uniquekeys; + + numDistinctRows = estimate_num_groups(root, ukey->exprs, + pathnode->path.rows, + NULL, NULL); + + pathnode->path.total_cost = pathnode->path.startup_cost * numDistinctRows; + pathnode->path.rows = numDistinctRows; + + return pathnode; +} + /* * create_agg_path * Creates a pathnode that represents performing aggregation/grouping diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index a50f897ffa..e67026ac3d 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -272,6 +272,9 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, info->amoptionalkey = amroutine->amoptionalkey; info->amsearcharray = amroutine->amsearcharray; info->amsearchnulls = amroutine->amsearchnulls; + info->amcanskip = (amroutine->amskip != NULL && + amroutine->amgetskiptuple != NULL && + amroutine->ambeginskipscan != NULL); info->amcanparallel = amroutine->amcanparallel; info->amhasgettuple = (amroutine->amgettuple != NULL); info->amhasgetbitmap = amroutine->amgetbitmap != NULL && diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index d2ce4a8450..d393dbe2aa 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -1000,6 +1000,15 @@ static struct config_bool ConfigureNamesBool[] = true, NULL, NULL, NULL }, + { + {"enable_indexskipscan", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("Enables the planner's use of index-skip-scan plans."), + NULL + }, + &enable_indexskipscan, + true, + NULL, NULL, NULL + }, { {"enable_bitmapscan", PGC_USERSET, QUERY_TUNING_METHOD, gettext_noop("Enables the planner's use of bitmap-scan plans."), diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 3fe9a53cb3..df48ee95c4 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -367,6 +367,7 @@ #enable_incremental_sort = on #enable_indexscan = on #enable_indexonlyscan = on +#enable_indexskipscan = on #enable_material = on #enable_memoize = on #enable_mergejoin = on diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c index 9e93908c65..eb17cbbc9b 100644 --- a/src/backend/utils/sort/tuplesort.c +++ b/src/backend/utils/sort/tuplesort.c @@ -1009,7 +1009,7 @@ tuplesort_begin_cluster(TupleDesc tupDesc, state->tupDesc = tupDesc; /* assume we need not copy tupDesc */ - indexScanKey = _bt_mkscankey(indexRel, NULL); + indexScanKey = _bt_mkscankey(indexRel, NULL, NULL); if (state->indexInfo->ii_Expressions != NULL) { @@ -1104,7 +1104,7 @@ tuplesort_begin_index_btree(Relation heapRel, state->indexRel = indexRel; state->enforceUnique = enforceUnique; - indexScanKey = _bt_mkscankey(indexRel, NULL); + indexScanKey = _bt_mkscankey(indexRel, NULL, NULL); /* Prepare SortSupport data for each column */ state->sortKeys = (SortSupport) palloc0(state->nKeys * diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h index d357ebb559..5f436476ef 100644 --- a/src/include/access/amapi.h +++ b/src/include/access/amapi.h @@ -162,6 +162,12 @@ typedef IndexScanDesc (*ambeginscan_function) (Relation indexRelation, int nkeys, int norderbys); +/* prepare for index scan with skip */ +typedef IndexScanDesc (*ambeginscan_skip_function) (Relation indexRelation, + int nkeys, + int norderbys, + int prefix); + /* (re)start index scan */ typedef void (*amrescan_function) (IndexScanDesc scan, ScanKey keys, @@ -173,6 +179,16 @@ typedef void (*amrescan_function) (IndexScanDesc scan, typedef bool (*amgettuple_function) (IndexScanDesc scan, ScanDirection direction); +/* next valid tuple */ +typedef bool (*amgettuple_with_skip_function) (IndexScanDesc scan, + ScanDirection prefixDir, + ScanDirection postfixDir); + +/* skip past duplicates */ +typedef bool (*amskip_function) (IndexScanDesc scan, + ScanDirection prefixDir, + ScanDirection postfixDir); + /* fetch all valid tuples */ typedef int64 (*amgetbitmap_function) (IndexScanDesc scan, TIDBitmap *tbm); @@ -269,12 +285,15 @@ typedef struct IndexAmRoutine amvalidate_function amvalidate; amadjustmembers_function amadjustmembers; /* can be NULL */ ambeginscan_function ambeginscan; + ambeginscan_skip_function ambeginskipscan; amrescan_function amrescan; amgettuple_function amgettuple; /* can be NULL */ + amgettuple_with_skip_function amgetskiptuple; /* can be NULL */ amgetbitmap_function amgetbitmap; /* can be NULL */ amendscan_function amendscan; ammarkpos_function ammarkpos; /* can be NULL */ amrestrpos_function amrestrpos; /* can be NULL */ + amskip_function amskip; /* can be NULL */ /* interface functions to support parallel index scans */ amestimateparallelscan_function amestimateparallelscan; /* can be NULL */ diff --git a/src/include/access/genam.h b/src/include/access/genam.h index 480a4762f5..38f51b4690 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -152,9 +152,17 @@ extern IndexScanDesc index_beginscan(Relation heapRelation, Relation indexRelation, Snapshot snapshot, int nkeys, int norderbys); +extern IndexScanDesc index_beginscan_skip(Relation heapRelation, + Relation indexRelation, + Snapshot snapshot, + int nkeys, int norderbys, int prefix); extern IndexScanDesc index_beginscan_bitmap(Relation indexRelation, Snapshot snapshot, int nkeys); +extern IndexScanDesc index_beginscan_bitmap_skip(Relation indexRelation, + Snapshot snapshot, + int nkeys, + int prefix); extern void index_rescan(IndexScanDesc scan, ScanKey keys, int nkeys, ScanKey orderbys, int norderbys); @@ -170,10 +178,16 @@ extern IndexScanDesc index_beginscan_parallel(Relation heaprel, ParallelIndexScanDesc pscan); extern ItemPointer index_getnext_tid(IndexScanDesc scan, ScanDirection direction); +extern ItemPointer index_getnext_tid_skip(IndexScanDesc scan, + ScanDirection prefixDir, + ScanDirection postfixDir); struct TupleTableSlot; extern bool index_fetch_heap(IndexScanDesc scan, struct TupleTableSlot *slot); extern bool index_getnext_slot(IndexScanDesc scan, ScanDirection direction, struct TupleTableSlot *slot); +extern bool index_getnext_slot_skip(IndexScanDesc scan, ScanDirection prefixDir, + ScanDirection postfixDir, + struct TupleTableSlot *slot); extern int64 index_getbitmap(IndexScanDesc scan, TIDBitmap *bitmap); extern IndexBulkDeleteResult *index_bulk_delete(IndexVacuumInfo *info, @@ -183,6 +197,8 @@ extern IndexBulkDeleteResult *index_bulk_delete(IndexVacuumInfo *info, extern IndexBulkDeleteResult *index_vacuum_cleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *istat); extern bool index_can_return(Relation indexRelation, int attno); +extern bool index_skip(IndexScanDesc scan, ScanDirection prefixDir, + ScanDirection postfixDir); extern RegProcedure index_getprocid(Relation irel, AttrNumber attnum, uint16 procnum); extern FmgrInfo *index_getprocinfo(Relation irel, AttrNumber attnum, diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 30a216e4c0..de19bbc878 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1027,6 +1027,54 @@ typedef struct BTArrayKeyInfo Datum *elem_values; /* array of num_elems Datums */ } BTArrayKeyInfo; +typedef struct BTSkipCompareResult +{ + bool equal; + int prefixCmpResult, skCmpResult; + bool prefixSkip, fullKeySkip; + int prefixSkipIndex; +} BTSkipCompareResult; + +typedef enum BTSkipState +{ + SkipStateStop, + SkipStateSkip, + SkipStateSkipExtra, + SkipStateNext +} BTSkipState; + +typedef struct BTSkipPosData +{ + BTSkipState nextAction; + ScanDirection nextDirection; + int nextSkipIndex; + BTScanInsertData skipScanKey; + char skipTuple[BLCKSZ]; /* tuple data where skipScanKey Datum's point to */ +} BTSkipPosData; + +typedef struct BTSkipData +{ + /* used to control skipping + * curPos.skipScanKey is a combination of currentTupleKey and fwdScanKey/bwdScanKey. + * currentTupleKey contains the scan keys for the current tuple + * fwdScanKey contains the scan keys for quals that would be chosen for a forward scan + * bwdScanKey contains the scan keys for quals that would be chosen for a backward scan + * we need both fwd and bwd, because the scan keys differ for going fwd and bwd + * if a qual would be a>2 and a<5, fwd would have a>2, while bwd would have a<5 + */ + BTScanInsertData currentTupleKey; + BTScanInsertData fwdScanKey; + ScanKeyData fwdNotNullKeys[INDEX_MAX_KEYS]; + BTScanInsertData bwdScanKey; + ScanKeyData bwdNotNullKeys[INDEX_MAX_KEYS]; + /* length of prefix to skip */ + int prefix; + + BTSkipPosData curPos, markPos; +} BTSkipData; + +typedef BTSkipData *BTSkip; + typedef struct BTScanOpaqueData { /* these fields are set by _bt_preprocess_keys(): */ @@ -1064,6 +1112,9 @@ typedef struct BTScanOpaqueData */ int markItemIndex; /* itemIndex, or -1 if not valid */ + /* Work space for _bt_skip */ + BTSkip skipData; /* used to control skipping */ + /* keep these last in struct for efficiency */ BTScanPosData currPos; /* current position data */ BTScanPosData markPos; /* marked position, if any */ @@ -1078,6 +1129,8 @@ typedef BTScanOpaqueData *BTScanOpaque; */ #define SK_BT_REQFWD 0x00010000 /* required to continue forward scan */ #define SK_BT_REQBKWD 0x00020000 /* required to continue backward scan */ +#define SK_BT_REQSKIPFWD 0x00040000 /* required to continue forward scan within current prefix */ +#define SK_BT_REQSKIPBKWD 0x00080000 /* required to continue backward scan within current prefix */ #define SK_BT_INDOPTION_SHIFT 24 /* must clear the above bits */ #define SK_BT_DESC (INDOPTION_DESC << SK_BT_INDOPTION_SHIFT) #define SK_BT_NULLS_FIRST (INDOPTION_NULLS_FIRST << SK_BT_INDOPTION_SHIFT) @@ -1124,9 +1177,12 @@ extern bool btinsert(Relation rel, Datum *values, bool *isnull, bool indexUnchanged, struct IndexInfo *indexInfo); extern IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys); +extern IndexScanDesc btbeginscan_skip(Relation rel, int nkeys, int norderbys, int skipPrefix); extern Size btestimateparallelscan(void); extern void btinitparallelscan(void *target); extern bool btgettuple(IndexScanDesc scan, ScanDirection dir); +extern bool btgettuple_skip(IndexScanDesc scan, ScanDirection prefixDir, ScanDirection postfixDir); +extern bool btskip(IndexScanDesc scan, ScanDirection prefixDir, ScanDirection postfixDir); extern int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm); extern void btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys); @@ -1227,15 +1283,81 @@ extern Buffer _bt_moveright(Relation rel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access, Snapshot snapshot); extern OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate); extern int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum); -extern bool _bt_first(IndexScanDesc scan, ScanDirection dir); -extern bool _bt_next(IndexScanDesc scan, ScanDirection dir); +extern bool _bt_first(IndexScanDesc scan, ScanDirection prefixDir, ScanDirection postfixDir); +extern bool _bt_next(IndexScanDesc scan, ScanDirection prefixDir, ScanDirection postfixDir); extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, Snapshot snapshot); +extern Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot); +extern OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf); +extern void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir); +extern bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, + OffsetNumber *offnum, bool isRegularMode); +extern bool _bt_steppage(IndexScanDesc scan, ScanDirection dir); +extern bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir); +extern void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp); + +/* +* prototypes for functions in nbtskip.c +*/ +static inline bool +_bt_skip_enabled(BTScanOpaque so) +{ + return so->skipData != NULL; +} + +static inline bool +_bt_skip_is_regular_mode(ScanDirection prefixDir, ScanDirection postfixDir) +{ + return prefixDir == postfixDir; +} + +/* returns whether or not we can use extra quals in the scankey after skipping to a prefix */ +static inline bool +_bt_has_extra_quals_after_skip(BTSkip skip, ScanDirection dir, int prefix) +{ + if (ScanDirectionIsForward(dir)) + { + return skip->fwdScanKey.keysz > prefix; + } + else + { + return skip->bwdScanKey.keysz > prefix; + } +} + +/* alias of BTScanPosIsValid */ +static inline bool +_bt_skip_is_always_valid(BTScanOpaque so) +{ + return BTScanPosIsValid(so->currPos); +} + +extern bool _bt_skip(IndexScanDesc scan, ScanDirection prefixDir, ScanDirection postfixDir); +extern void _bt_skip_create_scankeys(Relation rel, BTScanOpaque so); +extern void _bt_skip_update_scankey_for_extra_skip(IndexScanDesc scan, Relation indexRel, + ScanDirection curDir, ScanDirection prefixDir, bool prioritizeEqual, IndexTuple itup); +extern void _bt_skip_once(IndexScanDesc scan, IndexTuple *curTuple, OffsetNumber *curTupleOffnum, + bool forceSkip, ScanDirection prefixDir, ScanDirection postfixDir); +extern void _bt_skip_extra_conditions(IndexScanDesc scan, IndexTuple *curTuple, OffsetNumber *curTupleOffnum, + ScanDirection prefixDir, ScanDirection postfixDir, BTSkipCompareResult *cmp); +extern bool _bt_skip_find_next(IndexScanDesc scan, IndexTuple curTuple, OffsetNumber curTupleOffnum, + ScanDirection prefixDir, ScanDirection postfixDir); +extern void _bt_skip_until_match(IndexScanDesc scan, IndexTuple *curTuple, OffsetNumber *curTupleOffnum, + ScanDirection prefixDir, ScanDirection postfixDir); +extern bool _bt_has_results(BTScanOpaque so); +extern void _bt_compare_current_item(IndexScanDesc scan, IndexTuple tuple, int tupnatts, + ScanDirection dir, bool isRegularMode, BTSkipCompareResult* cmp); +extern bool _bt_step_back_page(IndexScanDesc scan, IndexTuple *curTuple, OffsetNumber *curTupleOffnum); +extern bool _bt_step_forward_page(IndexScanDesc scan, BlockNumber next, IndexTuple *curTuple, + OffsetNumber *curTupleOffnum); +extern bool _bt_checkkeys_skip(IndexScanDesc scan, IndexTuple tuple, int tupnatts, + ScanDirection dir, bool *continuescan, int *prefixskipindex); +extern IndexTuple +_bt_get_tuple_from_offset(BTScanOpaque so, OffsetNumber curTupleOffnum); /* * prototypes for functions in nbtutils.c */ -extern BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup); extern void _bt_freestack(BTStack stack); extern void _bt_preprocess_array_keys(IndexScanDesc scan); extern void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir); @@ -1244,7 +1366,7 @@ extern void _bt_mark_array_keys(IndexScanDesc scan); extern void _bt_restore_array_keys(IndexScanDesc scan); extern void _bt_preprocess_keys(IndexScanDesc scan); extern bool _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, - int tupnatts, ScanDirection dir, bool *continuescan); + int tupnatts, ScanDirection dir, bool *continuescan, int *indexSkipPrefix); extern void _bt_killitems(IndexScanDesc scan); extern BTCycleId _bt_vacuum_cycleid(Relation rel); extern BTCycleId _bt_start_vacuum(Relation rel); @@ -1266,6 +1388,19 @@ extern bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, extern void _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace, Page page, IndexTuple newtup); extern bool _bt_allequalimage(Relation rel, bool debugmessage); +extern bool _bt_checkkeys_threeway(IndexScanDesc scan, IndexTuple tuple, int tupnatts, + ScanDirection dir, bool *continuescan, int *prefixSkipIndex); +extern bool _bt_create_insertion_scan_key(Relation rel, ScanDirection dir, + ScanKey* startKeys, int keysCount, + BTScanInsert inskey, StrategyNumber* stratTotal, + bool* goback); +extern void _bt_set_bsearch_flags(StrategyNumber stratTotal, ScanDirection dir, + bool* nextkey, bool* goback); +extern int _bt_choose_scan_keys(ScanKey scanKeys, int numberOfKeys, ScanDirection dir, +ScanKey* startKeys, ScanKeyData* notnullkeys, + StrategyNumber* stratTotal, int prefix); +extern BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup, BTScanInsert key); +extern void print_itup(BlockNumber blk, IndexTuple left, IndexTuple right, Relation rel, char *extra); /* * prototypes for functions in nbtvalidate.c diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index cd57a704ad..1833fa946b 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -455,9 +455,13 @@ extern Datum ExecMakeFunctionResultSet(SetExprState *fcache, */ typedef TupleTableSlot *(*ExecScanAccessMtd) (ScanState *node); typedef bool (*ExecScanRecheckMtd) (ScanState *node, TupleTableSlot *slot); +typedef bool (*ExecScanSkipMtd) (ScanState *node); extern TupleTableSlot *ExecScan(ScanState *node, ExecScanAccessMtd accessMtd, ExecScanRecheckMtd recheckMtd); +extern TupleTableSlot *ExecScanExtended(ScanState *node, ExecScanAccessMtd accessMtd, + ExecScanRecheckMtd recheckMtd, + ExecScanSkipMtd skipMtd); extern void ExecAssignScanProjectionInfo(ScanState *node); extern void ExecAssignScanProjectionInfoWithVarno(ScanState *node, int varno); extern void ExecScanReScan(ScanState *node); diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 2e8cbee69f..4a7c750d18 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1379,6 +1379,7 @@ typedef struct ScanState Relation ss_currentRelation; struct TableScanDescData *ss_currentScanDesc; TupleTableSlot *ss_ScanTupleSlot; + bool ss_FirstTupleEmitted; } ScanState; /* ---------------- @@ -1475,6 +1476,8 @@ typedef struct IndexScanState ExprContext *iss_RuntimeContext; Relation iss_RelationDesc; struct IndexScanDescData *iss_ScanDesc; + int iss_SkipPrefixSize; + bool iss_Distinct; /* These are needed for re-checking ORDER BY expr ordering */ pairingheap *iss_ReorderQueue; @@ -1504,6 +1507,8 @@ typedef struct IndexScanState * TableSlot slot for holding tuples fetched from the table * VMBuffer buffer in use for visibility map testing, if any * PscanLen size of parallel index-only scan descriptor + * SkipPrefixSize number of keys for skip-based DISTINCT + * FirstTupleEmitted has the first tuple been emitted * ---------------- */ typedef struct IndexOnlyScanState @@ -1522,6 +1527,8 @@ typedef struct IndexOnlyScanState struct IndexScanDescData *ioss_ScanDesc; TupleTableSlot *ioss_TableSlot; Buffer ioss_VMBuffer; + int ioss_SkipPrefixSize; + bool ioss_Distinct; Size ioss_PscanLen; } IndexOnlyScanState; diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h index 3ae6b91576..87042223c9 100644 --- a/src/include/nodes/pathnodes.h +++ b/src/include/nodes/pathnodes.h @@ -699,6 +699,7 @@ typedef struct RelOptInfo List *unique_pathlist; /* unique Paths */ struct Path *cheapest_startup_path; struct Path *cheapest_total_path; + struct Path *cheapest_distinct_unique_path; struct Path *cheapest_unique_path; List *cheapest_parameterized_paths; @@ -1267,6 +1268,9 @@ typedef struct Path * we need not recompute them when considering using the same index in a * bitmap index/heap scan (see BitmapHeapPath). The costs of the IndexPath * itself represent the costs of an IndexScan or IndexOnlyScan plan type. + * + * 'indexskipprefix' represents the number of columns to consider for skip + * scans. *---------- */ typedef struct IndexPath @@ -1279,6 +1283,8 @@ typedef struct IndexPath ScanDirection indexscandir; Cost indextotalcost; Selectivity indexselectivity; + int indexskipprefix; + bool indexdistinct; } IndexPath; /* diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index 01a246d50e..79b70b9414 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -411,6 +411,8 @@ typedef struct IndexScan List *indexorderbyorig; /* the same in original form */ List *indexorderbyops; /* OIDs of sort ops for ORDER BY exprs */ ScanDirection indexorderdir; /* forward or backward or don't care */ + int indexskipprefixsize; /* the size of the prefix for skip scans */ + bool indexdistinct; /* whether only distinct keys are requested */ } IndexScan; /* ---------------- @@ -438,6 +440,8 @@ typedef struct IndexOnlyScan List *indexorderby; /* list of index ORDER BY exprs */ List *indextlist; /* TargetEntry list describing index's cols */ ScanDirection indexorderdir; /* forward or backward or don't care */ + int indexskipprefixsize; /* the size of the prefix for skip scans */ + bool indexdistinct; /* whether only distinct keys are requested */ } IndexOnlyScan; /* ---------------- @@ -464,6 +468,7 @@ typedef struct BitmapIndexScan bool isshared; /* Create shared bitmap if set */ List *indexqual; /* list of index quals (OpExprs) */ List *indexqualorig; /* the same in original form */ + int indexskipprefixsize; /* the size of the prefix for skip scans */ } BitmapIndexScan; /* ---------------- diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index 2113bc82de..561ab023e2 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -50,6 +50,7 @@ extern PGDLLIMPORT int max_parallel_workers_per_gather; extern PGDLLIMPORT bool enable_seqscan; extern PGDLLIMPORT bool enable_indexscan; extern PGDLLIMPORT bool enable_indexonlyscan; +extern PGDLLIMPORT bool enable_indexskipscan; extern PGDLLIMPORT bool enable_bitmapscan; extern PGDLLIMPORT bool enable_tidscan; extern PGDLLIMPORT bool enable_sort; diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h index facb2dfe74..0343b2e1f6 100644 --- a/src/include/optimizer/pathnode.h +++ b/src/include/optimizer/pathnode.h @@ -217,6 +217,10 @@ extern UpperUniquePath *create_upper_unique_path(PlannerInfo *root, Path *subpath, int numCols, double numGroups); +extern IndexPath *create_skipscan_unique_path(PlannerInfo *root, + IndexOptInfo *index, + Path *subpath, + int prefix); extern AggPath *create_agg_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h index e71e65264a..32f0527019 100644 --- a/src/include/optimizer/paths.h +++ b/src/include/optimizer/paths.h @@ -212,6 +212,10 @@ extern Path *get_cheapest_fractional_path_for_pathkeys(List *paths, Relids required_outer, double fraction); extern Path *get_cheapest_parallel_safe_total_inner(List *paths); +extern int find_index_prefix_for_pathkey(PlannerInfo *root, + IndexOptInfo *index, + ScanDirection scandir, + PathKey *pathkey); extern List *build_index_pathkeys(PlannerInfo *root, IndexOptInfo *index, ScanDirection scandir); extern List *build_partition_pathkeys(PlannerInfo *root, RelOptInfo *partrel, diff --git a/src/interfaces/libpq/encnames.c b/src/interfaces/libpq/encnames.c new file mode 120000 index 0000000000..ca78618b55 --- /dev/null +++ b/src/interfaces/libpq/encnames.c @@ -0,0 +1 @@ +../../../src/backend/utils/mb/encnames.c \ No newline at end of file diff --git a/src/interfaces/libpq/wchar.c b/src/interfaces/libpq/wchar.c new file mode 120000 index 0000000000..a27508f72a --- /dev/null +++ b/src/interfaces/libpq/wchar.c @@ -0,0 +1 @@ +../../../src/backend/utils/mb/wchar.c \ No newline at end of file diff --git a/src/test/regress/expected/select_distinct.out b/src/test/regress/expected/select_distinct.out index 58122c6f88..ec98dbf63b 100644 --- a/src/test/regress/expected/select_distinct.out +++ b/src/test/regress/expected/select_distinct.out @@ -375,3 +375,602 @@ SELECT null IS NOT DISTINCT FROM null as "yes"; t (1 row) +-- index only skip scan +CREATE TABLE distinct_a (a int, b int, c int); +INSERT INTO distinct_a ( + SELECT five, tenthous, 10 FROM + generate_series(1, 5) five, + generate_series(1, 10000) tenthous +); +CREATE INDEX ON distinct_a (a, b); +ANALYZE distinct_a; +SELECT DISTINCT a FROM distinct_a; + a +--- + 1 + 2 + 3 + 4 + 5 +(5 rows) + +SELECT DISTINCT a FROM distinct_a WHERE a = 1; + a +--- + 1 +(1 row) + +SELECT DISTINCT a FROM distinct_a ORDER BY a DESC; + a +--- + 5 + 4 + 3 + 2 + 1 +(5 rows) + +EXPLAIN (COSTS OFF) +SELECT DISTINCT a FROM distinct_a; + QUERY PLAN +-------------------------------------------------------- + Index Only Scan using distinct_a_a_b_idx on distinct_a + Skip scan: Distinct only +(2 rows) + +-- test index skip scan with a condition on a non unique field +SELECT DISTINCT ON (a) a, b FROM distinct_a WHERE b = 2; + a | b +---+--- + 1 | 2 + 2 | 2 + 3 | 2 + 4 | 2 + 5 | 2 +(5 rows) + +-- test index skip scan backwards +SELECT DISTINCT ON (a) a, b FROM distinct_a ORDER BY a DESC, b DESC; + a | b +---+------- + 5 | 10000 + 4 | 10000 + 3 | 10000 + 2 | 10000 + 1 | 10000 +(5 rows) + +-- check colums order +CREATE INDEX distinct_a_b_a on distinct_a (b, a); +SELECT DISTINCT a FROM distinct_a WHERE b = 2; + a +--- + 1 + 2 + 3 + 4 + 5 +(5 rows) + +SELECT DISTINCT on (a, b) a, b FROM distinct_a WHERE b = 2; + a | b +---+--- + 1 | 2 + 2 | 2 + 3 | 2 + 4 | 2 + 5 | 2 +(5 rows) + +EXPLAIN (COSTS OFF) +SELECT DISTINCT a FROM distinct_a WHERE b = 2; + QUERY PLAN +---------------------------------------------------- + Index Only Scan using distinct_a_b_a on distinct_a + Skip scan: Distinct only + Index Cond: (b = 2) +(3 rows) + +EXPLAIN (COSTS OFF) +SELECT DISTINCT on (a, b) a, b FROM distinct_a WHERE b = 2; + QUERY PLAN +---------------------------------------------------- + Index Only Scan using distinct_a_b_a on distinct_a + Skip scan: Distinct only + Index Cond: (b = 2) +(3 rows) + +DROP INDEX distinct_a_b_a; +-- test opposite scan/index directions inside a cursor +-- forward/backward +BEGIN; +DECLARE c SCROLL CURSOR FOR +SELECT DISTINCT ON (a) a,b FROM distinct_a ORDER BY a, b; +FETCH FROM c; + a | b +---+--- + 1 | 1 +(1 row) + +FETCH BACKWARD FROM c; + a | b +---+--- +(0 rows) + +FETCH 6 FROM c; + a | b +---+--- + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 +(5 rows) + +FETCH BACKWARD 6 FROM c; + a | b +---+--- + 5 | 1 + 4 | 1 + 3 | 1 + 2 | 1 + 1 | 1 +(5 rows) + +FETCH 6 FROM c; + a | b +---+--- + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 +(5 rows) + +FETCH BACKWARD 6 FROM c; + a | b +---+--- + 5 | 1 + 4 | 1 + 3 | 1 + 2 | 1 + 1 | 1 +(5 rows) + +END; +-- backward/forward +BEGIN; +DECLARE c SCROLL CURSOR FOR +SELECT DISTINCT ON (a) a,b FROM distinct_a ORDER BY a DESC, b DESC; +FETCH FROM c; + a | b +---+------- + 5 | 10000 +(1 row) + +FETCH BACKWARD FROM c; + a | b +---+--- +(0 rows) + +FETCH 6 FROM c; + a | b +---+------- + 5 | 10000 + 4 | 10000 + 3 | 10000 + 2 | 10000 + 1 | 10000 +(5 rows) + +FETCH BACKWARD 6 FROM c; + a | b +---+------- + 1 | 10000 + 2 | 10000 + 3 | 10000 + 4 | 10000 + 5 | 10000 +(5 rows) + +FETCH 6 FROM c; + a | b +---+------- + 5 | 10000 + 4 | 10000 + 3 | 10000 + 2 | 10000 + 1 | 10000 +(5 rows) + +FETCH BACKWARD 6 FROM c; + a | b +---+------- + 1 | 10000 + 2 | 10000 + 3 | 10000 + 4 | 10000 + 5 | 10000 +(5 rows) + +END; +-- test missing values and skipping from the end +CREATE TABLE distinct_abc(a int, b int, c int); +CREATE INDEX ON distinct_abc(a, b, c); +INSERT INTO distinct_abc + VALUES (1, 1, 1), + (1, 1, 2), + (1, 2, 2), + (1, 2, 3), + (2, 2, 1), + (2, 2, 3), + (3, 1, 1), + (3, 1, 2), + (3, 2, 2), + (3, 2, 3); +EXPLAIN (COSTS OFF) +SELECT DISTINCT ON (a) a,b,c FROM distinct_abc WHERE c = 2; + QUERY PLAN +-------------------------------------------------------------- + Index Only Scan using distinct_abc_a_b_c_idx on distinct_abc + Skip scan: Distinct only + Index Cond: (c = 2) +(3 rows) + +BEGIN; +DECLARE c SCROLL CURSOR FOR +SELECT DISTINCT ON (a) a,b,c FROM distinct_abc WHERE c = 2; +FETCH ALL FROM c; + a | b | c +---+---+--- + 1 | 1 | 2 + 3 | 1 | 2 +(2 rows) + +FETCH BACKWARD ALL FROM c; + a | b | c +---+---+--- + 3 | 1 | 2 + 1 | 1 | 2 +(2 rows) + +END; +EXPLAIN (COSTS OFF) +SELECT DISTINCT ON (a) a,b,c FROM distinct_abc WHERE c = 2 +ORDER BY a DESC, b DESC; + QUERY PLAN +----------------------------------------------------------------------- + Index Only Scan Backward using distinct_abc_a_b_c_idx on distinct_abc + Skip scan: Distinct only + Index Cond: (c = 2) +(3 rows) + +BEGIN; +DECLARE c SCROLL CURSOR FOR +SELECT DISTINCT ON (a) a,b,c FROM distinct_abc WHERE c = 2 +ORDER BY a DESC, b DESC; +FETCH ALL FROM c; + a | b | c +---+---+--- + 3 | 2 | 2 + 1 | 2 | 2 +(2 rows) + +FETCH BACKWARD ALL FROM c; + a | b | c +---+---+--- + 1 | 2 | 2 + 3 | 2 | 2 +(2 rows) + +END; +DROP TABLE distinct_abc; +-- index skip scan +SELECT DISTINCT ON (a) a, b, c +FROM distinct_a ORDER BY a; + a | b | c +---+---+---- + 1 | 1 | 10 + 2 | 1 | 10 + 3 | 1 | 10 + 4 | 1 | 10 + 5 | 1 | 10 +(5 rows) + +SELECT DISTINCT ON (a) a, b, c +FROM distinct_a WHERE a = 1 ORDER BY a; + a | b | c +---+---+---- + 1 | 1 | 10 +(1 row) + +EXPLAIN (COSTS OFF) +SELECT DISTINCT ON (a) a, b, c +FROM distinct_a ORDER BY a; + QUERY PLAN +--------------------------------------------------- + Index Scan using distinct_a_a_b_idx on distinct_a + Skip scan: Distinct only +(2 rows) + +EXPLAIN (COSTS OFF) +SELECT DISTINCT ON (a) a, b, c +FROM distinct_a WHERE a = 1 ORDER BY a; + QUERY PLAN +--------------------------------------------------- + Index Scan using distinct_a_a_b_idx on distinct_a + Skip scan: Distinct only + Index Cond: (a = 1) +(3 rows) + +-- check colums order +SELECT DISTINCT a FROM distinct_a WHERE b = 2 AND c = 10; + a +--- + 1 + 2 + 3 + 4 + 5 +(5 rows) + +EXPLAIN (COSTS OFF) +SELECT DISTINCT a FROM distinct_a WHERE b = 2 AND c = 10; + QUERY PLAN +--------------------------------------------------- + Index Scan using distinct_a_a_b_idx on distinct_a + Skip scan: Distinct only + Index Cond: (b = 2) + Filter: (c = 10) +(4 rows) + +-- check projection case +SELECT DISTINCT a, a FROM distinct_a WHERE b = 2; + a | a +---+--- + 1 | 1 + 2 | 2 + 3 | 3 + 4 | 4 + 5 | 5 +(5 rows) + +SELECT DISTINCT a, 1 FROM distinct_a WHERE b = 2; + a | ?column? +---+---------- + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 +(5 rows) + +-- test cursor forward/backward movements +BEGIN; +DECLARE c SCROLL CURSOR FOR SELECT DISTINCT a FROM distinct_a; +FETCH FROM c; + a +--- + 1 +(1 row) + +FETCH BACKWARD FROM c; + a +--- +(0 rows) + +FETCH 6 FROM c; + a +--- + 1 + 2 + 3 + 4 + 5 +(5 rows) + +FETCH BACKWARD 6 FROM c; + a +--- + 5 + 4 + 3 + 2 + 1 +(5 rows) + +FETCH 6 FROM c; + a +--- + 1 + 2 + 3 + 4 + 5 +(5 rows) + +FETCH BACKWARD 6 FROM c; + a +--- + 5 + 4 + 3 + 2 + 1 +(5 rows) + +END; +DROP TABLE distinct_a; +-- test tuples visibility +CREATE TABLE distinct_visibility (a int, b int); +INSERT INTO distinct_visibility (select a, b from generate_series(1,5) a, generate_series(1, 10000) b); +CREATE INDEX ON distinct_visibility (a, b); +ANALYZE distinct_visibility; +SELECT DISTINCT ON (a) a, b FROM distinct_visibility ORDER BY a, b; + a | b +---+--- + 1 | 1 + 2 | 1 + 3 | 1 + 4 | 1 + 5 | 1 +(5 rows) + +DELETE FROM distinct_visibility WHERE a = 2 and b = 1; +SELECT DISTINCT ON (a) a, b FROM distinct_visibility ORDER BY a, b; + a | b +---+--- + 1 | 1 + 2 | 2 + 3 | 1 + 4 | 1 + 5 | 1 +(5 rows) + +SELECT DISTINCT ON (a) a, b FROM distinct_visibility ORDER BY a DESC, b DESC; + a | b +---+------- + 5 | 10000 + 4 | 10000 + 3 | 10000 + 2 | 10000 + 1 | 10000 +(5 rows) + +DELETE FROM distinct_visibility WHERE a = 2 and b = 10000; +SELECT DISTINCT ON (a) a, b FROM distinct_visibility ORDER BY a DESC, b DESC; + a | b +---+------- + 5 | 10000 + 4 | 10000 + 3 | 10000 + 2 | 9999 + 1 | 10000 +(5 rows) + +DROP TABLE distinct_visibility; +-- test page boundaries +CREATE TABLE distinct_boundaries AS + SELECT a, b::int2 b, (b % 2)::int2 c FROM + generate_series(1, 5) a, + generate_series(1,366) b; +CREATE INDEX ON distinct_boundaries (a, b, c); +ANALYZE distinct_boundaries; +EXPLAIN (COSTS OFF) +SELECT DISTINCT ON (a) a, b, c from distinct_boundaries +WHERE b >= 1 and c = 0 ORDER BY a, b; + QUERY PLAN +---------------------------------------------------------------------------- + Index Only Scan using distinct_boundaries_a_b_c_idx on distinct_boundaries + Skip scan: Distinct only + Index Cond: ((b >= 1) AND (c = 0)) +(3 rows) + +SELECT DISTINCT ON (a) a, b, c from distinct_boundaries +WHERE b >= 1 and c = 0 ORDER BY a, b; + a | b | c +---+---+--- + 1 | 2 | 0 + 2 | 2 | 0 + 3 | 2 | 0 + 4 | 2 | 0 + 5 | 2 | 0 +(5 rows) + +DROP TABLE distinct_boundaries; +-- test tuple killing +-- DESC ordering +CREATE TABLE distinct_killed AS + SELECT a, b, b % 2 AS c, 10 AS d + FROM generate_series(1, 5) a, + generate_series(1,1000) b; +CREATE INDEX ON distinct_killed (a, b, c, d); +DELETE FROM distinct_killed where a = 3; +BEGIN; + DECLARE c SCROLL CURSOR FOR + SELECT DISTINCT ON (a) a,b,c,d + FROM distinct_killed ORDER BY a DESC, b DESC; + FETCH FORWARD ALL FROM c; + a | b | c | d +---+------+---+---- + 5 | 1000 | 0 | 10 + 4 | 1000 | 0 | 10 + 2 | 1000 | 0 | 10 + 1 | 1000 | 0 | 10 +(4 rows) + + FETCH BACKWARD ALL FROM c; + a | b | c | d +---+------+---+---- + 1 | 1000 | 0 | 10 + 2 | 1000 | 0 | 10 + 4 | 1000 | 0 | 10 + 5 | 1000 | 0 | 10 +(4 rows) + +COMMIT; +DROP TABLE distinct_killed; +-- regular ordering +CREATE TABLE distinct_killed AS + SELECT a, b, b % 2 AS c, 10 AS d + FROM generate_series(1, 5) a, + generate_series(1,1000) b; +CREATE INDEX ON distinct_killed (a, b, c, d); +DELETE FROM distinct_killed where a = 3; +BEGIN; + DECLARE c SCROLL CURSOR FOR + SELECT DISTINCT ON (a) a,b,c,d + FROM distinct_killed ORDER BY a, b; + FETCH FORWARD ALL FROM c; + a | b | c | d +---+---+---+---- + 1 | 1 | 1 | 10 + 2 | 1 | 1 | 10 + 4 | 1 | 1 | 10 + 5 | 1 | 1 | 10 +(4 rows) + + FETCH BACKWARD ALL FROM c; + a | b | c | d +---+---+---+---- + 5 | 1 | 1 | 10 + 4 | 1 | 1 | 10 + 2 | 1 | 1 | 10 + 1 | 1 | 1 | 10 +(4 rows) + +COMMIT; +DROP TABLE distinct_killed; +-- partial delete +CREATE TABLE distinct_killed AS + SELECT a, b, b % 2 AS c, 10 AS d + FROM generate_series(1, 5) a, + generate_series(1,1000) b; +CREATE INDEX ON distinct_killed (a, b, c, d); +DELETE FROM distinct_killed WHERE a = 3 AND b <= 999; +BEGIN; + DECLARE c SCROLL CURSOR FOR + SELECT DISTINCT ON (a) a,b,c,d + FROM distinct_killed ORDER BY a DESC, b DESC; + FETCH FORWARD ALL FROM c; + a | b | c | d +---+------+---+---- + 5 | 1000 | 0 | 10 + 4 | 1000 | 0 | 10 + 3 | 1000 | 0 | 10 + 2 | 1000 | 0 | 10 + 1 | 1000 | 0 | 10 +(5 rows) + + FETCH BACKWARD ALL FROM c; + a | b | c | d +---+------+---+---- + 1 | 1000 | 0 | 10 + 2 | 1000 | 0 | 10 + 3 | 1000 | 0 | 10 + 4 | 1000 | 0 | 10 + 5 | 1000 | 0 | 10 +(5 rows) + +COMMIT; +DROP TABLE distinct_killed; diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index 6e54f3e15e..282cae21b3 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -103,6 +103,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_incremental_sort | on enable_indexonlyscan | on enable_indexscan | on + enable_indexskipscan | on enable_material | on enable_memoize | on enable_mergejoin | on @@ -115,7 +116,7 @@ select name, setting from pg_settings where name like 'enable%'; enable_seqscan | on enable_sort | on enable_tidscan | on -(20 rows) +(21 rows) -- Test that the pg_timezone_names and pg_timezone_abbrevs views are -- more-or-less working. We can't test their contents in any great detail diff --git a/src/test/regress/sql/select_distinct.sql b/src/test/regress/sql/select_distinct.sql index 1bfe59c26f..708aa2a746 100644 --- a/src/test/regress/sql/select_distinct.sql +++ b/src/test/regress/sql/select_distinct.sql @@ -174,3 +174,251 @@ SELECT 1 IS NOT DISTINCT FROM 2 as "no"; SELECT 2 IS NOT DISTINCT FROM 2 as "yes"; SELECT 2 IS NOT DISTINCT FROM null as "no"; SELECT null IS NOT DISTINCT FROM null as "yes"; + +-- index only skip scan +CREATE TABLE distinct_a (a int, b int, c int); +INSERT INTO distinct_a ( + SELECT five, tenthous, 10 FROM + generate_series(1, 5) five, + generate_series(1, 10000) tenthous +); +CREATE INDEX ON distinct_a (a, b); +ANALYZE distinct_a; + +SELECT DISTINCT a FROM distinct_a; +SELECT DISTINCT a FROM distinct_a WHERE a = 1; +SELECT DISTINCT a FROM distinct_a ORDER BY a DESC; + +EXPLAIN (COSTS OFF) +SELECT DISTINCT a FROM distinct_a; + +-- test index skip scan with a condition on a non unique field +SELECT DISTINCT ON (a) a, b FROM distinct_a WHERE b = 2; + +-- test index skip scan backwards +SELECT DISTINCT ON (a) a, b FROM distinct_a ORDER BY a DESC, b DESC; + +-- check colums order +CREATE INDEX distinct_a_b_a on distinct_a (b, a); + +SELECT DISTINCT a FROM distinct_a WHERE b = 2; +SELECT DISTINCT on (a, b) a, b FROM distinct_a WHERE b = 2; + +EXPLAIN (COSTS OFF) +SELECT DISTINCT a FROM distinct_a WHERE b = 2; + +EXPLAIN (COSTS OFF) +SELECT DISTINCT on (a, b) a, b FROM distinct_a WHERE b = 2; + +DROP INDEX distinct_a_b_a; + +-- test opposite scan/index directions inside a cursor +-- forward/backward +BEGIN; +DECLARE c SCROLL CURSOR FOR +SELECT DISTINCT ON (a) a,b FROM distinct_a ORDER BY a, b; + +FETCH FROM c; +FETCH BACKWARD FROM c; + +FETCH 6 FROM c; +FETCH BACKWARD 6 FROM c; + +FETCH 6 FROM c; +FETCH BACKWARD 6 FROM c; + +END; + +-- backward/forward +BEGIN; +DECLARE c SCROLL CURSOR FOR +SELECT DISTINCT ON (a) a,b FROM distinct_a ORDER BY a DESC, b DESC; + +FETCH FROM c; +FETCH BACKWARD FROM c; + +FETCH 6 FROM c; +FETCH BACKWARD 6 FROM c; + +FETCH 6 FROM c; +FETCH BACKWARD 6 FROM c; + +END; + +-- test missing values and skipping from the end +CREATE TABLE distinct_abc(a int, b int, c int); +CREATE INDEX ON distinct_abc(a, b, c); +INSERT INTO distinct_abc + VALUES (1, 1, 1), + (1, 1, 2), + (1, 2, 2), + (1, 2, 3), + (2, 2, 1), + (2, 2, 3), + (3, 1, 1), + (3, 1, 2), + (3, 2, 2), + (3, 2, 3); + +EXPLAIN (COSTS OFF) +SELECT DISTINCT ON (a) a,b,c FROM distinct_abc WHERE c = 2; + +BEGIN; +DECLARE c SCROLL CURSOR FOR +SELECT DISTINCT ON (a) a,b,c FROM distinct_abc WHERE c = 2; + +FETCH ALL FROM c; +FETCH BACKWARD ALL FROM c; + +END; + +EXPLAIN (COSTS OFF) +SELECT DISTINCT ON (a) a,b,c FROM distinct_abc WHERE c = 2 +ORDER BY a DESC, b DESC; + +BEGIN; +DECLARE c SCROLL CURSOR FOR +SELECT DISTINCT ON (a) a,b,c FROM distinct_abc WHERE c = 2 +ORDER BY a DESC, b DESC; + +FETCH ALL FROM c; +FETCH BACKWARD ALL FROM c; + +END; + +DROP TABLE distinct_abc; + +-- index skip scan +SELECT DISTINCT ON (a) a, b, c +FROM distinct_a ORDER BY a; +SELECT DISTINCT ON (a) a, b, c +FROM distinct_a WHERE a = 1 ORDER BY a; + +EXPLAIN (COSTS OFF) +SELECT DISTINCT ON (a) a, b, c +FROM distinct_a ORDER BY a; +EXPLAIN (COSTS OFF) +SELECT DISTINCT ON (a) a, b, c +FROM distinct_a WHERE a = 1 ORDER BY a; + +-- check colums order +SELECT DISTINCT a FROM distinct_a WHERE b = 2 AND c = 10; + +EXPLAIN (COSTS OFF) +SELECT DISTINCT a FROM distinct_a WHERE b = 2 AND c = 10; + +-- check projection case +SELECT DISTINCT a, a FROM distinct_a WHERE b = 2; +SELECT DISTINCT a, 1 FROM distinct_a WHERE b = 2; + +-- test cursor forward/backward movements +BEGIN; +DECLARE c SCROLL CURSOR FOR SELECT DISTINCT a FROM distinct_a; + +FETCH FROM c; +FETCH BACKWARD FROM c; + +FETCH 6 FROM c; +FETCH BACKWARD 6 FROM c; + +FETCH 6 FROM c; +FETCH BACKWARD 6 FROM c; + +END; + +DROP TABLE distinct_a; + +-- test tuples visibility +CREATE TABLE distinct_visibility (a int, b int); +INSERT INTO distinct_visibility (select a, b from generate_series(1,5) a, generate_series(1, 10000) b); +CREATE INDEX ON distinct_visibility (a, b); +ANALYZE distinct_visibility; + +SELECT DISTINCT ON (a) a, b FROM distinct_visibility ORDER BY a, b; +DELETE FROM distinct_visibility WHERE a = 2 and b = 1; +SELECT DISTINCT ON (a) a, b FROM distinct_visibility ORDER BY a, b; + +SELECT DISTINCT ON (a) a, b FROM distinct_visibility ORDER BY a DESC, b DESC; +DELETE FROM distinct_visibility WHERE a = 2 and b = 10000; +SELECT DISTINCT ON (a) a, b FROM distinct_visibility ORDER BY a DESC, b DESC; +DROP TABLE distinct_visibility; + +-- test page boundaries +CREATE TABLE distinct_boundaries AS + SELECT a, b::int2 b, (b % 2)::int2 c FROM + generate_series(1, 5) a, + generate_series(1,366) b; + +CREATE INDEX ON distinct_boundaries (a, b, c); +ANALYZE distinct_boundaries; + +EXPLAIN (COSTS OFF) +SELECT DISTINCT ON (a) a, b, c from distinct_boundaries +WHERE b >= 1 and c = 0 ORDER BY a, b; + +SELECT DISTINCT ON (a) a, b, c from distinct_boundaries +WHERE b >= 1 and c = 0 ORDER BY a, b; + +DROP TABLE distinct_boundaries; + +-- test tuple killing + +-- DESC ordering +CREATE TABLE distinct_killed AS + SELECT a, b, b % 2 AS c, 10 AS d + FROM generate_series(1, 5) a, + generate_series(1,1000) b; + +CREATE INDEX ON distinct_killed (a, b, c, d); + +DELETE FROM distinct_killed where a = 3; + +BEGIN; + DECLARE c SCROLL CURSOR FOR + SELECT DISTINCT ON (a) a,b,c,d + FROM distinct_killed ORDER BY a DESC, b DESC; + FETCH FORWARD ALL FROM c; + FETCH BACKWARD ALL FROM c; +COMMIT; + +DROP TABLE distinct_killed; + +-- regular ordering +CREATE TABLE distinct_killed AS + SELECT a, b, b % 2 AS c, 10 AS d + FROM generate_series(1, 5) a, + generate_series(1,1000) b; + +CREATE INDEX ON distinct_killed (a, b, c, d); + +DELETE FROM distinct_killed where a = 3; + +BEGIN; + DECLARE c SCROLL CURSOR FOR + SELECT DISTINCT ON (a) a,b,c,d + FROM distinct_killed ORDER BY a, b; + FETCH FORWARD ALL FROM c; + FETCH BACKWARD ALL FROM c; +COMMIT; + +DROP TABLE distinct_killed; + +-- partial delete +CREATE TABLE distinct_killed AS + SELECT a, b, b % 2 AS c, 10 AS d + FROM generate_series(1, 5) a, + generate_series(1,1000) b; + +CREATE INDEX ON distinct_killed (a, b, c, d); + +DELETE FROM distinct_killed WHERE a = 3 AND b <= 999; + +BEGIN; + DECLARE c SCROLL CURSOR FOR + SELECT DISTINCT ON (a) a,b,c,d + FROM distinct_killed ORDER BY a DESC, b DESC; + FETCH FORWARD ALL FROM c; + FETCH BACKWARD ALL FROM c; +COMMIT; + +DROP TABLE distinct_killed; -- 2.29.2