DISTINCT with btree skip scan

Started by Thomas Munroover 11 years ago9 messages

munro@ip9.org

over 11 years ago

1 attachment(s)

Hello

As an exercise I hacked up the simplest code I could think of that would
demonstrate a faster DISTINCT based on skipping ahead to the next distinct
value in an index-only scan. Please see the attached (extremely buggy)
patch, and the example session below. (It's against my natural instinct to
send such half-baked early hacking phase code to the list, but thought it
would make sense to demo the concept and then seek advice, warnings, cease
and desist notices etc before pressing on down that route!) I would be
most grateful for any feedback you might have.

Best regards,

Thomas Munro

postgres=# create table foo (a int, b int, primary key (a, b));
CREATE TABLE
Time: 9.002 ms
postgres=# insert into foo select generate_series(1, 5000000) % 10,
generate_series(1, 5000000);
INSERT 0 5000000
Time: 35183.444 ms
postgres=# analyze foo;
ANALYZE
Time: 493.168 ms

postgres=# set enable_hashagg = true;
SET
Time: 0.274 ms
postgres=# explain select distinct a from foo;
┌───────────────────────────────────────────────────────────────────┐
│ QUERY PLAN │
├───────────────────────────────────────────────────────────────────┤
│ HashAggregate (cost=84624.00..84624.10 rows=10 width=4) │
│ Group Key: a │
│ -> Seq Scan on foo (cost=0.00..72124.00 rows=5000000 width=4) │
│ Planning time: 0.065 ms │
└───────────────────────────────────────────────────────────────────┘
(4 rows)

Time: 0.500 ms
postgres=# select distinct a from foo;
┌───┐
│ a │
├───┤
│ 6 │
│ 8 │
│ 1 │
│ 2 │
│ 3 │
│ 4 │
│ 5 │
│ 9 │
│ 7 │
│ 0 │
└───┘
(10 rows)

Time: 2017.019 ms

postgres=# set enable_hashagg = false;
SET
Time: 0.302 ms
postgres=# explain select distinct a from foo;
┌─────────────────────────────────────────────────────────────────────────────────────────────────────┐
│ QUERY PLAN
│
├─────────────────────────────────────────────────────────────────────────────────────────────────────┤
│ Index Only Scan for distinct prefix 1 using foo_pkey on foo
(cost=0.43..263354.20 rows=10 width=4) │
│ Planning time: 0.063 ms
│
└─────────────────────────────────────────────────────────────────────────────────────────────────────┘
(2 rows)

Time: 0.443 ms
postgres=# select distinct a from foo;
┌───┐
│ a │
├───┤
│ 0 │
│ 1 │
│ 2 │
│ 3 │
│ 4 │
│ 5 │
│ 6 │
│ 7 │
│ 8 │
│ 9 │
└───┘
(10 rows)

Time: 0.565 ms

Attachments:

distinct-skip.patchtext/x-patch; charset=US-ASCII; name=distinct-skip.patchDownload

diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index 53cf96f..5f10d7f 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -29,6 +29,7 @@
  *		index_can_return	- does index support index-only scans?
  *		index_getprocid - get a support procedure OID
  *		index_getprocinfo - get a support procedure's lookup info
+ *		index_skip		- advance to the next key value in a scan
  *
  * NOTES
  *		This file contains the index_ routines which used
@@ -742,6 +743,37 @@ index_can_return(Relation indexRelation)
 									  PointerGetDatum(indexRelation)));
 }
 
+bool
+index_can_skip(Relation indexRelation)
+{
+	FmgrInfo   *procedure;
+
+	RELATION_CHECKS;
+
+	/* amcanskip is optional; assume FALSE if not provided by AM */
+	if (!RegProcedureIsValid(indexRelation->rd_am->amcanskip))
+		return false;
+
+	GET_REL_PROCEDURE(amcanskip);
+
+	return DatumGetBool(FunctionCall1(procedure,
+									  PointerGetDatum(indexRelation)));
+}
+
+bool
+index_skip(IndexScanDesc scan, ScanDirection direction, int prefix)
+{
+	FmgrInfo   *procedure;
+	SCAN_CHECKS;
+	GET_SCAN_PROCEDURE(amskip);
+	Datum result =
+		DatumGetPointer(FunctionCall3(procedure,
+									  PointerGetDatum(scan),
+									  Int32GetDatum(direction),
+									  Int32GetDatum(prefix)));
+	return DatumGetBool(result);
+}
+
 /* ----------------
  *		index_getprocid
  *
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 36dc6c2..2f987e4 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -434,6 +434,8 @@ btbeginscan(PG_FUNCTION_ARGS)
 	so->currPos.nextTupleOffset = 0;
 	so->markPos.nextTupleOffset = 0;
 
+	so->skipScanKey = NULL;
+
 	scan->xs_itupdesc = RelationGetDescr(rel);
 
 	scan->opaque = so;
@@ -509,6 +511,19 @@ btrescan(PG_FUNCTION_ARGS)
 }
 
 /*
+ * btskip() -- skip to the beginning of the next key prefix
+ */
+Datum
+btskip(PG_FUNCTION_ARGS)
+{
+	IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
+	ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1);
+	int			prefix = PG_GETARG_INT32(2);
+	bool result = _bt_skip(scan, dir, prefix);
+	PG_RETURN_BOOL(result);	
+}
+
+/*
  *	btendscan() -- close down a scan
  */
 Datum
@@ -1118,3 +1133,12 @@ btcanreturn(PG_FUNCTION_ARGS)
 {
 	PG_RETURN_BOOL(true);
 }
+
+/*
+ * btcanskip() -- Check whether btree indexes support skipping.
+ */
+Datum
+btcanskip(PG_FUNCTION_ARGS)
+{
+	PG_RETURN_BOOL(true);
+}
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 203b969..b88f25a 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -1082,6 +1082,84 @@ _bt_next(IndexScanDesc scan, ScanDirection dir)
 }
 
 /*
+ *  _bt_skip() -- Skip items that have the same prefix as the current one.
+ */
+bool
+_bt_skip(IndexScanDesc scan, ScanDirection dir, int prefix)
+{
+	/* TODO:TM For now, we use _bt_search to search from the root; in theory
+	 * we should be able to do a local traversal ie from the current page, but
+	 * I don't know if it would actually be better in general  */
+
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	BTStack stack;
+	Buffer buf;
+	OffsetNumber offnum;
+	BTScanPosItem *currItem;
+
+	if (!scan->xs_want_itup)
+		elog(ERROR, "_bt_skip cannot skip if not returning tuples");
+	if (!scan->xs_itup)
+		elog(ERROR, "_bt_skip cannot skip if a tuple has not been fetched yet");
+	/*elog(NOTICE, "_bt_skip");*/
+
+	if (BTScanPosIsValid(so->currPos))
+	{
+		ReleaseBuffer(so->currPos.buf);
+		so->currPos.buf = InvalidBuffer;
+	}
+
+	/* 
+	 * TODO:TM lazily call _bt_mkscankey the first time and then just update
+	 * the values in so->skipScanKey each time after that, instead of the repeated
+	 * free/realloc
+	 */	
+	if (so->skipScanKey != NULL)
+		_bt_freeskey(so->skipScanKey);
+	so->skipScanKey = _bt_mkscankey(scan->indexRelation, scan->xs_itup);
+
+	/* TODO:TM share code with _bt_search?  Much of this copied from there
+	 * without good understanding of what is going on!  */
+
+	stack =_bt_search(scan->indexRelation, prefix, so->skipScanKey, true, &buf, BT_READ);
+	_bt_freestack(stack);
+	so->currPos.buf = buf;
+	/*
+	 * TODO:TM this doesn't work when we're scanning backwards!  (ie with an
+	 * ORDER BY ... DESC)
+	 */
+	offnum = _bt_binsrch(scan->indexRelation, buf, prefix, so->skipScanKey, true);
+	PredicateLockPage(scan->indexRelation, BufferGetBlockNumber(buf),
+					  scan->xs_snapshot);
+	if (ScanDirectionIsForward(dir))
+	{
+		so->currPos.moreLeft = false;
+		so->currPos.moreRight = true;
+	}
+	else
+	{
+		so->currPos.moreLeft = true;
+		so->currPos.moreRight = false;
+	}
+
+	if (!_bt_readpage(scan, dir, offnum))
+	{
+		if (!_bt_steppage(scan, dir))
+		{
+			/* elog(NOTICE, "reached the end"); */
+			return false;
+		}
+	}
+	LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
+	currItem = &so->currPos.items[so->currPos.itemIndex];
+	scan->xs_ctup.t_self = currItem->heapTid;
+	if (scan->xs_want_itup)
+		scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
+
+	return true;
+}
+
+/*
  *	_bt_readpage() -- Load data from current index page into so->currPos
  *
  * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 0d9663c..332cf02 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -1065,7 +1065,20 @@ ExplainNode(PlanState *planstate, List *ancestors,
 		case T_IndexOnlyScan:
 			{
 				IndexOnlyScan *indexonlyscan = (IndexOnlyScan *) plan;
-
+				if (indexonlyscan->distinctPrefix > 0)
+				{
+					/*
+					 * TODO:TM show "for distinct prefix (a, b)" instead of
+					 * just the column count?
+					 */
+					if (es->format == EXPLAIN_FORMAT_TEXT)
+						appendStringInfo(es->str, " for distinct prefix %d",
+										 indexonlyscan->distinctPrefix);
+					else
+						ExplainPropertyInteger("Distinct Prefix", 
+											   indexonlyscan->distinctPrefix,
+											   es);
+				}
 				ExplainIndexScanDetails(indexonlyscan->indexid,
 										indexonlyscan->indexorderdir,
 										es);
diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c
index afcd1ff..d46de44 100644
--- a/src/backend/executor/nodeIndexonlyscan.c
+++ b/src/backend/executor/nodeIndexonlyscan.c
@@ -74,6 +74,21 @@ IndexOnlyNext(IndexOnlyScanState *node)
 	slot = node->ss.ss_ScanTupleSlot;
 
 	/*
+	 * Check if we need to skip to the next key prefix, because we've been
+	 * asked to implement DISTINCT.
+	 */
+	if (node->ioss_NumDistinctKeys > 0 && node->ioss_FirstTupleEmitted)
+	{
+		/*elog(NOTICE, "I WOULD LIKE TO SKIP");*/
+		if (!index_skip(scandesc, direction, node->ioss_NumDistinctKeys))
+		{
+			/* Reached end of index. */
+			/* TODO:TM is this appropriate cleanup? */
+			return ExecClearTuple(slot);
+		}
+	}
+
+	/*
 	 * OK, now that we have what we need, fetch the next tuple.
 	 */
 	while ((tid = index_getnext_tid(scandesc, direction)) != NULL)
@@ -176,6 +191,17 @@ IndexOnlyNext(IndexOnlyScanState *node)
 			PredicateLockPage(scandesc->heapRelation,
 							  ItemPointerGetBlockNumber(tid),
 							  estate->es_snapshot);
+		/*
+		 * TODO:TM Do distinct scans break SSI?  We don't visit every page
+		 * anymore!  But maybe that's OK, because we only "read" the fact that
+		 * there is *at least one* row that has this value, so a conflicting
+		 * write would be one that removes ALL rows having the same value, and
+		 * therefore would touch this page that happens to hold the arbitrary
+		 * row we looked at, in the case of a disinct skip scan.  Does this
+		 * make any sense?
+		 */
+
+		node->ioss_FirstTupleEmitted = true;
 
 		return slot;
 	}
@@ -391,6 +417,8 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags)
 	indexstate->ss.ps.plan = (Plan *) node;
 	indexstate->ss.ps.state = estate;
 	indexstate->ioss_HeapFetches = 0;
+	indexstate->ioss_NumDistinctKeys = node->distinctPrefix;
+	indexstate->ioss_FirstTupleEmitted = false;
 
 	/*
 	 * Miscellaneous initialization
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index 8d3d5a7..2bca8fe 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -398,6 +398,7 @@ _copyIndexOnlyScan(const IndexOnlyScan *from)
 	COPY_NODE_FIELD(indexorderby);
 	COPY_NODE_FIELD(indextlist);
 	COPY_SCALAR_FIELD(indexorderdir);
+	COPY_SCALAR_FIELD(distinctPrefix);
 
 	return newnode;
 }
diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c
index 42dcb11..3908899 100644
--- a/src/backend/optimizer/path/indxpath.c
+++ b/src/backend/optimizer/path/indxpath.c
@@ -982,6 +982,10 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel,
 	index_only_scan = (scantype != ST_BITMAPSCAN &&
 					   check_index_only(rel, index));
 
+	/* TODO:TM -- if index_only_scan is true, AND we want DISTINCT, then
+	 * ... perhaps this is the right place to check if we can use an index
+	 * only scan in distinct mode? */
+
 	/*
 	 * 4. Generate an indexscan path if there are relevant restriction clauses
 	 * in the current clauses, OR the index ordering is potentially useful for
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index f2c9c99..7e1c625 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -1934,10 +1934,30 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 															   result_plan,
 															current_pathkeys,
 															   -1.0);
+				result_plan = (Plan *) make_unique(result_plan,
+												   parse->distinctClause);
+			}
+			else if IsA(result_plan, IndexOnlyScan)
+			{
+				/* 
+				 * The plan is appropriately sorted, and we can ask it to
+				 * produce unique values by skipping.
+				 *
+				 * TODO:TM This is a quick and dirty hack to get something
+				 * working by modfifying hte plan, but really we probably need
+				 * help planning this much earlier, possibly in
+				 * build_index_paths.
+				 */
+				IndexOnlyScan *ios = (IndexOnlyScan *) result_plan;
+				/*elog(NOTICE, "using skip to implement DISTINCT");*/
+				ios->distinctPrefix = list_length(root->distinct_pathkeys);
+			}
+			else
+			{
+				/* The plan is appropriately sorted, but contains duplicates. */
+				result_plan = (Plan *) make_unique(result_plan,
+												   parse->distinctClause);
 			}
-
-			result_plan = (Plan *) make_unique(result_plan,
-											   parse->distinctClause);
 			result_plan->plan_rows = dNumDistinctRows;
 			/* The Unique node won't change sort ordering */
 		}
diff --git a/src/include/access/genam.h b/src/include/access/genam.h
index d99158f..349097d 100644
--- a/src/include/access/genam.h
+++ b/src/include/access/genam.h
@@ -157,6 +157,8 @@ extern IndexBulkDeleteResult *index_bulk_delete(IndexVacuumInfo *info,
 extern IndexBulkDeleteResult *index_vacuum_cleanup(IndexVacuumInfo *info,
 					 IndexBulkDeleteResult *stats);
 extern bool index_can_return(Relation indexRelation);
+extern bool index_can_skip(Relation indexRelation);
+extern bool index_skip(IndexScanDesc scan, ScanDirection direction, int prefix);
 extern RegProcedure index_getprocid(Relation irel, AttrNumber attnum,
 				uint16 procnum);
 extern FmgrInfo *index_getprocinfo(Relation irel, AttrNumber attnum,
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index ed6f697..0cc762a 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -606,6 +606,9 @@ typedef struct BTScanOpaqueData
 	/* keep these last in struct for efficiency */
 	BTScanPosData currPos;		/* current position data */
 	BTScanPosData markPos;		/* marked position, if any */
+
+	/* workspace for _bt_skip */
+	ScanKey		skipScanKey;	/* used to control skipping */
 } BTScanOpaqueData;
 
 typedef BTScanOpaqueData *BTScanOpaque;
@@ -638,6 +641,8 @@ extern Datum btbulkdelete(PG_FUNCTION_ARGS);
 extern Datum btvacuumcleanup(PG_FUNCTION_ARGS);
 extern Datum btcanreturn(PG_FUNCTION_ARGS);
 extern Datum btoptions(PG_FUNCTION_ARGS);
+extern Datum btcanskip(PG_FUNCTION_ARGS);
+extern Datum btskip(PG_FUNCTION_ARGS);
 
 /*
  * prototypes for functions in nbtinsert.c
@@ -684,6 +689,8 @@ extern int32 _bt_compare(Relation rel, int keysz, ScanKey scankey,
 extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
 extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
 extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost);
+extern bool _bt_skip(IndexScanDesc scan, ScanDirection dir, int prefix);
+
 
 /*
  * prototypes for functions in nbtutils.c
diff --git a/src/include/catalog/pg_am.h b/src/include/catalog/pg_am.h
index 759ea70..b3d835e 100644
--- a/src/include/catalog/pg_am.h
+++ b/src/include/catalog/pg_am.h
@@ -67,6 +67,8 @@ CATALOG(pg_am,2601)
 	regproc		amcanreturn;	/* can indexscan return IndexTuples? */
 	regproc		amcostestimate; /* estimate cost of an indexscan */
 	regproc		amoptions;		/* parse AM-specific parameters */
+	regproc		amcanskip;		/* can indexscan skip to next prefix value? */
+	regproc		amskip;			/* skip to next prefix value function */
 } FormData_pg_am;
 
 /* ----------------
@@ -80,7 +82,7 @@ typedef FormData_pg_am *Form_pg_am;
  *		compiler constants for pg_am
  * ----------------
  */
-#define Natts_pg_am						30
+#define Natts_pg_am						32
 #define Anum_pg_am_amname				1
 #define Anum_pg_am_amstrategies			2
 #define Anum_pg_am_amsupport			3
@@ -111,25 +113,27 @@ typedef FormData_pg_am *Form_pg_am;
 #define Anum_pg_am_amcanreturn			28
 #define Anum_pg_am_amcostestimate		29
 #define Anum_pg_am_amoptions			30
+#define Anum_pg_am_amcanskip			31
+#define Anum_pg_am_amskip				32
 
 /* ----------------
  *		initial contents of pg_am
  * ----------------
  */
 
-DATA(insert OID = 403 (  btree		5 2 t f t t t t t t f t t 0 btinsert btbeginscan btgettuple btgetbitmap btrescan btendscan btmarkpos btrestrpos btbuild btbuildempty btbulkdelete btvacuumcleanup btcanreturn btcostestimate btoptions ));
+DATA(insert OID = 403 (  btree		5 2 t f t t t t t t f t t 0 btinsert btbeginscan btgettuple btgetbitmap btrescan btendscan btmarkpos btrestrpos btbuild btbuildempty btbulkdelete btvacuumcleanup btcanreturn btcostestimate btoptions btcanskip btskip ));
 DESCR("b-tree index access method");
 #define BTREE_AM_OID 403
-DATA(insert OID = 405 (  hash		1 1 f f t f f f f f f f f 23 hashinsert hashbeginscan hashgettuple hashgetbitmap hashrescan hashendscan hashmarkpos hashrestrpos hashbuild hashbuildempty hashbulkdelete hashvacuumcleanup - hashcostestimate hashoptions ));
+DATA(insert OID = 405 (  hash		1 1 f f t f f f f f f f f 23 hashinsert hashbeginscan hashgettuple hashgetbitmap hashrescan hashendscan hashmarkpos hashrestrpos hashbuild hashbuildempty hashbulkdelete hashvacuumcleanup - hashcostestimate hashoptions - - ));
 DESCR("hash index access method");
 #define HASH_AM_OID 405
-DATA(insert OID = 783 (  gist		0 8 f t f f t t f t t t f 0 gistinsert gistbeginscan gistgettuple gistgetbitmap gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbuildempty gistbulkdelete gistvacuumcleanup - gistcostestimate gistoptions ));
+DATA(insert OID = 783 (  gist		0 8 f t f f t t f t t t f 0 gistinsert gistbeginscan gistgettuple gistgetbitmap gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbuildempty gistbulkdelete gistvacuumcleanup - gistcostestimate gistoptions - - ));
 DESCR("GiST index access method");
 #define GIST_AM_OID 783
-DATA(insert OID = 2742 (  gin		0 6 f f f f t t f f t f f 0 gininsert ginbeginscan - gingetbitmap ginrescan ginendscan ginmarkpos ginrestrpos ginbuild ginbuildempty ginbulkdelete ginvacuumcleanup - gincostestimate ginoptions ));
+DATA(insert OID = 2742 (  gin		0 6 f f f f t t f f t f f 0 gininsert ginbeginscan - gingetbitmap ginrescan ginendscan ginmarkpos ginrestrpos ginbuild ginbuildempty ginbulkdelete ginvacuumcleanup - gincostestimate ginoptions - - ));
 DESCR("GIN index access method");
 #define GIN_AM_OID 2742
-DATA(insert OID = 4000 (  spgist	0 5 f f f f f t f t f f f 0 spginsert spgbeginscan spggettuple spggetbitmap spgrescan spgendscan spgmarkpos spgrestrpos spgbuild spgbuildempty spgbulkdelete spgvacuumcleanup spgcanreturn spgcostestimate spgoptions ));
+DATA(insert OID = 4000 (  spgist	0 5 f f f f f t f t f f f 0 spginsert spgbeginscan spggettuple spggetbitmap spgrescan spgendscan spgmarkpos spgrestrpos spgbuild spgbuildempty spgbulkdelete spgvacuumcleanup spgcanreturn spgcostestimate spgoptions - - ));
 DESCR("SP-GiST index access method");
 #define SPGIST_AM_OID 4000
 
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index f44085c..792ec5b 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -564,6 +564,10 @@ DATA(insert OID = 1268 (  btcostestimate   PGNSP PGUID 12 1 0 0 0 f f f f t f v
 DESCR("btree(internal)");
 DATA(insert OID = 2785 (  btoptions		   PGNSP PGUID 12 1 0 0 0 f f f f t f s 2 0 17 "1009 16" _null_ _null_ _null_ _null_  btoptions _null_ _null_ _null_ ));
 DESCR("btree(internal)");
+DATA(insert OID = 4051 (  btcanskip	   PGNSP PGUID 12 1 0 0 0 f f f f t f s 1 0 16 "2281" _null_ _null_ _null_ _null_ btcanskip _null_ _null_ _null_ ));
+DESCR("btree(internal)");
+DATA(insert OID = 4052 (  btskip	   PGNSP PGUID 12 1 0 0 0 f f f f t f v 3 0 16 "2281 2281 2281" _null_ _null_ _null_ _null_	btskip _null_ _null_ _null_ ));
+DESCR("btree(internal)");
 
 DATA(insert OID = 339 (  poly_same		   PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 16 "604 604" _null_ _null_ _null_ _null_ poly_same _null_ _null_ _null_ ));
 DATA(insert OID = 340 (  poly_contain	   PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 16 "604 604" _null_ _null_ _null_ _null_ poly_contain _null_ _null_ _null_ ));
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index b271f21..9c64c81 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1280,6 +1280,7 @@ typedef struct IndexScanState
  *		ScanDesc		   index scan descriptor
  *		VMBuffer		   buffer in use for visibility map testing, if any
  *		HeapFetches		   number of tuples we were forced to fetch from heap
+ *		NumDistinctKeys	   number of keys for skip-based DISTINCT  
  * ----------------
  */
 typedef struct IndexOnlyScanState
@@ -1298,6 +1299,8 @@ typedef struct IndexOnlyScanState
 	IndexScanDesc ioss_ScanDesc;
 	Buffer		ioss_VMBuffer;
 	long		ioss_HeapFetches;
+	int         ioss_NumDistinctKeys;
+	bool		ioss_FirstTupleEmitted;
 } IndexOnlyScanState;
 
 /* ----------------
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index 3b9c683..6ae992a 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -340,6 +340,7 @@ typedef struct IndexOnlyScan
 	List	   *indexorderby;	/* list of index ORDER BY exprs */
 	List	   *indextlist;		/* TargetEntry list describing index's cols */
 	ScanDirection indexorderdir;	/* forward or backward or don't care */
+	int			distinctPrefix;	/* the size of the prefix for distinct scans */
 } IndexOnlyScan;
 
 /* ----------------
diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h
index af4f53f..ecc72f2 100644
--- a/src/include/utils/rel.h
+++ b/src/include/utils/rel.h
@@ -61,6 +61,8 @@ typedef struct RelationAmInfo
 	FmgrInfo	ammarkpos;
 	FmgrInfo	amrestrpos;
 	FmgrInfo	amcanreturn;
+	FmgrInfo	amcanskip;
+	FmgrInfo	amskip;
 } RelationAmInfo;

Vik Fearing

vik.fearing@dalibo.com

over 11 years ago

In reply to: Thomas Munro (#1)

Re: DISTINCT with btree skip scan

On 07/05/2014 02:17 AM, Thomas Munro wrote:

As an exercise I hacked up the simplest code I could think of that would
demonstrate a faster DISTINCT based on skipping ahead to the next
distinct value in an index-only scan. Please see the attached (extremely
buggy) patch, and the example session below. (It's against my natural
instinct to send such half-baked early hacking phase code to the list,
but thought it would make sense to demo the concept and then seek
advice, warnings, cease and desist notices etc before pressing on down
that route!) I would be most grateful for any feedback you might have.

This is called a Loose Index Scan in our wiki[1]http://wiki.postgresql.org/wiki/Loose_indexscan -- Vik which I believe is
based on the terminology used for the same feature in MySQL although I
haven't and shan't check.

This is a feature I would really like to have.

Your benchmarks look promising but unfortunately it blows up for me so I
can't really test it. I have not yet read the patch nor debugged to see
why, but please do keep up work on this. People more expert than I can
tell you whether you're implementing it the right way.

[1]: http://wiki.postgresql.org/wiki/Loose_indexscan -- Vik
--
Vik

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Thomas Munro

munro@ip9.org

over 11 years ago

In reply to: Vik Fearing (#2)

Re: DISTINCT with btree skip scan

On 5 July 2014 02:03, Vik Fearing <vik.fearing@dalibo.com> wrote:

[1] http://wiki.postgresql.org/wiki/Loose_indexscan

Thanks. It talks about DISTINCT, and also about using index when you
don't have the leading column in your WHERE clause (as well as MySQL
("loose"), at least Oracle ("skip"), SQLite ("skip"), DB2 ("jump") can
do this). It looks like at least MySQL can also use loose index scans
to implement GROUP BY in certain cases involving MIN or MAX aggregate
functions (things like SELECT a, MIN(b) FROM foo GROUP BY a, given an
index on (a, b)).

But I'm only trying to implement the lowest hanging index skipping
plan: plain old DISTINCT. I think I see roughly how to cost, plan and
execute it... now to learn a lot more about PG internals...

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

David Rowley

dgrowleyml@gmail.com

about 11 years ago

In reply to: Thomas Munro (#1)

Re: DISTINCT with btree skip scan

On Sat, Jul 5, 2014 at 12:17 PM, Thomas Munro <munro@ip9.org> wrote:

postgres=# set enable_hashagg = false;
SET
Time: 0.302 ms
postgres=# explain select distinct a from foo;

┌─────────────────────────────────────────────────────────────────────────────────────────────────────┐
│ QUERY PLAN
│

├─────────────────────────────────────────────────────────────────────────────────────────────────────┤
│ Index Only Scan for distinct prefix 1 using foo_pkey on foo
(cost=0.43..263354.20 rows=10 width=4) │
│ Planning time: 0.063 ms
│

└─────────────────────────────────────────────────────────────────────────────────────────────────────┘
(2 rows)

Time: 0.443 ms
postgres=# select distinct a from foo;
┌───┐
│ a │
├───┤
│ 0 │
│ 1 │
│ 2 │
│ 3 │
│ 4 │
│ 5 │
│ 6 │
│ 7 │
│ 8 │
│ 9 │
└───┘
(10 rows)

Time: 0.565 ms

Hi Thomas,

I've had a quick look at this and it seems like a great win! I'm quite
surprised that we've not got this already. I think this technology could
also really help performance of queries such as SELECT * from bigtable bt
WHERE EXISTS(SELECT 1 FROM otherbigtable obt WHERE bt.somecol =
obt.someIndexedButNonUniqueColumn); I know you're not proposing to improve
that first off, but it could be done later once the benefits of this are
more commonly realised.

I think our shortfalls in this area have not gone unnoticed. I was reading
this post
https://www.periscope.io/blog/count-distinct-in-mysql-postgres-sql-server-and-oracle.html
about comparing performance of COUNT(DISTINCT col). I think this would give
a big win for query 3 in that post. I'm trying to find some time make some
changes to transform queries to allow the group by to happen before the
joins when possible, so between that and your patch we'd be 2 steps closer
to making query 1 in the link above perform a little better on PostgreSQL.

Do you think you'll manage to get time to look at this a bit more? I'd be
keen to look into the costing side of it if you think that'll help you any?

Regards

David Rowley

Thomas Munro

munro@ip9.org

about 11 years ago

In reply to: David Rowley (#4)

1 attachment(s)

Re: DISTINCT with btree skip scan

On 27 October 2014 20:24, David Rowley <dgrowleyml@gmail.com> wrote:

I've had a quick look at this and it seems like a great win! I'm quite
surprised that we've not got this already. I think this technology could
also really help performance of queries such as SELECT * from bigtable bt
WHERE EXISTS(SELECT 1 FROM otherbigtable obt WHERE bt.somecol =
obt.someIndexedButNonUniqueColumn); I know you're not proposing to improve
that first off, but it could be done later once the benefits of this are
more commonly realised.

I think our shortfalls in this area have not gone unnoticed. I was reading
this post
https://www.periscope.io/blog/count-distinct-in-mysql-postgres-sql-server-and-oracle.html
about comparing performance of COUNT(DISTINCT col). I think this would give
a big win for query 3 in that post. I'm trying to find some time make some
changes to transform queries to allow the group by to happen before the
joins when possible, so between that and your patch we'd be 2 steps closer
to making query 1 in the link above perform a little better on PostgreSQL.

Do you think you'll manage to get time to look at this a bit more? I'd be
keen to look into the costing side of it if you think that'll help you any?

Hi David,

Sorry for the silence, I have been busy moving countries.

I am definitely interested in collaborating on a series of patches to
implement various kinds of skip-based plans as seen in other RDBMSs if
others think it could be useful. I see skip-based DISTINCT as a good
place to start. (I suspect the semi-related skip scan plan (for the
case when your WHERE clause doesn't have a qual for the leading
column(s)) is much harder to plan and execute and I also suspect it's
less generally useful).

Here is a rebased version of that patch which fixes a crashing
off-by-one error, and handles backward scans properly, I think. This
is still just a quick hack to play with the general idea at this
stage.

It works by adding a new index operation 'skip' which the executor
code can use during a scan to advance to the next value (for some
prefix of the index's columns). That may be a terrible idea and
totally unnecessary... but let me explain my
reasoning:

1. Perhaps some index implementations can do something better than a
search for the next key value from the root. Is it possible or
desirable to use the current position as a starting point for a btree
traversal? I don't know.

2. It seemed that I'd need to create a new search ScanKey to use the
'rescan' interface for skipping to the next value, but I already had
an insertion ScanKey so I wanted a way to just reuse that. But maybe
there is some other way to reuse existing index interfaces, or maybe
there is an easy way to make a new search ScanKey from the existing
insertion ScanKey?

Currently it uses the existing IndexOnlyScan plan node, adding an
extra member. I briefly tried making a different plan called
DistinctIndexScan but it seemed I was going to need to duplicate or
refactor/share a lot of IndexOnlyScan code (this might be the right
thing to do but at this stage I'm just trying to demo the general idea
with minimal changes).

As for costing, I haven't looked at that part of PG at all yet. If
you *can* do a distinct index scan, would you *always* want to? How
well could we estimate the cost using existing statistics? I suppose
the expected number of page fetches is proportional to the expected
number of distinct values (ie skip operations) times btree depth, and
the cost may be adjusted in some way for cache effects (?), but how
well can we estimate the number of distinct values (a), (a, b) or (a,
b, c) in some range for an index on (a, b, c, d)?

As before, you still need the kludge of disabling hashagg to reach the new plan:

postgres=# set enable_hashagg = true;
SET
Time: 0.213 ms
postgres=# select distinct a from foo;
┌───┐
│ a │
├───┤
│ 6 │
│ 8 │
│ 1 │
│ 2 │
│ 3 │
│ 4 │
│ 5 │
│ 9 │
│ 7 │
│ 0 │
└───┘
(10 rows)

Time: 890.166 ms
postgres=# set enable_hashagg = false;
SET
Time: 0.271 ms
postgres=# select distinct a from foo;
┌───┐
│ a │
├───┤
│ 0 │
│ 1 │
│ 2 │
│ 3 │
│ 4 │
│ 5 │
│ 6 │
│ 7 │
│ 8 │
│ 9 │
└───┘
(10 rows)

Time: 0.573 ms

Best regards,
Thomas Munro

Attachments:

distinct-skip-v2.patchtext/x-patch; charset=US-ASCII; name=distinct-skip-v2.patchDownload

diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index 53cf96f..5f10d7f 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -29,6 +29,7 @@
  *		index_can_return	- does index support index-only scans?
  *		index_getprocid - get a support procedure OID
  *		index_getprocinfo - get a support procedure's lookup info
+ *		index_skip		- advance to the next key value in a scan
  *
  * NOTES
  *		This file contains the index_ routines which used
@@ -742,6 +743,37 @@ index_can_return(Relation indexRelation)
 									  PointerGetDatum(indexRelation)));
 }
 
+bool
+index_can_skip(Relation indexRelation)
+{
+	FmgrInfo   *procedure;
+
+	RELATION_CHECKS;
+
+	/* amcanskip is optional; assume FALSE if not provided by AM */
+	if (!RegProcedureIsValid(indexRelation->rd_am->amcanskip))
+		return false;
+
+	GET_REL_PROCEDURE(amcanskip);
+
+	return DatumGetBool(FunctionCall1(procedure,
+									  PointerGetDatum(indexRelation)));
+}
+
+bool
+index_skip(IndexScanDesc scan, ScanDirection direction, int prefix)
+{
+	FmgrInfo   *procedure;
+	SCAN_CHECKS;
+	GET_SCAN_PROCEDURE(amskip);
+	Datum result =
+		DatumGetPointer(FunctionCall3(procedure,
+									  PointerGetDatum(scan),
+									  Int32GetDatum(direction),
+									  Int32GetDatum(prefix)));
+	return DatumGetBool(result);
+}
+
 /* ----------------
  *		index_getprocid
  *
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 117b18e..bc09ea1 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -426,6 +426,8 @@ btbeginscan(PG_FUNCTION_ARGS)
 	so->currPos.nextTupleOffset = 0;
 	so->markPos.nextTupleOffset = 0;
 
+	so->skipScanKey = NULL;
+
 	scan->xs_itupdesc = RelationGetDescr(rel);
 
 	scan->opaque = so;
@@ -501,6 +503,19 @@ btrescan(PG_FUNCTION_ARGS)
 }
 
 /*
+ * btskip() -- skip to the beginning of the next key prefix
+ */
+Datum
+btskip(PG_FUNCTION_ARGS)
+{
+	IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
+	ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1);
+	int			prefix = PG_GETARG_INT32(2);
+	bool result = _bt_skip(scan, dir, prefix);
+	PG_RETURN_BOOL(result);	
+}
+
+/*
  *	btendscan() -- close down a scan
  */
 Datum
@@ -1110,3 +1125,12 @@ btcanreturn(PG_FUNCTION_ARGS)
 {
 	PG_RETURN_BOOL(true);
 }
+
+/*
+ * btcanskip() -- Check whether btree indexes support skipping.
+ */
+Datum
+btcanskip(PG_FUNCTION_ARGS)
+{
+	PG_RETURN_BOOL(true);
+}
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 203b969..8aaeb69 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -1082,6 +1082,90 @@ _bt_next(IndexScanDesc scan, ScanDirection dir)
 }
 
 /*
+ *  _bt_skip() -- Skip items that have the same prefix as the most recently
+ * fetched index tuple.  The current position is set so that a subsequent call
+ * to _bt_next will fetch the first tuple that differs in the leading 'prefix'
+ * keys.
+ *
+ * TODO:TM The 'step back one tuple' behaviour is only necessary because I
+ * wanted the nodeIndexonlyscan code to be able to use its existing
+ * tuple-fetch-visibility-check loop without a special case for the first
+ * iteration.  An alernative woudl be for skip to actually fetch the desired
+ * tuple immediately (ie without a subsequent call to _bt_next).
+ */
+bool
+_bt_skip(IndexScanDesc scan, ScanDirection dir, int prefix)
+{
+	/* TODO:TM For now, we use _bt_search to search from the root; in theory
+	 * we should be able to do a local traversal ie from the current page, but
+	 * I don't know if it would actually be better in general  */
+
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	BTStack stack;
+	Buffer buf;
+	OffsetNumber offnum;
+	BTScanPosItem *currItem;
+
+	if (!scan->xs_want_itup)
+		elog(ERROR, "_bt_skip cannot skip if not returning tuples");
+	if (!scan->xs_itup)
+		elog(ERROR, "_bt_skip cannot skip if a tuple has not been fetched yet");
+	/*elog(NOTICE, "_bt_skip");*/
+
+	if (BTScanPosIsValid(so->currPos))
+	{
+		ReleaseBuffer(so->currPos.buf);
+		so->currPos.buf = InvalidBuffer;
+	}
+
+	/* 
+	 * TODO:TM lazily call _bt_mkscankey the first time and then just update
+	 * the values in so->skipScanKey each time after that, instead of the repeated
+	 * free/realloc
+	 */	
+	if (so->skipScanKey != NULL)
+		_bt_freeskey(so->skipScanKey);
+	so->skipScanKey = _bt_mkscankey(scan->indexRelation, scan->xs_itup);
+
+	/* TODO:TM share code with _bt_search?  Much of this copied from there
+	 * without good understanding of what is going on!  */
+
+	stack =_bt_search(scan->indexRelation, prefix, so->skipScanKey, 
+					  ScanDirectionIsForward(dir), &buf, BT_READ);
+	_bt_freestack(stack);
+	so->currPos.buf = buf;
+	offnum = _bt_binsrch(scan->indexRelation, buf, prefix, so->skipScanKey,
+						 ScanDirectionIsForward(dir));
+	PredicateLockPage(scan->indexRelation, BufferGetBlockNumber(buf),
+					  scan->xs_snapshot);
+	if (ScanDirectionIsForward(dir))
+	{
+		so->currPos.moreLeft = false;
+		so->currPos.moreRight = true;
+	}
+	else
+	{
+		so->currPos.moreLeft = true;
+		so->currPos.moreRight = false;
+	}
+	if (ScanDirectionIsForward(dir))
+		offnum = OffsetNumberPrev(offnum);
+	if (!_bt_readpage(scan, dir, offnum))
+	{
+		if (!_bt_steppage(scan, dir))
+		{
+			return false;
+		}
+	}
+	LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
+	currItem = &so->currPos.items[so->currPos.itemIndex];
+	scan->xs_ctup.t_self = currItem->heapTid;
+	if (scan->xs_want_itup)
+		scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
+	return true;
+}
+
+/*
  *	_bt_readpage() -- Load data from current index page into so->currPos
  *
  * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 781a736..d94146b 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -1065,7 +1065,20 @@ ExplainNode(PlanState *planstate, List *ancestors,
 		case T_IndexOnlyScan:
 			{
 				IndexOnlyScan *indexonlyscan = (IndexOnlyScan *) plan;
-
+				if (indexonlyscan->distinctPrefix > 0)
+				{
+					/*
+					 * TODO:TM show "for distinct prefix (a, b)" instead of
+					 * just the column count?
+					 */
+					if (es->format == EXPLAIN_FORMAT_TEXT)
+						appendStringInfo(es->str, " for distinct %d column(s)",
+										 indexonlyscan->distinctPrefix);
+					else
+						ExplainPropertyInteger("Distinct Prefix", 
+											   indexonlyscan->distinctPrefix,
+											   es);
+				}
 				ExplainIndexScanDetails(indexonlyscan->indexid,
 										indexonlyscan->indexorderdir,
 										es);
diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c
index afcd1ff..d46de44 100644
--- a/src/backend/executor/nodeIndexonlyscan.c
+++ b/src/backend/executor/nodeIndexonlyscan.c
@@ -74,6 +74,21 @@ IndexOnlyNext(IndexOnlyScanState *node)
 	slot = node->ss.ss_ScanTupleSlot;
 
 	/*
+	 * Check if we need to skip to the next key prefix, because we've been
+	 * asked to implement DISTINCT.
+	 */
+	if (node->ioss_NumDistinctKeys > 0 && node->ioss_FirstTupleEmitted)
+	{
+		/*elog(NOTICE, "I WOULD LIKE TO SKIP");*/
+		if (!index_skip(scandesc, direction, node->ioss_NumDistinctKeys))
+		{
+			/* Reached end of index. */
+			/* TODO:TM is this appropriate cleanup? */
+			return ExecClearTuple(slot);
+		}
+	}
+
+	/*
 	 * OK, now that we have what we need, fetch the next tuple.
 	 */
 	while ((tid = index_getnext_tid(scandesc, direction)) != NULL)
@@ -176,6 +191,17 @@ IndexOnlyNext(IndexOnlyScanState *node)
 			PredicateLockPage(scandesc->heapRelation,
 							  ItemPointerGetBlockNumber(tid),
 							  estate->es_snapshot);
+		/*
+		 * TODO:TM Do distinct scans break SSI?  We don't visit every page
+		 * anymore!  But maybe that's OK, because we only "read" the fact that
+		 * there is *at least one* row that has this value, so a conflicting
+		 * write would be one that removes ALL rows having the same value, and
+		 * therefore would touch this page that happens to hold the arbitrary
+		 * row we looked at, in the case of a disinct skip scan.  Does this
+		 * make any sense?
+		 */
+
+		node->ioss_FirstTupleEmitted = true;
 
 		return slot;
 	}
@@ -391,6 +417,8 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags)
 	indexstate->ss.ps.plan = (Plan *) node;
 	indexstate->ss.ps.state = estate;
 	indexstate->ioss_HeapFetches = 0;
+	indexstate->ioss_NumDistinctKeys = node->distinctPrefix;
+	indexstate->ioss_FirstTupleEmitted = false;
 
 	/*
 	 * Miscellaneous initialization
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index aa053a0..fe351fd5 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -398,6 +398,7 @@ _copyIndexOnlyScan(const IndexOnlyScan *from)
 	COPY_NODE_FIELD(indexorderby);
 	COPY_NODE_FIELD(indextlist);
 	COPY_SCALAR_FIELD(indexorderdir);
+	COPY_SCALAR_FIELD(distinctPrefix);
 
 	return newnode;
 }
diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c
index 42dcb11..3908899 100644
--- a/src/backend/optimizer/path/indxpath.c
+++ b/src/backend/optimizer/path/indxpath.c
@@ -982,6 +982,10 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel,
 	index_only_scan = (scantype != ST_BITMAPSCAN &&
 					   check_index_only(rel, index));
 
+	/* TODO:TM -- if index_only_scan is true, AND we want DISTINCT, then
+	 * ... perhaps this is the right place to check if we can use an index
+	 * only scan in distinct mode? */
+
 	/*
 	 * 4. Generate an indexscan path if there are relevant restriction clauses
 	 * in the current clauses, OR the index ordering is potentially useful for
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index e1480cd..46105ca 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -1934,10 +1934,30 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 															   result_plan,
 															current_pathkeys,
 															   -1.0);
+				result_plan = (Plan *) make_unique(result_plan,
+												   parse->distinctClause);
+			}
+			else if IsA(result_plan, IndexOnlyScan)
+			{
+				/* 
+				 * The plan is appropriately sorted, and we can ask it to
+				 * produce unique values by skipping.
+				 *
+				 * TODO:TM This is a quick and dirty hack to get something
+				 * working by modifying the plan, but really we probably need
+				 * help planning this much earlier, possibly in
+				 * build_index_paths.
+				 */
+				IndexOnlyScan *ios = (IndexOnlyScan *) result_plan;
+				/*elog(NOTICE, "using skip to implement DISTINCT");*/
+				ios->distinctPrefix = list_length(root->distinct_pathkeys);
+			}
+			else
+			{
+				/* The plan is appropriately sorted, but contains duplicates. */
+				result_plan = (Plan *) make_unique(result_plan,
+												   parse->distinctClause);
 			}
-
-			result_plan = (Plan *) make_unique(result_plan,
-											   parse->distinctClause);
 			result_plan->plan_rows = dNumDistinctRows;
 			/* The Unique node won't change sort ordering */
 		}
diff --git a/src/include/access/genam.h b/src/include/access/genam.h
index d99158f..349097d 100644
--- a/src/include/access/genam.h
+++ b/src/include/access/genam.h
@@ -157,6 +157,8 @@ extern IndexBulkDeleteResult *index_bulk_delete(IndexVacuumInfo *info,
 extern IndexBulkDeleteResult *index_vacuum_cleanup(IndexVacuumInfo *info,
 					 IndexBulkDeleteResult *stats);
 extern bool index_can_return(Relation indexRelation);
+extern bool index_can_skip(Relation indexRelation);
+extern bool index_skip(IndexScanDesc scan, ScanDirection direction, int prefix);
 extern RegProcedure index_getprocid(Relation irel, AttrNumber attnum,
 				uint16 procnum);
 extern FmgrInfo *index_getprocinfo(Relation irel, AttrNumber attnum,
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 9fa943f..0a2a693 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -606,6 +606,9 @@ typedef struct BTScanOpaqueData
 	/* keep these last in struct for efficiency */
 	BTScanPosData currPos;		/* current position data */
 	BTScanPosData markPos;		/* marked position, if any */
+
+	/* workspace for _bt_skip */
+	ScanKey		skipScanKey;	/* used to control skipping */
 } BTScanOpaqueData;
 
 typedef BTScanOpaqueData *BTScanOpaque;
@@ -638,6 +641,8 @@ extern Datum btbulkdelete(PG_FUNCTION_ARGS);
 extern Datum btvacuumcleanup(PG_FUNCTION_ARGS);
 extern Datum btcanreturn(PG_FUNCTION_ARGS);
 extern Datum btoptions(PG_FUNCTION_ARGS);
+extern Datum btcanskip(PG_FUNCTION_ARGS);
+extern Datum btskip(PG_FUNCTION_ARGS);
 
 /*
  * prototypes for functions in nbtinsert.c
@@ -684,6 +689,8 @@ extern int32 _bt_compare(Relation rel, int keysz, ScanKey scankey,
 extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
 extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
 extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost);
+extern bool _bt_skip(IndexScanDesc scan, ScanDirection dir, int prefix);
+
 
 /*
  * prototypes for functions in nbtutils.c
diff --git a/src/include/catalog/pg_am.h b/src/include/catalog/pg_am.h
index 759ea70..b3d835e 100644
--- a/src/include/catalog/pg_am.h
+++ b/src/include/catalog/pg_am.h
@@ -67,6 +67,8 @@ CATALOG(pg_am,2601)
 	regproc		amcanreturn;	/* can indexscan return IndexTuples? */
 	regproc		amcostestimate; /* estimate cost of an indexscan */
 	regproc		amoptions;		/* parse AM-specific parameters */
+	regproc		amcanskip;		/* can indexscan skip to next prefix value? */
+	regproc		amskip;			/* skip to next prefix value function */
 } FormData_pg_am;
 
 /* ----------------
@@ -80,7 +82,7 @@ typedef FormData_pg_am *Form_pg_am;
  *		compiler constants for pg_am
  * ----------------
  */
-#define Natts_pg_am						30
+#define Natts_pg_am						32
 #define Anum_pg_am_amname				1
 #define Anum_pg_am_amstrategies			2
 #define Anum_pg_am_amsupport			3
@@ -111,25 +113,27 @@ typedef FormData_pg_am *Form_pg_am;
 #define Anum_pg_am_amcanreturn			28
 #define Anum_pg_am_amcostestimate		29
 #define Anum_pg_am_amoptions			30
+#define Anum_pg_am_amcanskip			31
+#define Anum_pg_am_amskip				32
 
 /* ----------------
  *		initial contents of pg_am
  * ----------------
  */
 
-DATA(insert OID = 403 (  btree		5 2 t f t t t t t t f t t 0 btinsert btbeginscan btgettuple btgetbitmap btrescan btendscan btmarkpos btrestrpos btbuild btbuildempty btbulkdelete btvacuumcleanup btcanreturn btcostestimate btoptions ));
+DATA(insert OID = 403 (  btree		5 2 t f t t t t t t f t t 0 btinsert btbeginscan btgettuple btgetbitmap btrescan btendscan btmarkpos btrestrpos btbuild btbuildempty btbulkdelete btvacuumcleanup btcanreturn btcostestimate btoptions btcanskip btskip ));
 DESCR("b-tree index access method");
 #define BTREE_AM_OID 403
-DATA(insert OID = 405 (  hash		1 1 f f t f f f f f f f f 23 hashinsert hashbeginscan hashgettuple hashgetbitmap hashrescan hashendscan hashmarkpos hashrestrpos hashbuild hashbuildempty hashbulkdelete hashvacuumcleanup - hashcostestimate hashoptions ));
+DATA(insert OID = 405 (  hash		1 1 f f t f f f f f f f f 23 hashinsert hashbeginscan hashgettuple hashgetbitmap hashrescan hashendscan hashmarkpos hashrestrpos hashbuild hashbuildempty hashbulkdelete hashvacuumcleanup - hashcostestimate hashoptions - - ));
 DESCR("hash index access method");
 #define HASH_AM_OID 405
-DATA(insert OID = 783 (  gist		0 8 f t f f t t f t t t f 0 gistinsert gistbeginscan gistgettuple gistgetbitmap gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbuildempty gistbulkdelete gistvacuumcleanup - gistcostestimate gistoptions ));
+DATA(insert OID = 783 (  gist		0 8 f t f f t t f t t t f 0 gistinsert gistbeginscan gistgettuple gistgetbitmap gistrescan gistendscan gistmarkpos gistrestrpos gistbuild gistbuildempty gistbulkdelete gistvacuumcleanup - gistcostestimate gistoptions - - ));
 DESCR("GiST index access method");
 #define GIST_AM_OID 783
-DATA(insert OID = 2742 (  gin		0 6 f f f f t t f f t f f 0 gininsert ginbeginscan - gingetbitmap ginrescan ginendscan ginmarkpos ginrestrpos ginbuild ginbuildempty ginbulkdelete ginvacuumcleanup - gincostestimate ginoptions ));
+DATA(insert OID = 2742 (  gin		0 6 f f f f t t f f t f f 0 gininsert ginbeginscan - gingetbitmap ginrescan ginendscan ginmarkpos ginrestrpos ginbuild ginbuildempty ginbulkdelete ginvacuumcleanup - gincostestimate ginoptions - - ));
 DESCR("GIN index access method");
 #define GIN_AM_OID 2742
-DATA(insert OID = 4000 (  spgist	0 5 f f f f f t f t f f f 0 spginsert spgbeginscan spggettuple spggetbitmap spgrescan spgendscan spgmarkpos spgrestrpos spgbuild spgbuildempty spgbulkdelete spgvacuumcleanup spgcanreturn spgcostestimate spgoptions ));
+DATA(insert OID = 4000 (  spgist	0 5 f f f f f t f t f f f 0 spginsert spgbeginscan spggettuple spggetbitmap spgrescan spgendscan spgmarkpos spgrestrpos spgbuild spgbuildempty spgbulkdelete spgvacuumcleanup spgcanreturn spgcostestimate spgoptions - - ));
 DESCR("SP-GiST index access method");
 #define SPGIST_AM_OID 4000
 
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index a56a635..0d8e0d9 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -564,6 +564,10 @@ DATA(insert OID = 1268 (  btcostestimate   PGNSP PGUID 12 1 0 0 0 f f f f t f v
 DESCR("btree(internal)");
 DATA(insert OID = 2785 (  btoptions		   PGNSP PGUID 12 1 0 0 0 f f f f t f s 2 0 17 "1009 16" _null_ _null_ _null_ _null_  btoptions _null_ _null_ _null_ ));
 DESCR("btree(internal)");
+DATA(insert OID = 4051 (  btcanskip	   PGNSP PGUID 12 1 0 0 0 f f f f t f s 1 0 16 "2281" _null_ _null_ _null_ _null_ btcanskip _null_ _null_ _null_ ));
+DESCR("btree(internal)");
+DATA(insert OID = 4052 (  btskip	   PGNSP PGUID 12 1 0 0 0 f f f f t f v 3 0 16 "2281 2281 2281" _null_ _null_ _null_ _null_	btskip _null_ _null_ _null_ ));
+DESCR("btree(internal)");
 
 DATA(insert OID = 339 (  poly_same		   PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 16 "604 604" _null_ _null_ _null_ _null_ poly_same _null_ _null_ _null_ ));
 DATA(insert OID = 340 (  poly_contain	   PGNSP PGUID 12 1 0 0 0 f f f f t f i 2 0 16 "604 604" _null_ _null_ _null_ _null_ poly_contain _null_ _null_ _null_ ));
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index b271f21..9c64c81 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1280,6 +1280,7 @@ typedef struct IndexScanState
  *		ScanDesc		   index scan descriptor
  *		VMBuffer		   buffer in use for visibility map testing, if any
  *		HeapFetches		   number of tuples we were forced to fetch from heap
+ *		NumDistinctKeys	   number of keys for skip-based DISTINCT  
  * ----------------
  */
 typedef struct IndexOnlyScanState
@@ -1298,6 +1299,8 @@ typedef struct IndexOnlyScanState
 	IndexScanDesc ioss_ScanDesc;
 	Buffer		ioss_VMBuffer;
 	long		ioss_HeapFetches;
+	int         ioss_NumDistinctKeys;
+	bool		ioss_FirstTupleEmitted;
 } IndexOnlyScanState;
 
 /* ----------------
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index 3b9c683..6ae992a 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -340,6 +340,7 @@ typedef struct IndexOnlyScan
 	List	   *indexorderby;	/* list of index ORDER BY exprs */
 	List	   *indextlist;		/* TargetEntry list describing index's cols */
 	ScanDirection indexorderdir;	/* forward or backward or don't care */
+	int			distinctPrefix;	/* the size of the prefix for distinct scans */
 } IndexOnlyScan;
 
 /* ----------------
diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h
index 37b6cbb..ce811f1 100644
--- a/src/include/utils/rel.h
+++ b/src/include/utils/rel.h
@@ -61,6 +61,8 @@ typedef struct RelationAmInfo
 	FmgrInfo	ammarkpos;
 	FmgrInfo	amrestrpos;
 	FmgrInfo	amcanreturn;
+	FmgrInfo	amcanskip;
+	FmgrInfo	amskip;
 } RelationAmInfo;

Thomas Munro

thomas.munro@enterprisedb.com

over 9 years ago

In reply to: Thomas Munro (#5)

2 attachment(s)

Re: DISTINCT with btree skip scan

On Sat, Nov 1, 2014 at 10:35 AM, Thomas Munro <munro@ip9.org> wrote:

I am definitely interested in collaborating on a series of patches to
implement various kinds of skip-based plans as seen in other RDBMSs if
others think it could be useful. I see skip-based DISTINCT as a good
place to start. (I suspect the semi-related skip scan plan (for the
case when your WHERE clause doesn't have a qual for the leading
column(s)) is much harder to plan and execute and I also suspect it's
less generally useful).

There's an interesting thread about merge joins with a skip
optimisation[1]/messages/by-id/CAFQtOwp4e1hU6gKi1R_nHgOLTzUJko8EcaC1FdAV0rQFo1ARzA@mail.gmail.com ("leapfrog trie-join", though I haven't read the paper
yet). That reminded me to go and rebase this experimental patch which
is somewhat related. Here it is, now split into two parts: one patch
to add an amskip operation, and another to consider using it to
implement DISTINCT when it was already has an index only scan path on
an index that supports skipping. As you can see it's still sketchy
experiment-grade code, but it now has a first attempt at costing.

postgres=# select count(a) from foo;
┌─────────┐
│ count │
├─────────┤
│ 5000000 │
└─────────┘
(1 row)

Time: 493.501 ms
postgres=# select distinct a from foo;
┌───┐
│ a │
├───┤
│ 0 │
│ 1 │
│ 2 │
│ 3 │
│ 4 │
│ 5 │
│ 6 │
│ 7 │
│ 8 │
│ 9 │
└───┘
(10 rows)

Time: 0.516 ms
postgres=# explain select distinct a from foo;
┌─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│ QUERY PLAN
│
├─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤
│ Index Only Scan for distinct values of leading 1 column(s) using
foo_pkey on foo (cost=0.43..4.33 rows=10 width=4) │
└─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
(1 row)

Time: 0.378 ms

[1]: /messages/by-id/CAFQtOwp4e1hU6gKi1R_nHgOLTzUJko8EcaC1FdAV0rQFo1ARzA@mail.gmail.com

--
Thomas Munro
http://www.enterprisedb.com

Attachments:

0001-indexam-skip-v3.patchapplication/octet-stream; name=0001-indexam-skip-v3.patchDownload

diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index 54b71cb..bdd99f2 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -29,6 +29,7 @@
  *		index_can_return	- does index support index-only scans?
  *		index_getprocid - get a support procedure OID
  *		index_getprocinfo - get a support procedure's lookup info
+ *		index_skip		- advance past duplicate key values in a scan
  *
  * NOTES
  *		This file contains the index_ routines which used
@@ -666,6 +667,21 @@ index_can_return(Relation indexRelation, int attno)
 }
 
 /* ----------------
+ *		index_skip
+ *
+ *		Skip past all tuples where the first 'prefix' columns have the
+ *		same value as the last tuple returned in the current scan.
+ * ----------------
+ */
+bool
+index_skip(IndexScanDesc scan, ScanDirection direction, int prefix)
+{
+	SCAN_CHECKS;
+
+	return scan->indexRelation->rd_amroutine->amskip(scan, direction, prefix);
+}
+
+/* ----------------
  *		index_getprocid
  *
  *		Index access methods typically require support routines that are
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 128744c..1a1cc6e 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -106,6 +106,7 @@ bthandler(PG_FUNCTION_ARGS)
 	amroutine->ambulkdelete = btbulkdelete;
 	amroutine->amvacuumcleanup = btvacuumcleanup;
 	amroutine->amcanreturn = btcanreturn;
+	amroutine->amskip = btskip;
 	amroutine->amcostestimate = btcostestimate;
 	amroutine->amoptions = btoptions;
 	amroutine->amproperty = btproperty;
@@ -454,6 +455,8 @@ btbeginscan(Relation rel, int nkeys, int norderbys)
 	 */
 	so->currTuples = so->markTuples = NULL;
 
+	so->skipScanKey = NULL;
+
 	scan->xs_itupdesc = RelationGetDescr(rel);
 
 	scan->opaque = so;
@@ -521,6 +524,15 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 }
 
 /*
+ * btskip() -- skip to the beginning of the next key prefix
+ */
+bool
+btskip(IndexScanDesc scan, ScanDirection direction, int prefix)
+{
+	return _bt_skip(scan, direction, prefix);
+}
+
+/*
  *	btendscan() -- close down a scan
  */
 void
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index ee46023..9cd58ae 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -1120,6 +1120,90 @@ _bt_next(IndexScanDesc scan, ScanDirection dir)
 }
 
 /*
+ *  _bt_skip() -- Skip items that have the same prefix as the most recently
+ * fetched index tuple.  The current position is set so that a subsequent call
+ * to _bt_next will fetch the first tuple that differs in the leading 'prefix'
+ * keys.
+ *
+ * TODO:TM The 'step back one tuple' behaviour is only necessary because I
+ * wanted the nodeIndexonlyscan code to be able to use its existing
+ * tuple-fetch-visibility-check loop without a special case for the first
+ * iteration.  An alernative woudl be for skip to actually fetch the desired
+ * tuple immediately (ie without a subsequent call to _bt_next).
+ */
+bool
+_bt_skip(IndexScanDesc scan, ScanDirection dir, int prefix)
+{
+	/*
+	 * TODO:TM For now, we use _bt_search to search from the root; in theory
+	 * we should be able to do a local traversal ie from the current page, but
+	 * I don't know if it would actually be better in general.
+	 */
+
+	BTScanOpaque so = (BTScanOpaque) scan->opaque;
+	BTStack stack;
+	Buffer buf;
+	OffsetNumber offnum;
+	BTScanPosItem *currItem;
+
+	if (!scan->xs_want_itup)
+		elog(ERROR, "_bt_skip cannot skip if not returning tuples");
+	if (!scan->xs_itup)
+		elog(ERROR, "_bt_skip cannot skip if a tuple has not been fetched yet");
+
+	if (BTScanPosIsValid(so->currPos))
+	{
+		ReleaseBuffer(so->currPos.buf);
+		so->currPos.buf = InvalidBuffer;
+	}
+
+	/*
+	 * TODO:TM lazily call _bt_mkscankey the first time and then just update
+	 * the values in so->skipScanKey each time after that, instead of the
+	 * repeated free/realloc
+	 */
+	if (so->skipScanKey != NULL)
+		_bt_freeskey(so->skipScanKey);
+	so->skipScanKey = _bt_mkscankey(scan->indexRelation, scan->xs_itup);
+
+	/* TODO:TM share some of the code below with _bt_search? */
+	stack =_bt_search(scan->indexRelation, prefix, so->skipScanKey,
+					  ScanDirectionIsForward(dir), &buf, BT_READ,
+					  scan->xs_snapshot);
+	_bt_freestack(stack);
+	so->currPos.buf = buf;
+	offnum = _bt_binsrch(scan->indexRelation, buf, prefix, so->skipScanKey,
+						 ScanDirectionIsForward(dir));
+	PredicateLockPage(scan->indexRelation, BufferGetBlockNumber(buf),
+					  scan->xs_snapshot);
+	if (ScanDirectionIsForward(dir))
+	{
+		so->currPos.moreLeft = false;
+		so->currPos.moreRight = true;
+	}
+	else
+	{
+		so->currPos.moreLeft = true;
+		so->currPos.moreRight = false;
+	}
+	if (ScanDirectionIsForward(dir))
+		offnum = OffsetNumberPrev(offnum);
+	if (!_bt_readpage(scan, dir, offnum))
+	{
+		if (!_bt_steppage(scan, dir))
+		{
+			return false;
+		}
+	}
+	LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
+	currItem = &so->currPos.items[so->currPos.itemIndex];
+	scan->xs_ctup.t_self = currItem->heapTid;
+	if (scan->xs_want_itup)
+		scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
+	return true;
+}
+
+/*
  *	_bt_readpage() -- Load data from current index page into so->currPos
  *
  * Caller must have pinned and read-locked so->currPos.buf; the buffer's state
diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index 5d18206..824c418 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -239,6 +239,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
 			info->amoptionalkey = amroutine->amoptionalkey;
 			info->amsearcharray = amroutine->amsearcharray;
 			info->amsearchnulls = amroutine->amsearchnulls;
+			info->amcanskip = (amroutine->amskip != NULL);
 			info->amhasgettuple = (amroutine->amgettuple != NULL);
 			info->amhasgetbitmap = (amroutine->amgetbitmap != NULL);
 			info->amcostestimate = amroutine->amcostestimate;
diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h
index 1036cca..09e08b7 100644
--- a/src/include/access/amapi.h
+++ b/src/include/access/amapi.h
@@ -124,6 +124,10 @@ typedef void (*amrescan_function) (IndexScanDesc scan,
 typedef bool (*amgettuple_function) (IndexScanDesc scan,
 												 ScanDirection direction);
 
+/* skip past duplicates in a given prefix */
+typedef bool (*amskip_function) (IndexScanDesc scan,
+								 ScanDirection dir, int prefix);
+
 /* fetch all valid tuples */
 typedef int64 (*amgetbitmap_function) (IndexScanDesc scan,
 												   TIDBitmap *tbm);
@@ -196,6 +200,7 @@ typedef struct IndexAmRoutine
 	amendscan_function amendscan;
 	ammarkpos_function ammarkpos;		/* can be NULL */
 	amrestrpos_function amrestrpos;		/* can be NULL */
+	amskip_function amskip;				/* can be NULL */
 } IndexAmRoutine;
 
 
diff --git a/src/include/access/genam.h b/src/include/access/genam.h
index 81907d5..09f9aba 100644
--- a/src/include/access/genam.h
+++ b/src/include/access/genam.h
@@ -157,6 +157,7 @@ extern IndexBulkDeleteResult *index_bulk_delete(IndexVacuumInfo *info,
 extern IndexBulkDeleteResult *index_vacuum_cleanup(IndexVacuumInfo *info,
 					 IndexBulkDeleteResult *stats);
 extern bool index_can_return(Relation indexRelation, int attno);
+extern bool index_skip(IndexScanDesc scan, ScanDirection direction, int prefix);
 extern RegProcedure index_getprocid(Relation irel, AttrNumber attnum,
 				uint16 procnum);
 extern FmgrInfo *index_getprocinfo(Relation irel, AttrNumber attnum,
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index c580f51..836f8de 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -636,6 +636,9 @@ typedef struct BTScanOpaqueData
 	/* keep these last in struct for efficiency */
 	BTScanPosData currPos;		/* current position data */
 	BTScanPosData markPos;		/* marked position, if any */
+
+	/* workspace for _bt_skip */
+	ScanKey		skipScanKey;	/* used to control skipping */
 } BTScanOpaqueData;
 
 typedef BTScanOpaqueData *BTScanOpaque;
@@ -721,6 +724,7 @@ extern int32 _bt_compare(Relation rel, int keysz, ScanKey scankey,
 			Page page, OffsetNumber offnum);
 extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
 extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
+extern bool _bt_skip(IndexScanDesc scan, ScanDirection dir, int prefix);
 extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
 				 Snapshot snapshot);
 
@@ -748,6 +752,7 @@ extern void _bt_end_vacuum_callback(int code, Datum arg);
 extern Size BTreeShmemSize(void);
 extern void BTreeShmemInit(void);
 extern bytea *btoptions(Datum reloptions, bool validate);
+extern bool btskip(IndexScanDesc scan, ScanDirection dir, int prefix);
 extern bool btproperty(Oid index_oid, int attno,
 		   IndexAMProperty prop, const char *propname,
 		   bool *res, bool *isnull);
diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h
index 3a1255a..0e1c8d0 100644
--- a/src/include/nodes/relation.h
+++ b/src/include/nodes/relation.h
@@ -622,6 +622,7 @@ typedef struct IndexOptInfo
 	bool		amsearchnulls;	/* can AM search for NULL/NOT NULL entries? */
 	bool		amhasgettuple;	/* does AM have amgettuple interface? */
 	bool		amhasgetbitmap; /* does AM have amgetbitmap interface? */
+	bool		amcanskip;		/* can AM skip duplicate values? */
 	/* Rather than include amapi.h here, we declare amcostestimate like this */
 	void		(*amcostestimate) ();	/* AM's cost estimator */
 } IndexOptInfo;

0002-distinct-skip-v3.patchapplication/octet-stream; name=0002-distinct-skip-v3.patchDownload

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 1247433..ad13fc9 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -1103,6 +1103,20 @@ ExplainNode(PlanState *planstate, List *ancestors,
 			{
 				IndexOnlyScan *indexonlyscan = (IndexOnlyScan *) plan;
 
+				if (indexonlyscan->distinctPrefix > 0)
+				{
+					/*
+					 * TODO:TM figure out how to show "for distinct (a, b)"
+					 * instead of just the column count...
+					 */
+					if (es->format == EXPLAIN_FORMAT_TEXT)
+						appendStringInfo(es->str, " for distinct values of leading %d column(s)",
+										 indexonlyscan->distinctPrefix);
+					else
+						ExplainPropertyInteger("Distinct Prefix",
+											   indexonlyscan->distinctPrefix,
+											   es);
+				}
 				ExplainIndexScanDetails(indexonlyscan->indexid,
 										indexonlyscan->indexorderdir,
 										es);
diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c
index 4f6f91c..fddd382 100644
--- a/src/backend/executor/nodeIndexonlyscan.c
+++ b/src/backend/executor/nodeIndexonlyscan.c
@@ -74,6 +74,20 @@ IndexOnlyNext(IndexOnlyScanState *node)
 	slot = node->ss.ss_ScanTupleSlot;
 
 	/*
+	 * Check if we need to skip to the next key prefix, because we've been
+	 * asked to implement DISTINCT.
+	 */
+	if (node->ioss_NumDistinctKeys > 0 && node->ioss_FirstTupleEmitted)
+	{
+		if (!index_skip(scandesc, direction, node->ioss_NumDistinctKeys))
+		{
+			/* Reached end of index. */
+			/* TODO:TM is this appropriate cleanup? */
+			return ExecClearTuple(slot);
+		}
+	}
+
+	/*
 	 * OK, now that we have what we need, fetch the next tuple.
 	 */
 	while ((tid = index_getnext_tid(scandesc, direction)) != NULL)
@@ -189,6 +203,17 @@ IndexOnlyNext(IndexOnlyScanState *node)
 			PredicateLockPage(scandesc->heapRelation,
 							  ItemPointerGetBlockNumber(tid),
 							  estate->es_snapshot);
+		/*
+		 * TODO:TM Do distinct scans break SSI?  We don't visit every page
+		 * anymore!  But maybe that's OK, because we only "read" the fact that
+		 * there is *at least one* row that has this value, so a conflicting
+		 * write would be one that removes ALL rows having the same value, and
+		 * therefore would touch this page that happens to hold the arbitrary
+		 * row we looked at, in the case of a disinct skip scan.  Does this
+		 * make any sense?
+		 */
+
+		node->ioss_FirstTupleEmitted = true;
 
 		return slot;
 	}
@@ -404,6 +429,8 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags)
 	indexstate->ss.ps.plan = (Plan *) node;
 	indexstate->ss.ps.state = estate;
 	indexstate->ioss_HeapFetches = 0;
+	indexstate->ioss_NumDistinctKeys = node->distinctPrefix;
+	indexstate->ioss_FirstTupleEmitted = false;
 
 	/*
 	 * Miscellaneous initialization
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index 71714bc..cc7ce64 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -457,6 +457,7 @@ _copyIndexOnlyScan(const IndexOnlyScan *from)
 	COPY_NODE_FIELD(indexorderby);
 	COPY_NODE_FIELD(indextlist);
 	COPY_SCALAR_FIELD(indexorderdir);
+	COPY_SCALAR_FIELD(distinctPrefix);
 
 	return newnode;
 }
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 47158f6..614452d 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -170,7 +170,8 @@ static IndexOnlyScan *make_indexonlyscan(List *qptlist, List *qpqual,
 				   Index scanrelid, Oid indexid,
 				   List *indexqual, List *indexorderby,
 				   List *indextlist,
-				   ScanDirection indexscandir);
+				   ScanDirection indexscandir,
+				   int skipprefix);
 static BitmapIndexScan *make_bitmap_indexscan(Index scanrelid, Oid indexid,
 					  List *indexqual,
 					  List *indexqualorig);
@@ -2501,7 +2502,8 @@ create_indexscan_plan(PlannerInfo *root,
 												fixed_indexquals,
 												fixed_indexorderbys,
 											best_path->indexinfo->indextlist,
-												best_path->indexscandir);
+												best_path->indexscandir,
+												best_path->indexskipprefix);
 	else
 		scan_plan = (Scan *) make_indexscan(tlist,
 											qpqual,
@@ -4721,7 +4723,8 @@ make_indexonlyscan(List *qptlist,
 				   List *indexqual,
 				   List *indexorderby,
 				   List *indextlist,
-				   ScanDirection indexscandir)
+				   ScanDirection indexscandir,
+				   int skipprefix)
 {
 	IndexOnlyScan *node = makeNode(IndexOnlyScan);
 	Plan	   *plan = &node->scan.plan;
@@ -4736,6 +4739,7 @@ make_indexonlyscan(List *qptlist,
 	node->indexorderby = indexorderby;
 	node->indextlist = indextlist;
 	node->indexorderdir = indexscandir;
+	node->distinctPrefix = skipprefix;
 
 	return node;
 }
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index f657ffc..dfb222b 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -4157,6 +4157,16 @@ create_distinct_paths(PlannerInfo *root,
 												  path,
 										list_length(root->distinct_pathkeys),
 												  numDistinctRows));
+
+				/* Also consider a skip scan, if possible. */
+				if (IsA(path, IndexPath) &&
+					path->pathtype == T_IndexOnlyScan &&
+					((IndexPath *) path)->indexinfo->amcanskip)
+					add_path(distinct_rel, (Path *)
+							 create_skipscan_unique_path(root, distinct_rel,
+														 path,
+										list_length(root->distinct_pathkeys),
+														 numDistinctRows));
 			}
 		}
 
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index abb7507..db674c8 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -2470,6 +2470,49 @@ create_upper_unique_path(PlannerInfo *root,
 }
 
 /*
+ * create_skipscan_unique_path
+ *	  Creates a pathnode the same as an existing IndexPath except based on
+ *	  skipping duplicate values.  This may or may not be cheaper than using
+ *	  create_upper_unique_path.
+ *
+ * The input path must be an IndexPath for an index that supports amskip.
+ */
+IndexPath *
+create_skipscan_unique_path(PlannerInfo *root,
+							RelOptInfo *rel,
+							Path *subpath,
+							int numCols,
+							double numGroups)
+{
+	IndexPath *pathnode = makeNode(IndexPath);
+
+	Assert(IsA(subpath, IndexPath));
+
+	/*
+	 * TODO: copyObject doesn't work on paths.  But we don't want to modify
+	 * the source path.  memcpy may not be very sane here; need to look into
+	 * how exactly to clone an IndexPath without breaking any rules.
+	 */
+	memcpy(pathnode, subpath, sizeof(IndexPath));
+
+	/* The size of the prefix we'll use for skipping. */
+	Assert(pathnode->indexinfo->amcanskip);
+	Assert(numCols > 0);
+	pathnode->indexskipprefix = numCols;
+
+	/*
+	 * The cost to skip to each distinct value should be roughly the same as
+	 * the cost of finding the first key times the number of distinct values
+	 * we expect to find.
+	 */
+	pathnode->path.startup_cost = subpath->startup_cost;
+	pathnode->path.total_cost = subpath->startup_cost * numGroups;
+	pathnode->path.rows = numGroups;
+
+	return pathnode;
+}
+
+/*
  * create_agg_path
  *	  Creates a pathnode that represents performing aggregation/grouping
  *
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 4fa3661..bb07e9a 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1369,6 +1369,7 @@ typedef struct IndexScanState
  *		ScanDesc		   index scan descriptor
  *		VMBuffer		   buffer in use for visibility map testing, if any
  *		HeapFetches		   number of tuples we were forced to fetch from heap
+ *		NumDistinctKeys	   number of keys for skip-based DISTINCT
  * ----------------
  */
 typedef struct IndexOnlyScanState
@@ -1387,6 +1388,8 @@ typedef struct IndexOnlyScanState
 	IndexScanDesc ioss_ScanDesc;
 	Buffer		ioss_VMBuffer;
 	long		ioss_HeapFetches;
+	int         ioss_NumDistinctKeys;
+	bool		ioss_FirstTupleEmitted;
 } IndexOnlyScanState;
 
 /* ----------------
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index e2fbc7d..31e465d 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -377,6 +377,7 @@ typedef struct IndexOnlyScan
 	List	   *indexorderby;	/* list of index ORDER BY exprs */
 	List	   *indextlist;		/* TargetEntry list describing index's cols */
 	ScanDirection indexorderdir;	/* forward or backward or don't care */
+	int			distinctPrefix;	/* the size of the prefix for distinct scans */
 } IndexOnlyScan;
 
 /* ----------------
diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h
index 0e1c8d0..774914f 100644
--- a/src/include/nodes/relation.h
+++ b/src/include/nodes/relation.h
@@ -955,6 +955,9 @@ typedef struct Path
  * we need not recompute them when considering using the same index in a
  * bitmap index/heap scan (see BitmapHeapPath).  The costs of the IndexPath
  * itself represent the costs of an IndexScan or IndexOnlyScan plan type.
+ *
+ * 'indexskipprefix' represents the number of columns to consider for skip
+ * scans.
  *----------
  */
 typedef struct IndexPath
@@ -969,6 +972,7 @@ typedef struct IndexPath
 	ScanDirection indexscandir;
 	Cost		indextotalcost;
 	Selectivity indexselectivity;
+	int			indexskipprefix;
 } IndexPath;
 
 /*
diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h
index 71d9154..42b1002 100644
--- a/src/include/optimizer/pathnode.h
+++ b/src/include/optimizer/pathnode.h
@@ -161,6 +161,11 @@ extern UpperUniquePath *create_upper_unique_path(PlannerInfo *root,
 						 Path *subpath,
 						 int numCols,
 						 double numGroups);
+extern IndexPath *create_skipscan_unique_path(PlannerInfo *root,
+				  RelOptInfo *rel,
+				  Path *subpath,
+				  int numCols,
+				  double numGroups);
 extern AggPath *create_agg_path(PlannerInfo *root,
 				RelOptInfo *rel,
 				Path *subpath,

Thomas Munro

thomas.munro@enterprisedb.com

about 9 years ago

In reply to: Thomas Munro (#6)

Re: DISTINCT with btree skip scan

On Wed, Oct 12, 2016 at 4:19 PM, Thomas Munro
<thomas.munro@enterprisedb.com> wrote:

Here it is, now split into two parts: one patch
to add an amskip operation, and another to consider using it to
implement DISTINCT when it was already has an index only scan path on
an index that supports skipping.

Those patches add an amskip operation and then teach the planner and
executor how to do this, given a table foo (a, b) with an index on (a)
or (a, b):

SELECT DISTINCT foo FROM a

Index Only Scan for distinct values of (a) using foo_pkey on foo

In just the right circumstances that is vastly faster than scanning
every tuple and uniquifying. I was thinking that the next step in
this experiment might be to introduce a kind of plan that can handle
queries where the index's leading column is not in the WHERE clause,
building on the above plan, and behaving conceptually a little bit
like a Nested Loop, like so:

SELECT * FROM foo WHERE b = 42

Index Skip Scan
-> Index Only Scan for distinct values of (a) using foo_pkey on foo
-> Index Only Scan using foo_pkey on foo
Index Cond: ((a = outer.a) AND (b = 42))

I suppose you could instead have a single node that knows how to
perform the whole scan itself so that you don't finish up searching
for each distinct value in both the inner and outer subplan, but it
occurred to me that building the plan out of Lego bricks like this
might generalise better to more complicated queries. I suppose it
might also parallelise nicely with a Parallel Index Only Scan as the
outer plan.

Maybe something similar is possible for handling "GROUP BY a".

Worth pursuing? Does amskip suck? Does anyone have better ideas,
either for how to do the low level skip or the higher level Index Skip
Scan, or perhaps a completely different way of looking at this?

--
Thomas Munro
http://www.enterprisedb.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Geoff Winkless

pgsqladmin@geoff.dj

about 9 years ago

In reply to: Thomas Munro (#7)

Re: DISTINCT with btree skip scan

On 23 November 2016 at 21:19, Thomas Munro
<thomas.munro@enterprisedb.com> wrote:

Worth pursuing? Does amskip suck? Does anyone have better ideas,
either for how to do the low level skip or the higher level Index Skip
Scan, or perhaps a completely different way of looking at this?

I have no helpful suggestions with how to achieve it, but can I add a
voice of encouragement: there have been a good few occasions in the
past year (we moved from another db to PG) where the lack of skip
scans has bitten us; in that case it was using MIN() and GROUP BY,
where the grouping column was the first element in the compound index
and the aggregate column was the second: in the Other DB that sort of
query was extremely quick, not so much here.

I was also idly pondering (in one of those moments of conceited
self-delusion where I thought I might actually have enough spare time
to try to work on it myself) whether it would be possible to implement
that sort of skip with two indexes (so MIN(a) GROUP BY b with separate
indexes on (a) and (b) rather than a single index (a,b)); I never got
much further than idle musings though.

Geoff

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Robert Haas

robertmhaas@gmail.com

about 9 years ago

In reply to: Thomas Munro (#7)

Re: DISTINCT with btree skip scan

On Wed, Nov 23, 2016 at 4:19 PM, Thomas Munro
<thomas.munro@enterprisedb.com> wrote:

On Wed, Oct 12, 2016 at 4:19 PM, Thomas Munro
<thomas.munro@enterprisedb.com> wrote:

Here it is, now split into two parts: one patch
to add an amskip operation, and another to consider using it to
implement DISTINCT when it was already has an index only scan path on
an index that supports skipping.

Those patches add an amskip operation and then teach the planner and
executor how to do this, given a table foo (a, b) with an index on (a)
or (a, b):

SELECT DISTINCT foo FROM a

Index Only Scan for distinct values of (a) using foo_pkey on foo

Cool!

In just the right circumstances that is vastly faster than scanning
every tuple and uniquifying. I was thinking that the next step in
this experiment might be to introduce a kind of plan that can handle
queries where the index's leading column is not in the WHERE clause,
building on the above plan, and behaving conceptually a little bit
like a Nested Loop, like so:

SELECT * FROM foo WHERE b = 42

Index Skip Scan
-> Index Only Scan for distinct values of (a) using foo_pkey on foo
-> Index Only Scan using foo_pkey on foo
Index Cond: ((a = outer.a) AND (b = 42))

Blech, that seems really ugly and probably inefficient.

I suppose you could instead have a single node that knows how to
perform the whole scan itself so that you don't finish up searching
for each distinct value in both the inner and outer subplan, but it
occurred to me that building the plan out of Lego bricks like this
might generalise better to more complicated queries. I suppose it
might also parallelise nicely with a Parallel Index Only Scan as the
outer plan.

I think putting all the smarts in a single node is likely to be
better, faster, and cleaner.

This patch has gotten favorable comments from several people, but
somebody really needs to REVIEW it.

Oh, well, since I'm here...

+    if (ScanDirectionIsForward(dir))
+    {
+        so->currPos.moreLeft = false;
+        so->currPos.moreRight = true;
+    }
+    else
+    {
+        so->currPos.moreLeft = true;
+        so->currPos.moreRight = false;
+    }

The lack of comments makes it hard for me to understand what the
motivation for this is, but I bet it's wrong. Suppose there's a
cursor involved here and the user tries to back up. Instead of having
a separate amskip operation, maybe there should be a flag attached to
a scan indicating whether it should return only distinct results.
Otherwise, you're allowing for the possibility that the same scan
might sometimes skip and other times not skip, but then it seems hard
for the scan to survive cursor operations. Anyway, why would that be
useful?

+    if (ScanDirectionIsForward(dir))
+        offnum = OffsetNumberPrev(offnum);
+    if (!_bt_readpage(scan, dir, offnum))
+    {
+        if (!_bt_steppage(scan, dir))
+        {
+            return false;
+        }
+    }

As the TODO:TM comment at the top of the function sort of implies
without directly coming out and saying so, this is ugly and suggests
that you've picked the wrong API. If the idea I proposed above isn't
appealing, I still think we need to get rid of this by some other
means.

+    /*
+     * TODO:TM For now, we use _bt_search to search from the root; in theory
+     * we should be able to do a local traversal ie from the current page, but
+     * I don't know if it would actually be better in general.
+     */

I think that this depends a lot on the number of duplicates. If we
end up merely fast-forwarding within the same page to find the next
unique value, re-traversing from the root sucks. However, if the next
distinct key *isn't* on the same page, then traversing from the root
is a good bet. A typical btree is only a few levels deep (like 3) so
once you don't find what you're looking for on the same page it's
probably fairly sound to re-traverse from the root. Of course you
lose at least a little bit in the case where you would have found the
next distinct key within a page or two but in other cases you win big.
I wonder if that would be a suitable heuristic: first check the
current page, if all keys there are equal then retraverse from the
root.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers