From e3bc921ad94c82c677886c2baef9a4876f6faeea Mon Sep 17 00:00:00 2001
From: Floris van Nee <floris.vannee@gmail.com>
Date: Thu, 19 Mar 2020 10:27:47 +0100
Subject: [PATCH 5/5] Support skip scan for non-distinct scans

Adds planner support to choose a skip scan for regular
non-distinct queries like:
SELECT * FROM t1 WHERE b=1 (with index on (a,b))
---
 src/backend/optimizer/path/indxpath.c | 181 +++++++++++++++++++++++++-
 src/backend/optimizer/plan/planner.c  |   2 +-
 src/backend/optimizer/util/pathnode.c |   4 +-
 src/backend/utils/adt/selfuncs.c      | 153 ++++++++++++++++++++--
 src/include/optimizer/pathnode.h      |   3 +-
 5 files changed, 327 insertions(+), 16 deletions(-)

diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c
index ba2dd30a13..d4d3e1c7eb 100644
--- a/src/backend/optimizer/path/indxpath.c
+++ b/src/backend/optimizer/path/indxpath.c
@@ -192,6 +192,17 @@ static Expr *match_clause_to_ordering_op(IndexOptInfo *index,
 static bool ec_member_matches_indexcol(PlannerInfo *root, RelOptInfo *rel,
 									   EquivalenceClass *ec, EquivalenceMember *em,
 									   void *arg);
+static List* add_possible_index_skip_paths(List* result,
+										  PlannerInfo *root,
+										  IndexOptInfo *index,
+										  List *indexclauses,
+										  List *indexorderbys,
+										  List *indexorderbycols,
+										  List *pathkeys,
+										  ScanDirection indexscandir,
+										  bool indexonly,
+										  Relids required_outer,
+										  double loop_count);
 
 
 /*
@@ -820,6 +831,136 @@ get_index_paths(PlannerInfo *root, RelOptInfo *rel,
 	}
 }
 
+/*
+ * Find available index skip paths and add them to the path list
+ */
+static List* add_possible_index_skip_paths(List* result,
+										  PlannerInfo *root,
+										  IndexOptInfo *index,
+										  List *indexclauses,
+										  List *indexorderbys,
+										  List *indexorderbycols,
+										  List *pathkeys,
+										  ScanDirection indexscandir,
+										  bool indexonly,
+										  Relids required_outer,
+										  double loop_count)
+{
+	int			indexcol;
+	bool		eqQualHere;
+	bool		eqQualPrev;
+	bool		eqSoFar;
+	ListCell   *lc;
+
+	/*
+	 * We need to find possible prefixes to use for the skip scan
+	 * Any useful prefix is one just before an index clause, unless
+	 * all clauses so far have been equal.
+	 * For example, on an index (a,b,c), the qual b=1 would
+	 * mean that an interesting skip prefix could be 1.
+	 * For qual a=1 AND b=1, it is not interesting to skip with
+	 * prefix 1, because the value of a is fixed already.
+	 */
+	indexcol = 0;
+	eqQualHere = false;
+	eqQualPrev = false;
+	eqSoFar = true;
+	foreach(lc, indexclauses)
+	{
+		IndexClause *iclause = lfirst_node(IndexClause, lc);
+		ListCell   *lc2;
+
+		if (indexcol != iclause->indexcol)
+		{
+			if (!eqQualHere || indexcol != iclause->indexcol - 1)
+				eqSoFar = false;
+
+			/* Beginning of a new column's quals */
+			if (!eqQualPrev && !eqSoFar)
+			{
+				/* We have a qual on current column,
+				 * there is no equality qual on the previous column,
+				 * not all of the previous quals are equality so far
+				 * (last one is special case for the first column in the index).
+				 * Optimal conditions to try an index skip path.
+				 */
+				IndexPath *ipath = create_index_path(root, index,
+										  indexclauses,
+										  indexorderbys,
+										  indexorderbycols,
+										  pathkeys,
+										  indexscandir,
+										  indexonly,
+										  required_outer,
+										  loop_count,
+										  false,
+										  iclause->indexcol);
+				result = lappend(result, ipath);
+			}
+
+			eqQualPrev = eqQualHere;
+			eqQualHere = false;
+			indexcol++;
+			/* if the clause is not for this index col, increment until it is */
+			while (indexcol != iclause->indexcol)
+			{
+				eqQualPrev = false;
+				eqSoFar = false;
+				indexcol++;
+			}
+		}
+
+		/* Examine each indexqual associated with this index clause */
+		foreach(lc2, iclause->indexquals)
+		{
+			RestrictInfo *rinfo = lfirst_node(RestrictInfo, lc2);
+			Expr	   *clause = rinfo->clause;
+			Oid			clause_op = InvalidOid;
+			int			op_strategy;
+
+			if (IsA(clause, OpExpr))
+			{
+				OpExpr	   *op = (OpExpr *) clause;
+				clause_op = op->opno;
+			}
+			else if (IsA(clause, RowCompareExpr))
+			{
+				RowCompareExpr *rc = (RowCompareExpr *) clause;
+				clause_op = linitial_oid(rc->opnos);
+			}
+			else if (IsA(clause, ScalarArrayOpExpr))
+			{
+				ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) clause;
+				clause_op = saop->opno;
+			}
+			else if (IsA(clause, NullTest))
+			{
+				NullTest   *nt = (NullTest *) clause;
+
+				if (nt->nulltesttype == IS_NULL)
+				{
+					/* IS NULL is like = for selectivity purposes */
+					eqQualHere = true;
+				}
+			}
+			else
+				elog(ERROR, "unsupported indexqual type: %d",
+					 (int) nodeTag(clause));
+
+			/* check for equality operator */
+			if (OidIsValid(clause_op))
+			{
+				op_strategy = get_op_opfamily_strategy(clause_op,
+													   index->opfamily[indexcol]);
+				Assert(op_strategy != 0);	/* not a member of opfamily?? */
+				if (op_strategy == BTEqualStrategyNumber)
+					eqQualHere = true;
+			}
+		}
+	}
+	return result;
+}
+
 /*
  * build_index_paths
  *	  Given an index and a set of index clauses for it, construct zero
@@ -1055,9 +1196,25 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel,
 								  index_only_scan,
 								  outer_relids,
 								  loop_count,
-								  false);
+								  false,
+								  0);
 		result = lappend(result, ipath);
 
+		if (can_skip)
+		{
+			result = add_possible_index_skip_paths(result, root, index,
+												   index_clauses,
+												   orderbyclauses,
+												   orderbyclausecols,
+												   useful_pathkeys,
+												   index_is_ordered ?
+												   ForwardScanDirection :
+												   NoMovementScanDirection,
+												   index_only_scan,
+												   outer_relids,
+												   loop_count);
+		}
+
 		/* Consider index skip scan as well */
 		if (root->query_uniquekeys != NULL && can_skip)
 		{
@@ -1104,7 +1261,8 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel,
 									  index_only_scan,
 									  outer_relids,
 									  loop_count,
-									  true);
+									  true,
+									  0);
 
 			/*
 			 * if, after costing the path, we find that it's not worth using
@@ -1137,9 +1295,23 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel,
 									  index_only_scan,
 									  outer_relids,
 									  loop_count,
-									  false);
+									  false,
+									  0);
 			result = lappend(result, ipath);
 
+			if (can_skip)
+			{
+				result = add_possible_index_skip_paths(result, root, index,
+													   index_clauses,
+													   NIL,
+													   NIL,
+													   useful_pathkeys,
+													   BackwardScanDirection,
+													   index_only_scan,
+													   outer_relids,
+													   loop_count);
+			}
+
 			/* Consider index skip scan as well */
 			if (root->query_uniquekeys != NULL && can_skip)
 			{
@@ -1181,7 +1353,8 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel,
 										  index_only_scan,
 										  outer_relids,
 										  loop_count,
-										  true);
+										  true,
+										  0);
 
 				/*
 				 * if, after costing the path, we find that it's not worth
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 91b1ca2634..affeee390c 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -6013,7 +6013,7 @@ plan_cluster_use_sort(Oid tableOid, Oid indexOid)
 	indexScanPath = create_index_path(root, indexInfo,
 									  NIL, NIL, NIL, NIL,
 									  ForwardScanDirection, false,
-									  NULL, 1.0, false);
+									  NULL, 1.0, false, 0);
 
 	return (seqScanAndSortPath.total_cost < indexScanPath->path.total_cost);
 }
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 16633fd672..2b1c64fb51 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -1051,7 +1051,8 @@ create_index_path(PlannerInfo *root,
 				  bool indexonly,
 				  Relids required_outer,
 				  double loop_count,
-				  bool partial_path)
+				  bool partial_path,
+				  int skip_prefix)
 {
 	IndexPath  *pathnode = makeNode(IndexPath);
 	RelOptInfo *rel = index->rel;
@@ -1071,6 +1072,7 @@ create_index_path(PlannerInfo *root,
 	pathnode->indexorderbys = indexorderbys;
 	pathnode->indexorderbycols = indexorderbycols;
 	pathnode->indexscandir = indexscandir;
+	pathnode->indexskipprefix = skip_prefix;
 
 	cost_index(pathnode, root, loop_count, partial_path);
 
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 10895fb287..45f4fc814b 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -210,7 +210,9 @@ static bool get_actual_variable_endpoint(Relation heapRel,
 										 MemoryContext outercontext,
 										 Datum *endpointDatum);
 static RelOptInfo *find_join_input_rel(PlannerInfo *root, Relids relids);
-
+static double estimate_num_groups_internal(PlannerInfo *root, List *groupExprs,
+									double input_rows, double rel_input_rows,
+									List **pgset, EstimationInfo *estinfo);
 
 /*
  *		eqsel			- Selectivity of "=" for any data types.
@@ -3367,6 +3369,19 @@ add_unique_group_var(PlannerInfo *root, List *varinfos,
 double
 estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows,
 					List **pgset, EstimationInfo *estinfo)
+{
+	return estimate_num_groups_internal(root, groupExprs, input_rows, -1, pgset, estinfo);
+}
+
+/*
+ * Same as estimate_num_groups, but with an extra argument to control
+ * the estimation used for the input rows of the relation. If
+ * rel_input_rows < 0, it uses the the original planner estimation for the
+ * individual rels, else if uses the estimation as provided to the function.
+ */
+static double
+estimate_num_groups_internal(PlannerInfo *root, List *groupExprs, double input_rows, double rel_input_rows,
+					List **pgset, EstimationInfo *estinfo)
 {
 	List	   *varinfos = NIL;
 	double		srf_multiplier = 1.0;
@@ -3533,6 +3548,12 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows,
 		int			relvarcount = 0;
 		List	   *newvarinfos = NIL;
 		List	   *relvarinfos = NIL;
+		double this_rel_input_rows;
+
+		if (rel_input_rows < 0.0)
+			this_rel_input_rows = rel->rows;
+		else
+			this_rel_input_rows = rel_input_rows;
 
 		/*
 		 * Split the list of varinfos in two - one for the current rel, one
@@ -3638,7 +3659,7 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows,
 			 * guarding against division by zero when reldistinct is zero.
 			 * Also skip this if we know that we are returning all rows.
 			 */
-			if (reldistinct > 0 && rel->rows < rel->tuples)
+			if (reldistinct > 0 && this_rel_input_rows < rel->tuples)
 			{
 				/*
 				 * Given a table containing N rows with n distinct values in a
@@ -3675,7 +3696,7 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows,
 				 * works well even when n is small.
 				 */
 				reldistinct *=
-					(1 - pow((rel->tuples - rel->rows) / rel->tuples,
+					(1 - pow((rel->tuples - this_rel_input_rows) / rel->tuples,
 							 rel->tuples / reldistinct));
 			}
 			reldistinct = clamp_row_est(reldistinct);
@@ -6621,8 +6642,10 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
 	double		numIndexTuples;
 	Cost		descentCost;
 	List	   *indexBoundQuals;
+	List	   *prefixBoundQuals;
 	int			indexcol;
 	bool		eqQualHere;
+	bool		stillEq;
 	bool		found_saop;
 	bool		found_is_null_op;
 	double		num_sa_scans;
@@ -6646,9 +6669,11 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
 	 * considered to act the same as it normally does.
 	 */
 	indexBoundQuals = NIL;
+	prefixBoundQuals = NIL;
 	indexcol = 0;
 	eqQualHere = false;
 	found_saop = false;
+	stillEq = true;
 	found_is_null_op = false;
 	num_sa_scans = 1;
 	foreach(lc, path->indexclauses)
@@ -6660,11 +6685,18 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
 		{
 			/* Beginning of a new column's quals */
 			if (!eqQualHere)
-				break;			/* done if no '=' qual for indexcol */
+			{
+				stillEq = false;
+				/* done if no '=' qual for indexcol and we're past the skip prefix */
+				if (path->indexskipprefix <= indexcol)
+					break;
+			}
 			eqQualHere = false;
 			indexcol++;
+			while (indexcol != iclause->indexcol && path->indexskipprefix > indexcol)
+				indexcol++;
 			if (indexcol != iclause->indexcol)
-				break;			/* no quals at all for indexcol */
+				break; /* no quals at all for indexcol */
 		}
 
 		/* Examine each indexqual associated with this index clause */
@@ -6696,7 +6728,7 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
 				clause_op = saop->opno;
 				found_saop = true;
 				/* count number of SA scans induced by indexBoundQuals only */
-				if (alength > 1)
+				if (alength > 1 && stillEq)
 					num_sa_scans *= alength;
 			}
 			else if (IsA(clause, NullTest))
@@ -6724,7 +6756,14 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
 					eqQualHere = true;
 			}
 
-			indexBoundQuals = lappend(indexBoundQuals, rinfo);
+			/* we keep two lists here, one with all quals up until the prefix
+			 * and one with only the quals until the first inequality.
+			 * we need the list with prefixes later
+			 */
+			if (stillEq)
+				indexBoundQuals = lappend(indexBoundQuals, rinfo);
+			if (path->indexskipprefix > 0)
+				prefixBoundQuals = lappend(prefixBoundQuals, rinfo);
 		}
 	}
 
@@ -6750,7 +6789,10 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
 		 * index-bound quals to produce a more accurate idea of the number of
 		 * rows covered by the bound conditions.
 		 */
-		selectivityQuals = add_predicate_to_index_quals(index, indexBoundQuals);
+		if (path->indexskipprefix > 0)
+			selectivityQuals = add_predicate_to_index_quals(index, prefixBoundQuals);
+		else
+			selectivityQuals = add_predicate_to_index_quals(index, indexBoundQuals);
 
 		btreeSelectivity = clauselist_selectivity(root, selectivityQuals,
 												  index->rel->relid,
@@ -6760,7 +6802,7 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
 
 		/*
 		 * As in genericcostestimate(), we have to adjust for any
-		 * ScalarArrayOpExpr quals included in indexBoundQuals, and then round
+		 * ScalarArrayOpExpr quals included in prefixBoundQuals, and then round
 		 * to integer.
 		 */
 		numIndexTuples = rint(numIndexTuples / num_sa_scans);
@@ -6806,6 +6848,99 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
 	costs.indexStartupCost += descentCost;
 	costs.indexTotalCost += costs.num_sa_scans * descentCost;
 
+	/*
+	 * Add extra costs for using an index skip scan.
+	 * The index skip scan could have significantly lower cost until now,
+	 * due to the different row estimation used (all the quals up to prefix,
+	 * rather than all the quals up to the first non-equality operator).
+	 * However, there are extra costs incurred for
+	 * a) setting up the scan
+	 * b) doing additional scans from root
+	 * c) small extra cost per tuple comparison
+	 * We add those here
+	 */
+	if (path->indexskipprefix > 0)
+	{
+		List *exprlist = NULL;
+		double numgroups_estimate;
+		int i = 0;
+		ListCell *indexpr_item = list_head(path->indexinfo->indexprs);
+		List	   *selectivityQuals;
+		Selectivity btreeSelectivity;
+		double estimatedIndexTuplesNoPrefix;
+
+		/* some rather arbitrary extra cost for preprocessing structures needed for skip scan */
+		costs.indexStartupCost += 200.0 * cpu_operator_cost;
+		costs.indexTotalCost += 200.0 * cpu_operator_cost;
+
+		/*
+		 * In order to reliably get a cost estimation for the number of scans we have to do from root,
+		 * we need some estimation on the number of distinct prefixes that exist. Therefore, we need
+		 * a different selectivity approximation (this time we do need to use the clauses until the first
+		 * non-equality operator). Using that, we can estimate the number of groups
+		 */
+		for (i = 0; i < path->indexinfo->nkeycolumns && i < path->indexskipprefix; i++)
+		{
+			Expr *expr = NULL;
+			int attr = path->indexinfo->indexkeys[i];
+			if(attr > 0)
+			{
+				TargetEntry *tentry = get_tle_by_resno(path->indexinfo->indextlist, i + 1);
+				Assert(tentry != NULL);
+				expr = tentry->expr;
+			}
+			else if (attr == 0)
+			{
+				/* Expression index */
+				expr = lfirst(indexpr_item);
+				indexpr_item = lnext(path->indexinfo->indexprs, indexpr_item);
+			}
+			else /* attr < 0 */
+			{
+				/* Index on system column is not supported */
+				Assert(false);
+			}
+
+			exprlist = lappend(exprlist, expr);
+		}
+
+		selectivityQuals = add_predicate_to_index_quals(index, indexBoundQuals);
+
+		btreeSelectivity = clauselist_selectivity(root, selectivityQuals,
+												  index->rel->relid,
+												  JOIN_INNER,
+												  NULL);
+		estimatedIndexTuplesNoPrefix = btreeSelectivity * index->rel->tuples;
+
+		/*
+		 * As in genericcostestimate(), we have to adjust for any
+		 * ScalarArrayOpExpr quals included in prefixBoundQuals, and then round
+		 * to integer.
+		 */
+		estimatedIndexTuplesNoPrefix = rint(estimatedIndexTuplesNoPrefix / num_sa_scans);
+
+		numgroups_estimate = estimate_num_groups_internal(
+					root, exprlist, estimatedIndexTuplesNoPrefix,
+					estimatedIndexTuplesNoPrefix, NULL, NULL);
+
+		/*
+		 * For each distinct prefix value we add descending cost as.
+		 * This is similar to the startup cost calculation for regular scans.
+		 * We can do at most 2 scans from root per distinct prefix, so multiply by 2.
+		 * Also add some CPU processing cost per page that we need to process, plus
+		 * some additional one-time cost for scanning the leaf page. This is a more
+		 * expensive estimation than the per-page cpu cost for the regular index scan.
+		 * This is intentional, because the index skip scan does more processing on
+		 * the leaf page.
+		 */
+		if (index->tuples > 0)
+			descentCost = ceil(log(index->tuples) / log(2.0)) * cpu_operator_cost * 2;
+		else
+			descentCost = 0;
+		descentCost += (index->tree_height + 1) * 50.0 * cpu_operator_cost * 2 + 200 * cpu_operator_cost;
+		costs.indexTotalCost += costs.num_sa_scans * descentCost * numgroups_estimate;
+	}
+
 	/*
 	 * If we can get an estimate of the first column's ordering correlation C
 	 * from pg_statistic, estimate the index correlation as C for a
diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h
index 0343b2e1f6..da4c933166 100644
--- a/src/include/optimizer/pathnode.h
+++ b/src/include/optimizer/pathnode.h
@@ -49,7 +49,8 @@ extern IndexPath *create_index_path(PlannerInfo *root,
 									bool indexonly,
 									Relids required_outer,
 									double loop_count,
-									bool partial_path);
+									bool partial_path,
+									int skip_prefix);
 extern BitmapHeapPath *create_bitmap_heap_path(PlannerInfo *root,
 											   RelOptInfo *rel,
 											   Path *bitmapqual,
-- 
2.29.2