From 074fc8a41b43dd8b10ab29eb880c9d161a1638d5 Mon Sep 17 00:00:00 2001
From: Dmitrii Dolgov <9erthalion6@gmail.com>
Date: Mon, 17 May 2021 11:47:07 +0200
Subject: [PATCH v39 1/5] Unique expressions

Extend PlannerInfo and Path structures with the list of relevant unique
expressions. It specifies which keys must be unique on the query
level, and allows to leverage this into on the path level. At the moment
only distinctClause makes use of such mechanism, which enables potential
use of index skip scan.

Originally proposed by David Rowley, based on the UniqueKey patch
implementation from Andy Fan, contains few bits out of previous version
from Jesper Pedersen, Floris Van Nee.
---
 src/backend/nodes/list.c                | 31 +++++++++
 src/backend/optimizer/path/Makefile     |  3 +-
 src/backend/optimizer/path/pathkeys.c   | 62 +++++++++++++++++
 src/backend/optimizer/path/uniquekeys.c | 92 +++++++++++++++++++++++++
 src/backend/optimizer/plan/planner.c    | 36 +++++++++-
 src/backend/optimizer/util/pathnode.c   | 32 ++++++---
 src/include/nodes/pathnodes.h           |  5 ++
 src/include/nodes/pg_list.h             |  1 +
 src/include/optimizer/pathnode.h        |  1 +
 src/include/optimizer/paths.h           |  9 +++
 10 files changed, 261 insertions(+), 11 deletions(-)
 create mode 100644 src/backend/optimizer/path/uniquekeys.c

diff --git a/src/backend/nodes/list.c b/src/backend/nodes/list.c
index 94fb236daf..9f2c408a4e 100644
--- a/src/backend/nodes/list.c
+++ b/src/backend/nodes/list.c
@@ -1537,3 +1537,34 @@ list_oid_cmp(const ListCell *p1, const ListCell *p2)
 		return 1;
 	return 0;
 }
+
+/*
+ * Return true iff every entry in "members" list is also present
+ * in the "target" list.
+ */
+bool
+list_is_subset(const List *members, const List *target)
+{
+	const ListCell	*lc1, *lc2;
+
+	Assert(IsPointerList(members));
+	Assert(IsPointerList(target));
+	check_list_invariants(members);
+	check_list_invariants(target);
+
+	foreach(lc1, members)
+	{
+		bool found = false;
+		foreach(lc2, target)
+		{
+			if (equal(lfirst(lc1), lfirst(lc2)))
+			{
+				found = true;
+				break;
+			}
+		}
+		if (!found)
+			return false;
+	}
+	return true;
+}
diff --git a/src/backend/optimizer/path/Makefile b/src/backend/optimizer/path/Makefile
index 1e199ff66f..7b9820c25f 100644
--- a/src/backend/optimizer/path/Makefile
+++ b/src/backend/optimizer/path/Makefile
@@ -21,6 +21,7 @@ OBJS = \
 	joinpath.o \
 	joinrels.o \
 	pathkeys.o \
-	tidpath.o
+	tidpath.o \
+	uniquekeys.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/optimizer/path/pathkeys.c b/src/backend/optimizer/path/pathkeys.c
index bd9a176d7d..f28547148d 100644
--- a/src/backend/optimizer/path/pathkeys.c
+++ b/src/backend/optimizer/path/pathkeys.c
@@ -29,6 +29,7 @@
 #include "utils/lsyscache.h"
 
 
+static bool pathkey_is_unique(PathKey *new_pathkey, List *pathkeys);
 static bool pathkey_is_redundant(PathKey *new_pathkey, List *pathkeys);
 static bool matches_boolean_partition_clause(RestrictInfo *rinfo,
 											 RelOptInfo *partrel,
@@ -96,6 +97,29 @@ make_canonical_pathkey(PlannerInfo *root,
 	return pk;
 }
 
+/*
+ * pathkey_is_unique
+ *		Checks if the new pathkey's equivalence class is the same as that of
+ *		any existing member of the pathkey list.
+ */
+static bool
+pathkey_is_unique(PathKey *new_pathkey, List *pathkeys)
+{
+	EquivalenceClass *new_ec = new_pathkey->pk_eclass;
+	ListCell   *lc;
+
+	/* If same EC already is already in the list, then not unique */
+	foreach(lc, pathkeys)
+	{
+		PathKey    *old_pathkey = (PathKey *) lfirst(lc);
+
+		if (new_ec == old_pathkey->pk_eclass)
+			return false;
+	}
+
+	return true;
+}
+
 /*
  * pathkey_is_redundant
  *	   Is a pathkey redundant with one already in the given list?
@@ -1152,6 +1176,44 @@ make_pathkeys_for_sortclauses(PlannerInfo *root,
 	return pathkeys;
 }
 
+/*
+ * make_pathkeys_for_uniquekeyclauses
+ *		Generate a pathkeys list to be used for uniquekey clauses
+ */
+List *
+make_pathkeys_for_uniquekeys(PlannerInfo *root,
+							 List *sortclauses,
+							 List *tlist)
+{
+	List	   *pathkeys = NIL;
+	ListCell   *l;
+
+	foreach(l, sortclauses)
+	{
+		SortGroupClause *sortcl = (SortGroupClause *) lfirst(l);
+		Expr	   *sortkey;
+		PathKey    *pathkey;
+
+		sortkey = (Expr *) get_sortgroupclause_expr(sortcl, tlist);
+		Assert(OidIsValid(sortcl->sortop));
+		pathkey = make_pathkey_from_sortop(root,
+										   sortkey,
+										   root->nullable_baserels,
+										   sortcl->sortop,
+										   sortcl->nulls_first,
+										   sortcl->tleSortGroupRef,
+										   true);
+
+		if (EC_MUST_BE_REDUNDANT(pathkey->pk_eclass))
+			continue;
+
+		if (pathkey_is_unique(pathkey, pathkeys))
+			pathkeys = lappend(pathkeys, pathkey);
+	}
+
+	return pathkeys;
+}
+
 /****************************************************************************
  *		PATHKEYS AND MERGECLAUSES
  ****************************************************************************/
diff --git a/src/backend/optimizer/path/uniquekeys.c b/src/backend/optimizer/path/uniquekeys.c
new file mode 100644
index 0000000000..d2525771e3
--- /dev/null
+++ b/src/backend/optimizer/path/uniquekeys.c
@@ -0,0 +1,92 @@
+/*-------------------------------------------------------------------------
+ *
+ * uniquekeys.c
+ *	  Utilities for matching and building unique keys
+ *
+ * Portions Copyright (c) 2020, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/optimizer/path/uniquekeys.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "optimizer/paths.h"
+
+/*
+ * build_uniquekeys
+ * 		Preparing list of pathkeys keys which are considered to be unique for
+ * 		this query.
+ *
+ * For now used only for distinct clauses, where redundant keys	need to be
+ * preserved e.g. for skip scan. Justification for this function existence is
+ * future plans to make it produce actual UniqueKey list. 
+ */
+List*
+build_uniquekeys(PlannerInfo *root, List *sortclauses)
+{
+	List *result = NIL;
+	List *sortkeys;
+	ListCell *l;
+	List *exprs = NIL;
+
+	sortkeys = make_pathkeys_for_uniquekeys(root,
+											sortclauses,
+											root->processed_tlist);
+
+	/* Create a uniquekey and add it to the list */
+	foreach(l, sortkeys)
+	{
+		PathKey    *pathkey = (PathKey *) lfirst(l);
+		EquivalenceClass *ec = pathkey->pk_eclass;
+		EquivalenceMember *mem = (EquivalenceMember*) lfirst(list_head(ec->ec_members));
+		exprs = lappend(exprs, mem->em_expr);
+	}
+
+	result = lappend(result, exprs);
+
+	return result;
+}
+
+/*
+ * query_has_uniquekeys_for
+ * 		Check if the specified unique keys matching all query level unique
+ * 		keys.
+ *
+ * The main use is to verify that unique keys for some path are covering all
+ * requested query unique keys. Based on this information a path could be
+ * rejected if it satisfy uniqueness only partially.
+ */
+bool
+query_has_uniquekeys_for(PlannerInfo *root, List *path_uniquekeys,
+						 bool allow_multinulls)
+{
+	ListCell *lc;
+	ListCell *lc2;
+
+	/* root->query_uniquekeys are the requested DISTINCT clauses on query level
+	 * path_uniquekeys are the unique keys on current path. All requested
+	 * query_uniquekeys must be satisfied by the path_uniquekeys.
+	 */
+	foreach(lc, root->query_uniquekeys)
+	{
+		List *query_ukey = lfirst_node(List, lc);
+		bool satisfied = false;
+		foreach(lc2, path_uniquekeys)
+		{
+			List *ukey = lfirst_node(List, lc2);
+			if (list_length(ukey) == 0 &&
+				list_length(query_ukey) != 0)
+				continue;
+			if (list_is_subset(ukey, query_ukey))
+			{
+				satisfied = true;
+				break;
+			}
+		}
+		if (!satisfied)
+			return false;
+	}
+	return true;
+}
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 1868c4eff4..4b2bed7e68 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -3094,12 +3094,18 @@ standard_qp_callback(PlannerInfo *root, void *extra)
 
 	if (parse->distinctClause &&
 		grouping_is_sortable(parse->distinctClause))
+	{
 		root->distinct_pathkeys =
 			make_pathkeys_for_sortclauses(root,
 										  parse->distinctClause,
 										  tlist);
+		root->query_uniquekeys = build_uniquekeys(root, parse->distinctClause);
+	}
 	else
+	{
 		root->distinct_pathkeys = NIL;
+		root->query_uniquekeys = NIL;
+	}
 
 	root->sort_pathkeys =
 		make_pathkeys_for_sortclauses(root,
@@ -4314,13 +4320,19 @@ create_distinct_paths(PlannerInfo *root,
 			Path	   *path = (Path *) lfirst(lc);
 
 			if (pathkeys_contained_in(needed_pathkeys, path->pathkeys))
-			{
 				add_path(distinct_rel, (Path *)
 						 create_upper_unique_path(root, distinct_rel,
 												  path,
 												  list_length(root->distinct_pathkeys),
 												  numDistinctRows));
-			}
+		}
+
+		foreach(lc, input_rel->unique_pathlist)
+		{
+			Path	   *path = (Path *) lfirst(lc);
+
+			if (query_has_uniquekeys_for(root, path->uniquekeys, false))
+				add_path(distinct_rel, path);
 		}
 
 		/* For explicit-sort case, always use the more rigorous clause */
@@ -6955,6 +6967,26 @@ apply_scanjoin_target_to_paths(PlannerInfo *root,
 		}
 	}
 
+	foreach(lc, rel->unique_pathlist)
+	{
+		Path	   *subpath = (Path *) lfirst(lc);
+
+		/* Shouldn't have any parameterized paths anymore */
+		Assert(subpath->param_info == NULL);
+
+		if (tlist_same_exprs)
+			subpath->pathtarget->sortgrouprefs =
+				scanjoin_target->sortgrouprefs;
+		else
+		{
+			Path	   *newpath;
+
+			newpath = (Path *) create_projection_path(root, rel, subpath,
+													  scanjoin_target);
+			lfirst(lc) = newpath;
+		}
+	}
+
 	/*
 	 * Now, if final scan/join target contains SRFs, insert ProjectSetPath(s)
 	 * atop each existing path.  (Note that this function doesn't look at the
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index b248b038e0..e190b20675 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -416,10 +416,10 @@ set_cheapest(RelOptInfo *parent_rel)
  * 'parent_rel' is the relation entry to which the path corresponds.
  * 'new_path' is a potential path for parent_rel.
  *
- * Returns nothing, but modifies parent_rel->pathlist.
+ * Returns modified pathlist.
  */
-void
-add_path(RelOptInfo *parent_rel, Path *new_path)
+static List *
+add_path_to(RelOptInfo *parent_rel, List *pathlist, Path *new_path)
 {
 	bool		accept_new = true;	/* unless we find a superior old path */
 	int			insert_at = 0;	/* where to insert new item */
@@ -440,7 +440,7 @@ add_path(RelOptInfo *parent_rel, Path *new_path)
 	 * for more than one old path to be tossed out because new_path dominates
 	 * it.
 	 */
-	foreach(p1, parent_rel->pathlist)
+	foreach(p1, pathlist)
 	{
 		Path	   *old_path = (Path *) lfirst(p1);
 		bool		remove_old = false; /* unless new proves superior */
@@ -584,8 +584,7 @@ add_path(RelOptInfo *parent_rel, Path *new_path)
 		 */
 		if (remove_old)
 		{
-			parent_rel->pathlist = foreach_delete_current(parent_rel->pathlist,
-														  p1);
+			pathlist = foreach_delete_current(pathlist, p1);
 
 			/*
 			 * Delete the data pointed-to by the deleted cell, if possible
@@ -612,8 +611,7 @@ add_path(RelOptInfo *parent_rel, Path *new_path)
 	if (accept_new)
 	{
 		/* Accept the new path: insert it at proper place in pathlist */
-		parent_rel->pathlist =
-			list_insert_nth(parent_rel->pathlist, insert_at, new_path);
+		pathlist = list_insert_nth(pathlist, insert_at, new_path);
 	}
 	else
 	{
@@ -621,6 +619,23 @@ add_path(RelOptInfo *parent_rel, Path *new_path)
 		if (!IsA(new_path, IndexPath))
 			pfree(new_path);
 	}
+
+	return pathlist;
+}
+
+void
+add_path(RelOptInfo *parent_rel, Path *new_path)
+{
+	parent_rel->pathlist = add_path_to(parent_rel,
+									   parent_rel->pathlist, new_path);
+}
+
+void
+add_unique_path(RelOptInfo *parent_rel, Path *new_path)
+{
+	parent_rel->unique_pathlist = add_path_to(parent_rel,
+											  parent_rel->unique_pathlist,
+											  new_path);
 }
 
 /*
@@ -2648,6 +2663,7 @@ create_projection_path(PlannerInfo *root,
 	pathnode->path.pathkeys = subpath->pathkeys;
 
 	pathnode->subpath = subpath;
+	pathnode->path.uniquekeys = subpath->uniquekeys;
 
 	/*
 	 * We might not need a separate Result node.  If the input plan node type
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index a65bda7e3c..6c04ba2bb2 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -292,6 +292,7 @@ struct PlannerInfo
 
 	List	   *query_pathkeys; /* desired pathkeys for query_planner() */
 
+	List	   *query_uniquekeys; /* unique keys required for the query */
 	List	   *group_pathkeys; /* groupClause pathkeys, if any */
 	List	   *window_pathkeys;	/* pathkeys of bottom window, if any */
 	List	   *distinct_pathkeys;	/* distinctClause pathkeys, if any */
@@ -690,6 +691,7 @@ typedef struct RelOptInfo
 	List	   *pathlist;		/* Path structures */
 	List	   *ppilist;		/* ParamPathInfos used in pathlist */
 	List	   *partial_pathlist;	/* partial Paths */
+	List       *unique_pathlist;    /* unique Paths */
 	struct Path *cheapest_startup_path;
 	struct Path *cheapest_total_path;
 	struct Path *cheapest_unique_path;
@@ -875,6 +877,7 @@ struct IndexOptInfo
 	bool		amsearchnulls;	/* can AM search for NULL/NOT NULL entries? */
 	bool		amhasgettuple;	/* does AM have amgettuple interface? */
 	bool		amhasgetbitmap; /* does AM have amgetbitmap interface? */
+	bool		amcanskip;		/* can AM skip duplicate values? */
 	bool		amcanparallel;	/* does AM support parallel scan? */
 	bool		amcanmarkpos;	/* does AM support mark/restore? */
 	/* Rather than include amapi.h here, we declare amcostestimate like this */
@@ -1187,6 +1190,8 @@ typedef struct Path
 
 	List	   *pathkeys;		/* sort ordering of path's output */
 	/* pathkeys is a List of PathKey nodes; see above */
+
+	List	   *uniquekeys;	/* the unique keys, or NIL if none */
 } Path;
 
 /* Macro for extracting a path's parameterization relids; beware double eval */
diff --git a/src/include/nodes/pg_list.h b/src/include/nodes/pg_list.h
index 30f98c4595..cdee029389 100644
--- a/src/include/nodes/pg_list.h
+++ b/src/include/nodes/pg_list.h
@@ -566,6 +566,7 @@ extern pg_nodiscard List *list_delete_first(List *list);
 extern pg_nodiscard List *list_delete_last(List *list);
 extern pg_nodiscard List *list_delete_nth_cell(List *list, int n);
 extern pg_nodiscard List *list_delete_cell(List *list, ListCell *cell);
+extern bool list_is_subset(const List *members, const List *target);
 
 extern List *list_union(const List *list1, const List *list2);
 extern List *list_union_ptr(const List *list1, const List *list2);
diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h
index 53261ee91f..b976f2963e 100644
--- a/src/include/optimizer/pathnode.h
+++ b/src/include/optimizer/pathnode.h
@@ -27,6 +27,7 @@ extern int	compare_fractional_path_costs(Path *path1, Path *path2,
 										  double fraction);
 extern void set_cheapest(RelOptInfo *parent_rel);
 extern void add_path(RelOptInfo *parent_rel, Path *new_path);
+extern void add_unique_path(RelOptInfo *parent_rel, Path *new_path);
 extern bool add_path_precheck(RelOptInfo *parent_rel,
 							  Cost startup_cost, Cost total_cost,
 							  List *pathkeys, Relids required_outer);
diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h
index f1d111063c..b613392e2f 100644
--- a/src/include/optimizer/paths.h
+++ b/src/include/optimizer/paths.h
@@ -229,6 +229,9 @@ extern List *build_join_pathkeys(PlannerInfo *root,
 extern List *make_pathkeys_for_sortclauses(PlannerInfo *root,
 										   List *sortclauses,
 										   List *tlist);
+extern List *make_pathkeys_for_uniquekeys(PlannerInfo *root,
+										  List *sortclauses,
+										  List *tlist);
 extern void initialize_mergeclause_eclasses(PlannerInfo *root,
 											RestrictInfo *restrictinfo);
 extern void update_mergeclause_eclasses(PlannerInfo *root,
@@ -255,4 +258,10 @@ extern PathKey *make_canonical_pathkey(PlannerInfo *root,
 extern void add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel,
 									List *live_childrels);
 
+extern bool query_has_uniquekeys_for(PlannerInfo *root,
+									 List *exprs,
+									 bool allow_multinulls);
+
+extern List *build_uniquekeys(PlannerInfo *root, List *sortclauses);
+
 #endif							/* PATHS_H */
-- 
2.26.3

