From 79532b0e1bcf5faa35c65743b6afc869de8efafc Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Thu, 19 Dec 2024 12:27:13 -0500
Subject: [PATCH v5 5/6] Teach generate_nonunion_paths to consider pre-sorted
 child paths.

Much of this patch is just re-ordering existing code so that we can
compute the desired pathkeys before we call build_setop_child_paths.

I don't think we need any new test cases: the existing cases that
change plan are enough to show that this works.

One test case in union.sql would switch from HashSetOp Except to
sorted SetOp Except if we left it alone, since the cheapest inputs
are already sorted.  But the point of that set of tests is to
exercise HashSetOp, so swing a bigger hammer.  (The same query is
re-tested with sorting further down, so this seems fine.)

Discussion: https://postgr.es/m/1850138.1731549611@sss.pgh.pa.us
---
 src/backend/optimizer/prep/prepunion.c | 144 ++++++++++++++++---------
 src/test/regress/expected/union.out    |  37 +++----
 src/test/regress/sql/union.sql         |   5 +
 3 files changed, 115 insertions(+), 71 deletions(-)

diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c
index e92a088334..b3a26738d0 100644
--- a/src/backend/optimizer/prep/prepunion.c
+++ b/src/backend/optimizer/prep/prepunion.c
@@ -1010,6 +1010,7 @@ generate_nonunion_paths(SetOperationStmt *op, PlannerInfo *root,
 	bool		lpath_trivial_tlist,
 				rpath_trivial_tlist,
 				result_trivial_tlist;
+	List	   *nonunion_pathkeys = NIL;
 	double		dLeftGroups,
 				dRightGroups,
 				dNumGroups,
@@ -1030,11 +1031,6 @@ generate_nonunion_paths(SetOperationStmt *op, PlannerInfo *root,
 								  refnames_tlist,
 								  &lpath_tlist,
 								  &lpath_trivial_tlist);
-	if (lrel->rtekind == RTE_SUBQUERY)
-		build_setop_child_paths(root, lrel, lpath_trivial_tlist, lpath_tlist,
-								NIL, &dLeftGroups);
-	else
-		dLeftGroups = lrel->rows;
 
 	rrel = recurse_set_operations(op->rarg, root,
 								  op,
@@ -1042,9 +1038,57 @@ generate_nonunion_paths(SetOperationStmt *op, PlannerInfo *root,
 								  refnames_tlist,
 								  &rpath_tlist,
 								  &rpath_trivial_tlist);
+
+	/*
+	 * Generate tlist for SetOp plan node.
+	 *
+	 * The tlist for a SetOp plan isn't important so far as the SetOp is
+	 * concerned, but we must make it look real anyway for the benefit of the
+	 * next plan level up.
+	 */
+	tlist = generate_setop_tlist(op->colTypes, op->colCollations,
+								 0, false, lpath_tlist, refnames_tlist,
+								 &result_trivial_tlist);
+
+	/* We should not have needed any type coercions in the tlist */
+	Assert(result_trivial_tlist);
+
+	*pTargetList = tlist;
+
+	/* Identify the grouping semantics */
+	groupList = generate_setop_grouplist(op, tlist);
+
+	/* Check whether the operators support sorting or hashing */
+	can_sort = grouping_is_sortable(groupList);
+	can_hash = grouping_is_hashable(groupList);
+	if (!can_sort && !can_hash)
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+		/* translator: %s is INTERSECT or EXCEPT */
+				 errmsg("could not implement %s",
+						(op->op == SETOP_INTERSECT) ? "INTERSECT" : "EXCEPT"),
+				 errdetail("Some of the datatypes only support hashing, while others only support sorting.")));
+
+	if (can_sort)
+	{
+		/* Determine the pathkeys for sorting by the whole target list */
+		nonunion_pathkeys = make_pathkeys_for_sortclauses(root, groupList,
+														  tlist);
+
+		root->query_pathkeys = nonunion_pathkeys;
+	}
+
+	/*
+	 * Now that we've got all that info, we can build the child paths.
+	 */
+	if (lrel->rtekind == RTE_SUBQUERY)
+		build_setop_child_paths(root, lrel, lpath_trivial_tlist, lpath_tlist,
+								nonunion_pathkeys, &dLeftGroups);
+	else
+		dLeftGroups = lrel->rows;
 	if (rrel->rtekind == RTE_SUBQUERY)
 		build_setop_child_paths(root, rrel, rpath_trivial_tlist, rpath_tlist,
-								NIL, &dRightGroups);
+								nonunion_pathkeys, &dRightGroups);
 	else
 		dRightGroups = rrel->rows;
 
@@ -1080,30 +1124,11 @@ generate_nonunion_paths(SetOperationStmt *op, PlannerInfo *root,
 	lpath = lrel->cheapest_total_path;
 	rpath = rrel->cheapest_total_path;
 
-	/*
-	 * Generate tlist for SetOp plan node.
-	 *
-	 * The tlist for a SetOp plan isn't important so far as the SetOp is
-	 * concerned, but we must make it look real anyway for the benefit of the
-	 * next plan level up.
-	 */
-	tlist = generate_setop_tlist(op->colTypes, op->colCollations,
-								 0, false, lpath_tlist, refnames_tlist,
-								 &result_trivial_tlist);
-
-	/* We should not have needed any type coercions in the tlist */
-	Assert(result_trivial_tlist);
-
-	*pTargetList = tlist;
-
 	/* Build result relation. */
 	result_rel = fetch_upper_rel(root, UPPERREL_SETOP,
 								 bms_union(lrel->relids, rrel->relids));
 	result_rel->reltarget = create_pathtarget(root, tlist);
 
-	/* Identify the grouping semantics */
-	groupList = generate_setop_grouplist(op, tlist);
-
 	/*
 	 * Estimate number of distinct groups that we'll need hashtable entries
 	 * for; this is the size of the left-hand input for EXCEPT, or the smaller
@@ -1139,17 +1164,6 @@ generate_nonunion_paths(SetOperationStmt *op, PlannerInfo *root,
 			break;
 	}
 
-	/* Check whether the operators support sorting or hashing */
-	can_sort = grouping_is_sortable(groupList);
-	can_hash = grouping_is_hashable(groupList);
-	if (!can_sort && !can_hash)
-		ereport(ERROR,
-				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-		/* translator: %s is INTERSECT or EXCEPT */
-				 errmsg("could not implement %s",
-						(op->op == SETOP_INTERSECT) ? "INTERSECT" : "EXCEPT"),
-				 errdetail("Some of the datatypes only support hashing, while others only support sorting.")));
-
 	/*
 	 * If we can hash, that just requires a SetOp atop the cheapest inputs.
 	 */
@@ -1189,35 +1203,63 @@ generate_nonunion_paths(SetOperationStmt *op, PlannerInfo *root,
 	}
 
 	/*
-	 * If we can sort, sort the inputs if needed before adding SetOp.
+	 * If we can sort, generate the cheapest sorted input paths, and add a
+	 * SetOp atop those.
 	 */
 	if (can_sort)
 	{
 		List	   *pathkeys;
+		Path	   *slpath,
+				   *srpath;
 
+		/* First the left input ... */
 		pathkeys = make_pathkeys_for_sortclauses(root,
 												 groupList,
 												 lpath_tlist);
-		if (!pathkeys_contained_in(pathkeys, lpath->pathkeys))
-			lpath = (Path *) create_sort_path(root,
-											  lpath->parent,
-											  lpath,
-											  pathkeys,
-											  -1.0);
+		if (pathkeys_contained_in(pathkeys, lpath->pathkeys))
+			slpath = lpath;		/* cheapest path is already sorted */
+		else
+		{
+			slpath = get_cheapest_path_for_pathkeys(lrel->pathlist,
+													nonunion_pathkeys,
+													NULL,
+													TOTAL_COST,
+													false);
+			/* Subquery failed to produce any presorted paths? */
+			if (slpath == NULL)
+				slpath = (Path *) create_sort_path(root,
+												   lpath->parent,
+												   lpath,
+												   pathkeys,
+												   -1.0);
+		}
+
+		/* and now the same for the right. */
 		pathkeys = make_pathkeys_for_sortclauses(root,
 												 groupList,
 												 rpath_tlist);
-		if (!pathkeys_contained_in(pathkeys, rpath->pathkeys))
-			rpath = (Path *) create_sort_path(root,
-											  rpath->parent,
-											  rpath,
-											  pathkeys,
-											  -1.0);
+		if (pathkeys_contained_in(pathkeys, rpath->pathkeys))
+			srpath = rpath;		/* cheapest path is already sorted */
+		else
+		{
+			srpath = get_cheapest_path_for_pathkeys(rrel->pathlist,
+													nonunion_pathkeys,
+													NULL,
+													TOTAL_COST,
+													false);
+			/* Subquery failed to produce any presorted paths? */
+			if (srpath == NULL)
+				srpath = (Path *) create_sort_path(root,
+												   rpath->parent,
+												   rpath,
+												   pathkeys,
+												   -1.0);
+		}
 
 		path = (Path *) create_setop_path(root,
 										  result_rel,
-										  lpath,
-										  rpath,
+										  slpath,
+										  srpath,
 										  cmd,
 										  SETOP_SORTED,
 										  groupList,
diff --git a/src/test/regress/expected/union.out b/src/test/regress/expected/union.out
index 16d0807e7f..caa8fe70a0 100644
--- a/src/test/regress/expected/union.out
+++ b/src/test/regress/expected/union.out
@@ -385,13 +385,15 @@ select count(*) from
   5000
 (1 row)
 
+-- this query will prefer a sorted setop unless we force it.
+set enable_indexscan to off;
 explain (costs off)
 select unique1 from tenk1 except select unique2 from tenk1 where unique2 != 10;
-                         QUERY PLAN                         
-------------------------------------------------------------
+           QUERY PLAN            
+---------------------------------
  HashSetOp Except
-   ->  Index Only Scan using tenk1_unique1 on tenk1
-   ->  Index Only Scan using tenk1_unique2 on tenk1 tenk1_1
+   ->  Seq Scan on tenk1
+   ->  Seq Scan on tenk1 tenk1_1
          Filter: (unique2 <> 10)
 (4 rows)
 
@@ -401,6 +403,7 @@ select unique1 from tenk1 except select unique2 from tenk1 where unique2 != 10;
       10
 (1 row)
 
+reset enable_indexscan;
 -- the hashed implementation is sensitive to child plans' tuple slot types
 explain (costs off)
 select * from int8_tbl intersect select q2, q1 from int8_tbl order by 1, 2;
@@ -455,17 +458,15 @@ select count(*) from
 explain (costs off)
 select count(*) from
   ( select unique1 from tenk1 intersect select fivethous from tenk1 ) ss;
-                               QUERY PLAN                               
-------------------------------------------------------------------------
+                            QUERY PLAN                            
+------------------------------------------------------------------
  Aggregate
    ->  SetOp Intersect
          ->  Sort
                Sort Key: tenk1.fivethous
                ->  Seq Scan on tenk1
-         ->  Sort
-               Sort Key: tenk1_1.unique1
-               ->  Index Only Scan using tenk1_unique1 on tenk1 tenk1_1
-(8 rows)
+         ->  Index Only Scan using tenk1_unique1 on tenk1 tenk1_1
+(6 rows)
 
 select count(*) from
   ( select unique1 from tenk1 intersect select fivethous from tenk1 ) ss;
@@ -476,17 +477,13 @@ select count(*) from
 
 explain (costs off)
 select unique1 from tenk1 except select unique2 from tenk1 where unique2 != 10;
-                            QUERY PLAN                            
-------------------------------------------------------------------
+                         QUERY PLAN                         
+------------------------------------------------------------
  SetOp Except
-   ->  Sort
-         Sort Key: tenk1.unique1
-         ->  Index Only Scan using tenk1_unique1 on tenk1
-   ->  Sort
-         Sort Key: tenk1_1.unique2
-         ->  Index Only Scan using tenk1_unique2 on tenk1 tenk1_1
-               Filter: (unique2 <> 10)
-(8 rows)
+   ->  Index Only Scan using tenk1_unique1 on tenk1
+   ->  Index Only Scan using tenk1_unique2 on tenk1 tenk1_1
+         Filter: (unique2 <> 10)
+(4 rows)
 
 select unique1 from tenk1 except select unique2 from tenk1 where unique2 != 10;
  unique1 
diff --git a/src/test/regress/sql/union.sql b/src/test/regress/sql/union.sql
index db2aae3cc5..13700a6bfc 100644
--- a/src/test/regress/sql/union.sql
+++ b/src/test/regress/sql/union.sql
@@ -134,10 +134,15 @@ select count(*) from
 select count(*) from
   ( select unique1 from tenk1 intersect select fivethous from tenk1 ) ss;
 
+-- this query will prefer a sorted setop unless we force it.
+set enable_indexscan to off;
+
 explain (costs off)
 select unique1 from tenk1 except select unique2 from tenk1 where unique2 != 10;
 select unique1 from tenk1 except select unique2 from tenk1 where unique2 != 10;
 
+reset enable_indexscan;
+
 -- the hashed implementation is sensitive to child plans' tuple slot types
 explain (costs off)
 select * from int8_tbl intersect select q2, q1 from int8_tbl order by 1, 2;
-- 
2.43.5