From f66de0ee842a20ced46d2661b4bf80914e9e8847 Mon Sep 17 00:00:00 2001
From: Ashutosh Bapat <ashutosh.bapat@enterprisedb.com>
Date: Mon, 15 Apr 2024 12:03:01 +0530
Subject: [PATCH] Fix partitioned join case in apply_scanjoin_target_to_paths()

apply_scanjoin_target_to_paths() assumes that the cheapest paths for a
partitioned relation can be computed by Append'ing appropriate paths from the
partitions. This is not true for partitioned join relations which have two
types of paths: a. Append paths produced by partitionwise join, b. Join paths
produced by joining partitioned relations. Partitionwise join paths may not be
optimal always. But only those are recomputed by
apply_scanjoin_target_to_paths() after applying the new targetlist. Fix
apply_scanjoin_target_to_paths() not to wipe out all existing paths for a
partitioned join relation.

Author: Ashutosh Bapat
Review and initial investigation: Jacob Wartak
Discussion: https://www.postgresql.org/message-id/CAExHW5toze58+jL-454J3ty11sqJyU13Sz5rJPQZDmASwZgWiA@mail.gmail.com
---
 src/backend/optimizer/plan/planner.c         | 33 ++++---
 src/test/regress/expected/partition_join.out | 96 +++++++++++++-------
 src/test/regress/sql/partition_join.sql      | 15 ++-
 3 files changed, 93 insertions(+), 51 deletions(-)

diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 5320da51a0..4714fa41a0 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -7515,17 +7515,24 @@ apply_scanjoin_target_to_paths(PlannerInfo *root,
 	check_stack_depth();
 
 	/*
-	 * If the rel is partitioned, we want to drop its existing paths and
-	 * generate new ones.  This function would still be correct if we kept the
-	 * existing paths: we'd modify them to generate the correct target above
-	 * the partitioning Append, and then they'd compete on cost with paths
-	 * generating the target below the Append.  However, in our current cost
-	 * model the latter way is always the same or cheaper cost, so modifying
-	 * the existing paths would just be useless work.  Moreover, when the cost
-	 * is the same, varying roundoff errors might sometimes allow an existing
-	 * path to be picked, resulting in undesirable cross-platform plan
-	 * variations.  So we drop old paths and thereby force the work to be done
-	 * below the Append, except in the case of a non-parallel-safe target.
+	 * If the rel is partitioned simple rel, we want to drop its existing
+	 * paths and generate new ones.  This function would still be correct if
+	 * we kept the existing paths: we'd modify them to generate the correct
+	 * target above the partitioning Append, and then they'd compete on cost
+	 * with paths generating the target below the Append.  However, in our
+	 * current cost model the latter way is always the same or cheaper cost,
+	 * so modifying the existing paths would just be useless work.  Moreover,
+	 * when the cost is the same, varying roundoff errors might sometimes
+	 * allow an existing path to be picked, resulting in undesirable
+	 * cross-platform plan variations.  So we drop old paths and thereby force
+	 * the work to be done below the Append, except in the case of a
+	 * non-parallel-safe target.
+	 *
+	 * Partitioned join relations have two types of paths: 1. Append paths
+	 * created by partitionwise join and 2. Join paths joining the partitioned
+	 * relations. The paths in the second set are similar to the paths for a
+	 * non-partitioned relation and should not be wiped out since they may
+	 * contain an optimal path.
 	 *
 	 * Some care is needed, because we have to allow
 	 * generate_useful_gather_paths to see the old partial paths in the next
@@ -7533,7 +7540,7 @@ apply_scanjoin_target_to_paths(PlannerInfo *root,
 	 * generate_useful_gather_paths to add path(s) to the main list, and
 	 * finally zap the partial pathlist.
 	 */
-	if (rel_is_partitioned)
+	if (rel_is_partitioned && IS_SIMPLE_REL(rel))
 		rel->pathlist = NIL;
 
 	/*
@@ -7559,7 +7566,7 @@ apply_scanjoin_target_to_paths(PlannerInfo *root,
 	}
 
 	/* Finish dropping old paths for a partitioned rel, per comment above */
-	if (rel_is_partitioned)
+	if (rel_is_partitioned && IS_SIMPLE_REL(rel))
 		rel->partial_pathlist = NIL;
 
 	/* Extract SRF-free scan/join target. */
diff --git a/src/test/regress/expected/partition_join.out b/src/test/regress/expected/partition_join.out
index 6d07f86b9b..513ffbdfac 100644
--- a/src/test/regress/expected/partition_join.out
+++ b/src/test/regress/expected/partition_join.out
@@ -4777,38 +4777,35 @@ CREATE TABLE plt3_adv_p1 PARTITION OF plt3_adv FOR VALUES IN ('0001');
 CREATE TABLE plt3_adv_p2 PARTITION OF plt3_adv FOR VALUES IN ('0003', '0004');
 INSERT INTO plt3_adv SELECT i, i, to_char(i % 5, 'FM0000') FROM generate_series(0, 24) i WHERE i % 5 IN (1, 3, 4);
 ANALYZE plt3_adv;
--- This tests that when merging partitions from plt1_adv and plt2_adv in
--- merge_list_bounds(), process_outer_partition() returns an already-assigned
--- merged partition when re-called with plt1_adv_p1 for the second list value
--- '0001' of that partition
+-- Both the queries test that when merging partitions from plt1_adv and
+-- plt2_adv in merge_list_bounds(), process_outer_partition() returns an
+-- already-assigned merged partition when re-called with plt1_adv_p1 for the
+-- second list value '0001' of that partition. But the first query doesn't end
+-- up using partitionwise join since it's more costly whereas the second one
+-- use partitionwise join.
 EXPLAIN (COSTS OFF)
 SELECT t1.a, t1.c, t2.a, t2.c, t3.a, t3.c FROM (plt1_adv t1 LEFT JOIN plt2_adv t2 ON (t1.c = t2.c)) FULL JOIN plt3_adv t3 ON (t1.c = t3.c) WHERE coalesce(t1.a, 0) % 5 != 3 AND coalesce(t1.a, 0) % 5 != 4 ORDER BY t1.c, t1.a, t2.a, t3.a;
-                                          QUERY PLAN                                           
------------------------------------------------------------------------------------------------
+                                     QUERY PLAN                                      
+-------------------------------------------------------------------------------------
  Sort
    Sort Key: t1.c, t1.a, t2.a, t3.a
-   ->  Append
-         ->  Hash Full Join
-               Hash Cond: (t1_1.c = t3_1.c)
-               Filter: (((COALESCE(t1_1.a, 0) % 5) <> 3) AND ((COALESCE(t1_1.a, 0) % 5) <> 4))
-               ->  Hash Left Join
-                     Hash Cond: (t1_1.c = t2_1.c)
+   ->  Hash Full Join
+         Hash Cond: (t1.c = t3.c)
+         Filter: (((COALESCE(t1.a, 0) % 5) <> 3) AND ((COALESCE(t1.a, 0) % 5) <> 4))
+         ->  Hash Left Join
+               Hash Cond: (t1.c = t2.c)
+               ->  Append
                      ->  Seq Scan on plt1_adv_p1 t1_1
-                     ->  Hash
-                           ->  Seq Scan on plt2_adv_p1 t2_1
-               ->  Hash
-                     ->  Seq Scan on plt3_adv_p1 t3_1
-         ->  Hash Full Join
-               Hash Cond: (t1_2.c = t3_2.c)
-               Filter: (((COALESCE(t1_2.a, 0) % 5) <> 3) AND ((COALESCE(t1_2.a, 0) % 5) <> 4))
-               ->  Hash Left Join
-                     Hash Cond: (t1_2.c = t2_2.c)
                      ->  Seq Scan on plt1_adv_p2 t1_2
-                     ->  Hash
-                           ->  Seq Scan on plt2_adv_p2 t2_2
                ->  Hash
+                     ->  Append
+                           ->  Seq Scan on plt2_adv_p1 t2_1
+                           ->  Seq Scan on plt2_adv_p2 t2_2
+         ->  Hash
+               ->  Append
+                     ->  Seq Scan on plt3_adv_p1 t3_1
                      ->  Seq Scan on plt3_adv_p2 t3_2
-(23 rows)
+(18 rows)
 
 SELECT t1.a, t1.c, t2.a, t2.c, t3.a, t3.c FROM (plt1_adv t1 LEFT JOIN plt2_adv t2 ON (t1.c = t2.c)) FULL JOIN plt3_adv t3 ON (t1.c = t3.c) WHERE coalesce(t1.a, 0) % 5 != 3 AND coalesce(t1.a, 0) % 5 != 4 ORDER BY t1.c, t1.a, t2.a, t3.a;
  a  |  c   | a  |  c   | a  |  c   
@@ -4870,6 +4867,42 @@ SELECT t1.a, t1.c, t2.a, t2.c, t3.a, t3.c FROM (plt1_adv t1 LEFT JOIN plt2_adv t
  22 | 0002 | 22 | 0002 |    | 
 (55 rows)
 
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.a, t2.c FROM (plt1_adv t1 LEFT JOIN plt2_adv t2 ON (t1.c = t2.c)) WHERE coalesce(t1.a, t2.a) IN (3, 4) ORDER BY t1.c, t1.a, t2.a;
+                                 QUERY PLAN                                  
+-----------------------------------------------------------------------------
+ Sort
+   Sort Key: t1.c, t1.a, t2.a
+   ->  Append
+         ->  Hash Left Join
+               Hash Cond: (t1_1.c = t2_1.c)
+               Filter: (COALESCE(t1_1.a, t2_1.a) = ANY ('{3,4}'::integer[]))
+               ->  Seq Scan on plt1_adv_p1 t1_1
+               ->  Hash
+                     ->  Seq Scan on plt2_adv_p1 t2_1
+         ->  Hash Left Join
+               Hash Cond: (t1_2.c = t2_2.c)
+               Filter: (COALESCE(t1_2.a, t2_2.a) = ANY ('{3,4}'::integer[]))
+               ->  Seq Scan on plt1_adv_p2 t1_2
+               ->  Hash
+                     ->  Seq Scan on plt2_adv_p2 t2_2
+(15 rows)
+
+SELECT t1.a, t1.c, t2.a, t2.c FROM (plt1_adv t1 LEFT JOIN plt2_adv t2 ON (t1.c = t2.c)) WHERE coalesce(t1.a, t2.a) IN (3, 4) ORDER BY t1.c, t1.a, t2.a;
+ a |  c   | a  |  c   
+---+------+----+------
+ 3 | 0003 |  3 | 0003
+ 3 | 0003 |  8 | 0003
+ 3 | 0003 | 13 | 0003
+ 3 | 0003 | 18 | 0003
+ 3 | 0003 | 23 | 0003
+ 4 | 0004 |  4 | 0004
+ 4 | 0004 |  9 | 0004
+ 4 | 0004 | 14 | 0004
+ 4 | 0004 | 19 | 0004
+ 4 | 0004 | 24 | 0004
+(10 rows)
+
 DROP TABLE plt1_adv;
 DROP TABLE plt2_adv;
 DROP TABLE plt3_adv;
@@ -5094,23 +5127,20 @@ INSERT INTO fract_t (id) (SELECT generate_series(0, 1999));
 ANALYZE fract_t;
 -- verify plan; nested index only scans
 SET max_parallel_workers_per_gather = 0;
-SET enable_partitionwise_join = on;
 EXPLAIN (COSTS OFF)
 SELECT x.id, y.id FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY x.id ASC LIMIT 10;
                               QUERY PLAN                               
 -----------------------------------------------------------------------
  Limit
-   ->  Merge Append
-         Sort Key: x.id
-         ->  Merge Left Join
-               Merge Cond: (x_1.id = y_1.id)
+   ->  Merge Left Join
+         Merge Cond: (x.id = y.id)
+         ->  Append
                ->  Index Only Scan using fract_t0_pkey on fract_t0 x_1
-               ->  Index Only Scan using fract_t0_pkey on fract_t0 y_1
-         ->  Merge Left Join
-               Merge Cond: (x_2.id = y_2.id)
                ->  Index Only Scan using fract_t1_pkey on fract_t1 x_2
+         ->  Append
+               ->  Index Only Scan using fract_t0_pkey on fract_t0 y_1
                ->  Index Only Scan using fract_t1_pkey on fract_t1 y_2
-(11 rows)
+(9 rows)
 
 EXPLAIN (COSTS OFF)
 SELECT x.id, y.id FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY x.id DESC LIMIT 10;
diff --git a/src/test/regress/sql/partition_join.sql b/src/test/regress/sql/partition_join.sql
index 128ce8376e..3bde5b54c1 100644
--- a/src/test/regress/sql/partition_join.sql
+++ b/src/test/regress/sql/partition_join.sql
@@ -1136,14 +1136,20 @@ CREATE TABLE plt3_adv_p2 PARTITION OF plt3_adv FOR VALUES IN ('0003', '0004');
 INSERT INTO plt3_adv SELECT i, i, to_char(i % 5, 'FM0000') FROM generate_series(0, 24) i WHERE i % 5 IN (1, 3, 4);
 ANALYZE plt3_adv;
 
--- This tests that when merging partitions from plt1_adv and plt2_adv in
--- merge_list_bounds(), process_outer_partition() returns an already-assigned
--- merged partition when re-called with plt1_adv_p1 for the second list value
--- '0001' of that partition
+-- Both the queries test that when merging partitions from plt1_adv and
+-- plt2_adv in merge_list_bounds(), process_outer_partition() returns an
+-- already-assigned merged partition when re-called with plt1_adv_p1 for the
+-- second list value '0001' of that partition. But the first query doesn't end
+-- up using partitionwise join since it's more costly whereas the second one
+-- use partitionwise join.
 EXPLAIN (COSTS OFF)
 SELECT t1.a, t1.c, t2.a, t2.c, t3.a, t3.c FROM (plt1_adv t1 LEFT JOIN plt2_adv t2 ON (t1.c = t2.c)) FULL JOIN plt3_adv t3 ON (t1.c = t3.c) WHERE coalesce(t1.a, 0) % 5 != 3 AND coalesce(t1.a, 0) % 5 != 4 ORDER BY t1.c, t1.a, t2.a, t3.a;
 SELECT t1.a, t1.c, t2.a, t2.c, t3.a, t3.c FROM (plt1_adv t1 LEFT JOIN plt2_adv t2 ON (t1.c = t2.c)) FULL JOIN plt3_adv t3 ON (t1.c = t3.c) WHERE coalesce(t1.a, 0) % 5 != 3 AND coalesce(t1.a, 0) % 5 != 4 ORDER BY t1.c, t1.a, t2.a, t3.a;
 
+EXPLAIN (COSTS OFF)
+SELECT t1.a, t1.c, t2.a, t2.c FROM (plt1_adv t1 LEFT JOIN plt2_adv t2 ON (t1.c = t2.c)) WHERE coalesce(t1.a, t2.a) IN (3, 4) ORDER BY t1.c, t1.a, t2.a;
+SELECT t1.a, t1.c, t2.a, t2.c FROM (plt1_adv t1 LEFT JOIN plt2_adv t2 ON (t1.c = t2.c)) WHERE coalesce(t1.a, t2.a) IN (3, 4) ORDER BY t1.c, t1.a, t2.a;
+
 DROP TABLE plt1_adv;
 DROP TABLE plt2_adv;
 DROP TABLE plt3_adv;
@@ -1203,7 +1209,6 @@ ANALYZE fract_t;
 
 -- verify plan; nested index only scans
 SET max_parallel_workers_per_gather = 0;
-SET enable_partitionwise_join = on;
 
 EXPLAIN (COSTS OFF)
 SELECT x.id, y.id FROM fract_t x LEFT JOIN fract_t y USING (id) ORDER BY x.id ASC LIMIT 10;
-- 
2.34.1