From b911333078fad71d4509adab1b0473828409b000 Mon Sep 17 00:00:00 2001
From: Alena Rybakina <a.rybakina@postgrespro.ru>
Date: Tue, 12 Nov 2024 12:16:42 +0300
Subject: [PATCH] Teach the planner to convert EXISTS and NOT EXISTS subqueries
 into semi and anti joins, respectively, if subquery's join expressions are
 independent and vars have the level no more than the parent. In addtion the
 transformation will be alowed if the expressions are constant. To do this, we
 put all potential expressions from the qual list and join list into the
 common list and check each expression one by one to see if they are suitable
 for transformation. In particular, we need to increment the level of
 expresions's vars to the parent query level. We condider expressions only for
 INNER JOIN type of join in subquery, otherwice the transformation is not
 available.
 
 Authors: Alena Rybakina <lena.ribackina@yandex.ru>
 Reviewed-by: Ranier Vilela <ranier.vf@gmail.com>, Ilia Evdokimov <ilya.evdokimov@tantorlabs.com>

---
 src/backend/optimizer/plan/subselect.c  | 135 ++++++++++----
 src/test/regress/expected/subselect.out | 232 ++++++++++++++++++++++++
 src/test/regress/sql/subselect.sql      |  99 ++++++++++
 3 files changed, 433 insertions(+), 33 deletions(-)

diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c
index eaaf8c1b49a..957d6cd36be 100644
--- a/src/backend/optimizer/plan/subselect.c
+++ b/src/backend/optimizer/plan/subselect.c
@@ -1376,6 +1376,18 @@ convert_EXISTS_sublink_to_join(PlannerInfo *root, SubLink *sublink,
 	int			varno;
 	Relids		clause_varnos;
 	Relids		upper_varnos;
+	ListCell *lc;
+	List *clauses = NIL;
+	List *all_clauses = NIL;
+	int first_elem = true;
+	Const *const_var = makeConst(BOOLOID,
+									-1,
+									InvalidOid,
+									sizeof(bool),
+									(Datum) 1,
+									false,
+									true);
+
 
 	Assert(sublink->subLinkType == EXISTS_SUBLINK);
 
@@ -1403,40 +1415,15 @@ convert_EXISTS_sublink_to_join(PlannerInfo *root, SubLink *sublink,
 	 * with noplace to evaluate the targetlist.
 	 */
 	if (!simplify_EXISTS_query(root, subselect))
-		return NULL;
+			return NULL;
 
-	/*
-	 * Separate out the WHERE clause.  (We could theoretically also remove
-	 * top-level plain JOIN/ON clauses, but it's probably not worth the
-	 * trouble.)
-	 */
-	whereClause = subselect->jointree->quals;
-	subselect->jointree->quals = NULL;
+	if (subselect->jointree->quals)
+		all_clauses = lappend(all_clauses, subselect->jointree->quals);
 
-	/*
-	 * The rest of the sub-select must not refer to any Vars of the parent
-	 * query.  (Vars of higher levels should be okay, though.)
-	 */
-	if (contain_vars_of_level((Node *) subselect, 1))
-		return NULL;
-
-	/*
-	 * On the other hand, the WHERE clause must contain some Vars of the
-	 * parent query, else it's not gonna be a join.
-	 */
-	if (!contain_vars_of_level(whereClause, 1))
-		return NULL;
-
-	/*
-	 * We don't risk optimizing if the WHERE clause is volatile, either.
-	 */
-	if (contain_volatile_functions(whereClause))
-		return NULL;
+	subselect->jointree->quals = NULL;
 
-	/*
-	 * The subquery must have a nonempty jointree, but we can make it so.
-	 */
-	replace_empty_jointree(subselect);
+	/* Gather all clauses in main list for the further consideration */
+	all_clauses = list_concat(all_clauses, subselect->jointree->fromlist);
 
 	/*
 	 * Prepare to pull up the sub-select into top range table.
@@ -1455,7 +1442,90 @@ convert_EXISTS_sublink_to_join(PlannerInfo *root, SubLink *sublink,
 	 */
 	rtoffset = list_length(parse->rtable);
 	OffsetVarNodes((Node *) subselect, rtoffset, 0);
-	OffsetVarNodes(whereClause, rtoffset, 0);
+
+	/*
+	 * We will able to remove top-level plain JOIN/ON clauses if they are not outer join.
+	 */
+	foreach (lc, all_clauses)
+	{
+		Node *je = ((Node *) lfirst(lc));
+		bool const_type_expr = false;
+
+		whereClause = je;
+
+		if (IsA(je, RangeTblRef))
+			goto end;
+
+		if ((IsA(je, JoinExpr) && ((JoinExpr *)je)->jointype != JOIN_INNER))
+			goto end;
+
+		if (IsA(je, JoinExpr) && ((JoinExpr *)je)->quals != NULL)
+			whereClause = ((JoinExpr *)je)->quals;
+
+		if (IsA(whereClause, OpExpr) &&
+		   (IsA(get_rightop(whereClause), Const) || IsA(get_leftop(whereClause), Const)))
+			const_type_expr = true;
+
+		/*
+		* On the other hand, the WHERE clause must contain some Vars of the
+		* parent query, else it's not gonna be a join.
+		*/
+		if (!const_type_expr && !contain_vars_of_level(whereClause, 1))
+			goto end;
+
+		/*
+		* We don't risk optimizing if the WHERE clause is volatile, either.
+		*/
+		if (contain_volatile_functions(whereClause))
+			goto end;
+
+		/*
+		 * In case of a successful attempt, replaces it with the correct condition
+		 */
+		if (IsA(je, JoinExpr))
+			((JoinExpr *)je)->quals = (Node *) const_var;
+
+		if(!const_type_expr || (const_type_expr && contain_vars_of_level(whereClause, 1)))
+		{
+			OffsetVarNodes(whereClause, rtoffset, 0);
+			IncrementVarSublevelsUp(whereClause, -1, 1);
+		}
+
+		clauses = lappend(clauses, whereClause);
+
+		first_elem = false;
+		subselect->jointree->fromlist = list_delete_ptr(subselect->jointree->fromlist, lc);
+
+		end:
+			if (first_elem)
+				return NULL;
+	}
+
+	list_free(all_clauses);
+
+	/* We don't have any cluses for pull-up creation */
+	if (clauses == NIL)
+		return NULL;
+	else
+		/* We can easily combine clauses through AND operator because they are independent */
+		whereClause = list_length(clauses) > 1 ?
+							(Node *) makeBoolExpr(AND_EXPR, clauses, -1) :
+							(Node *) linitial(clauses);
+
+
+	subselect->jointree->quals = NULL;
+
+	/*
+	 * The rest of the sub-select must not refer to any Vars of the parent
+	 * query.  (Vars of higher levels should be okay, though.)
+	 */
+	if (contain_vars_of_level((Node *) subselect, 1))
+		return NULL;
+
+	/*
+	 * The subquery must have a nonempty jointree, but we can make it so.
+	 */
+	replace_empty_jointree(subselect);
 
 	/*
 	 * Upper-level vars in subquery will now be one level closer to their
@@ -1463,7 +1533,6 @@ convert_EXISTS_sublink_to_join(PlannerInfo *root, SubLink *sublink,
 	 * becomes level zero.
 	 */
 	IncrementVarSublevelsUp((Node *) subselect, -1, 1);
-	IncrementVarSublevelsUp(whereClause, -1, 1);
 
 	/*
 	 * Now that the WHERE clause is adjusted to match the parent query
diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out
index ebc545e2461..f6dbe99df90 100644
--- a/src/test/regress/expected/subselect.out
+++ b/src/test/regress/expected/subselect.out
@@ -812,6 +812,238 @@ where exists (
       from text_tbl ) ss
   where road.name = ss.f1 );
 rollback;
+-- Test case for exist sublink where we can consider some undependent expression
+-- with outer link
+--
+EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF)
+ SELECT 1
+   FROM ta
+  WHERE EXISTS (SELECT 1
+                  FROM tb
+                  JOIN tc
+                    ON ta.id = tb.id);
+                               QUERY PLAN                                
+-------------------------------------------------------------------------
+ Nested Loop Semi Join (actual rows=2 loops=1)
+   ->  Seq Scan on ta (actual rows=2 loops=1)
+   ->  Nested Loop (actual rows=1 loops=2)
+         ->  Index Only Scan using tc_pkey on tc (actual rows=1 loops=2)
+               Index Cond: (id = ta.id)
+               Heap Fetches: 2
+         ->  Seq Scan on tb (actual rows=1 loops=2)
+(7 rows)
+
+EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF)
+ SELECT 1
+   FROM ta
+  WHERE NOT EXISTS (SELECT 1
+                  FROM tb
+                  JOIN tc
+                    ON ta.id = tb.id);
+                               QUERY PLAN                                
+-------------------------------------------------------------------------
+ Nested Loop Anti Join (actual rows=0 loops=1)
+   ->  Seq Scan on ta (actual rows=2 loops=1)
+   ->  Nested Loop (actual rows=1 loops=2)
+         ->  Index Only Scan using tc_pkey on tc (actual rows=1 loops=2)
+               Index Cond: (id = ta.id)
+               Heap Fetches: 2
+         ->  Seq Scan on tb (actual rows=1 loops=2)
+(7 rows)
+
+EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF)
+ SELECT 1
+   FROM ta
+  WHERE EXISTS (SELECT 1
+                  FROM tb
+                  JOIN tc
+                    ON ta.id = tc.id);
+                               QUERY PLAN                                
+-------------------------------------------------------------------------
+ Nested Loop Semi Join (actual rows=2 loops=1)
+   ->  Seq Scan on ta (actual rows=2 loops=1)
+   ->  Nested Loop (actual rows=1 loops=2)
+         ->  Index Only Scan using tb_pkey on tb (actual rows=1 loops=2)
+               Index Cond: (id = ta.id)
+               Heap Fetches: 2
+         ->  Seq Scan on tc (actual rows=1 loops=2)
+(7 rows)
+
+EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF)
+ SELECT 1
+   FROM ta
+  WHERE NOT EXISTS (SELECT 1
+                  FROM tb
+                  JOIN tc
+                    ON ta.id = tb.id);
+                               QUERY PLAN                                
+-------------------------------------------------------------------------
+ Nested Loop Anti Join (actual rows=0 loops=1)
+   ->  Seq Scan on ta (actual rows=2 loops=1)
+   ->  Nested Loop (actual rows=1 loops=2)
+         ->  Index Only Scan using tc_pkey on tc (actual rows=1 loops=2)
+               Index Cond: (id = ta.id)
+               Heap Fetches: 2
+         ->  Seq Scan on tb (actual rows=1 loops=2)
+(7 rows)
+
+-- Join compound expression
+EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF)
+ SELECT 1
+   FROM ta
+  WHERE EXISTS (SELECT 1
+                  FROM tb
+                  JOIN tc
+                    ON ta.id = tc.id and
+                       ta.id = tb.id);
+                         QUERY PLAN                          
+-------------------------------------------------------------
+ Hash Right Semi Join (actual rows=2 loops=1)
+   Hash Cond: (tb.id = ta.id)
+   ->  Hash Join (actual rows=2 loops=1)
+         Hash Cond: (tb.id = tc.id)
+         ->  Seq Scan on tb (actual rows=4 loops=1)
+         ->  Hash (actual rows=2 loops=1)
+               Buckets: 4096  Batches: 1  Memory Usage: 33kB
+               ->  Seq Scan on tc (actual rows=2 loops=1)
+   ->  Hash (actual rows=2 loops=1)
+         Buckets: 4096  Batches: 1  Memory Usage: 33kB
+         ->  Seq Scan on ta (actual rows=2 loops=1)
+(11 rows)
+
+EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF)
+ SELECT 1
+   FROM ta
+  WHERE NOT EXISTS (SELECT 1
+                  FROM tb
+                  JOIN tc
+                    ON ta.id = tc.id and
+                       ta.id = tb.id);
+                               QUERY PLAN                                
+-------------------------------------------------------------------------
+ Nested Loop Anti Join (actual rows=0 loops=1)
+   ->  Seq Scan on ta (actual rows=2 loops=1)
+   ->  Nested Loop (actual rows=1 loops=2)
+         ->  Index Only Scan using tb_pkey on tb (actual rows=1 loops=2)
+               Index Cond: (id = ta.id)
+               Heap Fetches: 2
+         ->  Index Only Scan using tc_pkey on tc (actual rows=1 loops=2)
+               Index Cond: (id = ta.id)
+               Heap Fetches: 2
+(9 rows)
+
+EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF)
+ SELECT 1
+   FROM ta
+  WHERE EXISTS (SELECT 1
+                  FROM tb
+                  JOIN tc
+                    ON ta.id = tc.id and
+                       ta.id = tb.id);
+                         QUERY PLAN                          
+-------------------------------------------------------------
+ Hash Right Semi Join (actual rows=2 loops=1)
+   Hash Cond: (tb.id = ta.id)
+   ->  Hash Join (actual rows=2 loops=1)
+         Hash Cond: (tb.id = tc.id)
+         ->  Seq Scan on tb (actual rows=4 loops=1)
+         ->  Hash (actual rows=2 loops=1)
+               Buckets: 4096  Batches: 1  Memory Usage: 33kB
+               ->  Seq Scan on tc (actual rows=2 loops=1)
+   ->  Hash (actual rows=2 loops=1)
+         Buckets: 4096  Batches: 1  Memory Usage: 33kB
+         ->  Seq Scan on ta (actual rows=2 loops=1)
+(11 rows)
+
+-- Compound expression with const type
+EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF)
+ SELECT 1
+   FROM ta
+  WHERE EXISTS (SELECT 1
+                  FROM tb
+                  JOIN tc
+                    ON ta.id = tc.id and
+                       ta.id = 1);
+                               QUERY PLAN                                
+-------------------------------------------------------------------------
+ Nested Loop Semi Join (actual rows=1 loops=1)
+   ->  Index Only Scan using ta_pkey on ta (actual rows=1 loops=1)
+         Index Cond: (id = 1)
+         Heap Fetches: 1
+   ->  Nested Loop (actual rows=1 loops=1)
+         ->  Index Only Scan using tb_pkey on tb (actual rows=1 loops=1)
+               Index Cond: (id = 1)
+               Heap Fetches: 1
+         ->  Seq Scan on tc (actual rows=1 loops=1)
+(9 rows)
+
+EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF)
+ SELECT 1
+   FROM ta
+  WHERE NOT EXISTS (SELECT 1
+                  FROM tb
+                  JOIN tc
+                    ON ta.id = tc.id and
+                       ta.id = 1);
+                        QUERY PLAN                        
+----------------------------------------------------------
+ Hash Right Anti Join (actual rows=1 loops=1)
+   Hash Cond: (tb.id = ta.id)
+   Join Filter: (ta.id = 1)
+   Rows Removed by Join Filter: 2
+   ->  Nested Loop (actual rows=8 loops=1)
+         ->  Seq Scan on tb (actual rows=4 loops=1)
+         ->  Materialize (actual rows=2 loops=4)
+               Storage: Memory  Maximum Storage: 17kB
+               ->  Seq Scan on tc (actual rows=2 loops=1)
+   ->  Hash (actual rows=2 loops=1)
+         Buckets: 4096  Batches: 1  Memory Usage: 33kB
+         ->  Seq Scan on ta (actual rows=2 loops=1)
+(12 rows)
+
+EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF)
+ SELECT 1
+   FROM ta
+  WHERE EXISTS (SELECT 1
+                  FROM tb
+                  JOIN tc
+                    ON tb.id = 1 and
+                       ta.id = 1);
+                               QUERY PLAN                                
+-------------------------------------------------------------------------
+ Nested Loop Semi Join (actual rows=1 loops=1)
+   ->  Index Only Scan using ta_pkey on ta (actual rows=1 loops=1)
+         Index Cond: (id = 1)
+         Heap Fetches: 1
+   ->  Nested Loop (actual rows=1 loops=1)
+         ->  Index Only Scan using tc_pkey on tc (actual rows=1 loops=1)
+               Index Cond: (id = 1)
+               Heap Fetches: 1
+         ->  Seq Scan on tb (actual rows=1 loops=1)
+(9 rows)
+
+-- Disabled pull up because it is applcapable for INNER JOIN connection
+EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF)
+ SELECT 1
+   FROM ta
+  WHERE EXISTS (SELECT 1
+                  FROM tb
+                  RIGHT JOIN tc
+                    ON ta.id = tc.id);
+                         QUERY PLAN                         
+------------------------------------------------------------
+ Seq Scan on ta (actual rows=2 loops=1)
+   Filter: EXISTS(SubPlan 1)
+   SubPlan 1
+     ->  Nested Loop Left Join (actual rows=1 loops=2)
+           Join Filter: (ta.id = tc.id)
+           Rows Removed by Join Filter: 2
+           ->  Seq Scan on tc (actual rows=1 loops=2)
+           ->  Materialize (actual rows=2 loops=2)
+                 Storage: Memory  Maximum Storage: 17kB
+                 ->  Seq Scan on tb (actual rows=4 loops=1)
+(10 rows)
+
 --
 -- Test case for sublinks pushed down into subselects via join alias expansion
 --
diff --git a/src/test/regress/sql/subselect.sql b/src/test/regress/sql/subselect.sql
index 6ed3636a9e4..13d7066a823 100644
--- a/src/test/regress/sql/subselect.sql
+++ b/src/test/regress/sql/subselect.sql
@@ -439,6 +439,105 @@ where exists (
 
 rollback;
 
+-- Test case for exist sublink where we can consider some undependent expression
+-- with outer link
+--
+EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF)
+ SELECT 1
+   FROM ta
+  WHERE EXISTS (SELECT 1
+                  FROM tb
+                  JOIN tc
+                    ON ta.id = tb.id);
+
+EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF)
+ SELECT 1
+   FROM ta
+  WHERE NOT EXISTS (SELECT 1
+                  FROM tb
+                  JOIN tc
+                    ON ta.id = tb.id);
+
+EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF)
+ SELECT 1
+   FROM ta
+  WHERE EXISTS (SELECT 1
+                  FROM tb
+                  JOIN tc
+                    ON ta.id = tc.id);
+
+EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF)
+ SELECT 1
+   FROM ta
+  WHERE NOT EXISTS (SELECT 1
+                  FROM tb
+                  JOIN tc
+                    ON ta.id = tb.id);
+
+-- Join compound expression
+EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF)
+ SELECT 1
+   FROM ta
+  WHERE EXISTS (SELECT 1
+                  FROM tb
+                  JOIN tc
+                    ON ta.id = tc.id and
+                       ta.id = tb.id);
+
+EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF)
+ SELECT 1
+   FROM ta
+  WHERE NOT EXISTS (SELECT 1
+                  FROM tb
+                  JOIN tc
+                    ON ta.id = tc.id and
+                       ta.id = tb.id);
+
+EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF)
+ SELECT 1
+   FROM ta
+  WHERE EXISTS (SELECT 1
+                  FROM tb
+                  JOIN tc
+                    ON ta.id = tc.id and
+                       ta.id = tb.id);
+
+-- Compound expression with const type
+EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF)
+ SELECT 1
+   FROM ta
+  WHERE EXISTS (SELECT 1
+                  FROM tb
+                  JOIN tc
+                    ON ta.id = tc.id and
+                       ta.id = 1);
+
+EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF)
+ SELECT 1
+   FROM ta
+  WHERE NOT EXISTS (SELECT 1
+                  FROM tb
+                  JOIN tc
+                    ON ta.id = tc.id and
+                       ta.id = 1);
+
+EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF)
+ SELECT 1
+   FROM ta
+  WHERE EXISTS (SELECT 1
+                  FROM tb
+                  JOIN tc
+                    ON tb.id = 1 and
+                       ta.id = 1);
+-- Disabled pull up because it is applcapable for INNER JOIN connection
+EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF, BUFFERS OFF)
+ SELECT 1
+   FROM ta
+  WHERE EXISTS (SELECT 1
+                  FROM tb
+                  RIGHT JOIN tc
+                    ON ta.id = tc.id);
+
 --
 -- Test case for sublinks pushed down into subselects via join alias expansion
 --
-- 
2.34.1

