Useless LEFT JOIN breaks MIN/MAX optimization

Started by Alena Rybakina11 months ago5 messages

a.rybakina@postgrespro.ru

11 months ago

1 attachment(s)

Hi hackers!

My colleague gave me an interesting case related to min max
optimization. Adding a useless left join to the select min from t query
breaks the min/max read optimization from the index.
What is meant is shown in the example below:

drop table if exists t1;
drop table if exists t2;

create table t1 (id int not null, mod text);
insert into t1 select id, (id % 10)::text from generate_series(1,100000) id;
create unique index on t1(id);
create index on t1(mod);

This is the best plan for this query, since we only need one minimum
value for this index. And it works perfectly:
explain select min(mod) from t1;
explain select min(mod) from t1;
QUERY PLAN
------------------------------------------------------------------------------------------------
Result (cost=0.33..0.34 rows=1 width=32)
InitPlan 1 (returns $0)
-> Limit (cost=0.29..0.33 rows=1 width=32)
-> Index Only Scan using t1_mod_idx on t1 (cost=0.29..3861.54
rows=99500 width=32)
Index Cond: (mod IS NOT NULL)
(5 rows)

create table t2 (id int not null);
insert into t2 select id from generate_series(1,100000) id;
create unique index on t2(id);

But if we add a join, we fall into a sec scan without options:
explain select min(t1.mod) from t1 left join t2 on t1.id = t2.id;
postgres=# explain select min(t1.mod) from t1 left join t2 on t1.id = t2.id;
QUERY PLAN
-----------------------------------------------------------------
Aggregate (cost=1693.00..1693.01 rows=1 width=32)
-> Seq Scan on t1 (cost=0.00..1443.00 rows=100000 width=32)

I have implemented a patch that solves this problem - allowing to
consider and join expressions for trial optimization. I am glad for
feedback and review!

--
Regards,
Alena Rybakina
Postgres Professional

Attachments:

0001-Apply-min_max_transformation-for-eliminated-join-rel.patchtext/x-patch; charset=UTF-8; name=0001-Apply-min_max_transformation-for-eliminated-join-rel.patchDownload

From eb8fe49f68e198217a8f3e92ee424ff897f9e21e Mon Sep 17 00:00:00 2001
From: Alena Rybakina <a.rybakina@postgrespro.ru>
Date: Thu, 27 Feb 2025 11:26:44 +0300
Subject: [PATCH] Apply min_max_transformation for eliminated join relations
 after successfuly applying join removal optimization.

---
 src/backend/optimizer/plan/planagg.c     | 14 +++++++--
 src/test/regress/expected/aggregates.out | 40 ++++++++++++++++++++++++
 src/test/regress/sql/aggregates.sql      | 17 ++++++++++
 3 files changed, 69 insertions(+), 2 deletions(-)

diff --git a/src/backend/optimizer/plan/planagg.c b/src/backend/optimizer/plan/planagg.c
index 64605be3178..b1d44987fb1 100644
--- a/src/backend/optimizer/plan/planagg.c
+++ b/src/backend/optimizer/plan/planagg.c
@@ -125,9 +125,19 @@ preprocess_minmax_aggregates(PlannerInfo *root)
 			return;
 		jtnode = linitial(jtnode->fromlist);
 	}
-	if (!IsA(jtnode, RangeTblRef))
+
+	/*
+	 * Let's consider applying optimization to the remaining
+	 * single relations after join removal optimization.
+	 * To do this, we need to make sure that fromlist is empty and
+	 * quals contains only one RTE relation.aggs_list.
+	 */
+	if (IsA(jtnode, JoinExpr) && jtnode->fromlist == NIL && IsA(jtnode->quals, RangeTblRef))
+		rtr = (RangeTblRef *) (jtnode->quals);
+	else if (IsA(jtnode, RangeTblRef))
+		rtr = (RangeTblRef *) jtnode;
+	else
 		return;
-	rtr = (RangeTblRef *) jtnode;
 	rte = planner_rt_fetch(rtr->rtindex, root);
 	if (rte->rtekind == RTE_RELATION)
 		 /* ordinary relation, ok */ ;
diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out
index 8c4f8ce27ed..79fe2fcce4d 100644
--- a/src/test/regress/expected/aggregates.out
+++ b/src/test/regress/expected/aggregates.out
@@ -1329,6 +1329,46 @@ from int4_tbl t0;
 (5 rows)
 
 rollback;
+--Check min/max optimization for join expressions
+create unique index tenk1_unique1_idx on tenk1(unique1);
+create index tenk1_unique2_idx on tenk1(unique2);
+create unique index INT4_TBL_f1_idx on INT4_TBL(f1);
+explain (COSTS OFF)
+select min(t1.unique2) from tenk1 t1 left join INT4_TBL t2 on t1.unique1 = t2.f1;
+                          QUERY PLAN                          
+--------------------------------------------------------------
+ Result
+   InitPlan 1
+     ->  Limit
+           ->  Index Scan using tenk1_unique2_idx on tenk1 t1
+                 Index Cond: (unique2 IS NOT NULL)
+(5 rows)
+
+explain (COSTS OFF)
+select min(t1.unique2) from tenk1 t1 inner join INT4_TBL t2 on t1.unique1 = t2.f1;
+                         QUERY PLAN                         
+------------------------------------------------------------
+ Aggregate
+   ->  Nested Loop
+         ->  Seq Scan on int4_tbl t2
+         ->  Index Scan using tenk1_unique1_idx on tenk1 t1
+               Index Cond: (unique1 = t2.f1)
+(5 rows)
+
+explain (COSTS OFF)
+select min(t1.unique2) from tenk1 t1 left outer join INT4_TBL t2 on t1.unique1 = t2.f1;
+                          QUERY PLAN                          
+--------------------------------------------------------------
+ Result
+   InitPlan 1
+     ->  Limit
+           ->  Index Scan using tenk1_unique2_idx on tenk1 t1
+                 Index Cond: (unique2 IS NOT NULL)
+(5 rows)
+
+DROP INDEX tenk1_unique1_idx;
+DROP INDEX tenk1_unique2_idx;
+DROP INDEX INT4_TBL_f1_idx;
 -- check for correct detection of nested-aggregate errors
 select max(min(unique1)) from tenk1;
 ERROR:  aggregate function calls cannot be nested
diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql
index a1dc94bff57..c489f71bfc3 100644
--- a/src/test/regress/sql/aggregates.sql
+++ b/src/test/regress/sql/aggregates.sql
@@ -452,6 +452,23 @@ select f1, (select distinct min(t1.f1) from int4_tbl t1 where t1.f1 = t0.f1)
 from int4_tbl t0;
 rollback;
 
+--Check min/max optimization for join expressions
+create unique index tenk1_unique1_idx on tenk1(unique1);
+create index tenk1_unique2_idx on tenk1(unique2);
+create unique index INT4_TBL_f1_idx on INT4_TBL(f1);
+explain (COSTS OFF)
+select min(t1.unique2) from tenk1 t1 left join INT4_TBL t2 on t1.unique1 = t2.f1;
+
+explain (COSTS OFF)
+select min(t1.unique2) from tenk1 t1 inner join INT4_TBL t2 on t1.unique1 = t2.f1;
+
+explain (COSTS OFF)
+select min(t1.unique2) from tenk1 t1 left outer join INT4_TBL t2 on t1.unique1 = t2.f1;
+
+DROP INDEX tenk1_unique1_idx;
+DROP INDEX tenk1_unique2_idx;
+DROP INDEX INT4_TBL_f1_idx;
+
 -- check for correct detection of nested-aggregate errors
 select max(min(unique1)) from tenk1;
 select (select max(min(unique1)) from int8_tbl) from tenk1;
-- 
2.34.1

Robert Haas

robertmhaas@gmail.com

8 months ago

In reply to: Alena Rybakina (#1)

Re: Useless LEFT JOIN breaks MIN/MAX optimization

Hi Alena,

If I understand correctly, the problem here is that join removal and
minmax aggregates don't work well together: after join removal runs,
we end up with a state that doesn't permit the minmax-aggregate code
to work.

I agree that would be good to fix but the patch doesn't seem right to me.

I'm looking at this line from the patch, in particular:

+ if (IsA(jtnode, JoinExpr) && jtnode->fromlist == NIL &&
IsA(jtnode->quals, RangeTblRef))

jtnode is declared to be of type FromExpr. But here you check that it
is a JoinExpr, and if it is, then you dereference it as if it were a
FromExpr. So, if I'm understanding correctly, the reference to
jtnode->fromlist is actually looking at some completely unrelated
field that is part of a JoinExpr, like maybe larg, and jtnode->quals
is looking at some other field, maybe jtnode->rarg. That's pretty
crazy coding -- the right thing to do would be to cast the jtnode to
the proper type and then access it using the proper field names.

Backing up a step, it seems to me that it might be a good idea to
rewrite the preceding loop, which can descend through any number of
levels of FromExpr expressions, to be able to descend through either
FromExpr or JoinExpr expressions, stopping when it can't descend
further using either method. The way you've coded it supposes that it
only ever needs to descend through one JoinExpr expression and that
the JoinExpr will always be beneath all the FromExprs, which might not
be a correct assumption. It would probably be a good idea to write
some more complex test cases where multiple joins get removed at
various levels, and where there are also subquery levels that produce
FromExprs, in various configurations, and test that the patch works in
all of those cases.

But that's also assuming that you're correct here about how to descend
through a JoinExpr, which I'm not quite sure whether is true. It's
also assuming that we should solve the problem here rather than in
some other part of the code e.g. the join removal code, and I'm not
sure about that either.

I'm just studying this for the first time so apologies if this review
is not quite up to standard.

Thanks,

--
Robert Haas
EDB: http://www.enterprisedb.com

Alena Rybakina

a.rybakina@postgrespro.ru

8 months ago

In reply to: Robert Haas (#2)

Re: Useless LEFT JOIN breaks MIN/MAX optimization

Hi, Robert!

On 09.05.2025 20:12, Robert Haas wrote:

If I understand correctly, the problem here is that join removal and
minmax aggregates don't work well together: after join removal runs,
we end up with a state that doesn't permit the minmax-aggregate code
to work.

Yes, it is correct.

I agree that would be good to fix but the patch doesn't seem right to me.

I'm looking at this line from the patch, in particular:

+ if (IsA(jtnode, JoinExpr) && jtnode->fromlist == NIL &&
IsA(jtnode->quals, RangeTblRef))

jtnode is declared to be of type FromExpr. But here you check that it
is a JoinExpr, and if it is, then you dereference it as if it were a
FromExpr. So, if I'm understanding correctly, the reference to
jtnode->fromlist is actually looking at some completely unrelated
field that is part of a JoinExpr, like maybe larg, and jtnode->quals
is looking at some other field, maybe jtnode->rarg. That's pretty
crazy coding -- the right thing to do would be to cast the jtnode to
the proper type and then access it using the proper field names.

Yes, you are right, my mistake. I'll correct it. Thank you)

Backing up a step, it seems to me that it might be a good idea to
rewrite the preceding loop, which can descend through any number of
levels of FromExpr expressions, to be able to descend through either
FromExpr or JoinExpr expressions, stopping when it can't descend
further using either method. The way you've coded it supposes that it
only ever needs to descend through one JoinExpr expression and that
the JoinExpr will always be beneath all the FromExprs, which might not
be a correct assumption. It would probably be a good idea to write
some more complex test cases where multiple joins get removed at
various levels, and where there are also subquery levels that produce
FromExprs, in various configurations, and test that the patch works in
all of those cases.

You are right, there are not enough tests here and we need to add
queries with more complex semantics and you are right about the approach.

I'll implement it. Thanks for pointing this out, I missed it.

But that's also assuming that you're correct here about how to descend
through a JoinExpr, which I'm not quite sure whether is true. It's
also assuming that we should solve the problem here rather than in
some other part of the code e.g. the join removal code, and I'm not
sure about that either.

I don’t think it’s necessary to move this code into the join removal
optimization phase. There’s no guarantee that there will not appear
future optimizations that will make impossible to apply min/max
optimization afterward. Keeping the min/max optimization in a single,
consistent location also improves clarity and maintainability of the code.

The simplest solution, in my opinion, is to extend the current logic to
include type checking for the JoinExpr types as I did in my patch.
Specifically, we can verify that it now involves only a single relation
or partitioned relations without formation join algorithms.

I'm just studying this for the first time so apologies if this review
is not quite up to standard.

No need to apologize - you raised important points during both the patch
and issue reviews. Thanks for your valuable and helpful feedback!

--
Regards,
Alena Rybakina
Postgres Professional

Tom Lane

tgl@sss.pgh.pa.us

8 months ago

In reply to: Robert Haas (#2)

Re: Useless LEFT JOIN breaks MIN/MAX optimization

Robert Haas <robertmhaas@gmail.com> writes:

But that's also assuming that you're correct here about how to descend
through a JoinExpr, which I'm not quite sure whether is true. It's
also assuming that we should solve the problem here rather than in
some other part of the code e.g. the join removal code, and I'm not
sure about that either.

The actual problem here is that remove_useless_joins hasn't run yet.
It's called inside query_planner which happens only after we do
preprocess_minmax_aggregates.

So I think this patch is a dead end. It's not possible for it to
correctly predict whether remove_useless_joins will remove the join,
short of repeating all that work which we surely don't want.
(I'm a bit surprised that it hasn't visibly broken existing test cases.)

It might be possible to move preprocess_minmax_aggregates to happen
after join removal, but I fear it'd require some pretty fundamental
rethinking of how it generates indexscan paths --- recursively
calling query_planner seems dubious. (But maybe that'd work?
The modified query should no longer contain aggs, so we wouldn't
recurse again.)

preprocess_minmax_aggregates is pretty much of a hack anyway.
If you read the comments, it's just full of weird stuff that it
has to duplicate from other places, or things that magically work
because the relevant stuff isn't possible in this query, etc.
Maybe it's time to think about nuking it from orbit and doing a
fresh implementation in some other place that's a better fit.
I have no immediate ideas about what that should look like, other
than it'd be better if it happened after join removal.

regards, tom lane

Alena Rybakina

a.rybakina@postgrespro.ru

8 months ago

In reply to: Tom Lane (#4)

1 attachment(s)

Re: Useless LEFT JOIN breaks MIN/MAX optimization

On 12.05.2025 14:05, Tom Lane wrote:

Robert Haas<robertmhaas@gmail.com> writes:

But that's also assuming that you're correct here about how to descend
through a JoinExpr, which I'm not quite sure whether is true. It's
also assuming that we should solve the problem here rather than in
some other part of the code e.g. the join removal code, and I'm not
sure about that either.

The actual problem here is that remove_useless_joins hasn't run yet.
It's called inside query_planner which happens only after we do
preprocess_minmax_aggregates.

So I think this patch is a dead end. It's not possible for it to
correctly predict whether remove_useless_joins will remove the join,
short of repeating all that work which we surely don't want.
(I'm a bit surprised that it hasn't visibly broken existing test cases.)

To be honest, I was not completely sure about my decision at first and
had no idea how to do it differently, so I submitted a request for
"Advanced session feedback" to consider this patch.

It might be possible to move preprocess_minmax_aggregates to happen
after join removal, but I fear it'd require some pretty fundamental
rethinking of how it generates indexscan paths --- recursively
calling query_planner seems dubious. (But maybe that'd work?
The modified query should no longer contain aggs, so we wouldn't
recurse again.)

I considered another approach using late optimization and ran into a
problem where the planner could not find a partitioned table.

It was a long time ago to be frankly, but the problem there was that the
planner stored this information at a higher level. I can try to finish this.

I attached a diff just in case.

preprocess_minmax_aggregates is pretty much of a hack anyway.
If you read the comments, it's just full of weird stuff that it
has to duplicate from other places, or things that magically work
because the relevant stuff isn't possible in this query, etc.
Maybe it's time to think about nuking it from orbit and doing a
fresh implementation in some other place that's a better fit.
I have no immediate ideas about what that should look like, other
than it'd be better if it happened after join removal.

I didn't consider this and I'll think about it.

Thanks for the feedback!

Regards,
Alena Rybakina
Postgres Professional

Attachments:

min_max.diff.no-cfbottext/plain; charset=UTF-8; name=min_max.diff.no-cfbotDownload

diff --git a/src/backend/optimizer/plan/planagg.c b/src/backend/optimizer/plan/planagg.c
index 64605be31781f9e841f02ea005e9220a721e45aa..c78f32e850bb2bf884a63971898de2c6a88675ea 100644
--- a/src/backend/optimizer/plan/planagg.c
+++ b/src/backend/optimizer/plan/planagg.c
@@ -125,16 +125,24 @@ preprocess_minmax_aggregates(PlannerInfo *root)
 			return;
 		jtnode = linitial(jtnode->fromlist);
 	}
-	if (!IsA(jtnode, RangeTblRef))
+	if (IsA(jtnode, JoinExpr) && jtnode->fromlist == NIL && root->join_info_list == NIL && IsA(jtnode->quals, RangeTblRef))
+		rtr = (RangeTblRef *) (jtnode->quals);
+	else if (IsA(jtnode, RangeTblRef))
+	{
+		rtr = (RangeTblRef *) jtnode;
+	}
+	else
+		return;
+
+	if(root->simple_rel_array[rtr->rtindex] == NULL)
 		return;
-	rtr = (RangeTblRef *) jtnode;
+
 	rte = planner_rt_fetch(rtr->rtindex, root);
+
 	if (rte->rtekind == RTE_RELATION)
-		 /* ordinary relation, ok */ ;
+		/* ordinary relation, ok */ ;
 	else if (rte->rtekind == RTE_SUBQUERY && rte->inh)
-		 /* flattened UNION ALL subquery, ok */ ;
-	else
-		return;
+		/* flattened UNION ALL subquery, ok */ ;
 
 	/*
 	 * Examine all the aggregates and verify all are MIN/MAX aggregates.  Stop
@@ -345,6 +353,7 @@ build_minmax_path(PlannerInfo *root, MinMaxAggInfo *mminfo,
 	subroot->init_plans = NIL;
 	subroot->agginfos = NIL;
 	subroot->aggtransinfos = NIL;
+	subroot->eq_classes = NIL;
 
 	subroot->parse = parse = copyObject(root->parse);
 	IncrementVarSublevelsUp((Node *) parse, 1, 1);
diff --git a/src/backend/optimizer/plan/planmain.c b/src/backend/optimizer/plan/planmain.c
index ade23fd9d56d2a96ff62ceeeea67869aa96e368a..563e689dfc0c3f2e8aad2dea367a928e0467598a 100644
--- a/src/backend/optimizer/plan/planmain.c
+++ b/src/backend/optimizer/plan/planmain.c
@@ -78,6 +78,7 @@ query_planner(PlannerInfo *root,
 	root->placeholder_array_size = 0;
 	root->fkey_list = NIL;
 	root->initial_rels = NIL;
+	root->placeholdersFrozen = false;
 
 	/*
 	 * Set up arrays for accessing base relations and AppendRelInfos.
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 8a474a50be73c2a73607ee2c775758ac95999b79..9627ea2aed196b3bcb46ff82f2a9d6535473a1d0 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -1535,15 +1535,6 @@ grouping_planner(PlannerInfo *root, double tuple_fraction,
 				parse->hasWindowFuncs = false;
 		}
 
-		/*
-		 * Preprocess MIN/MAX aggregates, if any.  Note: be careful about
-		 * adding logic between here and the query_planner() call.  Anything
-		 * that is needed in MIN/MAX-optimizable cases will have to be
-		 * duplicated in planagg.c.
-		 */
-		if (parse->hasAggs)
-			preprocess_minmax_aggregates(root);
-
 		/*
 		 * Figure out whether there's a hard limit on the number of rows that
 		 * query_planner's result subplan needs to return.  Even if we know a
@@ -1612,6 +1603,15 @@ grouping_planner(PlannerInfo *root, double tuple_fraction,
 			sort_input_target_parallel_safe = final_target_parallel_safe;
 		}
 
+		/*
+		 * Preprocess MIN/MAX aggregates, if any.  Note: be careful about
+		 * adding logic between here and the query_planner() call.  Anything
+		 * that is needed in MIN/MAX-optimizable cases will have to be
+		 * duplicated in planagg.c.
+		 */
+		if (parse->hasAggs)
+			preprocess_minmax_aggregates(root);
+
 		/*
 		 * If we have window functions to deal with, the output from any
 		 * grouping step needs to be what the window functions want;
diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out
index f2fb66388cc91934bf90faeaea7bc06751b14670..4cab1539cffd075f870d04b2639a33a33cbb8f2d 100644
--- a/src/test/regress/expected/aggregates.out
+++ b/src/test/regress/expected/aggregates.out
@@ -1329,6 +1329,24 @@ from int4_tbl t0;
 (5 rows)
 
 rollback;
+--Check min/max optimization for join expressions
+create unique index tenk1_unique1_idx on tenk1(unique1);
+create index tenk1_unique2_idx on tenk1(unique2);
+create unique index INT4_TBL_f1_idx on INT4_TBL(f1);
+explain (COSTS OFF)
+select min(t1.unique2) from tenk1 t1 left join INT4_TBL t2 on t1.unique1 = t2.f1;
+                          QUERY PLAN                          
+--------------------------------------------------------------
+ Result
+   InitPlan 1
+     ->  Limit
+           ->  Index Scan using tenk1_unique2_idx on tenk1 t1
+                 Index Cond: (unique2 IS NOT NULL)
+(5 rows)
+
+DROP INDEX tenk1_unique1_idx;
+DROP INDEX tenk1_unique2_idx;
+DROP INDEX INT4_TBL_f1_idx;
 -- check for correct detection of nested-aggregate errors
 select max(min(unique1)) from tenk1;
 ERROR:  aggregate function calls cannot be nested
diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql
index 77168bcc7447c5d8dec11cd133c9355f82b33235..9b01b01e928af84f98484b17fd8c66134fa33a77 100644
--- a/src/test/regress/sql/aggregates.sql
+++ b/src/test/regress/sql/aggregates.sql
@@ -452,6 +452,16 @@ select f1, (select distinct min(t1.f1) from int4_tbl t1 where t1.f1 = t0.f1)
 from int4_tbl t0;
 rollback;
 
+--Check min/max optimization for join expressions
+create unique index tenk1_unique1_idx on tenk1(unique1);
+create index tenk1_unique2_idx on tenk1(unique2);
+create unique index INT4_TBL_f1_idx on INT4_TBL(f1);
+explain (COSTS OFF)
+select min(t1.unique2) from tenk1 t1 left join INT4_TBL t2 on t1.unique1 = t2.f1;
+DROP INDEX tenk1_unique1_idx;
+DROP INDEX tenk1_unique2_idx;
+DROP INDEX INT4_TBL_f1_idx;
+
 -- check for correct detection of nested-aggregate errors
 select max(min(unique1)) from tenk1;
 select (select max(min(unique1)) from int8_tbl) from tenk1;