diff --git a/src/backend/optimizer/plan/analyzejoins.c b/src/backend/optimizer/plan/analyzejoins.c index 5783f90..615ae59 100644 --- a/src/backend/optimizer/plan/analyzejoins.c +++ b/src/backend/optimizer/plan/analyzejoins.c @@ -150,10 +150,12 @@ clause_sides_match_join(RestrictInfo *rinfo, Relids outerrelids, * it will just duplicate its left input. * * This is true for a left join for which the join condition cannot match - * more than one inner-side row. (There are other possibly interesting - * cases, but we don't have the infrastructure to prove them.) We also - * have to check that the inner side doesn't generate any variables needed - * above the join. + * more than one inner-side row. We can also allow removal of joins to + * relations that may match more than one inner-side row if a DISTINCT or + * GROUP BY clause would subsequently remove any duplicates caused by the + * join. (There are other possibly interesting cases, but we don't have the + * infrastructure to prove them.) We also have to check that the inner side + * doesn't generate any variables needed above the join. */ static bool join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo) @@ -181,9 +183,11 @@ join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo) /* * Before we go to the effort of checking whether any innerrel variables * are needed above the join, make a quick check to eliminate cases in - * which we will surely be unable to prove uniqueness of the innerrel. + * which we will surely be unable to remove the join. */ - if (!rel_supports_distinctness(root, innerrel)) + if (!rel_supports_distinctness(root, innerrel) && + root->parse->distinctClause == NIL && + (root->parse->groupClause == NIL || root->parse->hasAggs)) return false; /* Compute the relid set for the join we are considering */ @@ -237,6 +241,22 @@ join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo) } /* + * When a DISTINCT, DISTINCT ON or GROUP BY clause is present, the + * unreferenced relation's join has no ability to duplicate rows in final + * result set, as any duplicate rows would be removed by the + * DISTINCT/GROUP BY clause anyway. However, we're unable to apply this + * when aggregate functions are present as we must aggregate any + * duplicated rows. We needn't bother checking the actual distinct or + * grouping expressions here, as we already know from the above checks + * that no Var is present from the relation we're trying to remove. + */ + if (root->parse->distinctClause != NIL) + return true; + + if (root->parse->groupClause != NIL && !root->parse->hasAggs) + return true; + + /* * Search for mergejoinable clauses that constrain the inner rel against * either the outer rel or a pseudoconstant. If an operator is * mergejoinable then it behaves like equality for some btree opclass, so diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out index f02ef3f..75ad4e1 100644 --- a/src/test/regress/expected/join.out +++ b/src/test/regress/expected/join.out @@ -4051,6 +4051,93 @@ select d.* from d left join (select id from a union select id from b) s Seq Scan on d (1 row) +-- check join removal works without a unique index on the left joined table +-- when a DISTINCT or GROUP BY is present which would remove row duplication +explain (costs off) +select distinct a.*,b.* from a left join b on a.b_id = b.id left join d + on a.b_id = d.a; + QUERY PLAN +----------------------------------------- + HashAggregate + Group Key: a.id, a.b_id, b.id, b.c_id + -> Hash Left Join + Hash Cond: (a.b_id = b.id) + -> Seq Scan on a + -> Hash + -> Seq Scan on b +(7 rows) + +-- as above but with DISTINCT ON +explain (costs off) +select distinct on (a.id,b.id) a.*,b.* from a left join b on a.b_id = b.id left join d + on a.b_id = d.a order by a.id,b.id,a.b_id; + QUERY PLAN +------------------------------------------ + Unique + -> Sort + Sort Key: a.id, b.id, a.b_id + -> Hash Left Join + Hash Cond: (a.b_id = b.id) + -> Seq Scan on a + -> Hash + -> Seq Scan on b +(8 rows) + +-- as above but with GROUP BY +explain (costs off) +select a.id,b.id from a left join b on a.b_id = b.id left join d + on a.b_id = d.a group by a.id,b.id; + QUERY PLAN +------------------------------------ + HashAggregate + Group Key: a.id, b.id + -> Hash Left Join + Hash Cond: (a.b_id = b.id) + -> Seq Scan on a + -> Hash + -> Seq Scan on b +(7 rows) + +-- also ensure the removal works with other relation types. +explain (costs off) +select distinct on (a.id,b.id) a.*,b.* from a +left join b on a.b_id = b.id +left join (values(1),(1)) d(a) + on a.b_id = d.a order by a.id,b.id,a.b_id; + QUERY PLAN +------------------------------------------ + Unique + -> Sort + Sort Key: a.id, b.id, a.b_id + -> Hash Left Join + Hash Cond: (a.b_id = b.id) + -> Seq Scan on a + -> Hash + -> Seq Scan on b +(8 rows) + +-- ensure no join removal when we must aggregate any duplicated rows +explain (costs off) +select a.id,b.id,count(*) from a left join b on a.b_id = b.id left join d + on a.b_id = d.a group by a.id,b.id; + QUERY PLAN +------------------------------------------------ + HashAggregate + Group Key: a.id, b.id + -> Merge Left Join + Merge Cond: (a.b_id = d.a) + -> Sort + Sort Key: a.b_id + -> Hash Left Join + Hash Cond: (a.b_id = b.id) + -> Seq Scan on a + -> Hash + -> Seq Scan on b + -> Sort + Sort Key: d.a + -> Seq Scan on d +(14 rows) + -- check join removal with a cross-type comparison operator explain (costs off) select i8.* from int8_tbl i8 left join (select f1 from int4_tbl group by f1) i4 diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql index cbb71c5..31d6ff1 100644 --- a/src/test/regress/sql/join.sql +++ b/src/test/regress/sql/join.sql @@ -1331,6 +1331,34 @@ explain (costs off) select d.* from d left join (select id from a union select id from b) s on d.a = s.id; +-- check join removal works without a unique index on the left joined table +-- when a DISTINCT or GROUP BY is present which would remove row duplication +explain (costs off) +select distinct a.*,b.* from a left join b on a.b_id = b.id left join d + on a.b_id = d.a; + +-- as above but with DISTINCT ON +explain (costs off) +select distinct on (a.id,b.id) a.*,b.* from a left join b on a.b_id = b.id left join d + on a.b_id = d.a order by a.id,b.id,a.b_id; + +-- as above but with GROUP BY +explain (costs off) +select a.id,b.id from a left join b on a.b_id = b.id left join d + on a.b_id = d.a group by a.id,b.id; + +-- also ensure the removal works with other relation types. +explain (costs off) +select distinct on (a.id,b.id) a.*,b.* from a +left join b on a.b_id = b.id +left join (values(1),(1)) d(a) + on a.b_id = d.a order by a.id,b.id,a.b_id; + +-- ensure no join removal when we must aggregate any duplicated rows +explain (costs off) +select a.id,b.id,count(*) from a left join b on a.b_id = b.id left join d + on a.b_id = d.a group by a.id,b.id; + -- check join removal with a cross-type comparison operator explain (costs off) select i8.* from int8_tbl i8 left join (select f1 from int4_tbl group by f1) i4