diff --git a/src/backend/optimizer/plan/analyzejoins.c b/src/backend/optimizer/plan/analyzejoins.c index 129fc3d..e65c21b 100644 --- a/src/backend/optimizer/plan/analyzejoins.c +++ b/src/backend/optimizer/plan/analyzejoins.c @@ -27,9 +27,13 @@ #include "optimizer/paths.h" #include "optimizer/planmain.h" #include "optimizer/var.h" +#include "optimizer/clauses.h" +#include "parser/parsetree.h" /* local functions */ static bool join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo); +static bool sortclause_is_unique_on_restrictinfo(Query *query, + List *clause_list, List *sortclause); static void remove_rel_from_query(PlannerInfo *root, int relid, Relids joinrelids); static List *remove_rel_from_joinlist(List *joinlist, int relid, int *nremoved); @@ -154,11 +158,13 @@ join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo) /* * Currently, we only know how to remove left joins to a baserel with - * unique indexes. We can check most of these criteria pretty trivially - * to avoid doing useless extra work. But checking whether any of the - * indexes are unique would require iterating over the indexlist, so for - * now we just make sure there are indexes of some sort or other. If none - * of them are unique, join removal will still fail, just slightly later. + * unique indexes and left joins to a subquery where the subquery is + * unique on the join condition. We can check most of these criteria + * pretty trivially to avoid doing useless extra work. But checking + * whether any of the indexes are unique would require iterating over + * the indexlist, so for now, if we're joining to a relation, we'll just + * ensure that we have at least 1 index, it won't matter if that index + * is unique at this stage, we'll check those details later. */ if (sjinfo->jointype != JOIN_LEFT || sjinfo->delay_upper_joins || @@ -168,11 +174,17 @@ join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo) innerrelid = bms_singleton_member(sjinfo->min_righthand); innerrel = find_base_rel(root, innerrelid); - if (innerrel->reloptkind != RELOPT_BASEREL || - innerrel->rtekind != RTE_RELATION || - innerrel->indexlist == NIL) + if (innerrel->reloptkind != RELOPT_BASEREL) return false; + if (innerrel->rtekind == RTE_RELATION) + { + if (innerrel->indexlist == NIL) + return false; /* no possibility of a unique index */ + } + else if (innerrel->rtekind != RTE_SUBQUERY) + return false; /* unsupported rtekind */ + /* Compute the relid set for the join we are considering */ joinrelids = bms_union(sjinfo->min_lefthand, sjinfo->min_righthand); @@ -276,16 +288,128 @@ join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo) */ /* Now examine the indexes to see if we have a matching unique index */ - if (relation_has_unique_index_for(root, innerrel, clause_list, NIL, NIL)) + if (innerrel->rtekind == RTE_RELATION && + relation_has_unique_index_for(root, innerrel, clause_list, NIL, NIL)) return true; /* + * We can be certain that the sub query contains no duplicate values for the join + * clause if item in the sub query's GROUP BY clause is also used in the join clause + * using equality. This works the same way for the DISTINCT clause. We need not pay + * any attention to WHERE or HAVING clauses as these just restrict the results more + * and could not be the cause of duplication in the result set. + */ + if (innerrel->rtekind == RTE_SUBQUERY) + { + Query *query = root->simple_rte_array[innerrelid]->subquery; + + if (sortclause_is_unique_on_restrictinfo(query, clause_list, query->groupClause) || + sortclause_is_unique_on_restrictinfo(query, clause_list, query->distinctClause)) + return true; + } + /* XXX is this comment still needed?? * Some day it would be nice to check for other methods of establishing * distinctness. */ return false; } +/* + * sortclause_is_unique_on_restrictinfo + * + * Checks to see if all items in sortclause also exist in clause_list. + * The function will return true if clause_list is the same as or a superset + * of the sortclause. If the sortclause has columns that don't exist in the + * clause_list then the query can't be guaranteed unique on the clause_list + * columns. Also if the targetList expression contains any volatile functions + * then we return false as: + * SELECT DISTINCT a+random() FROM (VALUES(1),(1)) a(a); + * will most likely return more than 1 row. + */ +static bool +sortclause_is_unique_on_restrictinfo(Query *query, List *clause_list, List *sortclause) +{ + ListCell *l; + + /* + * if this sortclause is empty then the query can't be unique + * on the clause list. + */ + if (sortclause == NIL) + return false; + + /* + * Loop over each sort clause to ensure that we have + * an item in the join conditions that matches it. + * It does not matter if we have more items in the join + * condition than we have in the sort clause + */ + foreach(l, sortclause) + { + ListCell *ri; + SortGroupClause *scl = (SortGroupClause *) lfirst(l); + TargetEntry *sortTarget = NULL; + bool matched = false; + ListCell *l1; + + /* search the targetlist for the TargetEntry for this sort clause */ + /* XXX Surely there is a function to do this for us?? */ + foreach(l1, query->targetList) + { + TargetEntry *tle = (TargetEntry *) lfirst(l1); + + if (tle->ressortgroupref == scl->tleSortGroupRef) + { + sortTarget = tle; + break; + } + } + + if (sortTarget == NULL) + elog(ERROR, "Unable to find sort target in targetlist"); + + /* + * Since a constant only has 1 value the existence of one here will + * not cause any duplication of the results. We'll simply ignore it! + */ + if (IsA(sortTarget->expr, Const)) + continue; + + foreach(ri, clause_list) + { + RestrictInfo *rinfo = (RestrictInfo *) lfirst(ri); + Node *rexpr; + + if (rinfo->outer_is_left) + rexpr = get_rightop(rinfo->clause); + else + rexpr = get_leftop(rinfo->clause); + + if (IsA(rexpr, Var)) + { + Var *var = (Var *)rexpr; + TargetEntry *tle = get_tle_by_resno(query->targetList, var->varattno); + + /* Can't remove join if the expression contains a volatile function */ + if (contain_volatile_functions((Node *) tle->expr)) + return false; + + if (tle->resorigtbl == sortTarget->resorigtbl && + tle->resno == sortTarget->resno) + { + matched = true; + break; /* match found */ + } + } + else /* XXX what else could it be? */ + return false; + } + + if (!matched) + return false; + } + return true; +} /* * Remove the target relid from the planner's data structures, having diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out index 934488a..1555aed 100644 --- a/src/test/regress/expected/join.out +++ b/src/test/regress/expected/join.out @@ -3098,6 +3098,123 @@ select id from a where id in ( -> Seq Scan on b (5 rows) +-- check optimization of outer join when joining a unique sub query using group by +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT b.id FROM b GROUP BY b.id) b ON a.id = b.id; + QUERY PLAN +--------------- + Seq Scan on a +(1 row) + +-- check optimization of outer join when joining a unique sub query using distinct +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT b.c_id FROM b) b ON a.id = b.c_id; + QUERY PLAN +--------------- + Seq Scan on a +(1 row) + +-- check optimization of outer join when joining a unique sub query using distinct +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT a+b AS ab FROM d) d ON a.id = d.ab; + QUERY PLAN +--------------- + Seq Scan on a +(1 row) + +-- optimization is not possible when distinct contains a volatile function +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT a+b+random() AS abr FROM d) d ON a.id = d.abr; + QUERY PLAN +------------------------------------------------------------------------------------------ + Hash Left Join + Hash Cond: ((a.id)::double precision = ((((d.a + d.b))::double precision + random()))) + -> Seq Scan on a + -> Hash + -> HashAggregate + Group Key: (((d.a + d.b))::double precision + random()) + -> Seq Scan on d +(7 rows) + +-- check optimization of outer join when joining a unique sub query using distinct +-- and a constant expression. +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT b.c_id,1 AS dummy FROM b) b ON a.id = b.c_id; + QUERY PLAN +--------------- + Seq Scan on a +(1 row) + +-- check optimization of outer join when joining a unique sub query which contains 2 tables +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT b.id FROM b INNER JOIN c ON b.id = c.id GROUP BY b.id) b ON a.id = b.id; + QUERY PLAN +--------------- + Seq Scan on a +(1 row) + +-- check optimization of outer join when joining a unique sub query which contains 2 tables +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT b.id FROM b INNER JOIN c ON b.id = c.id GROUP BY b.id) b ON a.id + 10 = b.id; + QUERY PLAN +--------------- + Seq Scan on a +(1 row) + +-- check optimization of outer join when joining a unique sub query which contains 2 tables +-- and contains a redundant join clause +EXPLAIN (COSTS OFF) +SELECT a.id FROM a +LEFT JOIN (SELECT b.id,1 as dummy FROM b INNER JOIN c ON b.id = c.id GROUP BY b.id) b ON a.id = b.id AND b.dummy = 1; + QUERY PLAN +--------------- + Seq Scan on a +(1 row) + +-- optimization is not possible when the group by contains a column which is not in the join condition. +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT b.id FROM b GROUP BY b.id,b.c_id) b ON a.id = b.id; + QUERY PLAN +--------------------------------- + Hash Right Join + Hash Cond: (b.id = a.id) + -> HashAggregate + Group Key: b.id, b.c_id + -> Seq Scan on b + -> Hash + -> Seq Scan on a +(7 rows) + +-- optimization is not possible when distinct clause contains an item that is not in the join clause +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT b.id,random() AS r FROM b) b ON a.id = b.id; + QUERY PLAN +----------------------------------- + Hash Right Join + Hash Cond: (b.id = a.id) + -> HashAggregate + Group Key: b.id, random() + -> Seq Scan on b + -> Hash + -> Seq Scan on a +(7 rows) + +-- optimization is not possible when distinct contains a volatile function +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT b.id,random() AS r FROM b) b ON a.id = b.id AND r = random(); + QUERY PLAN +------------------------------------------------- + Hash Left Join + Hash Cond: (a.id = b.id) + -> Seq Scan on a + -> Hash + -> Subquery Scan on b + Filter: (b.r = random()) + -> HashAggregate + Group Key: b_1.id, random() + -> Seq Scan on b b_1 +(9 rows) + rollback; create temp table parent (k int primary key, pd int); create temp table child (k int unique, cd int); diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql index 275cb11..153a0bc 100644 --- a/src/test/regress/sql/join.sql +++ b/src/test/regress/sql/join.sql @@ -861,9 +861,11 @@ begin; CREATE TEMP TABLE a (id int PRIMARY KEY, b_id int); CREATE TEMP TABLE b (id int PRIMARY KEY, c_id int); CREATE TEMP TABLE c (id int PRIMARY KEY); +CREATE TEMP TABLE d (a INT, b INT); INSERT INTO a VALUES (0, 0), (1, NULL); INSERT INTO b VALUES (0, 0), (1, NULL); INSERT INTO c VALUES (0), (1); +INSERT INTO d VALUES (1,3),(2,2),(3,1); -- all three cases should be optimizable into a simple seqscan explain (costs off) SELECT a.* FROM a LEFT JOIN b ON a.b_id = b.id; @@ -878,6 +880,53 @@ select id from a where id in ( select b.id from b left join c on b.id = c.id ); +-- check optimization of outer join when joining a unique sub query using group by +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT b.id FROM b GROUP BY b.id) b ON a.id = b.id; + +-- check optimization of outer join when joining a unique sub query using distinct +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT b.c_id FROM b) b ON a.id = b.c_id; + +-- check optimization of outer join when joining a unique sub query using distinct +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT a+b AS ab FROM d) d ON a.id = d.ab; + +-- optimization is not possible when distinct contains a volatile function +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT a+b+random() AS abr FROM d) d ON a.id = d.abr; + +-- check optimization of outer join when joining a unique sub query using distinct +-- and a constant expression. +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT b.c_id,1 AS dummy FROM b) b ON a.id = b.c_id; + +-- check optimization of outer join when joining a unique sub query which contains 2 tables +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT b.id FROM b INNER JOIN c ON b.id = c.id GROUP BY b.id) b ON a.id = b.id; + +-- check optimization of outer join when joining a unique sub query which contains 2 tables +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT b.id FROM b INNER JOIN c ON b.id = c.id GROUP BY b.id) b ON a.id + 10 = b.id; + +-- check optimization of outer join when joining a unique sub query which contains 2 tables +-- and contains a redundant join clause +EXPLAIN (COSTS OFF) +SELECT a.id FROM a +LEFT JOIN (SELECT b.id,1 as dummy FROM b INNER JOIN c ON b.id = c.id GROUP BY b.id) b ON a.id = b.id AND b.dummy = 1; + +-- optimization is not possible when the group by contains a column which is not in the join condition. +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT b.id FROM b GROUP BY b.id,b.c_id) b ON a.id = b.id; + +-- optimization is not possible when distinct clause contains an item that is not in the join clause +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT b.id,random() AS r FROM b) b ON a.id = b.id; + +-- optimization is not possible when distinct contains a volatile function +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT b.id,random() AS r FROM b) b ON a.id = b.id AND r = random(); + rollback; create temp table parent (k int primary key, pd int);