diff --git a/src/backend/optimizer/plan/analyzejoins.c b/src/backend/optimizer/plan/analyzejoins.c index 129fc3d..83cb70c 100644 --- a/src/backend/optimizer/plan/analyzejoins.c +++ b/src/backend/optimizer/plan/analyzejoins.c @@ -27,9 +27,15 @@ #include "optimizer/paths.h" #include "optimizer/planmain.h" #include "optimizer/var.h" +#include "optimizer/clauses.h" +#include "parser/parsetree.h" +#include "optimizer/tlist.h" +#include "nodes/nodeFuncs.h" /* local functions */ static bool join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo); +static bool sortclause_is_unique_on_restrictinfo(Query *query, + List *clause_list, List *sortclause); static void remove_rel_from_query(PlannerInfo *root, int relid, Relids joinrelids); static List *remove_rel_from_joinlist(List *joinlist, int relid, int *nremoved); @@ -154,11 +160,13 @@ join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo) /* * Currently, we only know how to remove left joins to a baserel with - * unique indexes. We can check most of these criteria pretty trivially - * to avoid doing useless extra work. But checking whether any of the - * indexes are unique would require iterating over the indexlist, so for - * now we just make sure there are indexes of some sort or other. If none - * of them are unique, join removal will still fail, just slightly later. + * unique indexes and left joins to a subquery where the subquery is + * unique on the join condition. We can check most of these criteria + * pretty trivially to avoid doing useless extra work. But checking + * whether any of the indexes are unique would require iterating over + * the indexlist, so for now, if we're joining to a relation, we'll just + * ensure that we have at least 1 index, it won't matter if that index + * is unique at this stage, we'll check those details later. */ if (sjinfo->jointype != JOIN_LEFT || sjinfo->delay_upper_joins || @@ -168,11 +176,17 @@ join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo) innerrelid = bms_singleton_member(sjinfo->min_righthand); innerrel = find_base_rel(root, innerrelid); - if (innerrel->reloptkind != RELOPT_BASEREL || - innerrel->rtekind != RTE_RELATION || - innerrel->indexlist == NIL) + if (innerrel->reloptkind != RELOPT_BASEREL) return false; + if (innerrel->rtekind == RTE_RELATION) + { + if (innerrel->indexlist == NIL) + return false; /* no possibility of a unique index */ + } + else if (innerrel->rtekind != RTE_SUBQUERY) + return false; /* unsupported rtekind */ + /* Compute the relid set for the join we are considering */ joinrelids = bms_union(sjinfo->min_lefthand, sjinfo->min_righthand); @@ -276,16 +290,143 @@ join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo) */ /* Now examine the indexes to see if we have a matching unique index */ - if (relation_has_unique_index_for(root, innerrel, clause_list, NIL, NIL)) + if (innerrel->rtekind == RTE_RELATION && + relation_has_unique_index_for(root, innerrel, clause_list, NIL, NIL)) return true; /* + * We can be certain that the sub query contains no duplicate values for + * the join clause if item in the sub query's GROUP BY clause is also used + * in the join clause using equality. This works the same way for the + * DISTINCT clause. We need not pay any attention to WHERE or HAVING + * clauses as these just restrict the results more and could not be the + * cause of duplication in the result set. However there are a number of + * pre-checks we must perform before we even look at the GROUP BY or DISTINCT + * clauses. These are described below. + */ + if (innerrel->rtekind == RTE_SUBQUERY) + { + Query *subquery = root->simple_rte_array[innerrelid]->subquery; + + /* + * We cannot remove the subquery if the target list contains any set returning + * functions as these may cause the query not to be unique on the grouping + * columns, as per the following example: + * select a.a,generate_series(1,10) from (values(1)) a(a) group by a + */ + if (expression_returns_set((Node *) subquery->targetList)) + return false; + + /* + * Don't remove the join if the target list contains any volatile functions. + * Doing so may remove desired side affects that calls to the function may + * cause. + */ + if (contain_volatile_functions((Node *) subquery->targetList)) + return false; + + /* + * It should be safe to remove the join if all the group by expressions have matching + * items in the join condition. + */ + if (subquery->groupClause != NIL && + sortclause_is_unique_on_restrictinfo(subquery, clause_list, subquery->groupClause)) + return true; + + /* + * It should be safe to remove the join if all the distinct column list have matching + * items in the join condition. + */ + if (subquery->distinctClause != NIL && + sortclause_is_unique_on_restrictinfo(subquery, clause_list, subquery->distinctClause)) + return true; + + /* + * Note that we must also not remove the join in the subquery contains + * a FOR UDPATE. We can actually skip this check as GROUP BY or DISTINCT + * cannot be used at the same time as FOR UPDATE + */ + } + /* XXX is this comment still needed?? * Some day it would be nice to check for other methods of establishing * distinctness. */ return false; } +/* + * sortclause_is_unique_on_restrictinfo + * + * Checks to see if all items in sortclause also exist in clause_list. + * The function will return true if clause_list is the same as or a superset + * of the sortclause. If the sortclause has columns that don't exist in the + * clause_list then the query can't be guaranteed unique on the clause_list + * columns. Also if the targetList expression contains any volatile functions + * then we return false as something like: + * SELECT DISTINCT a+random() FROM (VALUES(1),(1)) a(a); + * will almost always return more than 1 row. + * + * Note: The calling function must ensure that sortclause is not NIL. + */ +static bool +sortclause_is_unique_on_restrictinfo(Query *query, List *clause_list, List *sortclause) +{ + ListCell *l; + + Assert(sortclause != NIL); + + /* + * Loop over each sort clause to ensure that we have + * an item in the join conditions that matches it. + * It does not matter if we have more items in the join + * condition than we have in the sort clause. + */ + foreach(l, sortclause) + { + ListCell *ri; + SortGroupClause *scl = (SortGroupClause *) lfirst(l); + TargetEntry *sortTarget; + bool matched = false; + + /* lookup the target list entry for the current sort sort group ref */ + sortTarget = get_sortgroupref_tle(scl->tleSortGroupRef, query->targetList); + + /* + * Since a constant only has 1 value the existence of one here will + * not cause any duplication of the results. We'll simply ignore it! + */ + if (IsA(sortTarget->expr, Const)) + continue; + + foreach(ri, clause_list) + { + RestrictInfo *rinfo = (RestrictInfo *) lfirst(ri); + Node *rexpr; + + if (rinfo->outer_is_left) + rexpr = get_rightop(rinfo->clause); + else + rexpr = get_leftop(rinfo->clause); + + if (IsA(rexpr, Var)) + { + Var *var = (Var *)rexpr; + + if (var->varattno == sortTarget->resno) + { + matched = true; + break; /* match found */ + } + } + else /* XXX what else could it be? */ + return false; + } + + if (!matched) + return false; + } + return true; +} /* * Remove the target relid from the planner's data structures, having diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out index 934488a..ff13c76 100644 --- a/src/test/regress/expected/join.out +++ b/src/test/regress/expected/join.out @@ -3060,9 +3060,11 @@ begin; CREATE TEMP TABLE a (id int PRIMARY KEY, b_id int); CREATE TEMP TABLE b (id int PRIMARY KEY, c_id int); CREATE TEMP TABLE c (id int PRIMARY KEY); +CREATE TEMP TABLE d (a INT, b INT); INSERT INTO a VALUES (0, 0), (1, NULL); INSERT INTO b VALUES (0, 0), (1, NULL); INSERT INTO c VALUES (0), (1); +INSERT INTO d VALUES (1,3),(2,2),(3,1); -- all three cases should be optimizable into a simple seqscan explain (costs off) SELECT a.* FROM a LEFT JOIN b ON a.b_id = b.id; QUERY PLAN @@ -3098,6 +3100,151 @@ select id from a where id in ( -> Seq Scan on b (5 rows) +-- check optimization of outer join when joining a unique sub query using group by +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT b.id FROM b GROUP BY b.id) b ON a.id = b.id; + QUERY PLAN +--------------- + Seq Scan on a +(1 row) + +-- check optimization of outer join when joining a unique sub query using distinct +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT b.c_id FROM b) b ON a.id = b.c_id; + QUERY PLAN +--------------- + Seq Scan on a +(1 row) + +-- check optimization of outer join when joining a unique sub query using distinct +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT a+b AS ab FROM d) d ON a.id = d.ab; + QUERY PLAN +--------------- + Seq Scan on a +(1 row) + +-- optimization is not possible when distinct contains a volatile function +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT a+b+random() AS abr FROM d) d ON a.id = d.abr; + QUERY PLAN +------------------------------------------------------------------------------------------ + Hash Left Join + Hash Cond: ((a.id)::double precision = ((((d.a + d.b))::double precision + random()))) + -> Seq Scan on a + -> Hash + -> HashAggregate + Group Key: (((d.a + d.b))::double precision + random()) + -> Seq Scan on d +(7 rows) + +-- check optimization of outer join when joining a unique sub query using distinct +-- and a constant expression. +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT b.c_id,1 AS dummy FROM b) b ON a.id = b.c_id; + QUERY PLAN +--------------- + Seq Scan on a +(1 row) + +-- check optimization of outer join when joining a unique sub query which contains 2 tables +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT b.id FROM b INNER JOIN c ON b.id = c.id GROUP BY b.id) b ON a.id = b.id; + QUERY PLAN +--------------- + Seq Scan on a +(1 row) + +-- check optimization of outer join when joining a unique sub query which contains 2 tables +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT b.id FROM b INNER JOIN c ON b.id = c.id GROUP BY b.id) b ON a.id + 10 = b.id; + QUERY PLAN +--------------- + Seq Scan on a +(1 row) + +-- check optimization of outer join when joining a unique sub query which contains 2 tables +-- and contains a redundant join clause +EXPLAIN (COSTS OFF) +SELECT a.id FROM a +LEFT JOIN (SELECT b.id,1 as dummy FROM b INNER JOIN c ON b.id = c.id GROUP BY b.id) b ON a.id = b.id AND b.dummy = 1; + QUERY PLAN +--------------- + Seq Scan on a +(1 row) + +-- optimization is not possible when the group by contains a column which is not in the join condition. +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT b.id FROM b GROUP BY b.id,b.c_id) b ON a.id = b.id; + QUERY PLAN +--------------------------------- + Hash Right Join + Hash Cond: (b.id = a.id) + -> HashAggregate + Group Key: b.id, b.c_id + -> Seq Scan on b + -> Hash + -> Seq Scan on a +(7 rows) + +-- optimization is not possible when distinct clause contains an item that is not in the join clause +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT b.id,c_id FROM b) b ON a.id = b.id; + QUERY PLAN +--------------------------------- + Hash Right Join + Hash Cond: (b.id = a.id) + -> HashAggregate + Group Key: b.id, b.c_id + -> Seq Scan on b + -> Hash + -> Seq Scan on a +(7 rows) + +-- optimization is not possible when distinct contains a volatile function +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT b.id,random() AS r FROM b) b ON a.id = b.id AND r = random(); + QUERY PLAN +------------------------------------------------- + Hash Left Join + Hash Cond: (a.id = b.id) + -> Seq Scan on a + -> Hash + -> Subquery Scan on b + Filter: (b.r = random()) + -> HashAggregate + Group Key: b_1.id, random() + -> Seq Scan on b b_1 +(9 rows) + +-- optimization is not possible when there are any volatile functions in the target list. +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT id,AVG(c_id),SUM(random()) FROM b GROUP BY id) b ON a.id = b.id; + QUERY PLAN +---------------------------- + Hash Right Join + Hash Cond: (b.id = a.id) + -> HashAggregate + Group Key: b.id + -> Seq Scan on b + -> Hash + -> Seq Scan on a +(7 rows) + +-- optimization is not possible when there are set returning functions in the target list. +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT id,generate_series(1,2) FROM b GROUP BY id) b ON a.id = b.id; + QUERY PLAN +---------------------------- + Hash Right Join + Hash Cond: (b.id = a.id) + -> HashAggregate + Group Key: b.id + -> Seq Scan on b + -> Hash + -> Seq Scan on a +(7 rows) + rollback; create temp table parent (k int primary key, pd int); create temp table child (k int unique, cd int); diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql index 275cb11..d00e7fe 100644 --- a/src/test/regress/sql/join.sql +++ b/src/test/regress/sql/join.sql @@ -861,9 +861,11 @@ begin; CREATE TEMP TABLE a (id int PRIMARY KEY, b_id int); CREATE TEMP TABLE b (id int PRIMARY KEY, c_id int); CREATE TEMP TABLE c (id int PRIMARY KEY); +CREATE TEMP TABLE d (a INT, b INT); INSERT INTO a VALUES (0, 0), (1, NULL); INSERT INTO b VALUES (0, 0), (1, NULL); INSERT INTO c VALUES (0), (1); +INSERT INTO d VALUES (1,3),(2,2),(3,1); -- all three cases should be optimizable into a simple seqscan explain (costs off) SELECT a.* FROM a LEFT JOIN b ON a.b_id = b.id; @@ -878,6 +880,61 @@ select id from a where id in ( select b.id from b left join c on b.id = c.id ); +-- check optimization of outer join when joining a unique sub query using group by +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT b.id FROM b GROUP BY b.id) b ON a.id = b.id; + +-- check optimization of outer join when joining a unique sub query using distinct +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT b.c_id FROM b) b ON a.id = b.c_id; + +-- check optimization of outer join when joining a unique sub query using distinct +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT a+b AS ab FROM d) d ON a.id = d.ab; + +-- optimization is not possible when distinct contains a volatile function +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT a+b+random() AS abr FROM d) d ON a.id = d.abr; + +-- check optimization of outer join when joining a unique sub query using distinct +-- and a constant expression. +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT b.c_id,1 AS dummy FROM b) b ON a.id = b.c_id; + +-- check optimization of outer join when joining a unique sub query which contains 2 tables +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT b.id FROM b INNER JOIN c ON b.id = c.id GROUP BY b.id) b ON a.id = b.id; + +-- check optimization of outer join when joining a unique sub query which contains 2 tables +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT b.id FROM b INNER JOIN c ON b.id = c.id GROUP BY b.id) b ON a.id + 10 = b.id; + +-- check optimization of outer join when joining a unique sub query which contains 2 tables +-- and contains a redundant join clause +EXPLAIN (COSTS OFF) +SELECT a.id FROM a +LEFT JOIN (SELECT b.id,1 as dummy FROM b INNER JOIN c ON b.id = c.id GROUP BY b.id) b ON a.id = b.id AND b.dummy = 1; + +-- optimization is not possible when the group by contains a column which is not in the join condition. +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT b.id FROM b GROUP BY b.id,b.c_id) b ON a.id = b.id; + +-- optimization is not possible when distinct clause contains an item that is not in the join clause +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT b.id,c_id FROM b) b ON a.id = b.id; + +-- optimization is not possible when distinct contains a volatile function +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT b.id,random() AS r FROM b) b ON a.id = b.id AND r = random(); + +-- optimization is not possible when there are any volatile functions in the target list. +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT id,AVG(c_id),SUM(random()) FROM b GROUP BY id) b ON a.id = b.id; + +-- optimization is not possible when there are set returning functions in the target list. +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT id,generate_series(1,2) FROM b GROUP BY id) b ON a.id = b.id; + rollback; create temp table parent (k int primary key, pd int);