diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c index be92049..1d8614e 100644 --- a/src/backend/optimizer/plan/subselect.c +++ b/src/backend/optimizer/plan/subselect.c @@ -27,6 +27,7 @@ #include "optimizer/prep.h" #include "optimizer/subselect.h" #include "optimizer/var.h" +#include "parser/parse_clause.h" #include "parser/parse_relation.h" #include "rewrite/rewriteManip.h" #include "utils/builtins.h" @@ -1176,7 +1177,7 @@ SS_process_ctes(PlannerInfo *root) */ JoinExpr * convert_ANY_sublink_to_join(PlannerInfo *root, SubLink *sublink, - Relids available_rels) + bool under_not, Relids available_rels) { JoinExpr *result; Query *parse = root->parse; @@ -1191,6 +1192,15 @@ convert_ANY_sublink_to_join(PlannerInfo *root, SubLink *sublink, Assert(sublink->subLinkType == ANY_SUBLINK); /* + * For the case of NOT IN, we can only perform this optimization if we are + * guaranteed that we'll never get NULL values from the subquery results. + * For example "SELECT 1 WHERE 3 NOT IN(1,2,NULL)", the query will produce + * 0 rows due to the fact that 3 <> NULL is unknown. + */ + if (under_not && queryTargetListCanHaveNulls(root, subselect)) + return NULL; + + /* * The sub-select must not refer to any Vars of the parent query. (Vars of * higher levels should be okay, though.) */ @@ -1256,7 +1266,7 @@ convert_ANY_sublink_to_join(PlannerInfo *root, SubLink *sublink, * And finally, build the JoinExpr node. */ result = makeNode(JoinExpr); - result->jointype = JOIN_SEMI; + result->jointype = under_not ? JOIN_ANTI : JOIN_SEMI; result->isNatural = false; result->larg = NULL; /* caller must fill this in */ result->rarg = (Node *) rtr; diff --git a/src/backend/optimizer/prep/prepjointree.c b/src/backend/optimizer/prep/prepjointree.c index 776fe42..7ea165d 100644 --- a/src/backend/optimizer/prep/prepjointree.c +++ b/src/backend/optimizer/prep/prepjointree.c @@ -334,7 +334,7 @@ pull_up_sublinks_qual_recurse(PlannerInfo *root, Node *node, /* Is it a convertible ANY or EXISTS clause? */ if (sublink->subLinkType == ANY_SUBLINK) { - if ((j = convert_ANY_sublink_to_join(root, sublink, + if ((j = convert_ANY_sublink_to_join(root, sublink, false, available_rels1)) != NULL) { /* Yes; insert the new join node into the join tree */ @@ -360,7 +360,7 @@ pull_up_sublinks_qual_recurse(PlannerInfo *root, Node *node, return NULL; } if (available_rels2 != NULL && - (j = convert_ANY_sublink_to_join(root, sublink, + (j = convert_ANY_sublink_to_join(root, sublink, false, available_rels2)) != NULL) { /* Yes; insert the new join node into the join tree */ @@ -452,7 +452,61 @@ pull_up_sublinks_qual_recurse(PlannerInfo *root, Node *node, if (sublink && IsA(sublink, SubLink)) { - if (sublink->subLinkType == EXISTS_SUBLINK) + if (sublink->subLinkType == ANY_SUBLINK) + { + if ((j = convert_ANY_sublink_to_join(root, sublink, true, + available_rels1)) != NULL) + { + /* Yes; insert the new join node into the join tree */ + j->larg = *jtlink1; + *jtlink1 = (Node *) j; + /* Recursively process pulled-up jointree nodes */ + j->rarg = pull_up_sublinks_jointree_recurse(root, + j->rarg, + &child_rels); + + /* + * Now recursively process the pulled-up quals. Because + * we are underneath a NOT, we can't pull up sublinks that + * reference the left-hand stuff, but it's still okay to + * pull up sublinks referencing j->rarg. + */ + j->quals = pull_up_sublinks_qual_recurse(root, + j->quals, + &j->rarg, + child_rels, + NULL, NULL); + /* Return NULL representing constant TRUE */ + return NULL; + } + if (available_rels2 != NULL && + (j = convert_ANY_sublink_to_join(root, sublink, true, + available_rels2)) != NULL) + { + /* Yes; insert the new join node into the join tree */ + j->larg = *jtlink2; + *jtlink2 = (Node *) j; + /* Recursively process pulled-up jointree nodes */ + j->rarg = pull_up_sublinks_jointree_recurse(root, + j->rarg, + &child_rels); + + /* + * Now recursively process the pulled-up quals. Because + * we are underneath a NOT, we can't pull up sublinks that + * reference the left-hand stuff, but it's still okay to + * pull up sublinks referencing j->rarg. + */ + j->quals = pull_up_sublinks_qual_recurse(root, + j->quals, + &j->rarg, + child_rels, + NULL, NULL); + /* Return NULL representing constant TRUE */ + return NULL; + } + } + else if (sublink->subLinkType == EXISTS_SUBLINK) { if ((j = convert_EXISTS_sublink_to_join(root, sublink, true, available_rels1)) != NULL) @@ -506,6 +560,7 @@ pull_up_sublinks_qual_recurse(PlannerInfo *root, Node *node, return NULL; } } + } /* Else return it unmodified */ return node; diff --git a/src/backend/parser/parse_clause.c b/src/backend/parser/parse_clause.c index fcee137..7de2a52 100644 --- a/src/backend/parser/parse_clause.c +++ b/src/backend/parser/parse_clause.c @@ -21,6 +21,7 @@ #include "commands/defrem.h" #include "nodes/makefuncs.h" #include "nodes/nodeFuncs.h" +#include "optimizer/clauses.h" #include "optimizer/tlist.h" #include "parser/analyze.h" #include "parser/parsetree.h" @@ -2406,6 +2407,115 @@ assignSortGroupRef(TargetEntry *tle, List *tlist) return tle->ressortgroupref; } + +/* + * queryTargetListListCanHaveNulls + * Checks if a query's targetList could produce NULL values. We only + * return false if we can prove without question that no NULLs can exist + * in all target list entries in the query. + * + * Note: resjunk targets are ignored here. + */ +bool +queryTargetListCanHaveNulls(PlannerInfo *root, Query *query) +{ + List *local_nonnullable_vars; + bool computed_nonnullable_vars = false; + ListCell *tl; + Node *node; + + /* + * It should also be possible to determine if no NULLs can exist in the + * results even when set operators are present in the query, but for now + * we'll just report that NULLs are possible. It may be worth fixing this + * up in the future, but at the time of writing this function, no call + * sites existed which would call the function if the query contained set + * operators. + */ + if (query->setOperations) + return true; + + + foreach(tl, query->targetList) + { + TargetEntry *tle = (TargetEntry *) lfirst(tl); + + /* ignore columns which won't be in the final results */ + if (tle->resjunk) + continue; + + node = (Node *) tle->expr; + + /* for Constant expressions we can tell directly if they're NULL or not. */ + if (IsA(node, Const)) + { + if (((Const *) node)->constisnull) + return true; + } + + else if (IsA(node, Var)) + { + ListCell *lc; + RangeTblEntry *rte; + bool matched; + Var *tlevar = (Var *) node; + //Relids joinrelids; + /* + * Now let's look at the WHERE clause items to see if there's anything in + * there that will disallow NULLs from the results. + */ + if (!computed_nonnullable_vars) + { + local_nonnullable_vars = find_nonnullable_vars(query->jointree->quals); + computed_nonnullable_vars = true; + } + + matched = false; + foreach(lc, local_nonnullable_vars) + { + Var *var = (Var *) lfirst(lc); + + if (var->varno == tlevar->varno && var->varattno == tlevar->varattno) + { + matched = true; + break; + } + } + + /* Did we find the var non-nullable? */ + if (matched) + continue; /* ok onto the next one */ + + /* + * If we make it down here then we've been unable to find any quals + * then ensure that the TargetEntry won't be NULL. All we have left + * to go on is a NOT NULL constraint on the column, though this is + * only going to prove that no nulls can be present if the relation + * that the column belongs to is NOT from an OUTER JOIN. + * + * First we'll ensure we're dealing with a Var that's directly from + * a table, then we'll ensure it's from an INNER JOIN type, finally + * we'll look for a NOT NULL constraint on the column. + */ + + if (!OidIsValid(tle->resorigtbl)) + return true; /* Var is not from a table */ + + /* WRONG RangeTblEntry!!! */ + rte = rt_fetch(tlevar->varno, query->rtable); + + if (IS_OUTER_JOIN(rte->jointype)) + return true; /* Var from an outer join */ + + if (!get_attnotnull(tle->resorigtbl, tle->resorigcol)) + return true; /* No NOT NULL constraint */ + } + else + return true; /* not a Const or a Var */ + } + return false; /* Cannot have NULLs */ +} + /* * targetIsInSortList * Is the given target item already in the sortlist? diff --git a/src/backend/utils/cache/lsyscache.c b/src/backend/utils/cache/lsyscache.c index 4b5ef99..c98adc7 100644 --- a/src/backend/utils/cache/lsyscache.c +++ b/src/backend/utils/cache/lsyscache.c @@ -816,6 +816,37 @@ get_opfamily_proc(Oid opfamily, Oid lefttype, Oid righttype, int16 procnum) /* ---------- ATTRIBUTE CACHES ---------- */ /* + * get_attnotnull + * Returns true if pg_attribute.attnotnull is true, otherwise returns + * false. An error is raised if no record is found for the relid/attnum. + * + * Note: Calling functions should be careful and test relid for InvalidOid + * before calling this function. + */ +bool +get_attnotnull(Oid relid, AttrNumber attnum) +{ + HeapTuple tp; + + tp = SearchSysCache2(ATTNUM, + ObjectIdGetDatum(relid), + Int16GetDatum(attnum)); + if (HeapTupleIsValid(tp)) + { + Form_pg_attribute att_tup = (Form_pg_attribute) GETSTRUCT(tp); + bool result = att_tup->attnotnull; + ReleaseSysCache(tp); + return result; + } + else + { + elog(ERROR, "cache lookup failed for attribute %d of relation %u", + attnum, relid); + return false; /* keep compiler quiet */ + } +} + +/* * get_attname * Given the relation id and the attribute number, * return the "attname" field from the attribute relation. diff --git a/src/include/optimizer/subselect.h b/src/include/optimizer/subselect.h index 5607e98..3e8bfe7 100644 --- a/src/include/optimizer/subselect.h +++ b/src/include/optimizer/subselect.h @@ -18,6 +18,7 @@ extern void SS_process_ctes(PlannerInfo *root); extern JoinExpr *convert_ANY_sublink_to_join(PlannerInfo *root, SubLink *sublink, + bool under_not, Relids available_rels); extern JoinExpr *convert_EXISTS_sublink_to_join(PlannerInfo *root, SubLink *sublink, diff --git a/src/include/parser/parse_clause.h b/src/include/parser/parse_clause.h index e9e7cdc..8201574 100644 --- a/src/include/parser/parse_clause.h +++ b/src/include/parser/parse_clause.h @@ -15,6 +15,7 @@ #define PARSE_CLAUSE_H #include "parser/parse_node.h" +#include "nodes/relation.h" extern void transformFromClause(ParseState *pstate, List *frmList); extern int setTargetTable(ParseState *pstate, RangeVar *relation, @@ -47,5 +48,6 @@ extern List *addTargetToSortList(ParseState *pstate, TargetEntry *tle, bool resolveUnknown); extern Index assignSortGroupRef(TargetEntry *tle, List *tlist); extern bool targetIsInSortList(TargetEntry *tle, Oid sortop, List *sortList); +extern bool queryTargetListCanHaveNulls(PlannerInfo *root, Query *query); #endif /* PARSE_CLAUSE_H */ diff --git a/src/include/utils/lsyscache.h b/src/include/utils/lsyscache.h index f46460a..5e7d946 100644 --- a/src/include/utils/lsyscache.h +++ b/src/include/utils/lsyscache.h @@ -63,6 +63,7 @@ extern List *get_op_btree_interpretation(Oid opno); extern bool equality_ops_are_compatible(Oid opno1, Oid opno2); extern Oid get_opfamily_proc(Oid opfamily, Oid lefttype, Oid righttype, int16 procnum); +extern bool get_attnotnull(Oid relid, AttrNumber attnum); extern char *get_attname(Oid relid, AttrNumber attnum); extern char *get_relid_attribute_name(Oid relid, AttrNumber attnum); extern AttrNumber get_attnum(Oid relid, const char *attname); diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out index ce15af7..970a224 100644 --- a/src/test/regress/expected/subselect.out +++ b/src/test/regress/expected/subselect.out @@ -739,3 +739,207 @@ select * from int4_tbl where 0 (1 row) +-- +-- Check NOT IN performs ANTI JOIN when subquery columns are NOT NULL +-- and does not when subquery columns can contain NULLs. +-- +BEGIN; +CREATE TEMP TABLE a (id INT PRIMARY KEY); +CREATE TEMP TABLE b (x INT NOT NULL, y INT); +CREATE TEMP TABLE c (z INT NOT NULL); +-- ANTI JOIN. x is defined as NOT NULL +EXPLAIN (COSTS OFF) +SELECT * FROM a WHERE id NOT IN (SELECT x FROM b); + QUERY PLAN +----------------------------------------- + Merge Anti Join + Merge Cond: (a.id = b.x) + -> Index Only Scan using a_pkey on a + -> Sort + Sort Key: b.x + -> Seq Scan on b +(6 rows) + +-- No ANTI JOIN, y can be NULL +EXPLAIN (COSTS OFF) +SELECT * FROM a WHERE id NOT IN (SELECT y FROM b); + QUERY PLAN +------------------------------------ + Seq Scan on a + Filter: (NOT (hashed SubPlan 1)) + SubPlan 1 + -> Seq Scan on b +(4 rows) + +-- No ANTI JOIN, x is NOT NULL, but we don't know if + 1 will change that. +EXPLAIN (COSTS OFF) +SELECT * FROM a WHERE id NOT IN (SELECT x+1 FROM b); + QUERY PLAN +------------------------------------ + Seq Scan on a + Filter: (NOT (hashed SubPlan 1)) + SubPlan 1 + -> Seq Scan on b +(4 rows) + +-- ANTI JOIN 1 is a Const that is not null. +EXPLAIN (COSTS OFF) +SELECT * FROM a WHERE id NOT IN (SELECT 1 FROM b); + QUERY PLAN +--------------------------- + Nested Loop Anti Join + Join Filter: (a.id = 1) + -> Seq Scan on a + -> Materialize + -> Seq Scan on b +(5 rows) + +-- No ANTI JOIN, results contain a NULL Const +EXPLAIN (COSTS OFF) +SELECT * FROM a WHERE id NOT IN (SELECT NULL::int FROM b); + QUERY PLAN +------------------------------------ + Seq Scan on a + Filter: (NOT (hashed SubPlan 1)) + SubPlan 1 + -> Seq Scan on b +(4 rows) + +-- ANTI JOIN y = 1 means y can't be NULL +EXPLAIN (COSTS OFF) +SELECT * FROM a WHERE id NOT IN (SELECT y FROM b WHERE y = 1); + QUERY PLAN +------------------------------- + Hash Anti Join + Hash Cond: (a.id = b.y) + -> Seq Scan on a + -> Hash + -> Seq Scan on b + Filter: (y = 1) +(6 rows) + +-- No ANTI JOIN, OR condition does not ensure y = 1 +EXPLAIN (COSTS OFF) +SELECT * FROM a WHERE id NOT IN (SELECT y FROM b WHERE y = 1 OR x = 1); + QUERY PLAN +---------------------------------------- + Seq Scan on a + Filter: (NOT (hashed SubPlan 1)) + SubPlan 1 + -> Seq Scan on b + Filter: ((y = 1) OR (x = 1)) +(5 rows) + +-- No ANTI JOIN, OR condition does not ensure y = 1 or y = 2 +EXPLAIN (COSTS OFF) +SELECT * FROM a WHERE id NOT IN (SELECT y FROM b WHERE (y = 1 OR x = 1) AND (y = 2 OR x = 2)); + QUERY PLAN +------------------------------------------------------------------- + Seq Scan on a + Filter: (NOT (hashed SubPlan 1)) + SubPlan 1 + -> Seq Scan on b + Filter: (((y = 1) OR (x = 1)) AND ((y = 2) OR (x = 2))) +(5 rows) + +-- ANTI JOIN y must be 2, so can't be NULL +EXPLAIN (COSTS OFF) +SELECT * FROM a WHERE id NOT IN (SELECT y FROM b WHERE (y = 1 OR x = 1) AND y = 2); + QUERY PLAN +---------------------------------------------------------- + Hash Anti Join + Hash Cond: (a.id = b.y) + -> Seq Scan on a + -> Hash + -> Seq Scan on b + Filter: ((y = 2) AND ((y = 1) OR (x = 1))) +(6 rows) + +-- ANTI JOIN y can be 1 or 2, but can't be null. +EXPLAIN (COSTS OFF) +SELECT * FROM a WHERE id NOT IN (SELECT y FROM b WHERE (y = 1 OR y = 2)); + QUERY PLAN +-------------------------------------------- + Hash Anti Join + Hash Cond: (a.id = b.y) + -> Seq Scan on a + -> Hash + -> Seq Scan on b + Filter: ((y = 1) OR (y = 2)) +(6 rows) + +-- No ANTI JOIN c.z is from an outer join so it can have nulls. +EXPLAIN (COSTS OFF) +SELECT * FROM a WHERE id NOT IN (SELECT c.z FROM b LEFT JOIN c ON b.x = c.z); + QUERY PLAN +------------------------------------ + Seq Scan on a + Filter: (NOT (hashed SubPlan 1)) + SubPlan 1 + -> Merge Left Join + Merge Cond: (b.x = c.z) + -> Sort + Sort Key: b.x + -> Seq Scan on b + -> Sort + Sort Key: c.z + -> Seq Scan on c +(11 rows) + +-- ANTI JOIN, c.z is from an inner join and has a NOT NULL constraint. +EXPLAIN (COSTS OFF) +SELECT * FROM a WHERE id NOT IN (SELECT c.z FROM b INNER JOIN c ON b.x = c.z); + QUERY PLAN +----------------------------------------- + Merge Anti Join + Merge Cond: (a.id = c.z) + -> Index Only Scan using a_pkey on a + -> Materialize + -> Merge Join + Merge Cond: (b.x = c.z) + -> Sort + Sort Key: b.x + -> Seq Scan on b + -> Sort + Sort Key: c.z + -> Seq Scan on c +(12 rows) + +-- ANTI JOIN, c.z must be 1 +EXPLAIN (COSTS OFF) +SELECT * FROM a WHERE id NOT IN (SELECT c.z FROM b LEFT JOIN c ON b.x = c.z WHERE c.z = 1); + QUERY PLAN +------------------------------------------- + Hash Anti Join + Hash Cond: (a.id = c.z) + -> Seq Scan on a + -> Hash + -> Nested Loop + -> Seq Scan on c + Filter: (z = 1) + -> Materialize + -> Seq Scan on b + Filter: (x = 1) +(10 rows) + +-- ANTI JOIN, c.z can't be NULL +EXPLAIN (COSTS OFF) +SELECT * FROM a WHERE id NOT IN (SELECT c.z FROM b LEFT JOIN c ON b.x = c.z WHERE c.z IS NOT NULL); + QUERY PLAN +--------------------------------------------------- + Merge Anti Join + Merge Cond: (a.id = c.z) + -> Index Only Scan using a_pkey on a + -> Materialize + -> Merge Join + Merge Cond: (b.x = c.z) + -> Sort + Sort Key: b.x + -> Seq Scan on b + -> Sort + Sort Key: c.z + -> Seq Scan on c + Filter: (z IS NOT NULL) +(13 rows) + +ROLLBACK; diff --git a/src/test/regress/sql/subselect.sql b/src/test/regress/sql/subselect.sql index 326fd70..53ba798 100644 --- a/src/test/regress/sql/subselect.sql +++ b/src/test/regress/sql/subselect.sql @@ -422,3 +422,72 @@ select * from int4_tbl where select * from int4_tbl where (case when f1 in (select unique1 from tenk1 a) then f1 else null end) in (select ten from tenk1 b); + +-- +-- Check NOT IN performs ANTI JOIN when subquery columns are NOT NULL +-- and does not when subquery columns can contain NULLs. +-- + +BEGIN; + +CREATE TEMP TABLE a (id INT PRIMARY KEY); +CREATE TEMP TABLE b (x INT NOT NULL, y INT); +CREATE TEMP TABLE c (z INT NOT NULL); + +-- ANTI JOIN. x is defined as NOT NULL +EXPLAIN (COSTS OFF) +SELECT * FROM a WHERE id NOT IN (SELECT x FROM b); + +-- No ANTI JOIN, y can be NULL +EXPLAIN (COSTS OFF) +SELECT * FROM a WHERE id NOT IN (SELECT y FROM b); + +-- No ANTI JOIN, x is NOT NULL, but we don't know if + 1 will change that. +EXPLAIN (COSTS OFF) +SELECT * FROM a WHERE id NOT IN (SELECT x+1 FROM b); + +-- ANTI JOIN 1 is a Const that is not null. +EXPLAIN (COSTS OFF) +SELECT * FROM a WHERE id NOT IN (SELECT 1 FROM b); + +-- No ANTI JOIN, results contain a NULL Const +EXPLAIN (COSTS OFF) +SELECT * FROM a WHERE id NOT IN (SELECT NULL::int FROM b); + +-- ANTI JOIN y = 1 means y can't be NULL +EXPLAIN (COSTS OFF) +SELECT * FROM a WHERE id NOT IN (SELECT y FROM b WHERE y = 1); + +-- No ANTI JOIN, OR condition does not ensure y = 1 +EXPLAIN (COSTS OFF) +SELECT * FROM a WHERE id NOT IN (SELECT y FROM b WHERE y = 1 OR x = 1); + +-- No ANTI JOIN, OR condition does not ensure y = 1 or y = 2 +EXPLAIN (COSTS OFF) +SELECT * FROM a WHERE id NOT IN (SELECT y FROM b WHERE (y = 1 OR x = 1) AND (y = 2 OR x = 2)); + +-- ANTI JOIN y must be 2, so can't be NULL +EXPLAIN (COSTS OFF) +SELECT * FROM a WHERE id NOT IN (SELECT y FROM b WHERE (y = 1 OR x = 1) AND y = 2); + +-- ANTI JOIN y can be 1 or 2, but can't be null. +EXPLAIN (COSTS OFF) +SELECT * FROM a WHERE id NOT IN (SELECT y FROM b WHERE (y = 1 OR y = 2)); + +-- No ANTI JOIN c.z is from an outer join so it can have nulls. +EXPLAIN (COSTS OFF) +SELECT * FROM a WHERE id NOT IN (SELECT c.z FROM b LEFT JOIN c ON b.x = c.z); + +-- ANTI JOIN, c.z is from an inner join and has a NOT NULL constraint. +EXPLAIN (COSTS OFF) +SELECT * FROM a WHERE id NOT IN (SELECT c.z FROM b INNER JOIN c ON b.x = c.z); + +-- ANTI JOIN, c.z must be 1 +EXPLAIN (COSTS OFF) +SELECT * FROM a WHERE id NOT IN (SELECT c.z FROM b LEFT JOIN c ON b.x = c.z WHERE c.z = 1); + +-- ANTI JOIN, c.z can't be NULL +EXPLAIN (COSTS OFF) +SELECT * FROM a WHERE id NOT IN (SELECT c.z FROM b LEFT JOIN c ON b.x = c.z WHERE c.z IS NOT NULL); + +ROLLBACK;