diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c index 42dcb11..93a8750 100644 --- a/src/backend/optimizer/path/indxpath.c +++ b/src/backend/optimizer/path/indxpath.c @@ -21,6 +21,7 @@ #include "access/sysattr.h" #include "catalog/pg_am.h" #include "catalog/pg_collation.h" +#include "catalog/pg_constraint.h" #include "catalog/pg_operator.h" #include "catalog/pg_opfamily.h" #include "catalog/pg_type.h" diff --git a/src/backend/optimizer/plan/analyzejoins.c b/src/backend/optimizer/plan/analyzejoins.c index 129fc3d..d2e8e7a 100644 --- a/src/backend/optimizer/plan/analyzejoins.c +++ b/src/backend/optimizer/plan/analyzejoins.c @@ -27,9 +27,23 @@ #include "optimizer/paths.h" #include "optimizer/planmain.h" #include "optimizer/var.h" +#include "optimizer/clauses.h" +#include "parser/parsetree.h" +#include "optimizer/tlist.h" +#include "nodes/nodeFuncs.h" +#include "nodes/pg_list.h" +#include "utils/lsyscache.h" /* local functions */ -static bool join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo); +static bool leftjoin_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo); +static bool semijoin_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo); +static bool sortclause_is_unique_on_restrictinfo(Query *query, + List *clause_list, List *sortclause); +static bool relation_has_foreign_key_for(PlannerInfo *root, RelOptInfo *rel, + RelOptInfo *referencedrel, List *referencing_exprs, + List *index_exprs, List *operator_list); +static bool expressions_match_foreign_key(ForeignKeyInfo *fk, IndexOptInfo *ind, + List *exprlist, List *index_exprs, List *operator_list); static void remove_rel_from_query(PlannerInfo *root, int relid, Relids joinrelids); static List *remove_rel_from_joinlist(List *joinlist, int relid, int *nremoved); @@ -59,10 +73,20 @@ restart: int innerrelid; int nremoved; - /* Skip if not removable */ - if (!join_is_removable(root, sjinfo)) - continue; - + if (sjinfo->jointype == JOIN_LEFT) + { + /* Skip if not removable */ + if (!leftjoin_is_removable(root, sjinfo)) + continue; + } + else if (sjinfo->jointype == JOIN_SEMI) + { + /* Skip if not removable */ + if (!semijoin_is_removable(root, sjinfo)) + continue; + } + else + continue; /* we don't support this join type */ /* * Currently, join_is_removable can only succeed when the sjinfo's * righthand is a single baserel. Remove that rel from the query and @@ -132,47 +156,75 @@ clause_sides_match_join(RestrictInfo *rinfo, Relids outerrelids, } /* - * join_is_removable - * Check whether we need not perform this special join at all, because + * leftjoin_is_removable + * Check whether we need not perform this left join at all, because * it will just duplicate its left input. * * This is true for a left join for which the join condition cannot match - * more than one inner-side row. (There are other possibly interesting - * cases, but we don't have the infrastructure to prove them.) We also - * have to check that the inner side doesn't generate any variables needed - * above the join. + * more than one inner-side row. To prove the join will be unique on the + * join condition we must analyze the unique indexes on the right side of + * the join to ensure that no more than 1 record can exist for the join + * condition. + * + * We can also remove sub queries if we can prove the query will not produce + * more than 1 record for the join condition, to do this we currently look at + * the GROUP BY and DISTINCT clause of the query. */ static bool -join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo) +leftjoin_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo) { int innerrelid; RelOptInfo *innerrel; + Query *subquery; Relids joinrelids; List *clause_list = NIL; ListCell *l; int attroff; + List *fklist = NIL; + + Assert(sjinfo->jointype == JOIN_LEFT); /* * Currently, we only know how to remove left joins to a baserel with - * unique indexes. We can check most of these criteria pretty trivially - * to avoid doing useless extra work. But checking whether any of the - * indexes are unique would require iterating over the indexlist, so for - * now we just make sure there are indexes of some sort or other. If none - * of them are unique, join removal will still fail, just slightly later. + * unique indexes and left joins to a subquery where the subquery is + * unique on the join condition. We can check most of these criteria + * pretty trivially to avoid doing useless extra work. But checking + * whether any of the indexes are unique would require iterating over + * the indexlist, so for now, if we're joining to a relation, we'll just + * ensure that we have at least 1 index, it won't matter if that index + * is unique at this stage, we'll check those details later. */ - if (sjinfo->jointype != JOIN_LEFT || - sjinfo->delay_upper_joins || + if (sjinfo->delay_upper_joins || bms_membership(sjinfo->min_righthand) != BMS_SINGLETON) return false; innerrelid = bms_singleton_member(sjinfo->min_righthand); innerrel = find_base_rel(root, innerrelid); - if (innerrel->reloptkind != RELOPT_BASEREL || - innerrel->rtekind != RTE_RELATION || - innerrel->indexlist == NIL) + if (innerrel->reloptkind != RELOPT_BASEREL) return false; + if (innerrel->rtekind == RTE_RELATION) + { + if (innerrel->indexlist == NIL) + return false; /* no possibility of a unique index */ + } + else if (innerrel->rtekind == RTE_SUBQUERY) + { + subquery = root->simple_rte_array[innerrelid]->subquery; + + /* + * The only means we currently use to check if the subquery is unique + * are the GROUP BY and DISTINCT clause. If both of these are empty + * then there's no point in going any further. + */ + if (subquery->groupClause == NIL && + subquery->distinctClause == NIL) + return false; + } + else + return false; /* unsupported rtekind */ + /* Compute the relid set for the join we are considering */ joinrelids = bms_union(sjinfo->min_lefthand, sjinfo->min_righthand); @@ -275,17 +327,437 @@ join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo) * clauses for the innerrel, so we needn't do that here. */ - /* Now examine the indexes to see if we have a matching unique index */ - if (relation_has_unique_index_for(root, innerrel, clause_list, NIL, NIL)) - return true; + /* + * Now examine the indexes to see if we have a matching unique index.*/ + if (innerrel->rtekind == RTE_RELATION) + return relation_has_unique_index_for(root, innerrel, clause_list, NIL, NIL); /* + * We can be certain that the sub query contains no duplicate values for + * the join clause if item in the sub query's GROUP BY clause is also used + * in the join clause using equality. This works the same way for the + * DISTINCT clause. We need not pay any attention to WHERE or HAVING + * clauses as these just restrict the results more and could not be the + * cause of duplication in the result set. However there are a number of + * pre-checks we must perform which could cause duplicate values even if + * all the required columns are in the GROUP BY or DISTINCT clause. + * + * NB: We must also not remove the join in the subquery contains a + * FOR UDPATE clause, but we can actually skip this check as GROUP BY and + * DISTINCT cannot be used at the same time as FOR UPDATE. + */ + if (innerrel->rtekind == RTE_SUBQUERY) + { + Assert(subquery == root->simple_rte_array[innerrelid]->subquery); + + /* + * We cannot remove the subquery if the target list contains any set + * returning functions as these may cause the query not to be unique + * on the grouping columns, as per the following example: + * "SELECT a.a,generate_series(1,2) FROM (VALUES(1)) a(a) GROUP BY a" + */ + if (expression_returns_set((Node *) subquery->targetList)) + return false; + + /* + * Don't remove the join if the target list contains any volatile + * functions. Doing so may remove desired side affects that calls + * to the function may cause. + */ + if (contain_volatile_functions((Node *) subquery->targetList)) + return false; + + /* + * It should be safe to remove the join if all the GROUP BY expressions + * have matching items in the join condition. + */ + if (subquery->groupClause != NIL && + sortclause_is_unique_on_restrictinfo(subquery, clause_list, subquery->groupClause)) + return true; + + /* + * It should be safe to remove the join if all the DISTINCT column list have matching + * items in the join condition. + */ + if (subquery->distinctClause != NIL && + sortclause_is_unique_on_restrictinfo(subquery, clause_list, subquery->distinctClause)) + return true; + } + /* XXX is this comment still needed?? * Some day it would be nice to check for other methods of establishing * distinctness. */ return false; } +/* + * semijoin_is_removable + * Check if we can remove this semi join. + * + * To prove that a semi join is redundant we have to ensure that a foreign key + * exists on the left side of the join which references the table at the right + * side of the join. This means that we can only support a single table on + * either side of the join. We must also ensure that the join condition matches + * all the foreign key columns to each index column on the referenced table. If + * any columns are missing then we cannot be sure we'll get exactly 1 record back, + * and if there are any extra conditions that are not in the foreign key then we + * cannot be sure that the join condition will produce at least 1 matching row. + */ +static bool +semijoin_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo) +{ + int innerrelid; + int outerrelid; + RelOptInfo *innerrel; + RelOptInfo *outerrel; + ListCell *lc; + List *referencing_exprs; + List *index_exprs; + List *operator_list; + + Assert(sjinfo->jointype == JOIN_SEMI); + + if (sjinfo->delay_upper_joins || + bms_membership(sjinfo->min_righthand) != BMS_SINGLETON) + return false; + + innerrelid = bms_singleton_member(sjinfo->min_righthand); + innerrel = find_base_rel(root, innerrelid); + + if (innerrel->reloptkind != RELOPT_BASEREL || + innerrel->rtekind != RTE_RELATION || + innerrel->indexlist == NIL || + bms_membership(sjinfo->min_lefthand) != BMS_SINGLETON) + return false; + + /* + * To allow the removal of a SEMI JOIN we must analyze the foreign + * keys of the relation on the left side of the join, for this to work + * we'll need to ensure that there is only 1 relation on the left side + * of the joins, otherwise there's no possibility of foreign keys. + * If the relation on the left side has no foreign keys then there's + * no possibility that the join can be removed. + */ + + outerrelid = bms_singleton_member(sjinfo->min_lefthand); + outerrel = find_base_rel(root, outerrelid); + + /* No possibility to remove the join if there's no foreign keys */ + if (outerrel->fklist == NIL) + return false; + + referencing_exprs = NIL; + index_exprs = NIL; + operator_list = NIL; + + foreach(lc, sjinfo->join_quals) + { + OpExpr *op = (OpExpr *) lfirst(lc); + Oid opno; + Node *left_expr; + Node *right_expr; + Relids left_varnos; + Relids right_varnos; + Relids all_varnos; + Oid opinputtype; + + /* Is it a binary opclause? */ + if (!IsA(op, OpExpr) || + list_length(op->args) != 2) + { + /* No, but does it reference both sides? */ + all_varnos = pull_varnos((Node *) op); + if (!bms_overlap(all_varnos, sjinfo->syn_righthand) || + bms_is_subset(all_varnos, sjinfo->syn_righthand)) + { + /* + * Clause refers to only one rel, so ignore it --- unless it + * contains volatile functions, in which case we'd better + * punt. + */ + if (contain_volatile_functions((Node *) op)) + return false; + continue; + } + /* Non-operator clause referencing both sides, must punt */ + return false; + } + + /* Extract data from binary opclause */ + opno = op->opno; + left_expr = linitial(op->args); + right_expr = lsecond(op->args); + left_varnos = pull_varnos(left_expr); + right_varnos = pull_varnos(right_expr); + all_varnos = bms_union(left_varnos, right_varnos); + opinputtype = exprType(left_expr); + + /* Does it reference both sides? */ + if (!bms_overlap(all_varnos, sjinfo->syn_righthand) || + bms_is_subset(all_varnos, sjinfo->syn_righthand)) + { + /* + * Clause refers to only one rel, so ignore it --- unless it + * contains volatile functions, in which case we'd better punt. + */ + if (contain_volatile_functions((Node *) op)) + return false; + continue; + } + + /* check rel membership of arguments */ + if (!bms_is_empty(right_varnos) && + bms_is_subset(right_varnos, sjinfo->syn_righthand) && + !bms_overlap(left_varnos, sjinfo->syn_righthand)) + { + /* typical case, right_expr is RHS variable */ + } + else if (!bms_is_empty(left_varnos) && + bms_is_subset(left_varnos, sjinfo->syn_righthand) && + !bms_overlap(right_varnos, sjinfo->syn_righthand)) + { + Node *tmp; + /* flipped case, left_expr is RHS variable */ + opno = get_commutator(opno); + if (!OidIsValid(opno)) + return false; + + /* swap the operands */ + tmp = left_expr; + left_expr = right_expr; + right_expr = tmp; + } + else + return false; + + /* so far so good, keep building lists */ + referencing_exprs = lappend(referencing_exprs, copyObject(left_expr)); + operator_list = lappend_oid(operator_list, opno); + index_exprs = lappend(index_exprs, copyObject(right_expr)); + } + + if (referencing_exprs == NIL) + return false; + + /* The expressions mustn't be volatile. */ + if (contain_volatile_functions((Node *) referencing_exprs)) + return false; + + if (contain_volatile_functions((Node *) index_exprs)) + return false; + + return relation_has_foreign_key_for(root, outerrel, innerrel, + referencing_exprs, index_exprs, operator_list); +} + +/* + * relation_has_foreign_key_for + * Checks if rel has a foreign key which references referencedrel with the + * given list of expressions. + * + * For the match to succeed: + * referencing_exprs must match the columns defined in the foreign key + * index_exprs must match the columns defined in the index for the foreign key. + */ +static bool +relation_has_foreign_key_for(PlannerInfo *root, RelOptInfo *rel, + RelOptInfo *referencedrel, List *referencing_exprs, + List *index_exprs, List *operator_list) +{ + ListCell *lc; + + Assert(list_length(referencing_exprs) == list_length(index_exprs)); + Assert(list_length(referencing_exprs) == list_length(operator_list)); + + /* + * Short-circuit if no foreign keys exist on the relation or + * there are no indexes on the referenced relation. Remember that + * it is possible for the fklist to not be empty and the indexlist + * to be empty as the foreign keys may be for some completely other + * relation. + */ + if (rel->fklist == NIL || referencedrel->indexlist == NIL) + return false; + + /* + * Here we must look at each foreign key which is defined and see if we + * can find that foreign key's index in the index list of the referenced + * table. When we find a match we do some quick pre-checks on the index + * then we try to see if all of the expressions can be matched to that + * foreign key and index. If we don't match then we'll keep trying to + * find another matching foreign key and index list. + */ + foreach(lc, rel->fklist) + { + ForeignKeyInfo *fk = (ForeignKeyInfo *) lfirst(lc); + ListCell *ic; + + /* + * We need to ensure that if the number of columns in the key is above zero + * that the foreign key is of type MATCH FULL. XXX is this overly strict?? + */ + if (fk->conncols > 1 && fk->confmatchtype != FKCONSTR_MATCH_FULL) + continue; + + foreach(ic, referencedrel->indexlist) + { + IndexOptInfo *ind = (IndexOptInfo *) lfirst(ic); + if (fk->conindid == ind->indexoid) + { + /* Sanity check? XXX Should we complain or just skip this one? */ + if (fk->conncols != ind->ncolumns) + elog(ERROR, "Number of columns in foreign key does not match number of indexed columns"); + + /* Index not ready? XXX Perhaps this should be an error as we + * should only have fks that have been validated. + */ + if (!ind->unique || !ind->immediate || + (ind->indpred != NIL && !ind->predOK)) + continue; + + if (expressions_match_foreign_key(fk, ind, referencing_exprs, index_exprs, operator_list)) + return true; + } + } + } + + return false; +} + +static bool +expressions_match_foreign_key(ForeignKeyInfo *fk, IndexOptInfo *ind, + List *exprlist, List *index_exprs, List *operator_list) +{ + ListCell *lc; + ListCell *lc2; + ListCell *lc3; + int col; + + Assert(list_length(exprlist) == list_length(index_exprs)); + Assert(list_length(exprlist) == list_length(operator_list)); + + /* + * For each column defined in the foreign key we must ensure that we find + * a qual in the expression list which matches the foreign key on one side + * of the expression and the index on the other side of the expression. It + * does not matter if the same expression appears many times, we just need + * to ensure all exist at least one and no extra non matching expressions + * exist. + */ + + /* + * Fast path out if there's not enough conditions to match + * each column in the foreign key. Note that we cannot check + * that the number of expressions is equal here since it would + * cause duplicate expressions to not match. + */ + if (list_length(exprlist) < fk->conncols) + return false; + + forthree(lc, exprlist, lc2, index_exprs, lc3, operator_list) + { + Node *expr = (Node *) lfirst(lc); + Node *idxexpr = (Node *) lfirst(lc2); + Oid opr = lfirst_oid(lc3); + bool matched = false; + + /* if anything is NULL or not a var then we can it's not a match */ + if (!expr || !IsA(expr, Var) || !idxexpr || !IsA(idxexpr, Var)) + return false; + + for (col = 0; col < fk->conncols; col++) + { + if (fk->conkey[col] == ((Var *) expr)->varattno && + fk->confkey[col] == ((Var *) idxexpr)->varattno && + opr == fk->conpfeqop[col]) + { + matched = true; + break; + } + } + + /* + * Did we find anything matching the fk col? If not then we'll + * return a no match. + */ + if (!matched) + return false; + } + + return true; +} + + +/* + * sortclause_is_unique_on_restrictinfo + * + * Checks to see if all items in sortclause also exist in clause_list. + * The function will return true if clause_list is the same as or a superset + * of the sortclause. If the sortclause has columns that don't exist in the + * clause_list then the query can't be guaranteed unique on the clause_list + * columns. + * + * Note: The calling function must ensure that sortclause is not NIL. + */ +static bool +sortclause_is_unique_on_restrictinfo(Query *query, List *clause_list, List *sortclause) +{ + ListCell *l; + + Assert(sortclause != NIL); + + /* + * Loop over each sort clause to ensure that we have + * an item in the join conditions that matches it. + * It does not matter if we have more items in the join + * condition than we have in the sort clause. + */ + foreach(l, sortclause) + { + ListCell *ri; + SortGroupClause *scl = (SortGroupClause *) lfirst(l); + TargetEntry *sortTarget; + bool matched = false; + + /* lookup the target list entry for the current sort sort group ref */ + sortTarget = get_sortgroupref_tle(scl->tleSortGroupRef, query->targetList); + + /* + * Since a constant only has 1 value the existence of one here will + * not cause any duplication of the results. We'll simply ignore it! + */ + if (IsA(sortTarget->expr, Const)) + continue; + + foreach(ri, clause_list) + { + RestrictInfo *rinfo = (RestrictInfo *) lfirst(ri); + Node *rexpr; + + if (rinfo->outer_is_left) + rexpr = get_rightop(rinfo->clause); + else + rexpr = get_leftop(rinfo->clause); + + if (IsA(rexpr, Var)) + { + Var *var = (Var *)rexpr; + + if (var->varattno == sortTarget->resno) + { + matched = true; + break; /* match found */ + } + } + else /* XXX what else could it be? */ + return false; + } + + if (!matched) + return false; + } + return true; +} /* * Remove the target relid from the planner's data structures, having diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index b2becfa..ac7b38b 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -26,6 +26,8 @@ #include "access/xlog.h" #include "catalog/catalog.h" #include "catalog/heap.h" +#include "catalog/pg_constraint.h" +#include "catalog/pg_type.h" #include "foreign/fdwapi.h" #include "miscadmin.h" #include "nodes/makefuncs.h" @@ -38,6 +40,7 @@ #include "parser/parsetree.h" #include "rewrite/rewriteManip.h" #include "storage/bufmgr.h" +#include "utils/fmgroids.h" #include "utils/lsyscache.h" #include "utils/rel.h" #include "utils/snapmgr.h" @@ -384,6 +387,123 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, heap_close(relation, NoLock); + + { + List *result = NIL; + Relation fkeyRel; + Relation fkeyRelIdx; + ScanKeyData fkeyScankey; + SysScanDesc fkeyScan; + HeapTuple tuple; + + /* ConstraintRelidIndexId + * Must scan pg_constraint. Right now, it is a seqscan because there is + * no available index on conrelid. + */ + ScanKeyInit(&fkeyScankey, + Anum_pg_constraint_conrelid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(relationObjectId)); + + fkeyRel = heap_open(ConstraintRelationId, AccessShareLock); + fkeyRelIdx = index_open(ConstraintRelidIndexId, AccessShareLock); + fkeyScan = systable_beginscan_ordered(fkeyRel, fkeyRelIdx, NULL, 1, &fkeyScankey); + + while ((tuple = systable_getnext_ordered(fkeyScan, ForwardScanDirection)) != NULL) + { + Form_pg_constraint con = (Form_pg_constraint) GETSTRUCT(tuple); + ForeignKeyInfo *fkinfo; + Datum adatum; + bool isNull; + ArrayType *arr; + int numkeys; + + /* Not a foreign key */ + if (con->contype != CONSTRAINT_FOREIGN) + continue; + + /* we're not interested unless the fk has been validated */ + if (!con->convalidated) + continue; + + fkinfo = (ForeignKeyInfo *) palloc(sizeof(ForeignKeyInfo)); + fkinfo->conindid = con->conindid; + fkinfo->confrelid = con->confrelid; + fkinfo->convalidated = con->convalidated; + fkinfo->conrelid = con->conrelid; + fkinfo->confupdtype = con->confupdtype; + fkinfo->confdeltype = con->confdeltype; + fkinfo->confmatchtype = con->confmatchtype; + + adatum = heap_getattr(tuple, Anum_pg_constraint_conkey, + RelationGetDescr(fkeyRel), &isNull); + if (isNull) + elog(ERROR, "null conkey for constraint %u", + HeapTupleGetOid(tuple)); + arr = DatumGetArrayTypeP(adatum); /* ensure not toasted */ + numkeys = ARR_DIMS(arr)[0]; + if (ARR_NDIM(arr) != 1 || + numkeys < 0 || + ARR_HASNULL(arr) || + ARR_ELEMTYPE(arr) != INT2OID) + elog(ERROR, "conkey is not a 1-D smallint array"); + + fkinfo->conkey = (int16 *) ARR_DATA_PTR(arr); + + fkinfo->conncols = numkeys; + + adatum = heap_getattr(tuple, Anum_pg_constraint_confkey, + RelationGetDescr(fkeyRel), &isNull); + if (isNull) + elog(ERROR, "null confkey for constraint %u", + HeapTupleGetOid(tuple)); + arr = DatumGetArrayTypeP(adatum); /* ensure not toasted */ + numkeys = ARR_DIMS(arr)[0]; + + /* sanity check */ + if (numkeys != fkinfo->conncols) + elog(ERROR, "number of confkey elements does not equal conkey elements"); + + if (ARR_NDIM(arr) != 1 || + numkeys < 0 || + ARR_HASNULL(arr) || + ARR_ELEMTYPE(arr) != INT2OID) + elog(ERROR, "confkey is not a 1-D smallint array"); + + fkinfo->confkey = (int16 *) ARR_DATA_PTR(arr); + + adatum = heap_getattr(tuple, Anum_pg_constraint_conpfeqop, + RelationGetDescr(fkeyRel), &isNull); + if (isNull) + elog(ERROR, "null conpfeqop for constraint %u", + HeapTupleGetOid(tuple)); + arr = DatumGetArrayTypeP(adatum); /* ensure not toasted */ + numkeys = ARR_DIMS(arr)[0]; + + /* sanity check */ + if (numkeys != fkinfo->conncols) + elog(ERROR, "number of conpfeqop elements does not equal conkey elements"); + + if (ARR_NDIM(arr) != 1 || + numkeys < 0 || + ARR_HASNULL(arr) || + ARR_ELEMTYPE(arr) != OIDOID) + elog(ERROR, "conpfeqop is not a 1-D smallint array"); + + fkinfo->conpfeqop = (Oid *) ARR_DATA_PTR(arr); + + result = lappend(result, fkinfo); + } + + rel->fklist = result; + + systable_endscan_ordered(fkeyScan); + index_close(fkeyRelIdx, AccessShareLock); + heap_close(fkeyRel, AccessShareLock); + } + + + /* * Allow a plugin to editorialize on the info we obtained from the * catalogs. Actions might include altering the assumed relation size, diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c index c938c27..a0fb8eb 100644 --- a/src/backend/optimizer/util/relnode.c +++ b/src/backend/optimizer/util/relnode.c @@ -115,6 +115,7 @@ build_simple_rel(PlannerInfo *root, int relid, RelOptKind reloptkind) rel->lateral_relids = NULL; rel->lateral_referencers = NULL; rel->indexlist = NIL; + rel->fklist = NIL; rel->pages = 0; rel->tuples = 0; rel->allvisfrac = 0; @@ -377,6 +378,7 @@ build_join_rel(PlannerInfo *root, joinrel->lateral_relids = NULL; joinrel->lateral_referencers = NULL; joinrel->indexlist = NIL; + joinrel->fklist = NIL; joinrel->pages = 0; joinrel->tuples = 0; joinrel->allvisfrac = 0; diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h index 300136e..3deb59b 100644 --- a/src/include/nodes/relation.h +++ b/src/include/nodes/relation.h @@ -445,6 +445,7 @@ typedef struct RelOptInfo Relids lateral_relids; /* minimum parameterization of rel */ Relids lateral_referencers; /* rels that reference me laterally */ List *indexlist; /* list of IndexOptInfo */ + List *fklist; /* list of ForeignKeyInfo */ BlockNumber pages; /* size estimates derived from pg_class */ double tuples; double allvisfrac; @@ -1643,4 +1644,20 @@ typedef struct JoinCostWorkspace int numbatches; } JoinCostWorkspace; +typedef struct ForeignKeyInfo +{ + Oid conindid; /* index supporting this constraint */ + Oid confrelid; /* relation referenced by foreign key */ + bool convalidated; /* constraint has been validated? */ + Oid conrelid; /* relation this constraint constrains */ + char confupdtype; /* foreign key's ON UPDATE action */ + char confdeltype; /* foreign key's ON DELETE action */ + char confmatchtype; /* foreign key's match type */ + int conncols; /* number of columns references */ + int16 *conkey; /* Columns of conrelid that the constraint applies to */ + int16 *confkey; /* columns of confrelid that foreign key references */ + Oid *conpfeqop; /* Operator list for comparing PK to FK */ +} ForeignKeyInfo; + + #endif /* RELATION_H */ diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out index 934488a..fd8e048 100644 --- a/src/test/regress/expected/join.out +++ b/src/test/regress/expected/join.out @@ -3060,9 +3060,11 @@ begin; CREATE TEMP TABLE a (id int PRIMARY KEY, b_id int); CREATE TEMP TABLE b (id int PRIMARY KEY, c_id int); CREATE TEMP TABLE c (id int PRIMARY KEY); +CREATE TEMP TABLE d (a INT, b INT); INSERT INTO a VALUES (0, 0), (1, NULL); INSERT INTO b VALUES (0, 0), (1, NULL); INSERT INTO c VALUES (0), (1); +INSERT INTO d VALUES (1,3),(2,2),(3,1); -- all three cases should be optimizable into a simple seqscan explain (costs off) SELECT a.* FROM a LEFT JOIN b ON a.b_id = b.id; QUERY PLAN @@ -3098,7 +3100,331 @@ select id from a where id in ( -> Seq Scan on b (5 rows) +-- check optimization of outer join when joining a unique sub query using group by +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT b.id FROM b GROUP BY b.id) b ON a.id = b.id; + QUERY PLAN +--------------- + Seq Scan on a +(1 row) + +-- check optimization of outer join when joining a unique sub query using distinct +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT b.c_id FROM b) b ON a.id = b.c_id; + QUERY PLAN +--------------- + Seq Scan on a +(1 row) + +-- check optimization of outer join when joining a unique sub query using distinct +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT a+b AS ab FROM d) d ON a.id = d.ab; + QUERY PLAN +--------------- + Seq Scan on a +(1 row) + +-- optimization is not possible when distinct contains a volatile function +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT a+b+random() AS abr FROM d) d ON a.id = d.abr; + QUERY PLAN +------------------------------------------------------------------------------------------ + Hash Left Join + Hash Cond: ((a.id)::double precision = ((((d.a + d.b))::double precision + random()))) + -> Seq Scan on a + -> Hash + -> HashAggregate + Group Key: (((d.a + d.b))::double precision + random()) + -> Seq Scan on d +(7 rows) + +-- check optimization of outer join when joining a unique sub query using distinct +-- and a constant expression. +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT b.c_id,1 AS dummy FROM b) b ON a.id = b.c_id; + QUERY PLAN +--------------- + Seq Scan on a +(1 row) + +-- check optimization of outer join when joining a unique sub query which contains 2 tables +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT b.id FROM b INNER JOIN c ON b.id = c.id GROUP BY b.id) b ON a.id = b.id; + QUERY PLAN +--------------- + Seq Scan on a +(1 row) + +-- check optimization of outer join when joining a unique sub query which contains 2 tables +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT b.id FROM b INNER JOIN c ON b.id = c.id GROUP BY b.id) b ON a.id + 10 = b.id; + QUERY PLAN +--------------- + Seq Scan on a +(1 row) + +-- check optimization of outer join when joining a unique sub query which contains 2 tables +-- and contains a redundant join clause +EXPLAIN (COSTS OFF) +SELECT a.id FROM a +LEFT JOIN (SELECT b.id,1 as dummy FROM b INNER JOIN c ON b.id = c.id GROUP BY b.id) b ON a.id = b.id AND b.dummy = 1; + QUERY PLAN +--------------- + Seq Scan on a +(1 row) + +-- optimization is not possible when the group by contains a column which is not in the join condition. +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT b.id FROM b GROUP BY b.id,b.c_id) b ON a.id = b.id; + QUERY PLAN +--------------------------------- + Hash Right Join + Hash Cond: (b.id = a.id) + -> HashAggregate + Group Key: b.id, b.c_id + -> Seq Scan on b + -> Hash + -> Seq Scan on a +(7 rows) + +-- optimization is not possible when distinct clause contains an item that is not in the join clause +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT b.id,c_id FROM b) b ON a.id = b.id; + QUERY PLAN +--------------------------------- + Hash Right Join + Hash Cond: (b.id = a.id) + -> HashAggregate + Group Key: b.id, b.c_id + -> Seq Scan on b + -> Hash + -> Seq Scan on a +(7 rows) + +-- optimization is not possible when distinct contains a volatile function +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT b.id,random() AS r FROM b) b ON a.id = b.id AND r = random(); + QUERY PLAN +------------------------------------------------- + Hash Left Join + Hash Cond: (a.id = b.id) + -> Seq Scan on a + -> Hash + -> Subquery Scan on b + Filter: (b.r = random()) + -> HashAggregate + Group Key: b_1.id, random() + -> Seq Scan on b b_1 +(9 rows) + +-- optimization is not possible when there are any volatile functions in the target list. +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT id,AVG(c_id),SUM(random()) FROM b GROUP BY id) b ON a.id = b.id; + QUERY PLAN +---------------------------- + Hash Right Join + Hash Cond: (b.id = a.id) + -> HashAggregate + Group Key: b.id + -> Seq Scan on b + -> Hash + -> Seq Scan on a +(7 rows) + +-- optimization is not possible when there are set returning functions in the target list. +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT id,generate_series(1,2) FROM b GROUP BY id) b ON a.id = b.id; + QUERY PLAN +---------------------------- + Hash Right Join + Hash Cond: (b.id = a.id) + -> HashAggregate + Group Key: b.id + -> Seq Scan on b + -> Hash + -> Seq Scan on a +(7 rows) + rollback; +BEGIN; +-- Test join removals for semi joins +CREATE TEMP TABLE b (id INT NOT NULL PRIMARY KEY); +CREATE TEMP TABLE a (id INT NOT NULL PRIMARY KEY, b_id INT NOT NULL REFERENCES b(id)); +-- should remove semi join to b +EXPLAIN (COSTS OFF) +SELECT id FROM a WHERE b_id IN(SELECT id FROM b); + QUERY PLAN +--------------- + Seq Scan on a +(1 row) + +-- should remove semi join to b +EXPLAIN (COSTS OFF) +SELECT id FROM a WHERE EXISTS(SELECT 1 FROM b WHERE a.b_id = id); + QUERY PLAN +--------------- + Seq Scan on a +(1 row) + +-- should remove semi join to b (swapped condition order) +EXPLAIN (COSTS OFF) +SELECT id FROM a WHERE EXISTS(SELECT 1 FROM b WHERE id = a.b_id); + QUERY PLAN +--------------- + Seq Scan on a +(1 row) + +-- should not remove semi join (since not using equals) +EXPLAIN (COSTS OFF) +SELECT id FROM a WHERE EXISTS(SELECT 1 FROM b WHERE id >= a.b_id); + QUERY PLAN +----------------------------------------- + Nested Loop Semi Join + -> Seq Scan on a + -> Index Only Scan using b_pkey on b + Index Cond: (id >= a.b_id) +(4 rows) + +-- should not remove semi join +EXPLAIN (COSTS OFF) +SELECT id FROM a WHERE b_id+0 IN(SELECT id FROM b); + QUERY PLAN +------------------------------------ + Hash Semi Join + Hash Cond: ((a.b_id + 0) = b.id) + -> Seq Scan on a + -> Hash + -> Seq Scan on b +(5 rows) + +-- should not remove semi join +EXPLAIN (COSTS OFF) +SELECT id FROM a WHERE b_id IN(SELECT id+0 FROM b); + QUERY PLAN +------------------------------------ + Hash Semi Join + Hash Cond: (a.b_id = (b.id + 0)) + -> Seq Scan on a + -> Hash + -> Seq Scan on b +(5 rows) + +-- should not remove semi join (wrong column) +EXPLAIN (COSTS OFF) +SELECT id FROM a WHERE id IN(SELECT id FROM b); + QUERY PLAN +---------------------------- + Hash Join + Hash Cond: (b.id = a.id) + -> Seq Scan on b + -> Hash + -> Seq Scan on a +(5 rows) + +ROLLBACK; +BEGIN; +-- Semi join removal code with 2 column foreign keys +CREATE TEMP TABLE b (id1 INT NOT NULL, id2 INT NOT NULL, PRIMARY KEY(id1,id2)); +CREATE TEMP TABLE a (id INT NOT NULL PRIMARY KEY, b_id1 INT NOT NULL, b_id2 INT NOT NULL); +ALTER TABLE a ADD CONSTRAINT a_b_id1_b_id2_fkey FOREIGN KEY (b_id1,b_id2) REFERENCES b(id1,id2) MATCH FULL; +-- should remove semi join to b +EXPLAIN (COSTS OFF) +SELECT id FROM a WHERE EXISTS(SELECT 1 FROM b WHERE a.b_id1 = id1 AND a.b_id2 = id2); + QUERY PLAN +--------------- + Seq Scan on a +(1 row) + +-- should not remove semi join to b (extra condition) +EXPLAIN (COSTS OFF) +SELECT id FROM a WHERE EXISTS(SELECT 1 FROM b WHERE a.b_id1 = id1 AND a.b_id2 = id2 AND a.b_id2 >= id2); + QUERY PLAN +-------------------------------------------------------- + Hash Semi Join + Hash Cond: ((a.b_id1 = b.id1) AND (a.b_id2 = b.id2)) + Join Filter: (a.b_id2 >= b.id2) + -> Seq Scan on a + -> Hash + -> Seq Scan on b +(6 rows) + +-- should not remove semi join to b (wrong operator) +EXPLAIN (COSTS OFF) +SELECT id FROM a WHERE EXISTS(SELECT 1 FROM b WHERE a.b_id1 > id1 AND a.b_id2 < id2); + QUERY PLAN +----------------------------------------------------------- + Nested Loop Semi Join + -> Seq Scan on a + -> Index Only Scan using b_pkey on b + Index Cond: ((id1 < a.b_id1) AND (id2 > a.b_id2)) +(4 rows) + +-- should not remove semi join (only checking id1) +EXPLAIN (COSTS OFF) +SELECT id FROM a WHERE EXISTS(SELECT 1 FROM b WHERE a.b_id1 = id1); + QUERY PLAN +--------------------------------- + Hash Join + Hash Cond: (a.b_id1 = b.id1) + -> Seq Scan on a + -> Hash + -> HashAggregate + Group Key: b.id1 + -> Seq Scan on b +(7 rows) + +-- should not remove semi join (only checking id2) +EXPLAIN (COSTS OFF) +SELECT id FROM a WHERE EXISTS(SELECT 1 FROM b WHERE a.b_id2 = id2); + QUERY PLAN +--------------------------------- + Hash Join + Hash Cond: (a.b_id2 = b.id2) + -> Seq Scan on a + -> Hash + -> HashAggregate + Group Key: b.id2 + -> Seq Scan on b +(7 rows) + +-- should not remove semi join (checking wrong columns) +EXPLAIN (COSTS OFF) +SELECT id FROM a WHERE EXISTS(SELECT 1 FROM b WHERE a.b_id2 = id1 AND a.b_id1 = id2); + QUERY PLAN +-------------------------------------------------------- + Hash Join + Hash Cond: ((a.b_id2 = b.id1) AND (a.b_id1 = b.id2)) + -> Seq Scan on a + -> Hash + -> Seq Scan on b +(5 rows) + +-- should not remove semi join (no check for id2) +EXPLAIN (COSTS OFF) +SELECT id FROM a WHERE EXISTS(SELECT 1 FROM b WHERE a.b_id2 = id1 AND a.b_id1 = id1); + QUERY PLAN +----------------------------------------- + Nested Loop Semi Join + -> Seq Scan on a + Filter: (b_id2 = b_id1) + -> Index Only Scan using b_pkey on b + Index Cond: (id1 = a.b_id2) +(5 rows) + +-- should not remove semi join (no check for b_id2) +EXPLAIN (COSTS OFF) +SELECT id FROM a WHERE EXISTS(SELECT 1 FROM b WHERE a.b_id1 = id1 AND a.b_id1 = id2); + QUERY PLAN +----------------------------------- + Hash Join + Hash Cond: (a.b_id1 = b.id1) + -> Seq Scan on a + -> Hash + -> Seq Scan on b + Filter: (id1 = id2) +(6 rows) + +ROLLBACK; create temp table parent (k int primary key, pd int); create temp table child (k int unique, cd int); insert into parent values (1, 10), (2, 20), (3, 30); diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql index 275cb11..984a24f 100644 --- a/src/test/regress/sql/join.sql +++ b/src/test/regress/sql/join.sql @@ -861,9 +861,11 @@ begin; CREATE TEMP TABLE a (id int PRIMARY KEY, b_id int); CREATE TEMP TABLE b (id int PRIMARY KEY, c_id int); CREATE TEMP TABLE c (id int PRIMARY KEY); +CREATE TEMP TABLE d (a INT, b INT); INSERT INTO a VALUES (0, 0), (1, NULL); INSERT INTO b VALUES (0, 0), (1, NULL); INSERT INTO c VALUES (0), (1); +INSERT INTO d VALUES (1,3),(2,2),(3,1); -- all three cases should be optimizable into a simple seqscan explain (costs off) SELECT a.* FROM a LEFT JOIN b ON a.b_id = b.id; @@ -878,8 +880,142 @@ select id from a where id in ( select b.id from b left join c on b.id = c.id ); +-- check optimization of outer join when joining a unique sub query using group by +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT b.id FROM b GROUP BY b.id) b ON a.id = b.id; + +-- check optimization of outer join when joining a unique sub query using distinct +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT b.c_id FROM b) b ON a.id = b.c_id; + +-- check optimization of outer join when joining a unique sub query using distinct +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT a+b AS ab FROM d) d ON a.id = d.ab; + +-- optimization is not possible when distinct contains a volatile function +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT a+b+random() AS abr FROM d) d ON a.id = d.abr; + +-- check optimization of outer join when joining a unique sub query using distinct +-- and a constant expression. +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT b.c_id,1 AS dummy FROM b) b ON a.id = b.c_id; + +-- check optimization of outer join when joining a unique sub query which contains 2 tables +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT b.id FROM b INNER JOIN c ON b.id = c.id GROUP BY b.id) b ON a.id = b.id; + +-- check optimization of outer join when joining a unique sub query which contains 2 tables +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT b.id FROM b INNER JOIN c ON b.id = c.id GROUP BY b.id) b ON a.id + 10 = b.id; + +-- check optimization of outer join when joining a unique sub query which contains 2 tables +-- and contains a redundant join clause +EXPLAIN (COSTS OFF) +SELECT a.id FROM a +LEFT JOIN (SELECT b.id,1 as dummy FROM b INNER JOIN c ON b.id = c.id GROUP BY b.id) b ON a.id = b.id AND b.dummy = 1; + +-- optimization is not possible when the group by contains a column which is not in the join condition. +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT b.id FROM b GROUP BY b.id,b.c_id) b ON a.id = b.id; + +-- optimization is not possible when distinct clause contains an item that is not in the join clause +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT b.id,c_id FROM b) b ON a.id = b.id; + +-- optimization is not possible when distinct contains a volatile function +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT DISTINCT b.id,random() AS r FROM b) b ON a.id = b.id AND r = random(); + +-- optimization is not possible when there are any volatile functions in the target list. +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT id,AVG(c_id),SUM(random()) FROM b GROUP BY id) b ON a.id = b.id; + +-- optimization is not possible when there are set returning functions in the target list. +EXPLAIN (COSTS OFF) +SELECT a.id FROM a LEFT JOIN (SELECT id,generate_series(1,2) FROM b GROUP BY id) b ON a.id = b.id; + rollback; +BEGIN; + +-- Test join removals for semi joins +CREATE TEMP TABLE b (id INT NOT NULL PRIMARY KEY); +CREATE TEMP TABLE a (id INT NOT NULL PRIMARY KEY, b_id INT NOT NULL REFERENCES b(id)); + +-- should remove semi join to b +EXPLAIN (COSTS OFF) +SELECT id FROM a WHERE b_id IN(SELECT id FROM b); + +-- should remove semi join to b +EXPLAIN (COSTS OFF) +SELECT id FROM a WHERE EXISTS(SELECT 1 FROM b WHERE a.b_id = id); + +-- should remove semi join to b (swapped condition order) +EXPLAIN (COSTS OFF) +SELECT id FROM a WHERE EXISTS(SELECT 1 FROM b WHERE id = a.b_id); + +-- should not remove semi join (since not using equals) +EXPLAIN (COSTS OFF) +SELECT id FROM a WHERE EXISTS(SELECT 1 FROM b WHERE id >= a.b_id); + +-- should not remove semi join +EXPLAIN (COSTS OFF) +SELECT id FROM a WHERE b_id+0 IN(SELECT id FROM b); + +-- should not remove semi join +EXPLAIN (COSTS OFF) +SELECT id FROM a WHERE b_id IN(SELECT id+0 FROM b); + +-- should not remove semi join (wrong column) +EXPLAIN (COSTS OFF) +SELECT id FROM a WHERE id IN(SELECT id FROM b); + +ROLLBACK; + +BEGIN; + +-- Semi join removal code with 2 column foreign keys + +CREATE TEMP TABLE b (id1 INT NOT NULL, id2 INT NOT NULL, PRIMARY KEY(id1,id2)); +CREATE TEMP TABLE a (id INT NOT NULL PRIMARY KEY, b_id1 INT NOT NULL, b_id2 INT NOT NULL); + +ALTER TABLE a ADD CONSTRAINT a_b_id1_b_id2_fkey FOREIGN KEY (b_id1,b_id2) REFERENCES b(id1,id2) MATCH FULL; + +-- should remove semi join to b +EXPLAIN (COSTS OFF) +SELECT id FROM a WHERE EXISTS(SELECT 1 FROM b WHERE a.b_id1 = id1 AND a.b_id2 = id2); + +-- should not remove semi join to b (extra condition) +EXPLAIN (COSTS OFF) +SELECT id FROM a WHERE EXISTS(SELECT 1 FROM b WHERE a.b_id1 = id1 AND a.b_id2 = id2 AND a.b_id2 >= id2); + +-- should not remove semi join to b (wrong operator) +EXPLAIN (COSTS OFF) +SELECT id FROM a WHERE EXISTS(SELECT 1 FROM b WHERE a.b_id1 > id1 AND a.b_id2 < id2); + +-- should not remove semi join (only checking id1) +EXPLAIN (COSTS OFF) +SELECT id FROM a WHERE EXISTS(SELECT 1 FROM b WHERE a.b_id1 = id1); + +-- should not remove semi join (only checking id2) +EXPLAIN (COSTS OFF) +SELECT id FROM a WHERE EXISTS(SELECT 1 FROM b WHERE a.b_id2 = id2); + +-- should not remove semi join (checking wrong columns) +EXPLAIN (COSTS OFF) +SELECT id FROM a WHERE EXISTS(SELECT 1 FROM b WHERE a.b_id2 = id1 AND a.b_id1 = id2); + +-- should not remove semi join (no check for id2) +EXPLAIN (COSTS OFF) +SELECT id FROM a WHERE EXISTS(SELECT 1 FROM b WHERE a.b_id2 = id1 AND a.b_id1 = id1); + +-- should not remove semi join (no check for b_id2) +EXPLAIN (COSTS OFF) +SELECT id FROM a WHERE EXISTS(SELECT 1 FROM b WHERE a.b_id1 = id1 AND a.b_id1 = id2); + +ROLLBACK; + create temp table parent (k int primary key, pd int); create temp table child (k int unique, cd int); insert into parent values (1, 10), (2, 20), (3, 30);