diff --git a/src/backend/catalog/pg_constraint.c b/src/backend/catalog/pg_constraint.c index 28e3aed..274eaf8 100644 --- a/src/backend/catalog/pg_constraint.c +++ b/src/backend/catalog/pg_constraint.c @@ -871,6 +871,143 @@ get_domain_constraint_oid(Oid typid, const char *conname, bool missing_ok) } /* + * identify_key_vars + * Returns a Bitmapset with bits set to mark which 0-based varlist items + * belong to 'relid's PRIMARY KEY. + * + * On successful identification of primary key columns 'constraintOid' is set + * to the OID of the primary key constraint. On failure to match to *all* + * primary key columns, NULL is returned and 'constraintOid' is left + * unchanged. + */ +Bitmapset * +identify_key_vars(Oid relid, Index varno, Index varlevelsup, List *varlist, + Oid *constraintOid) +{ + Relation pg_constraint; + HeapTuple tuple; + SysScanDesc scan; + ScanKeyData skey[1]; + Bitmapset *varlistmatches; + + /* Scan pg_constraint for constraints of the target rel */ + pg_constraint = heap_open(ConstraintRelationId, AccessShareLock); + + ScanKeyInit(&skey[0], + Anum_pg_constraint_conrelid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(relid)); + + scan = systable_beginscan(pg_constraint, ConstraintRelidIndexId, true, + NULL, 1, skey); + + varlistmatches = NULL; + + while (HeapTupleIsValid(tuple = systable_getnext(scan))) + { + Form_pg_constraint con = (Form_pg_constraint) GETSTRUCT(tuple); + Datum adatum; + bool isNull; + ArrayType *arr; + int16 *attnums; + int numkeys; + int i; + + /* + * For now we must ignore anything apart from PRIMARY KEY constraints. + * Technically we could look at UNIQUE indexes too, however we'd also + * have to ensure that each column of the unique index has a NOT NULL + * constraint, however since NOT NULL constraints currently don't have + * a pg_constraint entry this means that we're unable to track the + * dependency on the NOT NULL in order to invalidate any cached query + * plans in the event that someone drops the NOT NULL constraint. In + * any case it seems strange to go to more lengths here than what is + * done by check_functional_grouping(). + */ + if (con->contype != CONSTRAINT_PRIMARY) + continue; + + /* + * Constraint must be non-deferrable, but no point in looking for + * another primary key, as there can only be one, so break; + */ + if (con->condeferrable) + break; + + /* Extract the conkey array, ie, attnums of PK's columns */ + adatum = heap_getattr(tuple, Anum_pg_constraint_conkey, + RelationGetDescr(pg_constraint), &isNull); + if (isNull) + elog(ERROR, "null conkey for constraint %u", + HeapTupleGetOid(tuple)); + arr = DatumGetArrayTypeP(adatum); /* ensure not toasted */ + numkeys = ARR_DIMS(arr)[0]; + if (ARR_NDIM(arr) != 1 || + numkeys < 0 || + ARR_HASNULL(arr) || + ARR_ELEMTYPE(arr) != INT2OID) + elog(ERROR, "conkey is not a 1-D smallint array"); + attnums = (int16 *) ARR_DATA_PTR(arr); + + for (i = 0; i < numkeys; i++) + { + AttrNumber attnum = attnums[i]; + ListCell *lc; + bool found_col = false; + int varlistidx = 0; + + foreach(lc, varlist) + { + Var *gvar = (Var *) lfirst(lc); + + if (IsA(gvar, Var) && + gvar->varno == varno && + gvar->varlevelsup == varlevelsup && + gvar->varattno == attnum) + { + varlistmatches = bms_add_member(varlistmatches, varlistidx); + found_col = true; + /* + * We can break out of the loop here, as no duplicate Vars + * should exist as these will have been removed during + * parse analysis. + */ + break; + } + varlistidx++; + } + + if (!found_col) + { + /* + * No match found for this index key. Since the relation has + * only one PRIMARY KEY we can skip looking through any other + * constraints. + */ + varlistmatches = NULL; + break; + } + } + + /* + * Did we match all PRIMARY KEY Vars? If so we can return this match. + * There's no need to look for a better match as a relation can only + * have one PRIMARY KEY constraint. + */ + if (i == numkeys) + *constraintOid = HeapTupleGetOid(tuple); + + break; + } + + systable_endscan(scan); + + heap_close(pg_constraint, AccessShareLock); + + return varlistmatches; +} + +/* * Determine whether a relation can be proven functionally dependent on * a set of grouping columns. If so, return TRUE and add the pg_constraint * OIDs of the constraints needed for the proof to the *constraintDeps list. diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index fed1acc..4684ba2 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -21,6 +21,7 @@ #include "access/htup_details.h" #include "access/parallel.h" #include "access/xact.h" +#include "catalog/pg_constraint.h" #include "executor/executor.h" #include "executor/nodeAgg.h" #include "foreign/fdwapi.h" @@ -3160,6 +3161,162 @@ limit_needed(Query *parse) return false; /* don't need a Limit plan node */ } +/* + * remove_useless_groupby_columns + * Removes columns from the GROUP BY clause which are redundant due to + * being functionally dependant on one or more other column which are + * present. + * + * We do support non-aggregated columns in the query targetlist which are + * functionally dependant on the primary key, but not all relational databases + * allow this, so it can be fairly common that queries are written to have the + * GROUP BY clause include all of the columns which are not aggregated in the + * query's targetlist. These columns have no need to be there as long as the + * columns which make up the table's PRIMARY KEY are present in the GROUP BY + * clause. Here we test to see if the primary key columns are present for each + * relation which make up the columns of the GROUP BY clause, and if so we + * remove any which are not part of the primary key. + * + * In reality we could do more than just the primary key columns here as UNIQUE + * indexes with NOT NULL constraints on the columns allow us to also determine + * functional dependencies. However we mustn't go any further than what's done + * by check_functional_grouping(). + */ +static void +remove_useless_groupby_columns(PlannerInfo *root) +{ + List **relvarlist; + Bitmapset **surplusvars; + ListCell *lc; + int relid; + bool anysurplus = false; + Query *parse = root->parse; + + if (parse->groupingSets) + return; + + /* Nothing do to when there's only 1 group by expression */ + if (list_length(parse->groupClause) < 2) + return; + + relvarlist = (List **) palloc0(sizeof(List *) * (list_length(parse->rtable) + 1)); + surplusvars = (Bitmapset **) palloc0(sizeof(Bitmapset *) * (list_length(parse->rtable) + 1)); + + /* + * Pre-process the GROUP BY clause Vars from each relation into + * 'relvarlist' indexed by the relid + */ + foreach (lc, parse->groupClause) + { + SortGroupClause *sgc = (SortGroupClause *) lfirst(lc); + TargetEntry *tle; + Var *var; + + tle = get_sortgroupclause_tle(sgc, parse->targetList); + var = (Var *) tle->expr; + + /* + * XXX it may be useful to also look at non-Vars here. If the clause + * is an Expr containing Vars from a single relation then we may also + * be able to remove it, providing that we find all PRIMARY KEY columns + * elsewhere in the GROUP BY clause. Perhaps we can improve on this + * later, but for now it seems better to keep it simple. + */ + if (!IsA(var, Var)) + continue; + + Assert(var->varno <= list_length(parse->rtable)); + relvarlist[var->varno] = lappend(relvarlist[var->varno], var); + } + + /* + * Look at each rtable relation and check if any Vars were found to belong + * to this relation by the logic above. Note that since identify_key_vars() + * only concerns itself with PRIMARY KEY constraints to prove uniqueness, + * and that since PRIMARY KEYs cannot be based on expressions, then we need + * only concern ourselves here with plain Vars. + */ + relid = 1; + foreach(lc, parse->rtable) + { + RangeTblEntry *rte = (RangeTblEntry *) lfirst(lc); + List *varlist = relvarlist[relid]; + Bitmapset *varlistmatches; + Oid constraintOid; + ListCell *lc2; + + /* + * Clearly there's no room for removing any items from the GROUP BY + * clause there are fewer than two Vars belonging to this rel. + */ + if (varlist == NULL || list_length(varlist) < 2) + continue; + + varlistmatches = identify_key_vars(rte->relid, relid, 0, varlist, + &constraintOid); + + /* + * If we have some varlist matches and varlist has additional columns, + * then each of these additional column may be removed from the + * GROUP BY clause. We'll mark the varattno of each variable that's not + * needed. + */ + if (varlistmatches != NULL && + bms_num_members(varlistmatches) < list_length(varlist)) + { + int varidx = 0; + foreach(lc2, varlist) + { + Var *var = (Var *) lfirst(lc2); + + if (!bms_is_member(varidx, varlistmatches)) + { + surplusvars[relid] = bms_add_member(surplusvars[relid], + var->varattno); + } + + varidx++; + } + + /* + * Mark that the GROUP BY clause needs to be rebuilt. We'll do this + * just once when we've completely processing any other relation's + * columns which are present in the GROUP BY clause. + */ + anysurplus = true; + parse->constraintDeps = lappend_oid(parse->constraintDeps, + constraintOid); + } + relid++; + } + + /* + * If we found any surplus Vars in the GROUP BY clause, then we'll build + * a new GROUP BY clause without these surplus Vars. + */ + if (anysurplus) + { + List *new_groupby = NIL; + + foreach (lc, root->parse->groupClause) + { + SortGroupClause *sgc = (SortGroupClause *) lfirst(lc); + TargetEntry *tle; + Var *var; + + tle = get_sortgroupclause_tle(sgc, root->parse->targetList); + var = (Var *) tle->expr; + + if (!IsA(var, Var) || + !bms_is_member(var->varattno, surplusvars[var->varno])) + new_groupby = lappend(new_groupby, sgc); + + /* XXX do we to alter tleSortGroupRef to remove gaps? */ + } + root->parse->groupClause = new_groupby; + } +} + /* * preprocess_groupclause - do preparatory work on GROUP BY clause @@ -3205,6 +3362,12 @@ preprocess_groupclause(PlannerInfo *root, List *force) return new_groupclause; } + /* + * Remove any GROUP BY clause items which are functionally dependent on + * other clause items + */ + remove_useless_groupby_columns(root); + /* If no ORDER BY, nothing useful to do here */ if (parse->sortClause == NIL) return parse->groupClause; diff --git a/src/include/catalog/pg_constraint.h b/src/include/catalog/pg_constraint.h index 07dbcea..934e8db 100644 --- a/src/include/catalog/pg_constraint.h +++ b/src/include/catalog/pg_constraint.h @@ -250,6 +250,9 @@ extern void AlterConstraintNamespaces(Oid ownerId, Oid oldNspId, extern Oid get_relation_constraint_oid(Oid relid, const char *conname, bool missing_ok); extern Oid get_domain_constraint_oid(Oid typid, const char *conname, bool missing_ok); +extern Bitmapset *identify_key_vars(Oid relid, Index varno, Index varlevelsup, + List *varlist, Oid *constraintOid); + extern bool check_functional_grouping(Oid relid, Index varno, Index varlevelsup, List *grouping_columns, diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out index 64f046e..1c6c8e6 100644 --- a/src/test/regress/expected/join.out +++ b/src/test/regress/expected/join.out @@ -3918,19 +3918,19 @@ select d.* from d left join (select distinct * from b) s -- not in the join condition explain (costs off) select d.* from d left join (select * from b group by b.id, b.c_id) s - on d.a = s.id; - QUERY PLAN ---------------------------------------------- + on d.a = s.c_id; + QUERY PLAN +--------------------------------------- Merge Left Join - Merge Cond: (d.a = s.id) + Merge Cond: (d.a = s.c_id) -> Sort Sort Key: d.a -> Seq Scan on d -> Sort - Sort Key: s.id + Sort Key: s.c_id -> Subquery Scan on s -> HashAggregate - Group Key: b.id, b.c_id + Group Key: b.id -> Seq Scan on b (11 rows) diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql index 0358d00..e11e0a5 100644 --- a/src/test/regress/sql/join.sql +++ b/src/test/regress/sql/join.sql @@ -1266,7 +1266,7 @@ select d.* from d left join (select distinct * from b) s -- not in the join condition explain (costs off) select d.* from d left join (select * from b group by b.id, b.c_id) s - on d.a = s.id; + on d.a = s.c_id; -- similarly, but keying off a DISTINCT clause explain (costs off)