diff --git a/src/backend/catalog/pg_constraint.c b/src/backend/catalog/pg_constraint.c index 28e3aed..3e70d07 100644 --- a/src/backend/catalog/pg_constraint.c +++ b/src/backend/catalog/pg_constraint.c @@ -17,6 +17,7 @@ #include "access/genam.h" #include "access/heapam.h" #include "access/htup_details.h" +#include "access/sysattr.h" #include "catalog/dependency.h" #include "catalog/indexing.h" #include "catalog/objectaccess.h" @@ -871,6 +872,90 @@ get_domain_constraint_oid(Oid typid, const char *conname, bool missing_ok) } /* + * get_primary_key_attnos + * Returns a Bitmapset of each Var's varattno offset by + * FirstLowInvalidHeapAttributeNumber which makes up the PRIMARY KEY of + * 'relid'. If no PRIMARY KEY exists on 'relid' then NULL is returned. + * + * When 'deferrableOk' is false we will return NULL if the PRIMARY KEY + * constraint is set to deferrable. + * 'constraintOid' will be set to the OID of the PRIMARY KEY constraint, + * although this is only set when the function does not return NULL. + */ +Bitmapset * +get_primary_key_attnos(Oid relid, bool deferrableOk, Oid *constraintOid) +{ + Relation pg_constraint; + HeapTuple tuple; + SysScanDesc scan; + ScanKeyData skey[1]; + Bitmapset *pkattnos = NULL; + + /* Scan pg_constraint for constraints of the target rel */ + pg_constraint = heap_open(ConstraintRelationId, AccessShareLock); + + ScanKeyInit(&skey[0], + Anum_pg_constraint_conrelid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(relid)); + + scan = systable_beginscan(pg_constraint, ConstraintRelidIndexId, true, + NULL, 1, skey); + + while (HeapTupleIsValid(tuple = systable_getnext(scan))) + { + Form_pg_constraint con = (Form_pg_constraint) GETSTRUCT(tuple); + Datum adatum; + bool isNull; + ArrayType *arr; + int16 *attnums; + int numkeys; + int i; + + /* Skip constraints which are not PRIMARY KEYs */ + if (con->contype != CONSTRAINT_PRIMARY) + continue; + + /* + * If the primary key is deferrable, but we've been instructed to + * ignore deferrable constraints, then we'll simply break and exit + * returning NULL. There can only be a single primary key on a table, + * so it's senseless to look for another. + */ + if (con->condeferrable && !deferrableOk) + break; + + /* Extract the conkey array, ie, attnums of PK's columns */ + adatum = heap_getattr(tuple, Anum_pg_constraint_conkey, + RelationGetDescr(pg_constraint), &isNull); + if (isNull) + elog(ERROR, "null conkey for constraint %u", + HeapTupleGetOid(tuple)); + arr = DatumGetArrayTypeP(adatum); /* ensure not toasted */ + numkeys = ARR_DIMS(arr)[0]; + if (ARR_NDIM(arr) != 1 || + numkeys < 0 || + ARR_HASNULL(arr) || + ARR_ELEMTYPE(arr) != INT2OID) + elog(ERROR, "conkey is not a 1-D smallint array"); + attnums = (int16 *) ARR_DATA_PTR(arr); + + for (i = 0; i < numkeys; i++) + pkattnos = bms_add_member(pkattnos, + attnums[i] - FirstLowInvalidHeapAttributeNumber); + + *constraintOid = HeapTupleGetOid(tuple); + break; + } + + systable_endscan(scan); + + heap_close(pg_constraint, AccessShareLock); + + return pkattnos; +} + +/* * Determine whether a relation can be proven functionally dependent on * a set of grouping columns. If so, return TRUE and add the pg_constraint * OIDs of the constraints needed for the proof to the *constraintDeps list. diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index c0ec905..b32a369 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -20,7 +20,9 @@ #include "access/htup_details.h" #include "access/parallel.h" +#include "access/sysattr.h" #include "access/xact.h" +#include "catalog/pg_constraint.h" #include "executor/executor.h" #include "executor/nodeAgg.h" #include "foreign/fdwapi.h" @@ -87,6 +89,7 @@ static double preprocess_limit(PlannerInfo *root, double tuple_fraction, int64 *offset_est, int64 *count_est); static bool limit_needed(Query *parse); +static void remove_useless_groupby_columns(PlannerInfo *root); static List *preprocess_groupclause(PlannerInfo *root, List *force); static List *extract_rollup_sets(List *groupingSets); static List *reorder_grouping_sets(List *groupingSets, List *sortclause); @@ -664,6 +667,9 @@ subquery_planner(PlannerGlobal *glob, Query *parse, } parse->havingQual = (Node *) newHaving; + /* remove any dedundant GROUP BY columns */ + remove_useless_groupby_columns(root); + /* * If we have any outer joins, try to reduce them to plain inner joins. * This step is most easily done after we've done expression @@ -3160,6 +3166,176 @@ limit_needed(Query *parse) return false; /* don't need a Limit plan node */ } +/* + * remove_useless_groupby_columns + * Removes columns from the GROUP BY clause which are redundant due to + * being functionally dependant on one or more other column which are + * present. + * + * We do support non-aggregated column in the query's targetlist which are not + * present in the GROUP BY clause, providing these columns can be determined to + * be functionally dependant on the relation's primary key constraint. There's + * quite a few relational databases which don't allow this, so this means it + * can be quite common that queries are written to have the GROUP BY clause + * include all of the columns which are not aggregated in the query's + * targetlist. These columns have no need to be there as long as the columns + * which make up the table's PRIMARY KEY are present in the GROUP BY clause. + * Here we test to see if the primary key columns are present for each relation + * which make up the columns of the GROUP BY clause, and if so we remove any + * that belong to the same relation which are not part of the primary key. + * + * In reality we could do more than just the primary key columns here as UNIQUE + * indexes with NOT NULL constraints on the columns allow us to also determine + * functional dependencies. However we mustn't go any further than what's done + * by check_functional_grouping(). + */ +static void +remove_useless_groupby_columns(PlannerInfo *root) +{ + Bitmapset **groupbyattnos; + Bitmapset **surplusvars; + ListCell *lc; + int relid; + Query *parse = root->parse; + + /* Nothing do to when there are fewer than two group by expression */ + if (list_length(parse->groupClause) < 2) + return; + + /* Don't fiddle with the GROUP BY clause if the query has grouping sets */ + if (parse->groupingSets) + return; + + groupbyattnos = (Bitmapset **) palloc0(sizeof(Bitmapset *) * + (list_length(parse->rtable) + 1)); + surplusvars = NULL; /* don't allocate unless required */ + + /* + * Pre-process the GROUP BY clause and populate groupbyattnos with + * varattnos of Vars which belong to the clause. + */ + foreach (lc, parse->groupClause) + { + SortGroupClause *sgc = (SortGroupClause *) lfirst(lc); + TargetEntry *tle; + Var *var; + + tle = get_sortgroupclause_tle(sgc, parse->targetList); + var = (Var *) tle->expr; + + /* + * Ignore non-Vars or Vars from a higher level. + * XXX it may be useful to put more effort into removing non-Vars here. + * If the clause is an Expr containing Vars from a single relation then + * we may also be able to remove it, providing that we find all PRIMARY + * KEY columns elsewhere in the GROUP BY clause. Perhaps we can improve + * on this later, but for now it seems better to keep it simple. + */ + if (!IsA(var, Var) || var->varlevelsup > 0) + continue; + + relid = var->varno; + + Assert(relid <= list_length(parse->rtable)); + groupbyattnos[relid] = bms_add_member(groupbyattnos[relid], + var->varattno - FirstLowInvalidHeapAttributeNumber); + } + + /* + * Look at each relation and check if any of the Vars which were found in + * the GROUP BY clause are not needed due to all the PK constraint Vars + * being present in the clause. We'll take note of any surplus Vars which + * we find rather than removing them from the GROUP BY clause right away. + * This allows us to make a single pass over the GROUP BY clause, rather + * than one pass per relation. + */ + relid = 0; + foreach(lc, parse->rtable) + { + RangeTblEntry *rte = (RangeTblEntry *) lfirst(lc); + Bitmapset *relattnos; + Bitmapset *pkattnos; + Oid constraintOid; + + relid++; + + if (rte->rtekind != RTE_RELATION) + continue; + + relattnos = groupbyattnos[relid]; + + /* + * Clearly there's no room for removing Vars from this rel from the + * GROUP BY clause if there were fewer than two Vars found to belong to + * this rel. + */ + if (bms_num_members(relattnos) < 2) + continue; + + pkattnos = get_primary_key_attnos(rte->relid, false, &constraintOid); + + /* + * Can't remove any columns for this rel if there is no suitable + * primary key constraint. + */ + if (pkattnos == NULL) + continue; + + /* + * If the pkattnos is a proper subset of relattnos then we have some + * items in the GROUP BY which can be removed. + */ + if (bms_subset_compare(pkattnos, relattnos) == BMS_SUBSET1) + { + parse->constraintDeps = lappend_oid(parse->constraintDeps, + constraintOid); + + /* + * Now record which Vars are not required in the GROUP BY. We must + * first allocate space to do this, which we've delayed until here + * to save on unnecessarily allocations where no removals are + * possible. + */ + if (surplusvars == NULL) + surplusvars = (Bitmapset **) palloc0(sizeof(Bitmapset *) * + (list_length(parse->rtable) + 1)); + + /* + * Mark all Vars which we originally collected above minus the + * primary key Vars as surplus. We'll remove all of these from the + * GROUP BY in one pass, once we've checked all other relations + * which are involved in the GROUP BY clause. + */ + surplusvars[relid] = bms_difference(relattnos, pkattnos); + } + } + + /* build a new GROUP BY clause with all of the surplus Vars removed */ + if (surplusvars != NULL) + { + List *new_groupby = NIL; + + foreach (lc, root->parse->groupClause) + { + SortGroupClause *sgc = (SortGroupClause *) lfirst(lc); + TargetEntry *tle; + Var *var; + + tle = get_sortgroupclause_tle(sgc, root->parse->targetList); + var = (Var *) tle->expr; + + /* re-add non-Vars and anything not marked as surplus */ + if (!IsA(var, Var) || + !bms_is_member(var->varattno - + FirstLowInvalidHeapAttributeNumber, surplusvars[var->varno])) + new_groupby = lappend(new_groupby, sgc); + + /* XXX do we to alter tleSortGroupRef to remove gaps? */ + } + root->parse->groupClause = new_groupby; + } +} + /* * preprocess_groupclause - do preparatory work on GROUP BY clause diff --git a/src/include/catalog/pg_constraint.h b/src/include/catalog/pg_constraint.h index 07dbcea..bb8983c 100644 --- a/src/include/catalog/pg_constraint.h +++ b/src/include/catalog/pg_constraint.h @@ -250,6 +250,9 @@ extern void AlterConstraintNamespaces(Oid ownerId, Oid oldNspId, extern Oid get_relation_constraint_oid(Oid relid, const char *conname, bool missing_ok); extern Oid get_domain_constraint_oid(Oid typid, const char *conname, bool missing_ok); +extern Bitmapset *get_primary_key_attnos(Oid relid, bool deferrableOk, + Oid *constraintOid); + extern bool check_functional_grouping(Oid relid, Index varno, Index varlevelsup, List *grouping_columns, diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out index de826b5..65f6d4f 100644 --- a/src/test/regress/expected/aggregates.out +++ b/src/test/regress/expected/aggregates.out @@ -751,6 +751,69 @@ select max(unique2), generate_series(1,3) as g from tenk1 order by g desc; 9999 | 1 (3 rows) +-- check group by clause is processed correctly to remove surplus columns +create temp table t1 (a int, b int, c int, d int, primary key (a, b)); +create temp table t2 (x int, y int, z int, primary key (x, y)); +create temp table t3 (a int, b int, c int, primary key(a, b) deferrable); +-- ensure only primary key columns remain in group by +explain (costs off) select * from t1 group by a,b,c,d; + QUERY PLAN +---------------------- + HashAggregate + Group Key: a, b + -> Seq Scan on t1 +(3 rows) + +-- ensure c and d are not removed since the complete PK is not present in the +-- group by clause +explain (costs off) select a,c from t1 group by a,c,d; + QUERY PLAN +---------------------- + HashAggregate + Group Key: a, c, d + -> Seq Scan on t1 +(3 rows) + +-- ensure non-pk columns are removed from group by for all relations. +explain (costs off) select * from t1 +inner join t2 on t1.a = t2.x and t1.b = t2.y +group by t1.a,t1.b,t1.c,t1.d,t2.x,t2.y,t2.z; + QUERY PLAN +------------------------------------------------------- + Group + Group Key: t1.a, t1.b, t2.x, t2.y + -> Merge Join + Merge Cond: ((t1.a = t2.x) AND (t1.b = t2.y)) + -> Index Scan using t1_pkey on t1 + -> Index Scan using t2_pkey on t2 +(6 rows) + +-- ensure non-pk columns are only removed from t1. +explain (costs off) select t1.*,t2.x,t2.z from t1 +inner join t2 on t1.a = t2.x and t1.b = t2.y +group by t1.a,t1.b,t1.c,t1.d,t2.x,t2.z; + QUERY PLAN +------------------------------------------------------- + HashAggregate + Group Key: t1.a, t1.b, t2.x, t2.z + -> Merge Join + Merge Cond: ((t1.a = t2.x) AND (t1.b = t2.y)) + -> Index Scan using t1_pkey on t1 + -> Index Scan using t2_pkey on t2 +(6 rows) + +-- ensure group by columns are not removed when pk is deferrable +explain (costs off) select * from t3 group by a,b,c; + QUERY PLAN +---------------------- + HashAggregate + Group Key: a, b, c + -> Seq Scan on t3 +(3 rows) + +drop table t1; +drop table t2; +drop table t3; -- try it on an inheritance tree create table minmaxtest(f1 int); create table minmaxtest1() inherits (minmaxtest); diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out index 64f046e..5bb406c 100644 --- a/src/test/regress/expected/join.out +++ b/src/test/regress/expected/join.out @@ -3919,8 +3919,8 @@ select d.* from d left join (select distinct * from b) s explain (costs off) select d.* from d left join (select * from b group by b.id, b.c_id) s on d.a = s.id; - QUERY PLAN ---------------------------------------------- + QUERY PLAN +--------------------------------------- Merge Left Join Merge Cond: (d.a = s.id) -> Sort @@ -3930,7 +3930,7 @@ select d.* from d left join (select * from b group by b.id, b.c_id) s Sort Key: s.id -> Subquery Scan on s -> HashAggregate - Group Key: b.id, b.c_id + Group Key: b.id -> Seq Scan on b (11 rows) diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql index 8d501dc..d09ff08 100644 --- a/src/test/regress/sql/aggregates.sql +++ b/src/test/regress/sql/aggregates.sql @@ -270,6 +270,35 @@ explain (costs off) select max(unique2), generate_series(1,3) as g from tenk1 order by g desc; select max(unique2), generate_series(1,3) as g from tenk1 order by g desc; +-- check group by clause is processed correctly to remove surplus columns +create temp table t1 (a int, b int, c int, d int, primary key (a, b)); +create temp table t2 (x int, y int, z int, primary key (x, y)); +create temp table t3 (a int, b int, c int, primary key(a, b) deferrable); + +-- ensure only primary key columns remain in group by +explain (costs off) select * from t1 group by a,b,c,d; + +-- ensure c and d are not removed since the complete PK is not present in the +-- group by clause +explain (costs off) select a,c from t1 group by a,c,d; + +-- ensure non-pk columns are removed from group by for all relations. +explain (costs off) select * from t1 +inner join t2 on t1.a = t2.x and t1.b = t2.y +group by t1.a,t1.b,t1.c,t1.d,t2.x,t2.y,t2.z; + +-- ensure non-pk columns are only removed from t1. +explain (costs off) select t1.*,t2.x,t2.z from t1 +inner join t2 on t1.a = t2.x and t1.b = t2.y +group by t1.a,t1.b,t1.c,t1.d,t2.x,t2.z; + +-- ensure group by columns are not removed when pk is deferrable +explain (costs off) select * from t3 group by a,b,c; + +drop table t1; +drop table t2; +drop table t3; + -- try it on an inheritance tree create table minmaxtest(f1 int); create table minmaxtest1() inherits (minmaxtest);