From 99d9072b2fb16e526a1478bee575943045bab786 Mon Sep 17 00:00:00 2001 From: jian he Date: Thu, 28 Nov 2024 12:37:22 +0800 Subject: [PATCH v5] remove useless group by columns via unique-not-null In the `remove_useless_groupby_columns` function, use a unique index and a not-null constraint to eliminate unnecessary columns. Here, the primary key is treated as a "unique index and not-null". If there are multiple candidates, use the unique index with the fewest columns to remove the other columns in groupby expression. discussion: https://postgr.es/m/327990c8-b9b2-4b0c-bffb-462249f82de0@Spar --- src/backend/catalog/index.c | 103 ++++++++++++++++++++++ src/backend/optimizer/plan/planner.c | 55 +++++++++--- src/include/catalog/index.h | 1 + src/test/regress/expected/aggregates.out | 106 +++++++++++++++++++++++ src/test/regress/sql/aggregates.sql | 51 +++++++++++ 5 files changed, 305 insertions(+), 11 deletions(-) diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index f9bb721c5f..80a1e44167 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -4257,3 +4257,106 @@ RestoreReindexState(const void *reindexstate) /* Note the worker has its own transaction nesting level */ reindexingNestLevel = GetCurrentTransactionNestLevel(); } + +/* + * get_unique_not_null_attnos + * Returns a List of Bitmapsets with the attribute numbers (offset by + * FirstLowInvalidHeapAttributeNumber) of all valid non-partial unique + * indexes which contain only non-nullable columns. + */ +List * +get_unique_not_null_attnos(Oid relid) +{ + Bitmapset *not_null_attnos = NULL; + Relation pg_index; + HeapTuple indexTuple; + SysScanDesc scan; + ScanKeyData skey; + List *not_null_cs; + List *indlist = NIL; + + /* Use CookedConstraint to fetch not-null constraint. */ + not_null_cs = RelationGetNotNullConstraints(relid, true, false); + if (not_null_cs == NIL) + return NULL; + + foreach_ptr(CookedConstraint, cc, not_null_cs) + not_null_attnos = bms_add_member(not_null_attnos, cc->attnum - FirstLowInvalidHeapAttributeNumber); + + /* Scan pg_index for unique index of the target rel */ + pg_index = table_open(IndexRelationId, AccessShareLock); + + ScanKeyInit(&skey, + Anum_pg_index_indrelid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(relid)); + scan = systable_beginscan(pg_index, IndexIndrelidIndexId, true, + NULL, 1, &skey); + + while (HeapTupleIsValid(indexTuple = systable_getnext(scan))) + { + Bitmapset *unique_notnull_attnos = NULL; + Form_pg_index index = (Form_pg_index) GETSTRUCT(indexTuple); + Datum adatum; + bool isNull; + bool all_notnull = true; + int numkeys; + Datum *keys; + int nelems; + + /* we're only interested in unique index */ + if (!index->indisunique) + continue; + + /* Skip invalid or deferred index */ + if (!index->indisvalid || !index->indimmediate) + continue; + + /* Skip expression index or predicate index */ + if (!heap_attisnull(indexTuple, Anum_pg_index_indpred, NULL) || + !heap_attisnull(indexTuple, Anum_pg_index_indexprs, NULL)) + continue; + + /* Extract the pg_index->indkey int2vector */ + adatum = heap_getattr(indexTuple, Anum_pg_index_indkey, + RelationGetDescr(pg_index), &isNull); + if (isNull) + elog(ERROR, "null int2vector for index %u", index->indexrelid); + + deconstruct_array_builtin(DatumGetArrayTypeP(adatum), INT2OID, + &keys, + NULL, + &nelems); + + if (nelems != index->indnatts) + elog(ERROR, "corrupted int2vector for index %u", index->indexrelid); + + Assert(nelems >= index->indnkeyatts); + numkeys = index->indnkeyatts; + + for (int i = 0; i < numkeys; i++) + { + int attno = DatumGetInt16(keys[i]) - FirstLowInvalidHeapAttributeNumber; + + /* ignore this index if any columns are NULLable */ + if (!bms_is_member(attno, not_null_attnos)) + { + all_notnull = false; + break; + } + + unique_notnull_attnos = bms_add_member(unique_notnull_attnos, attno); + } + + /* if the index has no NULLable columns, add it to the list */ + if (all_notnull) + indlist = lappend(indlist, unique_notnull_attnos); + } + + systable_endscan(scan); + + /* TODO sort the indlist by index OID */ + + table_close(pg_index, AccessShareLock); + return indlist; +} \ No newline at end of file diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index b665a7762e..1803e5556a 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -27,6 +27,7 @@ #include "catalog/pg_inherits.h" #include "catalog/pg_proc.h" #include "catalog/pg_type.h" +#include "catalog/index.h" #include "executor/executor.h" #include "foreign/fdwapi.h" #include "jit/jit.h" @@ -2805,8 +2806,10 @@ remove_useless_groupby_columns(PlannerInfo *root) { RangeTblEntry *rte = lfirst_node(RangeTblEntry, lc); Bitmapset *relattnos; - Bitmapset *pkattnos; - Oid constraintOid; + Bitmapset *best_attnos = NULL; + List *unique_notnull_attnos; + int32 best_num_attnos = PG_INT32_MAX; + ListCell *lc2; relid++; @@ -2828,18 +2831,48 @@ remove_useless_groupby_columns(PlannerInfo *root) continue; /* - * Can't remove any columns for this rel if there is no suitable - * (i.e., nondeferrable) primary key constraint. + * Fetch details about all suitable unique indexes with NOT NULL + * constraints on all columns. */ - pkattnos = get_primary_key_attnos(rte->relid, false, &constraintOid); - if (pkattnos == NULL) - continue; + unique_notnull_attnos = get_unique_not_null_attnos(rte->relid); /* - * If the primary key is a proper subset of relattnos then we have - * some items in the GROUP BY that can be removed. + * Now search each index to check if there are any indexes where the + * indexed columns are a subset of the GROUP BY columns for this + * relation. */ - if (bms_subset_compare(pkattnos, relattnos) == BMS_SUBSET1) + foreach(lc2, unique_notnull_attnos) + { + Bitmapset *ind_attnos = lfirst_node(Bitmapset, lc2); + int members; + + /* + * Skip any indexes where the indexed columns aren't a subset of + * the GROUP BY. + */ + if (bms_subset_compare(ind_attnos, relattnos) != BMS_SUBSET1) + continue; + + /* + * Record the attribute numbers from the index with the fewest + * columns. This allows the largest number of columns to be + * removed from the GROUP BY clause. In the future, we may wish + * to consider using the narrowest set of columns and looking at + * pg_statistic.stawidth as it might be better to use an index + * with, say two INT4s, rather than, say, one long varlena column. + */ + members = bms_num_members(ind_attnos); + + if (members < best_num_attnos) + { + best_attnos = ind_attnos; + best_num_attnos = members; + } + } + + list_free(unique_notnull_attnos); + + if (best_attnos != NULL) { /* * To easily remember whether we've found anything to do, we don't @@ -2850,7 +2883,7 @@ remove_useless_groupby_columns(PlannerInfo *root) (list_length(parse->rtable) + 1)); /* Remember the attnos of the removable columns */ - surplusvars[relid] = bms_difference(relattnos, pkattnos); + surplusvars[relid] = bms_difference(relattnos, best_attnos); } } diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h index 2dea96f47c..dacc0f6ebd 100644 --- a/src/include/catalog/index.h +++ b/src/include/catalog/index.h @@ -175,6 +175,7 @@ extern void RestoreReindexState(const void *reindexstate); extern void IndexSetParentIndex(Relation partitionIdx, Oid parentOid); +extern List *get_unique_not_null_attnos(Oid relid); /* * itemptr_encode - Encode ItemPointer as int64/int8 diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out index 1e682565d1..c60615cfbd 100644 --- a/src/test/regress/expected/aggregates.out +++ b/src/test/regress/expected/aggregates.out @@ -1454,6 +1454,112 @@ drop table t2; drop table t3; drop table p_t1; -- +-- Test removal of redundant GROUP BY columns using unique not null index. +-- +create temp table t1 (a int, b int, c int, unique(c)); +create temp table t2 (a int, b int, c int not null, primary key (a, b), unique(c)); +create temp table t3 (a int, b int, c int not null, d int not null, primary key (a, b), unique(c, d)); +create temp table t4 (a int not null, b int not null, + c int not null, d int not null, + unique (a, b), unique(b, c), unique(a, c, d)); +create table t5 (a int not null, b int not null, + c int not null, d int not null, + unique (a, b)) partition by range(a, b); +create table t5_0 (like t5 including all); +create table t5_1 (like t5 including all); +alter table t5 attach partition t5_0 for values from (0,0) to (10, 10); +alter table t5 attach partition t5_1 for values from (10,10) to (21, 21); +insert into t5 select g,g+1, g+2, g+3 from generate_series(1, 20) g; +create temp table t6 (a int, b int not null, c int, d int, unique(b, c)); +-- Test unique index without not null constraint should not be used. +explain (costs off) select * from t1 group by a,b,c; + QUERY PLAN +---------------------- + HashAggregate + Group Key: a, b, c + -> Seq Scan on t1 +(3 rows) + +-- using unique index remove useless group by columns +-- since it has less columns compare with primary key +explain (costs off) select * from t2 group by a,b,c; + QUERY PLAN +---------------------- + HashAggregate + Group Key: c + -> Seq Scan on t2 +(3 rows) + +--can remove column b, since c has unique index and is not-null +explain (costs off) select count(*) from t2 group by b,c; + QUERY PLAN +---------------------- + HashAggregate + Group Key: c + -> Seq Scan on t2 +(3 rows) + +--more than one candidate can be used to remove other column. +--optimizer will use primary key or unique index. +explain (costs off) select * from t3 group by a,b,c,d; + QUERY PLAN +---------------------- + HashAggregate + Group Key: a, b + -> Seq Scan on t3 +(3 rows) + +-- Test unique not null indices have overlap. +explain (costs off) select * from t4 group by a,b,c,d; + QUERY PLAN +---------------------- + HashAggregate + Group Key: a, b + -> Seq Scan on t4 +(3 rows) + +--can remove column d from groupby expression, since we have unique index(b,c) +explain (costs off) select count(*) from t4 group by c, d, b; + QUERY PLAN +---------------------- + HashAggregate + Group Key: c, b + -> Seq Scan on t4 +(3 rows) + +--partition case, can reduce to a,b. +explain (costs off) select count(*) from t5 group by a, d, c,b; + QUERY PLAN +----------------------------------- + HashAggregate + Group Key: t5.a, t5.b + -> Append + -> Seq Scan on t5_0 t5_1 + -> Seq Scan on t5_1 t5_2 +(5 rows) + +--can reduce to a,b, groupby order or having duplicates does not matter +explain (costs off) select count(*) from t5 group by b,a,a, d, c; + QUERY PLAN +----------------------------------- + HashAggregate + Group Key: t5.b, t5.a + -> Append + -> Seq Scan on t5_0 t5_1 + -> Seq Scan on t5_1 t5_2 +(5 rows) + +---only part of unique key columns is not-null, cannot use it to remove columns. +explain(costs off) select count(*) from t6 group by b,c,a,d; + QUERY PLAN +------------------------- + HashAggregate + Group Key: b, c, a, d + -> Seq Scan on t6 +(3 rows) + +drop table t1, t2,t3,t4,t5,t6; +-- -- Test GROUP BY matching of join columns that are type-coerced due to USING -- create temp table t1(f1 int, f2 int); diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql index 4885daffe6..b4b16b29a8 100644 --- a/src/test/regress/sql/aggregates.sql +++ b/src/test/regress/sql/aggregates.sql @@ -512,6 +512,57 @@ drop table t2; drop table t3; drop table p_t1; +-- +-- Test removal of redundant GROUP BY columns using unique not null index. +-- +create temp table t1 (a int, b int, c int, unique(c)); +create temp table t2 (a int, b int, c int not null, primary key (a, b), unique(c)); +create temp table t3 (a int, b int, c int not null, d int not null, primary key (a, b), unique(c, d)); +create temp table t4 (a int not null, b int not null, + c int not null, d int not null, + unique (a, b), unique(b, c), unique(a, c, d)); + +create table t5 (a int not null, b int not null, + c int not null, d int not null, + unique (a, b)) partition by range(a, b); +create table t5_0 (like t5 including all); +create table t5_1 (like t5 including all); +alter table t5 attach partition t5_0 for values from (0,0) to (10, 10); +alter table t5 attach partition t5_1 for values from (10,10) to (21, 21); +insert into t5 select g,g+1, g+2, g+3 from generate_series(1, 20) g; + +create temp table t6 (a int, b int not null, c int, d int, unique(b, c)); + +-- Test unique index without not null constraint should not be used. +explain (costs off) select * from t1 group by a,b,c; + +-- using unique index remove useless group by columns +-- since it has less columns compare with primary key +explain (costs off) select * from t2 group by a,b,c; + +--can remove column b, since c has unique index and is not-null +explain (costs off) select count(*) from t2 group by b,c; + +--more than one candidate can be used to remove other column. +--optimizer will use primary key or unique index. +explain (costs off) select * from t3 group by a,b,c,d; + +-- Test unique not null indices have overlap. +explain (costs off) select * from t4 group by a,b,c,d; + +--can remove column d from groupby expression, since we have unique index(b,c) +explain (costs off) select count(*) from t4 group by c, d, b; + +--partition case, can reduce to a,b. +explain (costs off) select count(*) from t5 group by a, d, c,b; + +--can reduce to a,b, groupby order or having duplicates does not matter +explain (costs off) select count(*) from t5 group by b,a,a, d, c; + +---only part of unique key columns is not-null, cannot use it to remove columns. +explain(costs off) select count(*) from t6 group by b,c,a,d; + +drop table t1, t2,t3,t4,t5,t6; -- -- Test GROUP BY matching of join columns that are type-coerced due to USING -- -- 2.34.1