From d585cf0193a700501a153f75f653aa953e8a9d02 Mon Sep 17 00:00:00 2001
From: jian he <jian.universality@gmail.com>
Date: Thu, 28 Nov 2024 12:37:22 +0800
Subject: [PATCH v4 1/1] remove useless group by columns via unique-not-null

In the `remove_useless_groupby_columns` function,
use a unique index and a not-null constraint to eliminate unnecessary columns.
Here, the primary key is treated as a "unique index and not-null".
If there are multiple candidates,
use the unique index with the fewest columns to remove the other columns
in groupby expression.

discussion: https://postgr.es/m/327990c8-b9b2-4b0c-bffb-462249f82de0@Spar
---
 src/backend/catalog/index.c              | 101 +++++++++++++++++++++
 src/backend/optimizer/plan/planner.c     |  66 +++++++++-----
 src/include/catalog/index.h              |   1 +
 src/test/regress/expected/aggregates.out | 106 +++++++++++++++++++++++
 src/test/regress/sql/aggregates.sql      |  51 +++++++++++
 5 files changed, 304 insertions(+), 21 deletions(-)

diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index f9bb721c5f..841522ab4a 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -4257,3 +4257,104 @@ RestoreReindexState(const void *reindexstate)
 	/* Note the worker has its own transaction nesting level */
 	reindexingNestLevel = GetCurrentTransactionNestLevel();
 }
+
+/*
+ * given an input relation oid, return a list of Bitmapset(source:
+ * pg_index.indkey) that has a unique index *and* not-null constraint associated
+ * with it. note this includes the primary key. returned pk_pos is primary key's
+ * attnums index within the returned List
+*/
+List *
+get_unique_not_null_attnos(Oid relid, int *pk_pos)
+{
+	Bitmapset	*not_null_attnos = NULL;
+	Relation	pg_index;
+	HeapTuple	indexTuple;
+	SysScanDesc scan;
+	ScanKeyData skey;
+	List 		*not_null_cs;
+	List	   *tlist = NIL;
+	int			attno;
+
+	/* Use CookedConstraint to fetch not-null constraint. */
+	not_null_cs = RelationGetNotNullConstraints(relid, true, false);
+	if (not_null_cs == NIL)
+		return NULL;
+
+	foreach_ptr(CookedConstraint, cc, not_null_cs)
+		not_null_attnos = bms_add_member(not_null_attnos, cc->attnum - FirstLowInvalidHeapAttributeNumber);
+
+	/* Scan pg_index for unique index of the target rel */
+	pg_index = table_open(IndexRelationId, AccessShareLock);
+
+	ScanKeyInit(&skey,
+				Anum_pg_index_indrelid,
+				BTEqualStrategyNumber, F_OIDEQ,
+				ObjectIdGetDatum(relid));
+	scan = systable_beginscan(pg_index, IndexIndrelidIndexId, true,
+							  NULL, 1, &skey);
+
+	while (HeapTupleIsValid(indexTuple = systable_getnext(scan)))
+	{
+		Bitmapset  	*unique_notnull_attnos = NULL;
+		Form_pg_index index = (Form_pg_index) GETSTRUCT(indexTuple);
+		Datum		adatum;
+		bool		isNull;
+		bool		all_notnull = true;
+		int			numkeys;
+		Datum	   *keys;
+		int			nKeys,
+					pos = 0;
+
+		/* we're only interested in unique index */
+		if (!index->indisunique)
+			continue;
+
+		/* Skip invalid or deferred index */
+		if (!index->indisvalid || !index->indimmediate)
+			continue;
+
+		/* Skip expression index or predicate index */
+		if (!heap_attisnull(indexTuple, Anum_pg_index_indpred, NULL) ||
+			!heap_attisnull(indexTuple, Anum_pg_index_indexprs, NULL))
+			continue;
+
+		/* Extract the pg_index->indkey int2vector */
+		adatum = heap_getattr(indexTuple, Anum_pg_index_indkey,
+							  RelationGetDescr(pg_index), &isNull);
+		if (isNull)
+			elog(ERROR, "null int2vector for index %u", index->indexrelid);
+
+		deconstruct_array_builtin(DatumGetArrayTypeP(adatum), INT2OID,
+									&keys, NULL, &nKeys);
+
+		if(nKeys != index->indnatts)
+			elog(ERROR, "corrupted int2vector for index %u", index->indexrelid);
+
+		Assert(nKeys >= index->indnkeyatts);
+		numkeys = index->indnkeyatts;
+
+		for (int i = 0; i < numkeys; i++)
+		{
+			/* Skip if any of unique key's attnum don't have not-null */
+			attno = DatumGetInt16(keys[i]) - FirstLowInvalidHeapAttributeNumber;
+			if (!bms_is_member(attno, not_null_attnos))
+			{
+				all_notnull = false;
+				break;
+			}
+			unique_notnull_attnos = bms_add_member(unique_notnull_attnos, attno);
+		}
+		if (all_notnull)
+		{
+			if (index->indisprimary)
+				*pk_pos = pos;
+			pos++;
+			tlist = lappend(tlist, unique_notnull_attnos);
+		}
+	}
+	systable_endscan(scan);
+
+	table_close(pg_index, AccessShareLock);
+	return tlist;
+}
\ No newline at end of file
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index b665a7762e..2990e0412f 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -27,6 +27,7 @@
 #include "catalog/pg_inherits.h"
 #include "catalog/pg_proc.h"
 #include "catalog/pg_type.h"
+#include "catalog/index.h"
 #include "executor/executor.h"
 #include "foreign/fdwapi.h"
 #include "jit/jit.h"
@@ -2805,8 +2806,10 @@ remove_useless_groupby_columns(PlannerInfo *root)
 	{
 		RangeTblEntry *rte = lfirst_node(RangeTblEntry, lc);
 		Bitmapset  *relattnos;
-		Bitmapset  *pkattnos;
-		Oid			constraintOid;
+		List  		*unique_notnull_attnos = NIL;
+		List	   	*matched = NIL;
+		Bitmapset 	*best_attnos = NULL;
+		int			pk_pos = -1;
 
 		relid++;
 
@@ -2828,30 +2831,51 @@ remove_useless_groupby_columns(PlannerInfo *root)
 			continue;
 
 		/*
-		 * Can't remove any columns for this rel if there is no suitable
-		 * (i.e., nondeferrable) primary key constraint.
+		 * Can't remove any columns for this rel if there is no suitable (i.e.,
+		 * nondeferrable) primary key constraint. we can remove some columns if:
+		 * at least one unique index and not-null constraint asscoiated attnums
+		 * is subsset of groupby columns.  here,primary key is treated as unique
+		 * index and not-null constraint.
 		 */
-		pkattnos = get_primary_key_attnos(rte->relid, false, &constraintOid);
-		if (pkattnos == NULL)
-			continue;
+		unique_notnull_attnos = get_unique_not_null_attnos(rte->relid, &pk_pos);
 
-		/*
-		 * If the primary key is a proper subset of relattnos then we have
-		 * some items in the GROUP BY that can be removed.
-		 */
-		if (bms_subset_compare(pkattnos, relattnos) == BMS_SUBSET1)
+		if (list_length(unique_notnull_attnos) > 0)
 		{
-			/*
-			 * To easily remember whether we've found anything to do, we don't
-			 * allocate the surplusvars[] array until we find something.
-			 */
-			if (surplusvars == NULL)
-				surplusvars = (Bitmapset **) palloc0(sizeof(Bitmapset *) *
-													 (list_length(parse->rtable) + 1));
+			foreach_node(Bitmapset, attnos, unique_notnull_attnos)
+			{
+				Assert(attnos != NULL);
+				if (bms_subset_compare(attnos, relattnos) != BMS_SUBSET1)
+					continue;
+				else
+					matched = lappend(matched, attnos);
+			}
 
-			/* Remember the attnos of the removable columns */
-			surplusvars[relid] = bms_difference(relattnos, pkattnos);
+			if (list_length(matched) == 0)
+				continue;
+
+			best_attnos = list_nth_node(Bitmapset, matched, 0);
+			for (int i = 0; i < list_length(matched); i++)
+			{
+				Bitmapset *bt = list_nth_node(Bitmapset, matched, i);
+
+				if (bms_num_members(bt) < bms_num_members(best_attnos))
+					best_attnos = bt;
+			}
 		}
+		else
+			continue;
+
+		Assert(best_attnos);
+		/*
+		 * To easily remember whether we've found anything to do, we don't
+		 * allocate the surplusvars[] array until we find something.
+		*/
+		if (surplusvars == NULL)
+			surplusvars = (Bitmapset **) palloc0(sizeof(Bitmapset *) *
+													(list_length(parse->rtable) + 1));
+
+		/* Remember the attnos of the removable columns */
+		surplusvars[relid] = bms_difference(relattnos, best_attnos);
 	}
 
 	/*
diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h
index 2dea96f47c..b171102c76 100644
--- a/src/include/catalog/index.h
+++ b/src/include/catalog/index.h
@@ -175,6 +175,7 @@ extern void RestoreReindexState(const void *reindexstate);
 
 extern void IndexSetParentIndex(Relation partitionIdx, Oid parentOid);
 
+extern List *get_unique_not_null_attnos(Oid relid, int *pk_pos);
 
 /*
  * itemptr_encode - Encode ItemPointer as int64/int8
diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out
index 1e682565d1..c60615cfbd 100644
--- a/src/test/regress/expected/aggregates.out
+++ b/src/test/regress/expected/aggregates.out
@@ -1454,6 +1454,112 @@ drop table t2;
 drop table t3;
 drop table p_t1;
 --
+-- Test removal of redundant GROUP BY columns using unique not null index.
+--
+create temp table t1 (a int, b int, c int, unique(c));
+create temp table t2 (a int, b int, c int not null, primary key (a, b), unique(c));
+create temp table t3 (a int, b int, c int not null, d int not null, primary key (a, b), unique(c, d));
+create temp table t4 (a int not null, b int not null,
+                      c int not null, d int not null,
+                      unique (a, b), unique(b, c), unique(a, c, d));
+create table t5 (a int not null, b int not null,
+                      c int not null, d int not null,
+                      unique (a, b)) partition by range(a, b);
+create table t5_0 (like t5 including all);
+create table t5_1 (like t5 including all);
+alter table  t5 attach partition t5_0 for values from (0,0) to (10, 10);
+alter table  t5 attach partition t5_1 for values from (10,10) to (21, 21);
+insert into t5 select g,g+1, g+2, g+3 from generate_series(1, 20) g;
+create temp table t6 (a int, b int not null, c int, d int, unique(b, c));
+-- Test unique index without not null constraint should not be used.
+explain (costs off) select * from t1 group by a,b,c;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: a, b, c
+   ->  Seq Scan on t1
+(3 rows)
+
+-- using unique index remove useless group by columns
+-- since it has less columns compare with primary key
+explain (costs off) select * from t2 group by a,b,c;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: c
+   ->  Seq Scan on t2
+(3 rows)
+
+--can remove column b, since c has unique index and is not-null
+explain (costs off) select count(*) from t2 group by b,c;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: c
+   ->  Seq Scan on t2
+(3 rows)
+
+--more than one candidate can be used to remove other column.
+--optimizer will use primary key or unique index.
+explain (costs off) select * from t3 group by a,b,c,d;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: a, b
+   ->  Seq Scan on t3
+(3 rows)
+
+-- Test unique not null indices have overlap.
+explain (costs off) select * from t4 group by a,b,c,d;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: a, b
+   ->  Seq Scan on t4
+(3 rows)
+
+--can remove column d from groupby expression, since we have unique index(b,c)
+explain (costs off) select count(*) from t4 group by c, d, b;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: c, b
+   ->  Seq Scan on t4
+(3 rows)
+
+--partition case, can reduce to a,b.
+explain (costs off) select count(*) from t5 group by a, d, c,b;
+            QUERY PLAN             
+-----------------------------------
+ HashAggregate
+   Group Key: t5.a, t5.b
+   ->  Append
+         ->  Seq Scan on t5_0 t5_1
+         ->  Seq Scan on t5_1 t5_2
+(5 rows)
+
+--can reduce to a,b, groupby order or having duplicates does not matter
+explain (costs off) select count(*) from t5 group by b,a,a, d, c;
+            QUERY PLAN             
+-----------------------------------
+ HashAggregate
+   Group Key: t5.b, t5.a
+   ->  Append
+         ->  Seq Scan on t5_0 t5_1
+         ->  Seq Scan on t5_1 t5_2
+(5 rows)
+
+---only part of unique key columns is not-null, cannot use it to remove columns.
+explain(costs off) select count(*) from t6 group by b,c,a,d;
+       QUERY PLAN        
+-------------------------
+ HashAggregate
+   Group Key: b, c, a, d
+   ->  Seq Scan on t6
+(3 rows)
+
+drop table t1, t2,t3,t4,t5,t6;
+--
 -- Test GROUP BY matching of join columns that are type-coerced due to USING
 --
 create temp table t1(f1 int, f2 int);
diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql
index 4885daffe6..b4b16b29a8 100644
--- a/src/test/regress/sql/aggregates.sql
+++ b/src/test/regress/sql/aggregates.sql
@@ -512,6 +512,57 @@ drop table t2;
 drop table t3;
 drop table p_t1;
 
+--
+-- Test removal of redundant GROUP BY columns using unique not null index.
+--
+create temp table t1 (a int, b int, c int, unique(c));
+create temp table t2 (a int, b int, c int not null, primary key (a, b), unique(c));
+create temp table t3 (a int, b int, c int not null, d int not null, primary key (a, b), unique(c, d));
+create temp table t4 (a int not null, b int not null,
+                      c int not null, d int not null,
+                      unique (a, b), unique(b, c), unique(a, c, d));
+
+create table t5 (a int not null, b int not null,
+                      c int not null, d int not null,
+                      unique (a, b)) partition by range(a, b);
+create table t5_0 (like t5 including all);
+create table t5_1 (like t5 including all);
+alter table  t5 attach partition t5_0 for values from (0,0) to (10, 10);
+alter table  t5 attach partition t5_1 for values from (10,10) to (21, 21);
+insert into t5 select g,g+1, g+2, g+3 from generate_series(1, 20) g;
+
+create temp table t6 (a int, b int not null, c int, d int, unique(b, c));
+
+-- Test unique index without not null constraint should not be used.
+explain (costs off) select * from t1 group by a,b,c;
+
+-- using unique index remove useless group by columns
+-- since it has less columns compare with primary key
+explain (costs off) select * from t2 group by a,b,c;
+
+--can remove column b, since c has unique index and is not-null
+explain (costs off) select count(*) from t2 group by b,c;
+
+--more than one candidate can be used to remove other column.
+--optimizer will use primary key or unique index.
+explain (costs off) select * from t3 group by a,b,c,d;
+
+-- Test unique not null indices have overlap.
+explain (costs off) select * from t4 group by a,b,c,d;
+
+--can remove column d from groupby expression, since we have unique index(b,c)
+explain (costs off) select count(*) from t4 group by c, d, b;
+
+--partition case, can reduce to a,b.
+explain (costs off) select count(*) from t5 group by a, d, c,b;
+
+--can reduce to a,b, groupby order or having duplicates does not matter
+explain (costs off) select count(*) from t5 group by b,a,a, d, c;
+
+---only part of unique key columns is not-null, cannot use it to remove columns.
+explain(costs off) select count(*) from t6 group by b,c,a,d;
+
+drop table t1, t2,t3,t4,t5,t6;
 --
 -- Test GROUP BY matching of join columns that are type-coerced due to USING
 --
-- 
2.34.1