From 6b6689038de51fc7c56f710f9b613c1998ea2bd1 Mon Sep 17 00:00:00 2001
From: jian he <jian.universality@gmail.com>
Date: Fri, 22 Nov 2024 21:08:59 +0800
Subject: [PATCH v2 1/1] remove useless group by columns via unique index

in remove_useless_groupby_columns, if primary key exists and is a subset of
group by columns. then we remove non-primary-key column in group-by clause.  if
primary key is not available, then we try to use columns that aer unique and
not-null to remove other useless columns.

discussion: https://postgr.es/m/327990c8-b9b2-4b0c-bffb-462249f82de0@Spark
---
 src/backend/catalog/index.c              | 99 ++++++++++++++++++++++++
 src/backend/optimizer/plan/planner.c     | 66 +++++++++++++++-
 src/include/catalog/index.h              |  1 +
 src/test/regress/expected/aggregates.out | 75 ++++++++++++++++++
 src/test/regress/sql/aggregates.sql      | 39 ++++++++++
 5 files changed, 278 insertions(+), 2 deletions(-)

diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index f9bb721c5f..ad97925278 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -4257,3 +4257,102 @@ RestoreReindexState(const void *reindexstate)
 	/* Note the worker has its own transaction nesting level */
 	reindexingNestLevel = GetCurrentTransactionNestLevel();
 }
+
+/*
+ * given a input relation oid, return a list of attnums that have unique index
+ * and not-null constraint associated with it. note we do not check this
+ * relation's primary key.
+*/
+List *
+get_unique_not_null_attnos(Oid relid)
+{
+	List	   	*not_null_attnos = NIL;
+	Relation	pg_index;
+	HeapTuple	indexTuple;
+	SysScanDesc scan;
+	ScanKeyData skey;
+	List 		*not_null_cs;
+	List	   *tlist = NIL;
+
+	/*
+	 * Use cooked to fetch attnos. note: parimary key have a seperated not-null
+	 * constraint
+	 * */
+	not_null_cs = RelationGetNotNullConstraints(relid, true, false);
+	if (not_null_cs == NIL)
+		return NULL;
+
+	foreach_ptr(CookedConstraint, cc, not_null_cs)
+		not_null_attnos = lappend_int(not_null_attnos, cc->attnum);
+
+	/* Scan pg_index for unique index of the target rel */
+	pg_index = table_open(IndexRelationId, AccessShareLock);
+
+	ScanKeyInit(&skey,
+				Anum_pg_index_indrelid,
+				BTEqualStrategyNumber, F_OIDEQ,
+				ObjectIdGetDatum(relid));
+	scan = systable_beginscan(pg_index, IndexIndrelidIndexId, true,
+							  NULL, 1, &skey);
+
+	while (HeapTupleIsValid(indexTuple = systable_getnext(scan)))
+	{
+		Bitmapset  	*unique_notnull_attnos = NULL;
+		Form_pg_index index = (Form_pg_index) GETSTRUCT(indexTuple);
+		Datum		adatum;
+		bool		isNull;
+		int			numkeys;
+		bool 		all_notnull;
+		Datum	   *keys;
+		int			nKeys;
+
+		/* we're only interested in unique index */
+		if (!index->indisunique || index->indisprimary)
+			continue;
+
+		/* Skip invalid, exclusion index or deferred index */
+		if (!index->indisvalid || index->indisexclusion || !index->indimmediate)
+			continue;
+
+		/* Skip expression index or predicate index */
+		if (!heap_attisnull(indexTuple, Anum_pg_index_indpred, NULL) ||
+			!heap_attisnull(indexTuple, Anum_pg_index_indexprs, NULL))
+			continue;
+
+		/* Extract the pg_index->indkey int2vector */
+		adatum = heap_getattr(indexTuple, Anum_pg_index_indkey,
+							  RelationGetDescr(pg_index), &isNull);
+		if (isNull)
+			elog(ERROR, "null int2vector for index %u", index->indexrelid);
+
+		deconstruct_array_builtin(DatumGetArrayTypeP(adatum), INT2OID,
+									&keys, NULL, &nKeys);
+		if(nKeys != index->indnatts)
+			elog(ERROR, "corrupted int2vector for index %u", index->indexrelid);
+
+		Assert(nKeys >= index->indnkeyatts);
+		numkeys = index->indnkeyatts;
+
+		all_notnull = true;
+		for (int i = 0; i < numkeys; i++)
+		{
+			/* Skip if unique key attnum don't have not-null */
+			if (!list_member_int(not_null_attnos, DatumGetInt16(keys[i])))
+			{
+				all_notnull = false;
+				break;
+			}
+			unique_notnull_attnos = bms_add_member(unique_notnull_attnos,
+									  DatumGetInt16(keys[i]) - FirstLowInvalidHeapAttributeNumber);
+		}
+		if (unique_notnull_attnos != NULL)
+			tlist = lappend(tlist, unique_notnull_attnos);
+
+		if(!all_notnull)
+			continue;
+	}
+	systable_endscan(scan);
+
+	table_close(pg_index, AccessShareLock);
+	return tlist;
+}
\ No newline at end of file
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 1f78dc3d53..4341ac9aae 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -27,6 +27,7 @@
 #include "catalog/pg_inherits.h"
 #include "catalog/pg_proc.h"
 #include "catalog/pg_type.h"
+#include "catalog/index.h"
 #include "executor/executor.h"
 #include "foreign/fdwapi.h"
 #include "jit/jit.h"
@@ -2797,6 +2798,9 @@ remove_useless_groupby_columns(PlannerInfo *root)
 		Bitmapset  *relattnos;
 		Bitmapset  *pkattnos;
 		Oid			constraintOid;
+		Bitmapset  	*attnums = NULL;
+		List  		*unique_notnull_attnos = NIL;
+		List	   	*matched = NIL;
 
 		relid++;
 
@@ -2819,17 +2823,67 @@ remove_useless_groupby_columns(PlannerInfo *root)
 
 		/*
 		 * Can't remove any columns for this rel if there is no suitable
-		 * (i.e., nondeferrable) primary key constraint.
+		 * (i.e., nondeferrable) primary key constraint.However if there is
+		 * columns both have unique-index and not-null constraint, we can also
+		 * remove some columns. If primary key is there, we won't looking for
+		 * unique-not-null columns.
 		 */
 		pkattnos = get_primary_key_attnos(rte->relid, false, &constraintOid);
 		if (pkattnos == NULL)
+		{
+			/* get unique not null index */
+			unique_notnull_attnos = get_unique_not_null_attnos(rte->relid);
+			if (list_length(unique_notnull_attnos) > 0)
+			{
+				int		out_num,
+						inner_num;
+
+				foreach_node(Bitmapset, attnos, unique_notnull_attnos)
+				{
+					Assert(attnos != NULL);
+
+					if (bms_subset_compare(attnos, relattnos) != BMS_SUBSET1)
+						continue;
+					else
+						matched = lappend(matched, attnos);
+				}
+
+				/* there can be many unique-index-not-null attnums that are
+				* subset of groupbyattnos.  we choose one that is less number of
+				* members.if there are many equal number of attnums, we simply
+				* choose the first one.  we don't use foreach here, since we
+				* delete cells in list
+				*/
+				for (int outerpos = 0; outerpos < list_length(matched); outerpos++)
+				{
+					Bitmapset *bt;
+					bt = (Bitmapset *) list_nth(matched, outerpos);
+
+					out_num = bms_num_members(bt);
+					for (int restpos = outerpos + 1; restpos < list_length(matched);)
+					{
+						Bitmapset *other;
+						other = (Bitmapset *) list_nth(matched, restpos);
+						inner_num =	bms_num_members(other);
+						if (inner_num > out_num)
+							matched = list_delete_nth_cell(matched, restpos);
+						else if (inner_num < out_num)
+							matched = list_delete_nth_cell(matched, outerpos);
+						restpos++;
+					}
+				}
+				if (list_length(matched) > 0)
+					attnums = (Bitmapset *) list_nth(matched, 0);
+			}
+		}
+		if ((pkattnos == NULL && attnums == NULL))
 			continue;
 
 		/*
 		 * If the primary key is a proper subset of relattnos then we have
 		 * some items in the GROUP BY that can be removed.
 		 */
-		if (bms_subset_compare(pkattnos, relattnos) == BMS_SUBSET1)
+		if (pkattnos != NULL && bms_subset_compare(pkattnos, relattnos) == BMS_SUBSET1)
 		{
 			/*
 			 * To easily remember whether we've found anything to do, we don't
@@ -2842,6 +2896,14 @@ remove_useless_groupby_columns(PlannerInfo *root)
 			/* Remember the attnos of the removable columns */
 			surplusvars[relid] = bms_difference(relattnos, pkattnos);
 		}
+		else if(attnums != NULL)
+		{
+			if (surplusvars == NULL)
+				surplusvars = (Bitmapset **) palloc0(sizeof(Bitmapset *) *
+													 (list_length(parse->rtable) + 1));
+
+			surplusvars[relid] = bms_difference(relattnos, attnums);
+		}
 	}
 
 	/*
diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h
index 2dea96f47c..dacc0f6ebd 100644
--- a/src/include/catalog/index.h
+++ b/src/include/catalog/index.h
@@ -175,6 +175,7 @@ extern void RestoreReindexState(const void *reindexstate);
 
 extern void IndexSetParentIndex(Relation partitionIdx, Oid parentOid);
 
+extern List *get_unique_not_null_attnos(Oid relid);
 
 /*
  * itemptr_encode - Encode ItemPointer as int64/int8
diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out
index 1e682565d1..9bd91c4f38 100644
--- a/src/test/regress/expected/aggregates.out
+++ b/src/test/regress/expected/aggregates.out
@@ -1454,6 +1454,81 @@ drop table t2;
 drop table t3;
 drop table p_t1;
 --
+-- Test removal of redundant GROUP BY columns using unique not null index.
+--
+create temp table t1 (a int, b int, c int, unique(c));
+create temp table t2 (a int, b int, c int not null, primary key (a, b), unique(c));
+create temp table t3 (a int, b int, c int not null, d int not null, primary key (a, b), unique(c, d));
+create temp table t4 (a int not null, b int not null,
+                      c int not null, d int not null,
+                      unique (a, b), unique(b, c), unique(a, c, d));
+create table t5 (a int not null, b int not null,
+                      c int not null, d int not null,
+                      unique (a, b)) partition by range(a, b);
+create table t5_0 (like t5 including all);
+create table t5_1 (like t5 including all);
+ALTER TABLE t5 ATTACH PARTITION t5_0 FOR VALUES FROM (0,0) TO (10, 10);
+ALTER TABLE t5 ATTACH PARTITION t5_1 FOR VALUES FROM (10,10) TO (21, 21);
+insert into t5 select g,g+1, g+2, g+3 from generate_series(1, 20) g;
+-- Test unique index without not null constraint should not be used.
+explain (costs off) select * from t1 group by a,b,c;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: a, b, c
+   ->  Seq Scan on t1
+(3 rows)
+
+-- both unique not null index and primary key is there,
+-- using primary key to remove useless group by columns
+explain (costs off) select * from t2 group by a,b,c;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: a, b
+   ->  Seq Scan on t2
+(3 rows)
+
+-- Test primary key beats unique not null index.
+explain (costs off) select * from t3 group by a,b,c,d;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: a, b
+   ->  Seq Scan on t3
+(3 rows)
+
+-- Test unique not null indices have overlap.
+explain (costs off) select * from t4 group by a,b,c,d;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: a, b
+   ->  Seq Scan on t4
+(3 rows)
+
+--can remove column d from groupby expression, since we have unique index(b,c)
+explain (costs off) select count(*) from t4 group by c, d, b;
+      QUERY PLAN      
+----------------------
+ HashAggregate
+   Group Key: c, b
+   ->  Seq Scan on t4
+(3 rows)
+
+--partition case, can reduce to a,b.
+explain (costs off) select count(*) from t5 group by a, d, c,b;
+            QUERY PLAN             
+-----------------------------------
+ HashAggregate
+   Group Key: t5.a, t5.b
+   ->  Append
+         ->  Seq Scan on t5_0 t5_1
+         ->  Seq Scan on t5_1 t5_2
+(5 rows)
+
+drop table t1, t2,t3,t4,t5;
+--
 -- Test GROUP BY matching of join columns that are type-coerced due to USING
 --
 create temp table t1(f1 int, f2 int);
diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql
index 4885daffe6..d07c599a04 100644
--- a/src/test/regress/sql/aggregates.sql
+++ b/src/test/regress/sql/aggregates.sql
@@ -512,6 +512,45 @@ drop table t2;
 drop table t3;
 drop table p_t1;
 
+--
+-- Test removal of redundant GROUP BY columns using unique not null index.
+--
+create temp table t1 (a int, b int, c int, unique(c));
+create temp table t2 (a int, b int, c int not null, primary key (a, b), unique(c));
+create temp table t3 (a int, b int, c int not null, d int not null, primary key (a, b), unique(c, d));
+create temp table t4 (a int not null, b int not null,
+                      c int not null, d int not null,
+                      unique (a, b), unique(b, c), unique(a, c, d));
+
+create table t5 (a int not null, b int not null,
+                      c int not null, d int not null,
+                      unique (a, b)) partition by range(a, b);
+create table t5_0 (like t5 including all);
+create table t5_1 (like t5 including all);
+ALTER TABLE t5 ATTACH PARTITION t5_0 FOR VALUES FROM (0,0) TO (10, 10);
+ALTER TABLE t5 ATTACH PARTITION t5_1 FOR VALUES FROM (10,10) TO (21, 21);
+insert into t5 select g,g+1, g+2, g+3 from generate_series(1, 20) g;
+
+-- Test unique index without not null constraint should not be used.
+explain (costs off) select * from t1 group by a,b,c;
+
+-- both unique not null index and primary key is there,
+-- using primary key to remove useless group by columns
+explain (costs off) select * from t2 group by a,b,c;
+
+-- Test primary key beats unique not null index.
+explain (costs off) select * from t3 group by a,b,c,d;
+
+-- Test unique not null indices have overlap.
+explain (costs off) select * from t4 group by a,b,c,d;
+
+--can remove column d from groupby expression, since we have unique index(b,c)
+explain (costs off) select count(*) from t4 group by c, d, b;
+
+--partition case, can reduce to a,b.
+explain (costs off) select count(*) from t5 group by a, d, c,b;
+
+drop table t1, t2,t3,t4,t5;
 --
 -- Test GROUP BY matching of join columns that are type-coerced due to USING
 --
-- 
2.34.1