From 7a2ac9df4717e95475e10e660264dba550d5bee6 Mon Sep 17 00:00:00 2001
From: Ants Aasma <ants@cybertec.at>
Date: Sat, 28 Feb 2026 15:08:50 +0200
Subject: [PATCH v2] Decorrelate nested hash tables

Because hash tables are iterated in hash order, using the same hash
function in nested hash tables can lead to excessive collisions. If the
parent hash table can be the same size as the child hash tables it is
not a problem as the parent will quickly grow to the same size as the
child, eliminating further collisions. But if there are more than one
child the collisions can cause the table to quickly grow until it is
below 10% fillfactor.

The problem is made worse by nodeAgg devolving into building single
entry batches once hash table size exceeds working memory.

To hit the problem an aggregate node without a final function above
multiple other aggregate nodes is needed. Because hash iv is already
initialized using parallel worker number when there is no final fn
the typical cases do not hit this problem. A hash aggregate implementing
DISTINCT above multiple parallel workers with more groups than fit into
memory is needed.

Initializing the hash function based on plan node id decorrelates hash
tables within a plan while still keeping behavior deterministic.

Author: Ants Aasma <ants.aasma@cybertec.at>
Discussion: https://postgr.es/m/CANwKhkPOZupu3PYQVdkMmYjquYVqG2v8XmCAuuVM9Eu13-Zw3g%40mail.gmail.com
---
 src/backend/executor/execGrouping.c | 16 +++++++++++-----
 src/backend/executor/nodeAgg.c      |  2 +-
 src/test/regress/expected/union.out |  2 +-
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c
index c107514a85d..bfdcaca4e7d 100644
--- a/src/backend/executor/execGrouping.c
+++ b/src/backend/executor/execGrouping.c
@@ -199,6 +199,10 @@ BuildTupleHashTable(PlanState *parent,
 	TupleHashTable hashtable;
 	uint32		nbuckets;
 	MemoryContext oldcontext;
+	/*
+	 * The value here must match what ExecInitSubPlan passes to
+	 * ExecBuildHash32FromAttrs.
+	 */
 	uint32		hash_iv = 0;
 
 	/*
@@ -240,13 +244,15 @@ BuildTupleHashTable(PlanState *parent,
 	/*
 	 * If parallelism is in use, even if the leader backend is performing the
 	 * scan itself, we don't want to create the hashtable exactly the same way
-	 * in all workers. As hashtables are iterated over in keyspace-order,
-	 * doing so in all processes in the same way is likely to lead to
-	 * "unbalanced" hashtables when the table size initially is
-	 * underestimated.
+	 * in all workers. As hashtables are iterated over in hash value order,
+	 * inserting into a smaller table or from multiple tables is going to
+	 * cause collisions which triggers hash table growth. In hash aggregates
+	 * a too large hash table causes excessive spilling as there is no memory
+	 * left over for tuples.
 	 */
 	if (use_variable_hash_iv)
-		hash_iv = murmurhash32(ParallelWorkerNumber);
+		hash_iv = hash_combine(murmurhash32(ParallelWorkerNumber),
+							   murmurhash32(parent->plan->plan_node_id));
 
 	hashtable->hashtab = tuplehash_create(metacxt, nbuckets, hashtable);
 
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index 7d487a165fa..5c947fd151f 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -1535,7 +1535,7 @@ build_hash_table(AggState *aggstate, int setno, double nbuckets)
 											 metacxt,
 											 tuplescxt,
 											 tmpcxt,
-											 DO_AGGSPLIT_SKIPFINAL(aggstate->aggsplit));
+											 true);
 }
 
 /*
diff --git a/src/test/regress/expected/union.out b/src/test/regress/expected/union.out
index 709c85f2294..fad22c0ed2d 100644
--- a/src/test/regress/expected/union.out
+++ b/src/test/regress/expected/union.out
@@ -551,8 +551,8 @@ select x from (values (array[1, 2]), (array[1, 3])) _(x) union select x from (va
    x   
 -------
  {1,4}
- {1,2}
  {1,3}
+ {1,2}
 (3 rows)
 
 explain (costs off)
-- 
2.51.0