From c824159493b5caf27db9fe0ecb0f69a5e25f7713 Mon Sep 17 00:00:00 2001
From: "Andrei V. Lepikhov" <lepihov@gmail.com>
Date: Thu, 10 Apr 2025 14:16:44 +0200
Subject: [PATCH] Be more conventional estimating hash join bucket size.

Implementing estimation of hash join bucket size by extended statistics
we wanted to avoid use of add_unique_group_var because its logic is too
specific for the groups number estimation routine. But we forgot that this
routine also does over stuff, like duplicates removal from the list
of estimated clauses.

With this commit we make add_unique_group_var a little more general that allows
to use it during the bucket estimation.
---
 src/backend/utils/adt/selfuncs.c        | 35 +++++++++++++------------
 src/test/regress/expected/stats_ext.out | 16 +++++++++++
 src/test/regress/sql/stats_ext.sql      |  8 ++++++
 3 files changed, 42 insertions(+), 17 deletions(-)

diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 588d991fa57..8016c4dd3b0 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -3313,16 +3313,12 @@ typedef struct
 } GroupVarInfo;
 
 static List *
-add_unique_group_var(PlannerInfo *root, List *varinfos,
-					 Node *var, VariableStatData *vardata)
+add_unique_group_var(PlannerInfo *root, List *varinfos, Node *var,
+					 RelOptInfo *rel, double ndistinct, bool isdefault)
 {
 	GroupVarInfo *varinfo;
-	double		ndistinct;
-	bool		isdefault;
 	ListCell   *lc;
 
-	ndistinct = get_variable_numdistinct(vardata, &isdefault);
-
 	/*
 	 * The nullingrels bits within the var could cause the same var to be
 	 * counted multiple times if it's marked with different nullingrels.  They
@@ -3345,7 +3341,7 @@ add_unique_group_var(PlannerInfo *root, List *varinfos,
 		 * relations (see comments for estimate_num_groups).  We aren't too
 		 * fussy about the semantics of "equal" here.
 		 */
-		if (vardata->rel != varinfo->rel &&
+		if (rel != varinfo->rel &&
 			exprs_known_equal(root, var, varinfo->var, InvalidOid))
 		{
 			if (varinfo->ndistinct <= ndistinct)
@@ -3364,7 +3360,7 @@ add_unique_group_var(PlannerInfo *root, List *varinfos,
 	varinfo = (GroupVarInfo *) palloc(sizeof(GroupVarInfo));
 
 	varinfo->var = var;
-	varinfo->rel = vardata->rel;
+	varinfo->rel = rel;
 	varinfo->ndistinct = ndistinct;
 	varinfo->isdefault = isdefault;
 	varinfos = lappend(varinfos, varinfo);
@@ -3532,8 +3528,12 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows,
 		examine_variable(root, groupexpr, 0, &vardata);
 		if (HeapTupleIsValid(vardata.statsTuple) || vardata.isunique)
 		{
-			varinfos = add_unique_group_var(root, varinfos,
-											groupexpr, &vardata);
+			double	ndistinct;
+			bool	isdefault;
+
+			ndistinct = get_variable_numdistinct(&vardata, &isdefault);
+			varinfos = add_unique_group_var(root, varinfos, groupexpr,
+											vardata.rel, ndistinct, isdefault);
 			ReleaseVariableStats(vardata);
 			continue;
 		}
@@ -3569,9 +3569,13 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows,
 		foreach(l2, varshere)
 		{
 			Node	   *var = (Node *) lfirst(l2);
+			double		ndistinct;
+			bool		isdefault;
 
 			examine_variable(root, var, 0, &vardata);
-			varinfos = add_unique_group_var(root, varinfos, var, &vardata);
+			ndistinct = get_variable_numdistinct(&vardata, &isdefault);
+			varinfos = add_unique_group_var(root, varinfos, var, vardata.rel,
+											ndistinct, isdefault);
 			ReleaseVariableStats(vardata);
 		}
 	}
@@ -3880,12 +3884,9 @@ estimate_multivariate_bucketsize(PlannerInfo *root, RelOptInfo *inner,
 					 */
 					continue;
 
-				varinfo = (GroupVarInfo *) palloc(sizeof(GroupVarInfo));
-				varinfo->var = expr;
-				varinfo->rel = root->simple_rel_array[relid];
-				varinfo->ndistinct = 0.0;
-				varinfo->isdefault = false;
-				varinfos = lappend(varinfos, varinfo);
+				varinfos = add_unique_group_var(root, varinfos, expr,
+												root->simple_rel_array[relid],
+												0.0, false);
 
 				/*
 				 * Remember the link to RestrictInfo for the case the clause
diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out
index 686d8c93aa8..472ef0e44ae 100644
--- a/src/test/regress/expected/stats_ext.out
+++ b/src/test/regress/expected/stats_ext.out
@@ -3427,4 +3427,20 @@ SELECT * FROM sb_1 a, sb_2 b WHERE a.x = b.x AND a.y = b.y AND a.z = b.z;
          ->  Seq Scan on sb_2 b
 (5 rows)
 
+-- Check that the Hash Join bucket size estimator detects equal clauses correctly.
+SET enable_nestloop = 'off';
+SET enable_mergejoin = 'off';
+EXPLAIN (COSTS OFF)
+SELECT FROM sb_1 LEFT JOIN sb_2 ON (sb_2.x=sb_1.x) AND (sb_1.x=sb_2.x);
+                       QUERY PLAN                       
+--------------------------------------------------------
+ Hash Left Join
+   Hash Cond: ((sb_1.x = sb_2.x) AND (sb_1.x = sb_2.x))
+   ->  Seq Scan on sb_1
+   ->  Hash
+         ->  Seq Scan on sb_2
+(5 rows)
+
+RESET enable_nestloop;
+RESET enable_mergejoin;
 DROP TABLE sb_1, sb_2 CASCADE;
diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql
index b71a6cd089f..001ec72026a 100644
--- a/src/test/regress/sql/stats_ext.sql
+++ b/src/test/regress/sql/stats_ext.sql
@@ -1747,4 +1747,12 @@ ANALYZE sb_2;
 EXPLAIN (COSTS OFF) -- Choose hash join
 SELECT * FROM sb_1 a, sb_2 b WHERE a.x = b.x AND a.y = b.y AND a.z = b.z;
 
+-- Check that the Hash Join bucket size estimator detects equal clauses correctly.
+SET enable_nestloop = 'off';
+SET enable_mergejoin = 'off';
+EXPLAIN (COSTS OFF)
+SELECT FROM sb_1 LEFT JOIN sb_2 ON (sb_2.x=sb_1.x) AND (sb_1.x=sb_2.x);
+RESET enable_nestloop;
+RESET enable_mergejoin;
+
 DROP TABLE sb_1, sb_2 CASCADE;
-- 
2.39.5