From 2df9c47f188ed1a8ae8b70865c6a5ddff6dbe766 Mon Sep 17 00:00:00 2001
From: "Andrei V. Lepikhov" <lepihov@gmail.com>
Date: Thu, 10 Apr 2025 14:16:44 +0200
Subject: [PATCH v0] Be more conventional in estimating hash join bucket size.

Implementing estimation of hash join bucket size by extended statistics, we
wanted to avoid the use of add_unique_group_var because its logic greatly
depends on the group's number estimation code. But we forgot that this routine
also does over stuff, like duplicate removal from the list of estimated
clauses.
With this commit, we make add_unique_group_var a little more general, which
allows us to use it during the bucket estimation. At first, it doesn't need
vardata receiving RelOptInfo reference, ndistinct and isdefault values
directly. Also, GroupVarInfo tracks original restriction - it is a palliative
that simplifies the detection of clauses that have not yet been estimated.
---
 src/backend/utils/adt/selfuncs.c        | 77 +++++++++++--------------
 src/test/regress/expected/stats_ext.out | 28 +++++++++
 src/test/regress/sql/stats_ext.sql      | 11 ++++
 3 files changed, 72 insertions(+), 44 deletions(-)

diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 588d991fa57..b88a6bf5822 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -3310,19 +3310,17 @@ typedef struct
 	RelOptInfo *rel;			/* relation it belongs to */
 	double		ndistinct;		/* # distinct values */
 	bool		isdefault;		/* true if DEFAULT_NUM_DISTINCT was used */
+	RestrictInfo *source;		/* Source restriction to surely identification */
 } GroupVarInfo;
 
 static List *
-add_unique_group_var(PlannerInfo *root, List *varinfos,
-					 Node *var, VariableStatData *vardata)
+add_unique_group_var(PlannerInfo *root, List *varinfos, Node *var,
+					 RelOptInfo *rel, double ndistinct, bool isdefault,
+					 RestrictInfo *source)
 {
 	GroupVarInfo *varinfo;
-	double		ndistinct;
-	bool		isdefault;
 	ListCell   *lc;
 
-	ndistinct = get_variable_numdistinct(vardata, &isdefault);
-
 	/*
 	 * The nullingrels bits within the var could cause the same var to be
 	 * counted multiple times if it's marked with different nullingrels.  They
@@ -3345,7 +3343,7 @@ add_unique_group_var(PlannerInfo *root, List *varinfos,
 		 * relations (see comments for estimate_num_groups).  We aren't too
 		 * fussy about the semantics of "equal" here.
 		 */
-		if (vardata->rel != varinfo->rel &&
+		if (rel != varinfo->rel &&
 			exprs_known_equal(root, var, varinfo->var, InvalidOid))
 		{
 			if (varinfo->ndistinct <= ndistinct)
@@ -3364,9 +3362,10 @@ add_unique_group_var(PlannerInfo *root, List *varinfos,
 	varinfo = (GroupVarInfo *) palloc(sizeof(GroupVarInfo));
 
 	varinfo->var = var;
-	varinfo->rel = vardata->rel;
+	varinfo->rel = rel;
 	varinfo->ndistinct = ndistinct;
 	varinfo->isdefault = isdefault;
+	varinfo->source = source;
 	varinfos = lappend(varinfos, varinfo);
 	return varinfos;
 }
@@ -3532,8 +3531,13 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows,
 		examine_variable(root, groupexpr, 0, &vardata);
 		if (HeapTupleIsValid(vardata.statsTuple) || vardata.isunique)
 		{
-			varinfos = add_unique_group_var(root, varinfos,
-											groupexpr, &vardata);
+			double	ndistinct;
+			bool	isdefault;
+
+			ndistinct = get_variable_numdistinct(&vardata, &isdefault);
+			varinfos = add_unique_group_var(root, varinfos, groupexpr,
+											vardata.rel, ndistinct, isdefault,
+											NULL);
 			ReleaseVariableStats(vardata);
 			continue;
 		}
@@ -3569,9 +3573,13 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows,
 		foreach(l2, varshere)
 		{
 			Node	   *var = (Node *) lfirst(l2);
+			double		ndistinct;
+			bool		isdefault;
 
 			examine_variable(root, var, 0, &vardata);
-			varinfos = add_unique_group_var(root, varinfos, var, &vardata);
+			ndistinct = get_variable_numdistinct(&vardata, &isdefault);
+			varinfos = add_unique_group_var(root, varinfos, var, vardata.rel,
+											ndistinct, isdefault, NULL);
 			ReleaseVariableStats(vardata);
 		}
 	}
@@ -3816,13 +3824,9 @@ estimate_multivariate_bucketsize(PlannerInfo *root, RelOptInfo *inner,
 		ListCell   *lc;
 		int			relid = -1;
 		List	   *varinfos = NIL;
-		List	   *origin_rinfos = NIL;
 		double		mvndistinct;
-		List	   *origin_varinfos;
 		int			group_relid = -1;
 		RelOptInfo *group_rel = NULL;
-		ListCell   *lc1,
-				   *lc2;
 
 		/*
 		 * Find clauses, referencing the same single base relation and try to
@@ -3835,7 +3839,6 @@ estimate_multivariate_bucketsize(PlannerInfo *root, RelOptInfo *inner,
 			RestrictInfo *rinfo = lfirst_node(RestrictInfo, lc);
 			Node	   *expr;
 			Relids		relids;
-			GroupVarInfo *varinfo;
 
 			/*
 			 * Find the inner side of the join, which we need to estimate the
@@ -3880,18 +3883,9 @@ estimate_multivariate_bucketsize(PlannerInfo *root, RelOptInfo *inner,
 					 */
 					continue;
 
-				varinfo = (GroupVarInfo *) palloc(sizeof(GroupVarInfo));
-				varinfo->var = expr;
-				varinfo->rel = root->simple_rel_array[relid];
-				varinfo->ndistinct = 0.0;
-				varinfo->isdefault = false;
-				varinfos = lappend(varinfos, varinfo);
-
-				/*
-				 * Remember the link to RestrictInfo for the case the clause
-				 * is failed to be estimated.
-				 */
-				origin_rinfos = lappend(origin_rinfos, rinfo);
+				varinfos = add_unique_group_var(root, varinfos, expr,
+												root->simple_rel_array[relid],
+												0.0, false, rinfo);
 			}
 			else
 				/* This clause can't be estimated with extended statistics */
@@ -3906,22 +3900,23 @@ estimate_multivariate_bucketsize(PlannerInfo *root, RelOptInfo *inner,
 			 * Multivariate statistics doesn't apply to single columns except
 			 * for expressions, but it has not been implemented yet.
 			 */
-			otherclauses = list_concat(otherclauses, origin_rinfos);
+
+			if (varinfos != NULL)
+				otherclauses = lappend(otherclauses,
+								((GroupVarInfo *) linitial(varinfos))->source);
 			list_free_deep(varinfos);
-			list_free(origin_rinfos);
 			continue;
 		}
 
 		Assert(group_rel != NULL);
 
 		/* Employ the extended statistics. */
-		origin_varinfos = varinfos;
 		for (;;)
 		{
-			bool		estimated = estimate_multivariate_ndistinct(root,
-																	group_rel,
-																	&varinfos,
-																	&mvndistinct);
+			bool	estimated = estimate_multivariate_ndistinct(root,
+																group_rel,
+																&varinfos,
+																&mvndistinct);
 
 			if (!estimated)
 				break;
@@ -3936,19 +3931,13 @@ estimate_multivariate_bucketsize(PlannerInfo *root, RelOptInfo *inner,
 			Assert(ndistinct >= 1.0);
 		}
 
-		Assert(list_length(origin_varinfos) == list_length(origin_rinfos));
-
 		/* Collect unmatched clauses as otherclauses. */
-		forboth(lc1, origin_varinfos, lc2, origin_rinfos)
+		foreach(lc, varinfos)
 		{
-			GroupVarInfo *vinfo = lfirst(lc1);
-
-			if (!list_member_ptr(varinfos, vinfo))
-				/* Already estimated */
-				continue;
+			GroupVarInfo *varinfo = lfirst(lc);
 
 			/* Can't be estimated here - push to the returning list */
-			otherclauses = lappend(otherclauses, lfirst(lc2));
+			otherclauses = lappend(otherclauses, varinfo->source);
 		}
 	}
 
diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out
index 686d8c93aa8..6359e5fb689 100644
--- a/src/test/regress/expected/stats_ext.out
+++ b/src/test/regress/expected/stats_ext.out
@@ -3427,4 +3427,32 @@ SELECT * FROM sb_1 a, sb_2 b WHERE a.x = b.x AND a.y = b.y AND a.z = b.z;
          ->  Seq Scan on sb_2 b
 (5 rows)
 
+-- Check that the Hash Join bucket size estimator detects equal clauses correctly.
+SET enable_nestloop = 'off';
+SET enable_mergejoin = 'off';
+EXPLAIN (COSTS OFF)
+SELECT FROM sb_1 LEFT JOIN sb_2 ON (sb_2.x=sb_1.x) AND (sb_1.x=sb_2.x);
+                       QUERY PLAN                       
+--------------------------------------------------------
+ Hash Left Join
+   Hash Cond: ((sb_1.x = sb_2.x) AND (sb_1.x = sb_2.x))
+   ->  Seq Scan on sb_1
+   ->  Hash
+         ->  Seq Scan on sb_2
+(5 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT FROM sb_1 LEFT JOIN sb_2
+   ON (sb_2.x=sb_1.x) AND (sb_1.x=sb_2.x) AND (sb_1.y=sb_2.y);
+                                  QUERY PLAN                                  
+------------------------------------------------------------------------------
+ Hash Left Join
+   Hash Cond: ((sb_1.x = sb_2.x) AND (sb_1.y = sb_2.y) AND (sb_1.x = sb_2.x))
+   ->  Seq Scan on sb_1
+   ->  Hash
+         ->  Seq Scan on sb_2
+(5 rows)
+
+RESET enable_nestloop;
+RESET enable_mergejoin;
 DROP TABLE sb_1, sb_2 CASCADE;
diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql
index b71a6cd089f..da4f2fe9c93 100644
--- a/src/test/regress/sql/stats_ext.sql
+++ b/src/test/regress/sql/stats_ext.sql
@@ -1747,4 +1747,15 @@ ANALYZE sb_2;
 EXPLAIN (COSTS OFF) -- Choose hash join
 SELECT * FROM sb_1 a, sb_2 b WHERE a.x = b.x AND a.y = b.y AND a.z = b.z;
 
+-- Check that the Hash Join bucket size estimator detects equal clauses correctly.
+SET enable_nestloop = 'off';
+SET enable_mergejoin = 'off';
+EXPLAIN (COSTS OFF)
+SELECT FROM sb_1 LEFT JOIN sb_2 ON (sb_2.x=sb_1.x) AND (sb_1.x=sb_2.x);
+EXPLAIN (COSTS OFF)
+SELECT FROM sb_1 LEFT JOIN sb_2
+   ON (sb_2.x=sb_1.x) AND (sb_1.x=sb_2.x) AND (sb_1.y=sb_2.y);
+RESET enable_nestloop;
+RESET enable_mergejoin;
+
 DROP TABLE sb_1, sb_2 CASCADE;
-- 
2.39.5