From b55a84c5b7d6f86409acb87291986bf6b7714f0e Mon Sep 17 00:00:00 2001
From: "Andrei V. Lepikhov" <lepihov@gmail.com>
Date: Thu, 10 Apr 2025 14:16:44 +0200
Subject: [PATCH v1] Make the necessary preparations before estimating the hash
 join bucket size.

To estimate with extended statistics, it is necessary to clear
the varnullingrels field in the expression, and duplicates are not allowed
in the GroupVarInfo list.
---
 src/backend/utils/adt/selfuncs.c        | 26 +++++++++++++++++++++++
 src/test/regress/expected/stats_ext.out | 28 +++++++++++++++++++++++++
 src/test/regress/sql/stats_ext.sql      | 11 ++++++++++
 3 files changed, 65 insertions(+)

diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 588d991fa57..2fd1e177c20 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -3850,6 +3850,8 @@ estimate_multivariate_bucketsize(PlannerInfo *root, RelOptInfo *inner,
 			if (bms_get_singleton_member(relids, &relid) &&
 				root->simple_rel_array[relid]->statlist != NIL)
 			{
+				bool is_duplicate = false;
+
 				/*
 				 * This inner-side expression references only one relation.
 				 * Extended statistics on this clause can exist.
@@ -3880,6 +3882,30 @@ estimate_multivariate_bucketsize(PlannerInfo *root, RelOptInfo *inner,
 					 */
 					continue;
 
+				/* Clear nullingrels to correctly match hash keys */
+				expr = remove_nulling_relids(expr, root->outer_join_rels, NULL);
+
+				/*
+				 * Detect and exclude exact duplicates from the list of
+				 * hash keys (like add_unique_group_var does).
+				 */
+				foreach(lc1, varinfos)
+				{
+					varinfo = (GroupVarInfo *) lfirst(lc1);
+
+					if (!equal(expr, varinfo->var))
+						continue;
+
+					is_duplicate = true;
+					break;
+				}
+				if (is_duplicate)
+					/*
+					 * Skip exact duplicates. Adding them to the otherclauses
+					 * list also doesn't make sense.
+					 */
+					continue;
+
 				varinfo = (GroupVarInfo *) palloc(sizeof(GroupVarInfo));
 				varinfo->var = expr;
 				varinfo->rel = root->simple_rel_array[relid];
diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out
index 686d8c93aa8..6359e5fb689 100644
--- a/src/test/regress/expected/stats_ext.out
+++ b/src/test/regress/expected/stats_ext.out
@@ -3427,4 +3427,32 @@ SELECT * FROM sb_1 a, sb_2 b WHERE a.x = b.x AND a.y = b.y AND a.z = b.z;
          ->  Seq Scan on sb_2 b
 (5 rows)
 
+-- Check that the Hash Join bucket size estimator detects equal clauses correctly.
+SET enable_nestloop = 'off';
+SET enable_mergejoin = 'off';
+EXPLAIN (COSTS OFF)
+SELECT FROM sb_1 LEFT JOIN sb_2 ON (sb_2.x=sb_1.x) AND (sb_1.x=sb_2.x);
+                       QUERY PLAN                       
+--------------------------------------------------------
+ Hash Left Join
+   Hash Cond: ((sb_1.x = sb_2.x) AND (sb_1.x = sb_2.x))
+   ->  Seq Scan on sb_1
+   ->  Hash
+         ->  Seq Scan on sb_2
+(5 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT FROM sb_1 LEFT JOIN sb_2
+   ON (sb_2.x=sb_1.x) AND (sb_1.x=sb_2.x) AND (sb_1.y=sb_2.y);
+                                  QUERY PLAN                                  
+------------------------------------------------------------------------------
+ Hash Left Join
+   Hash Cond: ((sb_1.x = sb_2.x) AND (sb_1.y = sb_2.y) AND (sb_1.x = sb_2.x))
+   ->  Seq Scan on sb_1
+   ->  Hash
+         ->  Seq Scan on sb_2
+(5 rows)
+
+RESET enable_nestloop;
+RESET enable_mergejoin;
 DROP TABLE sb_1, sb_2 CASCADE;
diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql
index b71a6cd089f..da4f2fe9c93 100644
--- a/src/test/regress/sql/stats_ext.sql
+++ b/src/test/regress/sql/stats_ext.sql
@@ -1747,4 +1747,15 @@ ANALYZE sb_2;
 EXPLAIN (COSTS OFF) -- Choose hash join
 SELECT * FROM sb_1 a, sb_2 b WHERE a.x = b.x AND a.y = b.y AND a.z = b.z;
 
+-- Check that the Hash Join bucket size estimator detects equal clauses correctly.
+SET enable_nestloop = 'off';
+SET enable_mergejoin = 'off';
+EXPLAIN (COSTS OFF)
+SELECT FROM sb_1 LEFT JOIN sb_2 ON (sb_2.x=sb_1.x) AND (sb_1.x=sb_2.x);
+EXPLAIN (COSTS OFF)
+SELECT FROM sb_1 LEFT JOIN sb_2
+   ON (sb_2.x=sb_1.x) AND (sb_1.x=sb_2.x) AND (sb_1.y=sb_2.y);
+RESET enable_nestloop;
+RESET enable_mergejoin;
+
 DROP TABLE sb_1, sb_2 CASCADE;
-- 
2.39.5

