From e8653477fbe7321325122a2d5032798cd6a95a8b Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas.vondra@enterprisedb.com>
Date: Mon, 26 Jun 2023 15:39:11 +0300
Subject: [PATCH] Fixed the case of calculating underestimated cardinality for
 an LEFT JOIN with the restriction "IS NULL" in the clause. This error is
 caused by an incorrect calculation of selectivity in the "IS NULL" clause,
 since it took into account only table-level statistics without zero values in
 the results of the join operation. This patch fixes this by calculating the
 fraction of zero values on the right side of the join without of matching row
 on left.

Co-authored-by: Alena Rybakina <a.rybakina@postgrespro.ru>
---
 src/backend/utils/adt/selfuncs.c | 35 +++++++++++++++++++++++++++++---
 src/include/nodes/pathnodes.h    |  3 +++
 2 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index c4fcd0076ea..8e18aa1dd2b 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -153,7 +153,7 @@ static double eqjoinsel_inner(Oid opfuncoid, Oid collation,
 							  bool isdefault1, bool isdefault2,
 							  AttStatsSlot *sslot1, AttStatsSlot *sslot2,
 							  Form_pg_statistic stats1, Form_pg_statistic stats2,
-							  bool have_mcvs1, bool have_mcvs2);
+							  bool have_mcvs1, bool have_mcvs2, double *unmatched_frac);
 static double eqjoinsel_semi(Oid opfuncoid, Oid collation,
 							 VariableStatData *vardata1, VariableStatData *vardata2,
 							 double nd1, double nd2,
@@ -1710,6 +1710,9 @@ nulltestsel(PlannerInfo *root, NullTestType nulltesttype, Node *arg,
 		stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
 		freq_null = stats->stanullfrac;
 
+		if (sjinfo)
+			freq_null = freq_null + sjinfo->unmatched_frac - freq_null * sjinfo->unmatched_frac;
+
 		switch (nulltesttype)
 		{
 			case IS_NULL:
@@ -2313,13 +2316,24 @@ eqjoinsel(PG_FUNCTION_ARGS)
 	}
 
 	/* We need to compute the inner-join selectivity in all cases */
+	/*
+	 * calculate fraction of right without of matching row on left
+	 *
+	 * FIXME Should be restricted to JOIN_LEFT, we should have similar logic
+	 * for JOIN_FULL.
+	 *
+	 * XXX Probably should calculate unmatched as fraction of the join result,
+	 * not of the relation on the right (because the matched part can have more
+	 * matches per row and thus grow). Not sure. Depends on how it's used later.
+	 */
 	selec_inner = eqjoinsel_inner(opfuncoid, collation,
 								  &vardata1, &vardata2,
 								  nd1, nd2,
 								  isdefault1, isdefault2,
 								  &sslot1, &sslot2,
 								  stats1, stats2,
-								  have_mcvs1, have_mcvs2);
+								  have_mcvs1, have_mcvs2,
+								  &sjinfo->unmatched_frac);
 
 	switch (sjinfo->jointype)
 	{
@@ -2407,7 +2421,7 @@ eqjoinsel_inner(Oid opfuncoid, Oid collation,
 				bool isdefault1, bool isdefault2,
 				AttStatsSlot *sslot1, AttStatsSlot *sslot2,
 				Form_pg_statistic stats1, Form_pg_statistic stats2,
-				bool have_mcvs1, bool have_mcvs2)
+				bool have_mcvs1, bool have_mcvs2, double *unmatched_frac)
 {
 	double		selec;
 
@@ -2503,7 +2517,10 @@ eqjoinsel_inner(Oid opfuncoid, Oid collation,
 		}
 		CLAMP_PROBABILITY(matchfreq1);
 		CLAMP_PROBABILITY(unmatchfreq1);
+
+		*unmatched_frac = unmatchfreq1;
 		matchfreq2 = unmatchfreq2 = 0.0;
+
 		for (i = 0; i < sslot2->nvalues; i++)
 		{
 			if (hasmatch2[i])
@@ -2581,10 +2598,22 @@ eqjoinsel_inner(Oid opfuncoid, Oid collation,
 		double		nullfrac2 = stats2 ? stats2->stanullfrac : 0.0;
 
 		selec = (1.0 - nullfrac1) * (1.0 - nullfrac2);
+
+		/*
+		 * XXX Should this look at nullfrac on either side? Probably depends on
+		 * if we're calculating fraction of NULLs or fraction of unmatched rows.
+		 */
+		// unmatchfreq = (1.0 - nullfrac1) * (1.0 - nullfrac2);
 		if (nd1 > nd2)
+		{
 			selec /= nd1;
+			*unmatched_frac = (nd1 - nd2) * 1.0 / nd1;
+		}
 		else
+		{
 			selec /= nd2;
+			*unmatched_frac = 0.0;
+		}
 	}
 
 	return selec;
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index c17b53f7adb..6bc63e648e6 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -2853,6 +2853,9 @@ struct SpecialJoinInfo
 	bool		semi_can_hash;	/* true if semi_operators are all hash */
 	List	   *semi_operators; /* OIDs of equality join operators */
 	List	   *semi_rhs_exprs; /* righthand-side expressions of these ops */
+
+	/* For outer join, fraction of rows without a match. */
+	Selectivity	unmatched_frac;
 };
 
 /*
-- 
2.34.1

