From 47daf6945403425f3a09a2df450a2cd2e96ae23c Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas.vondra@enterprisedb.com>
Date: Mon, 10 Jul 2023 17:29:41 +0300
Subject: [PATCH] Fix the case of calculating the underestimated cardinality
 for LEFT/RIGHT/FULL OUTER JOIN, where zero elements were not taken into
 account as a result of the connection operation. This patch fixes this by
 calculating the fraction of zero values on the one side of the join without
 of matching row on the other side.

Co-authored-by: Alena Rybakina <a.rybakina@postgrespro.ru>
---
 src/backend/utils/adt/array_selfuncs.c |   2 +-
 src/backend/utils/adt/selfuncs.c       | 314 ++++++++++++++-----------
 src/include/nodes/pathnodes.h          |   3 +
 src/include/utils/selfuncs.h           |   2 +-
 4 files changed, 178 insertions(+), 143 deletions(-)

diff --git a/src/backend/utils/adt/array_selfuncs.c b/src/backend/utils/adt/array_selfuncs.c
index 9207a5ed193..56ce460cf05 100644
--- a/src/backend/utils/adt/array_selfuncs.c
+++ b/src/backend/utils/adt/array_selfuncs.c
@@ -93,7 +93,7 @@ scalararraysel_containment(PlannerInfo *root,
 	/*
 	 * rightop must be a variable, else punt.
 	 */
-	examine_variable(root, rightop, varRelid, &vardata);
+	examine_variable(root, rightop, varRelid, NULL, &vardata);
 	if (!vardata.rel)
 	{
 		ReleaseVariableStats(vardata);
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index c4fcd0076ea..072579dbfd5 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -153,7 +153,7 @@ static double eqjoinsel_inner(Oid opfuncoid, Oid collation,
 							  bool isdefault1, bool isdefault2,
 							  AttStatsSlot *sslot1, AttStatsSlot *sslot2,
 							  Form_pg_statistic stats1, Form_pg_statistic stats2,
-							  bool have_mcvs1, bool have_mcvs2);
+							  bool have_mcvs1, bool have_mcvs2, double *unmatched_frac);
 static double eqjoinsel_semi(Oid opfuncoid, Oid collation,
 							 VariableStatData *vardata1, VariableStatData *vardata2,
 							 double nd1, double nd2,
@@ -1513,7 +1513,7 @@ boolvarsel(PlannerInfo *root, Node *arg, int varRelid)
 	VariableStatData vardata;
 	double		selec;
 
-	examine_variable(root, arg, varRelid, &vardata);
+	examine_variable(root, arg, varRelid, NULL, &vardata);
 	if (HeapTupleIsValid(vardata.statsTuple))
 	{
 		/*
@@ -1541,110 +1541,20 @@ booltestsel(PlannerInfo *root, BoolTestType booltesttype, Node *arg,
 {
 	VariableStatData vardata;
 	double		selec;
+	double		freq_null;
+	AttStatsSlot sslot;
 
-	examine_variable(root, arg, varRelid, &vardata);
+	examine_variable(root, arg, varRelid, sjinfo, &vardata);
 
-	if (HeapTupleIsValid(vardata.statsTuple))
+	if (sjinfo)
+	{
+		freq_null = sjinfo->unmatched_frac;
+	}
+	else if(HeapTupleIsValid(vardata.statsTuple))
 	{
 		Form_pg_statistic stats;
-		double		freq_null;
-		AttStatsSlot sslot;
-
 		stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
 		freq_null = stats->stanullfrac;
-
-		if (get_attstatsslot(&sslot, vardata.statsTuple,
-							 STATISTIC_KIND_MCV, InvalidOid,
-							 ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS)
-			&& sslot.nnumbers > 0)
-		{
-			double		freq_true;
-			double		freq_false;
-
-			/*
-			 * Get first MCV frequency and derive frequency for true.
-			 */
-			if (DatumGetBool(sslot.values[0]))
-				freq_true = sslot.numbers[0];
-			else
-				freq_true = 1.0 - sslot.numbers[0] - freq_null;
-
-			/*
-			 * Next derive frequency for false. Then use these as appropriate
-			 * to derive frequency for each case.
-			 */
-			freq_false = 1.0 - freq_true - freq_null;
-
-			switch (booltesttype)
-			{
-				case IS_UNKNOWN:
-					/* select only NULL values */
-					selec = freq_null;
-					break;
-				case IS_NOT_UNKNOWN:
-					/* select non-NULL values */
-					selec = 1.0 - freq_null;
-					break;
-				case IS_TRUE:
-					/* select only TRUE values */
-					selec = freq_true;
-					break;
-				case IS_NOT_TRUE:
-					/* select non-TRUE values */
-					selec = 1.0 - freq_true;
-					break;
-				case IS_FALSE:
-					/* select only FALSE values */
-					selec = freq_false;
-					break;
-				case IS_NOT_FALSE:
-					/* select non-FALSE values */
-					selec = 1.0 - freq_false;
-					break;
-				default:
-					elog(ERROR, "unrecognized booltesttype: %d",
-						 (int) booltesttype);
-					selec = 0.0;	/* Keep compiler quiet */
-					break;
-			}
-
-			free_attstatsslot(&sslot);
-		}
-		else
-		{
-			/*
-			 * No most-common-value info available. Still have null fraction
-			 * information, so use it for IS [NOT] UNKNOWN. Otherwise adjust
-			 * for null fraction and assume a 50-50 split of TRUE and FALSE.
-			 */
-			switch (booltesttype)
-			{
-				case IS_UNKNOWN:
-					/* select only NULL values */
-					selec = freq_null;
-					break;
-				case IS_NOT_UNKNOWN:
-					/* select non-NULL values */
-					selec = 1.0 - freq_null;
-					break;
-				case IS_TRUE:
-				case IS_FALSE:
-					/* Assume we select half of the non-NULL values */
-					selec = (1.0 - freq_null) / 2.0;
-					break;
-				case IS_NOT_TRUE:
-				case IS_NOT_FALSE:
-					/* Assume we select NULLs plus half of the non-NULLs */
-					/* equiv. to freq_null + (1.0 - freq_null) / 2.0 */
-					selec = (freq_null + 1.0) / 2.0;
-					break;
-				default:
-					elog(ERROR, "unrecognized booltesttype: %d",
-						 (int) booltesttype);
-					selec = 0.0;	/* Keep compiler quiet */
-					break;
-			}
-		}
 	}
 	else
 	{
@@ -1680,8 +1590,103 @@ booltestsel(PlannerInfo *root, BoolTestType booltesttype, Node *arg,
 				selec = 0.0;	/* Keep compiler quiet */
 				break;
 		}
+		goto end;
+	}
+
+	if (vardata.statsTuple && get_attstatsslot(&sslot, vardata.statsTuple,
+							STATISTIC_KIND_MCV, InvalidOid,
+							ATTSTATSSLOT_VALUES | ATTSTATSSLOT_NUMBERS)
+		&& sslot.nnumbers > 0)
+	{
+		double		freq_true;
+		double		freq_false;
+
+		/*
+			* Get first MCV frequency and derive frequency for true.
+			*/
+		if (DatumGetBool(sslot.values[0]))
+			freq_true = sslot.numbers[0];
+		else
+			freq_true = 1.0 - sslot.numbers[0] - freq_null;
+
+		/*
+			* Next derive frequency for false. Then use these as appropriate
+			* to derive frequency for each case.
+			*/
+		freq_false = 1.0 - freq_true - freq_null;
+
+		switch (booltesttype)
+		{
+			case IS_UNKNOWN:
+				/* select only NULL values */
+				selec = freq_null;
+				break;
+			case IS_NOT_UNKNOWN:
+				/* select non-NULL values */
+				selec = 1.0 - freq_null;
+				break;
+			case IS_TRUE:
+				/* select only TRUE values */
+				selec = freq_true;
+				break;
+			case IS_NOT_TRUE:
+				/* select non-TRUE values */
+				selec = 1.0 - freq_true;
+				break;
+			case IS_FALSE:
+				/* select only FALSE values */
+				selec = freq_false;
+				break;
+			case IS_NOT_FALSE:
+				/* select non-FALSE values */
+				selec = 1.0 - freq_false;
+				break;
+			default:
+				elog(ERROR, "unrecognized booltesttype: %d",
+						(int) booltesttype);
+				selec = 0.0;	/* Keep compiler quiet */
+				break;
+		}
+
+		free_attstatsslot(&sslot);
+	}
+	else
+	{
+		/*
+			* No most-common-value info available. Still have null fraction
+			* information, so use it for IS [NOT] UNKNOWN. Otherwise adjust
+			* for null fraction and assume a 50-50 split of TRUE and FALSE.
+			*/
+		switch (booltesttype)
+		{
+			case IS_UNKNOWN:
+				/* select only NULL values */
+				selec = freq_null;
+				break;
+			case IS_NOT_UNKNOWN:
+				/* select non-NULL values */
+				selec = 1.0 - freq_null;
+				break;
+			case IS_TRUE:
+			case IS_FALSE:
+				/* Assume we select half of the non-NULL values */
+				selec = (1.0 - freq_null) / 2.0;
+				break;
+			case IS_NOT_TRUE:
+			case IS_NOT_FALSE:
+				/* Assume we select NULLs plus half of the non-NULLs */
+				/* equiv. to freq_null + (1.0 - freq_null) / 2.0 */
+				selec = (freq_null + 1.0) / 2.0;
+				break;
+			default:
+				elog(ERROR, "unrecognized booltesttype: %d",
+						(int) booltesttype);
+				selec = 0.0;	/* Keep compiler quiet */
+				break;
+		}
 	}
 
+	end:
 	ReleaseVariableStats(vardata);
 
 	/* result should be in range, but make sure... */
@@ -1699,39 +1704,19 @@ nulltestsel(PlannerInfo *root, NullTestType nulltesttype, Node *arg,
 {
 	VariableStatData vardata;
 	double		selec;
+	double		freq_null;
 
-	examine_variable(root, arg, varRelid, &vardata);
+	examine_variable(root, arg, varRelid, sjinfo, &vardata);
 
-	if (HeapTupleIsValid(vardata.statsTuple))
+	if (sjinfo)
+	{
+		freq_null = sjinfo->unmatched_frac;
+	}
+	else if (HeapTupleIsValid(vardata.statsTuple))
 	{
 		Form_pg_statistic stats;
-		double		freq_null;
-
 		stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
 		freq_null = stats->stanullfrac;
-
-		switch (nulltesttype)
-		{
-			case IS_NULL:
-
-				/*
-				 * Use freq_null directly.
-				 */
-				selec = freq_null;
-				break;
-			case IS_NOT_NULL:
-
-				/*
-				 * Select not unknown (not null) values. Calculate from
-				 * freq_null.
-				 */
-				selec = 1.0 - freq_null;
-				break;
-			default:
-				elog(ERROR, "unrecognized nulltesttype: %d",
-					 (int) nulltesttype);
-				return (Selectivity) 0; /* keep compiler quiet */
-		}
 	}
 	else if (vardata.var && IsA(vardata.var, Var) &&
 			 ((Var *) vardata.var)->varattno < 0)
@@ -1741,6 +1726,7 @@ nulltestsel(PlannerInfo *root, NullTestType nulltesttype, Node *arg,
 		 * NULL.
 		 */
 		selec = (nulltesttype == IS_NULL) ? 0.0 : 1.0;
+		goto end;
 	}
 	else
 	{
@@ -1760,8 +1746,33 @@ nulltestsel(PlannerInfo *root, NullTestType nulltesttype, Node *arg,
 					 (int) nulltesttype);
 				return (Selectivity) 0; /* keep compiler quiet */
 		}
+		goto end;
+	}
+
+	switch (nulltesttype)
+	{
+		case IS_NULL:
+
+			/*
+				* Use freq_null directly.
+				*/
+			selec = freq_null;
+			break;
+		case IS_NOT_NULL:
+
+			/*
+				* Select not unknown (not null) values. Calculate from
+				* freq_null.
+				*/
+			selec = 1.0 - freq_null;
+			break;
+		default:
+			elog(ERROR, "unrecognized nulltesttype: %d",
+					(int) nulltesttype);
+			return (Selectivity) 0; /* keep compiler quiet */
 	}
 
+	end:
 	ReleaseVariableStats(vardata);
 
 	/* result should be in range, but make sure... */
@@ -2319,7 +2330,7 @@ eqjoinsel(PG_FUNCTION_ARGS)
 								  isdefault1, isdefault2,
 								  &sslot1, &sslot2,
 								  stats1, stats2,
-								  have_mcvs1, have_mcvs2);
+								  have_mcvs1, have_mcvs2, &sjinfo->unmatched_frac);
 
 	switch (sjinfo->jointype)
 	{
@@ -2407,7 +2418,8 @@ eqjoinsel_inner(Oid opfuncoid, Oid collation,
 				bool isdefault1, bool isdefault2,
 				AttStatsSlot *sslot1, AttStatsSlot *sslot2,
 				Form_pg_statistic stats1, Form_pg_statistic stats2,
-				bool have_mcvs1, bool have_mcvs2)
+				bool have_mcvs1, bool have_mcvs2,
+				double *unmatched_frac)
 {
 	double		selec;
 
@@ -2503,6 +2515,9 @@ eqjoinsel_inner(Oid opfuncoid, Oid collation,
 		}
 		CLAMP_PROBABILITY(matchfreq1);
 		CLAMP_PROBABILITY(unmatchfreq1);
+
+		*unmatched_frac = unmatchfreq1;
+
 		matchfreq2 = unmatchfreq2 = 0.0;
 		for (i = 0; i < sslot2->nvalues; i++)
 		{
@@ -2581,10 +2596,21 @@ eqjoinsel_inner(Oid opfuncoid, Oid collation,
 		double		nullfrac2 = stats2 ? stats2->stanullfrac : 0.0;
 
 		selec = (1.0 - nullfrac1) * (1.0 - nullfrac2);
-		if (nd1 > nd2)
-			selec /= nd1;
+		/*
+		 * XXX Should this look at nullfrac on either side? Probably depends on
+		 * if we're calculating fraction of NULLs or fraction of unmatched rows.
+		 */
+		// unmatchfreq = (1.0 - nullfrac1) * (1.0 - nullfrac2);
+		if (nd1 != nd2)
+		{
+			selec /= Max(nd1, nd2);
+			*unmatched_frac = abs(nd1 - nd2) * 1.0 / Max(nd1, nd2);
+		}
 		else
+		{
 			selec /= nd2;
+			*unmatched_frac = 0.0;
+		}
 	}
 
 	return selec;
@@ -2964,8 +2990,8 @@ mergejoinscansel(PlannerInfo *root, Node *clause,
 		return;					/* shouldn't happen */
 
 	/* Look for stats for the inputs */
-	examine_variable(root, left, 0, &leftvar);
-	examine_variable(root, right, 0, &rightvar);
+	examine_variable(root, left, 0, NULL, &leftvar);
+	examine_variable(root, right, 0, NULL, &rightvar);
 
 	/* Extract the operator's declared left/right datatypes */
 	get_op_opfamily_properties(opno, opfamily, false,
@@ -3469,7 +3495,7 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows,
 		 * independent, than to combine estimates for the extracted variables
 		 * when we don't know how that relates to the expressions.
 		 */
-		examine_variable(root, groupexpr, 0, &vardata);
+		examine_variable(root, groupexpr, 0, NULL, &vardata);
 		if (HeapTupleIsValid(vardata.statsTuple) || vardata.isunique)
 		{
 			varinfos = add_unique_group_var(root, varinfos,
@@ -3510,7 +3536,7 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows,
 		{
 			Node	   *var = (Node *) lfirst(l2);
 
-			examine_variable(root, var, 0, &vardata);
+			examine_variable(root, var, 0, NULL, &vardata);
 			varinfos = add_unique_group_var(root, varinfos, var, &vardata);
 			ReleaseVariableStats(vardata);
 		}
@@ -3777,7 +3803,7 @@ estimate_hash_bucket_stats(PlannerInfo *root, Node *hashkey, double nbuckets,
 	bool		isdefault;
 	AttStatsSlot sslot;
 
-	examine_variable(root, hashkey, 0, &vardata);
+	examine_variable(root, hashkey, 0, NULL, &vardata);
 
 	/* Look up the frequency of the most common value, if available */
 	*mcv_freq = 0.0;
@@ -4865,8 +4891,8 @@ get_restriction_variable(PlannerInfo *root, List *args, int varRelid,
 	 * Examine both sides.  Note that when varRelid is nonzero, Vars of other
 	 * relations will be treated as pseudoconstants.
 	 */
-	examine_variable(root, left, varRelid, vardata);
-	examine_variable(root, right, varRelid, &rdata);
+	examine_variable(root, left, varRelid, NULL, vardata);
+	examine_variable(root, right, varRelid, NULL, &rdata);
 
 	/*
 	 * If one side is a variable and the other not, we win.
@@ -4919,8 +4945,8 @@ get_join_variables(PlannerInfo *root, List *args, SpecialJoinInfo *sjinfo,
 	left = (Node *) linitial(args);
 	right = (Node *) lsecond(args);
 
-	examine_variable(root, left, 0, vardata1);
-	examine_variable(root, right, 0, vardata2);
+	examine_variable(root, left, 0, NULL, vardata1);
+	examine_variable(root, right, 0, NULL, vardata2);
 
 	if (vardata1->rel &&
 		bms_is_subset(vardata1->rel->relids, sjinfo->syn_righthand))
@@ -4975,12 +5001,13 @@ ReleaseDummy(HeapTuple tuple)
  * Caller is responsible for doing ReleaseVariableStats() before exiting.
  */
 void
-examine_variable(PlannerInfo *root, Node *node, int varRelid,
+examine_variable(PlannerInfo *root, Node *node, int varRelid, SpecialJoinInfo *sjinfo,
 				 VariableStatData *vardata)
 {
 	Node	   *basenode;
 	Relids		varnos;
 	RelOptInfo *onerel;
+	Form_pg_statistic stats;
 
 	/* Make sure we don't return dangling pointers in vardata */
 	MemSet(vardata, 0, sizeof(VariableStatData));
@@ -5240,6 +5267,11 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid,
 				break;
 		}
 
+		if (HeapTupleIsValid(vardata->statsTuple) && sjinfo)
+		{
+			stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple);
+			sjinfo->unmatched_frac = stats->stanullfrac + sjinfo->unmatched_frac - stats->stanullfrac * sjinfo->unmatched_frac;
+		}
 		/*
 		 * Search extended statistics for one with a matching expression.
 		 * There might be multiple ones, so just grab the first one. In the
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index c17b53f7adb..6bc63e648e6 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -2853,6 +2853,9 @@ struct SpecialJoinInfo
 	bool		semi_can_hash;	/* true if semi_operators are all hash */
 	List	   *semi_operators; /* OIDs of equality join operators */
 	List	   *semi_rhs_exprs; /* righthand-side expressions of these ops */
+
+	/* For outer join, fraction of rows without a match. */
+	Selectivity	unmatched_frac;
 };
 
 /*
diff --git a/src/include/utils/selfuncs.h b/src/include/utils/selfuncs.h
index 2f76c473db4..d9418f7541c 100644
--- a/src/include/utils/selfuncs.h
+++ b/src/include/utils/selfuncs.h
@@ -148,7 +148,7 @@ extern PGDLLIMPORT get_index_stats_hook_type get_index_stats_hook;
 /* Functions in selfuncs.c */
 
 extern void examine_variable(PlannerInfo *root, Node *node, int varRelid,
-							 VariableStatData *vardata);
+								SpecialJoinInfo *sjinfo, VariableStatData *vardata);
 extern bool statistic_proc_security_check(VariableStatData *vardata, Oid func_oid);
 extern bool get_restriction_variable(PlannerInfo *root, List *args,
 									 int varRelid,
-- 
2.34.1

