From 439d39db128b9cbc06063a862283d663510d0fcc Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@2ndquadrant.com>
Date: Fri, 13 Nov 2020 01:46:50 +0100
Subject: [PATCH] Support estimation of clauses of the form Var op Var

Until now, extended statistics were used only to estimate clauses of the
form (Var op Const). This patch extends the estimation to also handle
clauses (Var op Var).
---
 src/backend/statistics/extended_stats.c       | 67 +++++++++----
 src/backend/statistics/mcv.c                  | 71 +++++++++++++-
 .../statistics/extended_stats_internal.h      |  2 +-
 src/test/regress/expected/stats_ext.out       | 96 +++++++++++++++++++
 src/test/regress/sql/stats_ext.sql            | 26 +++++
 5 files changed, 243 insertions(+), 19 deletions(-)

diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c
index 36326927c6..1a6e7a3fc0 100644
--- a/src/backend/statistics/extended_stats.c
+++ b/src/backend/statistics/extended_stats.c
@@ -987,14 +987,18 @@ statext_is_compatible_clause_internal(PlannerInfo *root, Node *clause,
 	{
 		RangeTblEntry *rte = root->simple_rte_array[relid];
 		OpExpr	   *expr = (OpExpr *) clause;
-		Var		   *var;
+		Var		   *var,
+				   *var2;
 
 		/* Only expressions with two arguments are considered compatible. */
 		if (list_length(expr->args) != 2)
 			return false;
 
-		/* Check if the expression has the right shape (one Var, one Const) */
-		if (!examine_clause_args(expr->args, &var, NULL, NULL))
+		/*
+		 * Check if the expression has the right shape (one Var and one Const,
+		 * or two Vars).
+		 */
+		if (!examine_clause_args(expr->args, &var, &var2, NULL, NULL))
 			return false;
 
 		/*
@@ -1034,7 +1038,20 @@ statext_is_compatible_clause_internal(PlannerInfo *root, Node *clause,
 			!get_func_leakproof(get_opcode(expr->opno)))
 			return false;
 
-		return statext_is_compatible_clause_internal(root, (Node *) var,
+		/*
+		 * Check compatibility of the first Var - we get this one for both
+		 * types of supported expressions (Var op Const) and (Var op Var).
+		 */
+		if (!statext_is_compatible_clause_internal(root, (Node *) var,
+												   relid, attnums))
+			return false;
+
+		/* For (Var op Const) we don't get the second Var, and we're done. */
+		if (!var2)
+			return true;
+
+		/* For (Var op Var) check compatibility of the second Var. */
+		return statext_is_compatible_clause_internal(root, (Node *) var2,
 													 relid, attnums);
 	}
 
@@ -1050,7 +1067,7 @@ statext_is_compatible_clause_internal(PlannerInfo *root, Node *clause,
 			return false;
 
 		/* Check if the expression has the right shape (one Var, one Const) */
-		if (!examine_clause_args(expr->args, &var, NULL, NULL))
+		if (!examine_clause_args(expr->args, &var, NULL, NULL, NULL))
 			return false;
 
 		/*
@@ -1446,22 +1463,24 @@ statext_clauselist_selectivity(PlannerInfo *root, List *clauses, int varRelid,
 }
 
 /*
- * examine_opclause_expression
+ * examine_clause_args
  *		Split expression into Var and Const parts.
  *
- * Attempts to match the arguments to either (Var op Const) or (Const op Var),
- * possibly with a RelabelType on top. When the expression matches this form,
- * returns true, otherwise returns false.
+ * Attempts to match the arguments to either (Var op Const) or (Const op Var)
+ * or (Var op Var), possibly with a RelabelType on top. When the expression
+ * matches this form, returns true, otherwise returns false.
  *
  * Optionally returns pointers to the extracted Var/Const nodes, when passed
  * non-null pointers (varp, cstp and varonleftp). The varonleftp flag specifies
  * on which side of the operator we found the Var node.
  */
 bool
-examine_clause_args(List *args, Var **varp, Const **cstp, bool *varonleftp)
+examine_clause_args(List *args, Var **var1p, Var **var2p, Const **cstp,
+					bool *varonleftp)
 {
-	Var		   *var;
-	Const	   *cst;
+	Var	   *var1 = NULL;
+	Var	   *var2 = NULL;
+	Const  *cst = NULL;
 	bool		varonleft;
 	Node	   *leftop,
 			   *rightop;
@@ -1481,22 +1500,38 @@ examine_clause_args(List *args, Var **varp, Const **cstp, bool *varonleftp)
 
 	if (IsA(leftop, Var) && IsA(rightop, Const))
 	{
-		var = (Var *) leftop;
+		var1 = (Var *) leftop;
 		cst = (Const *) rightop;
 		varonleft = true;
 	}
 	else if (IsA(leftop, Const) && IsA(rightop, Var))
 	{
-		var = (Var *) rightop;
+		var1 = (Var *) rightop;
 		cst = (Const *) leftop;
 		varonleft = false;
 	}
+	else if (IsA(leftop, Var) && IsA(rightop, Var))
+	{
+		var1 = (Var *) leftop;
+		var2 = (Var *) rightop;
+		varonleft = false;
+
+		/*
+		 * Both variables have to be for the same relation (otherwise it's
+		 * a join clause, and we don't deal with those yet.
+		 */
+		if (var1->varno != var2->varno)
+			return false;
+	}
 	else
 		return false;
 
 	/* return pointers to the extracted parts if requested */
-	if (varp)
-		*varp = var;
+	if (var1p)
+		*var1p = var1;
+
+	if (var2p)
+		*var2p = var2;
 
 	if (cstp)
 		*cstp = cst;
diff --git a/src/backend/statistics/mcv.c b/src/backend/statistics/mcv.c
index 6a262f1543..ac9a8020b8 100644
--- a/src/backend/statistics/mcv.c
+++ b/src/backend/statistics/mcv.c
@@ -1583,13 +1583,17 @@ mcv_get_match_bitmap(PlannerInfo *root, List *clauses,
 
 			/* valid only after examine_clause_args returns true */
 			Var		   *var;
+			Var		   *var2;
 			Const	   *cst;
 			bool		varonleft;
 
 			fmgr_info(get_opcode(expr->opno), &opproc);
 
 			/* extract the var and const from the expression */
-			if (examine_clause_args(expr->args, &var, &cst, &varonleft))
+			if (!examine_clause_args(expr->args, &var, &var2, &cst, &varonleft))
+				continue;
+
+			if (cst)	/* Var op Const */
 			{
 				int			idx;
 
@@ -1653,6 +1657,68 @@ mcv_get_match_bitmap(PlannerInfo *root, List *clauses,
 					matches[i] = RESULT_MERGE(matches[i], is_or, match);
 				}
 			}
+			else	/* Var op Var */
+			{
+				int			idx;
+				int			idx2;
+
+				Assert(var2);
+
+				/* match the attribute to a dimension of the statistic */
+				idx = bms_member_index(keys, var->varattno);
+				idx2 = bms_member_index(keys, var2->varattno);
+
+				/*
+				 * Walk through the MCV items and evaluate the current clause.
+				 * We can skip items that were already ruled out, and
+				 * terminate if there are no remaining MCV items that might
+				 * possibly match.
+				 */
+				for (i = 0; i < mcvlist->nitems; i++)
+				{
+					bool		match = true;
+					MCVItem    *item = &mcvlist->items[i];
+
+					/*
+					 * When either of the MCV items is NULL we can treat this
+					 * as a mismatch. We must not call the operator because
+					 * of strictness.
+					 */
+					if (item->isnull[idx] || item->isnull[idx2])
+					{
+						matches[i] = RESULT_MERGE(matches[i], is_or, false);
+						continue;
+					}
+
+					/*
+					 * Skip MCV items that can't change result in the bitmap.
+					 * Once the value gets false for AND-lists, or true for
+					 * OR-lists, we don't need to look at more clauses.
+					 */
+					if (RESULT_IS_FINAL(matches[i], is_or))
+						continue;
+
+					/*
+					 * First check whether the constant is below the lower
+					 * boundary (in that case we can skip the bucket, because
+					 * there's no overlap).
+					 *
+					 * We don't store collations used to build the statistics,
+					 * but we can use the collation for the attribute itself,
+					 * as stored in varcollid. We do reset the statistics after
+					 * a type change (including collation change), so this is
+					 * OK. We may need to relax this after allowing extended
+					 * statistics on expressions.
+					 */
+					match = DatumGetBool(FunctionCall2Coll(&opproc,
+														   var->varcollid,
+														   item->values[idx],
+														   item->values[idx2]));
+
+					/* update the match bitmap with the result */
+					matches[i] = RESULT_MERGE(matches[i], is_or, match);
+				}
+			}
 		}
 		else if (IsA(clause, ScalarArrayOpExpr))
 		{
@@ -1667,7 +1733,7 @@ mcv_get_match_bitmap(PlannerInfo *root, List *clauses,
 			fmgr_info(get_opcode(expr->opno), &opproc);
 
 			/* extract the var and const from the expression */
-			if (examine_clause_args(expr->args, &var, &cst, &varonleft))
+			if (examine_clause_args(expr->args, &var, NULL, &cst, &varonleft))
 			{
 				int			idx;
 
@@ -1681,6 +1747,7 @@ mcv_get_match_bitmap(PlannerInfo *root, List *clauses,
 
 				/* ScalarArrayOpExpr has the Var always on the left */
 				Assert(varonleft);
+				Assert(cst);
 
 				if (!cst->constisnull)
 				{
diff --git a/src/include/statistics/extended_stats_internal.h b/src/include/statistics/extended_stats_internal.h
index 61e69696cf..8f0bf2df5c 100644
--- a/src/include/statistics/extended_stats_internal.h
+++ b/src/include/statistics/extended_stats_internal.h
@@ -96,7 +96,7 @@ extern SortItem *build_sorted_items(int numrows, int *nitems, HeapTuple *rows,
 									TupleDesc tdesc, MultiSortSupport mss,
 									int numattrs, AttrNumber *attnums);
 
-extern bool examine_clause_args(List *args, Var **varp,
+extern bool examine_clause_args(List *args, Var **var1p, Var **var2p,
 								Const **cstp, bool *varonleftp);
 
 extern Selectivity mcv_clauselist_selectivity(PlannerInfo *root,
diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out
index 4c3edd213f..f77d1d5469 100644
--- a/src/test/regress/expected/stats_ext.out
+++ b/src/test/regress/expected/stats_ext.out
@@ -978,6 +978,18 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = '
        343 |    200
 (1 row)
 
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a > c');
+ estimated | actual 
+-----------+--------
+      1667 |   3750
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a < c');
+ estimated | actual 
+-----------+--------
+      1667 |      0
+(1 row)
+
 SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IN (1, 2, 51, 52) AND b IN ( ''1'', ''2'')');
  estimated | actual 
 -----------+--------
@@ -1113,6 +1125,18 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = '
        200 |    200
 (1 row)
 
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a > c');
+ estimated | actual 
+-----------+--------
+      3750 |   3750
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a < c');
+ estimated | actual 
+-----------+--------
+         1 |      0
+(1 row)
+
 SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IN (1, 2, 51, 52) AND b IN ( ''1'', ''2'')');
  estimated | actual 
 -----------+--------
@@ -1323,6 +1347,12 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IS NULL AND
       3750 |   2500
 (1 row)
 
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE b = d');
+ estimated | actual 
+-----------+--------
+        25 |   2500
+(1 row)
+
 -- create statistics
 CREATE STATISTICS mcv_lists_stats (mcv) ON a, b, d FROM mcv_lists;
 ANALYZE mcv_lists;
@@ -1356,6 +1386,12 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IS NULL AND
       2500 |   2500
 (1 row)
 
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE b = d');
+ estimated | actual 
+-----------+--------
+      2500 |   2500
+(1 row)
+
 -- mcv with pass-by-ref fixlen types, e.g. uuid
 CREATE TABLE mcv_lists_uuid (
     a UUID,
@@ -1450,6 +1486,18 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a AND
       1094 |      0
 (1 row)
 
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a = b');
+ estimated | actual 
+-----------+--------
+      9950 |   2500
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a = b AND b = c');
+ estimated | actual 
+-----------+--------
+        50 |   2500
+(1 row)
+
 CREATE STATISTICS mcv_lists_bool_stats (mcv) ON a, b, c
   FROM mcv_lists_bool;
 ANALYZE mcv_lists_bool;
@@ -1477,6 +1525,18 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a AND
          1 |      0
 (1 row)
 
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a = b');
+ estimated | actual 
+-----------+--------
+      2500 |   2500
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a = b AND b = c');
+ estimated | actual 
+-----------+--------
+      2500 |   2500
+(1 row)
+
 -- check the ability to use multiple MCV lists
 CREATE TABLE mcv_lists_multi (
 	a INTEGER,
@@ -1512,6 +1572,24 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AN
          4 |    142
 (1 row)
 
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = b AND c = d');
+ estimated | actual 
+-----------+--------
+         1 |   5000
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a < b');
+ estimated | actual 
+-----------+--------
+      1667 |      0
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a <= b');
+ estimated | actual 
+-----------+--------
+      1667 |   5000
+(1 row)
+
 -- create separate MCV statistics
 CREATE STATISTICS mcv_lists_multi_1 (mcv) ON a, b FROM mcv_lists_multi;
 CREATE STATISTICS mcv_lists_multi_2 (mcv) ON c, d FROM mcv_lists_multi;
@@ -1534,6 +1612,24 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AN
        143 |    142
 (1 row)
 
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = b AND c = d');
+ estimated | actual 
+-----------+--------
+      5000 |   5000
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a < b');
+ estimated | actual 
+-----------+--------
+         1 |      0
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a <= b');
+ estimated | actual 
+-----------+--------
+      5000 |   5000
+(1 row)
+
 DROP TABLE mcv_lists_multi;
 -- Permission tests. Users should not be able to see specific data values in
 -- the extended statistics, if they lack permission to see those values in
diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql
index 9781e590a3..7ae02397a9 100644
--- a/src/test/regress/sql/stats_ext.sql
+++ b/src/test/regress/sql/stats_ext.sql
@@ -512,6 +512,10 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = '
 
 SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ''1'' OR c = 1 OR d IS NOT NULL');
 
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a > c');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a < c');
+
 SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IN (1, 2, 51, 52) AND b IN ( ''1'', ''2'')');
 
 SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IN (1, 2, 51, 52, NULL) AND b IN ( ''1'', ''2'', NULL)');
@@ -561,6 +565,10 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 4 >= a AND ''0
 
 SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ''1'' OR c = 1');
 
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a > c');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a < c');
+
 SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IN (1, 2, 51, 52) AND b IN ( ''1'', ''2'')');
 
 SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IN (1, 2, 51, 52, NULL) AND b IN ( ''1'', ''2'', NULL)');
@@ -671,6 +679,8 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = '
 
 SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IS NULL AND (b = ''x'' OR d = ''x'')');
 
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE b = d');
+
 -- create statistics
 CREATE STATISTICS mcv_lists_stats (mcv) ON a, b, d FROM mcv_lists;
 
@@ -689,6 +699,8 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = '
 
 SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IS NULL AND (b = ''x'' OR d = ''x'')');
 
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE b = d');
+
 -- mcv with pass-by-ref fixlen types, e.g. uuid
 CREATE TABLE mcv_lists_uuid (
     a UUID,
@@ -764,6 +776,10 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a AND
 
 SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a AND b AND NOT c');
 
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a = b');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a = b AND b = c');
+
 CREATE STATISTICS mcv_lists_bool_stats (mcv) ON a, b, c
   FROM mcv_lists_bool;
 
@@ -777,6 +793,10 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a AND
 
 SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a AND b AND NOT c');
 
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a = b');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a = b AND b = c');
+
 -- check the ability to use multiple MCV lists
 CREATE TABLE mcv_lists_multi (
 	a INTEGER,
@@ -800,6 +820,9 @@ ANALYZE mcv_lists_multi;
 SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0');
 SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE c = 0 AND d = 0');
 SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0 AND c = 0 AND d = 0');
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = b AND c = d');
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a < b');
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a <= b');
 
 -- create separate MCV statistics
 CREATE STATISTICS mcv_lists_multi_1 (mcv) ON a, b FROM mcv_lists_multi;
@@ -810,6 +833,9 @@ ANALYZE mcv_lists_multi;
 SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0');
 SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE c = 0 AND d = 0');
 SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0 AND c = 0 AND d = 0');
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = b AND c = d');
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a < b');
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a <= b');
 
 DROP TABLE mcv_lists_multi;
 
-- 
2.26.2

