From 322b52b92649c2625fbbba369b767029ceff8051 Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Sat, 25 Sep 2021 19:42:41 -0500
Subject: [PATCH 1/4] Ignore extended statistics for inheritance trees

Since commit 859b3003de we only build extended statistics for relations
without the child relations, so that we update the catalog row only
once. However, the non-inherited statistics were still used for planning
of queries on inheritance trees, which may produce bogus estimates.

This is roughly the same issue 427c6b5b9 addressed ~15 years ago, and we
fix it the same way - by ignoring extended statistics when calculating
estimates for the inheritance tree as a whole. We still consider
extended statistics when calculating estimates for individual child
relations, of course.

Report and patch by Justin Pryzby, minor fixes and cleanup by me.
Backpatch all the way back to PostgreSQL 10, where extended statistics
were introduced (same as 859b3003de).

Author: Justin Pryzby
Reported-by: Justin Pryzby
Backpatch-through: 10
Discussion: https://postgr.es/m/20210923212624.GI831%40telsasoft.com
---
 src/backend/statistics/dependencies.c   |  9 +++++++++
 src/backend/statistics/extended_stats.c |  9 +++++++++
 src/backend/utils/adt/selfuncs.c        | 16 ++++++++++++++++
 src/test/regress/expected/stats_ext.out | 24 ++++++++++++++++++++++++
 src/test/regress/sql/stats_ext.sql      | 15 +++++++++++++++
 5 files changed, 73 insertions(+)

diff --git a/src/backend/statistics/dependencies.c b/src/backend/statistics/dependencies.c
index 8bf80db8e4..88479eee8b 100644
--- a/src/backend/statistics/dependencies.c
+++ b/src/backend/statistics/dependencies.c
@@ -24,6 +24,7 @@
 #include "nodes/pathnodes.h"
 #include "optimizer/clauses.h"
 #include "optimizer/optimizer.h"
+#include "parser/parsetree.h"
 #include "statistics/extended_stats_internal.h"
 #include "statistics/statistics.h"
 #include "utils/bytea.h"
@@ -1410,11 +1411,19 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
 	int			ndependencies;
 	int			i;
 	AttrNumber	attnum_offset;
+	RangeTblEntry *rte = planner_rt_fetch(rel->relid, root);
 
 	/* unique expressions */
 	Node	  **unique_exprs;
 	int			unique_exprs_cnt;
 
+	/*
+	 * When dealing with inheritence trees, ignore extended stats (which were
+	 * built without data from child rels, and thus do not represent them).
+	 */
+	if (rte->inh)
+		return 1.0;
+
 	/* check if there's any stats that might be useful for us. */
 	if (!has_stats_of_kind(rel->statlist, STATS_EXT_DEPENDENCIES))
 		return 1.0;
diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c
index 69ca52094f..607b02ab8e 100644
--- a/src/backend/statistics/extended_stats.c
+++ b/src/backend/statistics/extended_stats.c
@@ -30,6 +30,7 @@
 #include "nodes/nodeFuncs.h"
 #include "optimizer/clauses.h"
 #include "optimizer/optimizer.h"
+#include "parser/parsetree.h"
 #include "pgstat.h"
 #include "postmaster/autovacuum.h"
 #include "statistics/extended_stats_internal.h"
@@ -1694,6 +1695,14 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
 	List	  **list_exprs;		/* expressions matched to any statistic */
 	int			listidx;
 	Selectivity sel = (is_or) ? 0.0 : 1.0;
+	RangeTblEntry *rte = planner_rt_fetch(rel->relid, root);
+
+	/*
+	 * When dealing with inheritence trees, ignore extended stats (which were
+	 * built without data from child rels, and thus do not represent them).
+	 */
+	if (rte->inh)
+		return sel;
 
 	/* check if there's any stats that might be useful for us. */
 	if (!has_stats_of_kind(rel->statlist, STATS_EXT_MCV))
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 10895fb287..0b12f5288e 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -3913,6 +3913,14 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
 	Oid			statOid = InvalidOid;
 	MVNDistinct *stats;
 	StatisticExtInfo *matched_info = NULL;
+	RangeTblEntry		*rte = planner_rt_fetch(rel->relid, root);
+
+	/*
+	 * When dealing with inheritence trees, ignore extended stats (which were
+	 * built without data from child rels, and thus do not represent them).
+	 */
+	if (rte->inh)
+		return false;
 
 	/* bail out immediately if the table has no extended statistics */
 	if (!rel->statlist)
@@ -5232,6 +5240,14 @@ examine_variable(PlannerInfo *root, Node *node, int varRelid,
 			if (vardata->statsTuple)
 				break;
 
+			/*
+			 * When dealing with inheritence trees, ignore extended stats (which
+			 * were built without data from child rels, and so do not represent
+			 * them).
+			 */
+			if (planner_rt_fetch(onerel->relid, root)->inh)
+				break;
+
 			/* skip stats without per-expression stats */
 			if (info->kind != STATS_EXT_EXPRESSIONS)
 				continue;
diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out
index c60ba45aba..5410f58f7f 100644
--- a/src/test/regress/expected/stats_ext.out
+++ b/src/test/regress/expected/stats_ext.out
@@ -176,6 +176,30 @@ CREATE STATISTICS ab1_a_b_stats ON a, b FROM ab1;
 ANALYZE ab1;
 DROP TABLE ab1 CASCADE;
 NOTICE:  drop cascades to table ab1c
+-- Tests for stats with inheritance
+CREATE TABLE stxdinh(i int, j int);
+CREATE TABLE stxdinh1() INHERITS(stxdinh);
+INSERT INTO stxdinh SELECT a, a/10 FROM generate_series(1,9)a;
+INSERT INTO stxdinh1 SELECT a, a FROM generate_series(1,999)a;
+VACUUM ANALYZE stxdinh, stxdinh1;
+-- Ensure non-inherited stats are not applied to inherited query
+-- Without stats object, it looks like this
+SELECT * FROM check_estimated_rows('SELECT * FROM stxdinh* GROUP BY 1,2');
+ estimated | actual 
+-----------+--------
+      1000 |   1008
+(1 row)
+
+CREATE STATISTICS stxdinh ON i,j FROM stxdinh;
+VACUUM ANALYZE stxdinh, stxdinh1;
+-- Since the stats object does not include inherited stats, it should not affect the estimates
+SELECT * FROM check_estimated_rows('SELECT * FROM stxdinh* GROUP BY 1,2');
+ estimated | actual 
+-----------+--------
+      1000 |   1008
+(1 row)
+
+DROP TABLE stxdinh, stxdinh1;
 -- basic test for statistics on expressions
 CREATE TABLE ab1 (a INTEGER, b INTEGER, c TIMESTAMP, d TIMESTAMPTZ);
 -- expression stats may be built on a single expression column
diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql
index 6fb37962a7..a196d5e2f8 100644
--- a/src/test/regress/sql/stats_ext.sql
+++ b/src/test/regress/sql/stats_ext.sql
@@ -112,6 +112,21 @@ CREATE STATISTICS ab1_a_b_stats ON a, b FROM ab1;
 ANALYZE ab1;
 DROP TABLE ab1 CASCADE;
 
+-- Tests for stats with inheritance
+CREATE TABLE stxdinh(i int, j int);
+CREATE TABLE stxdinh1() INHERITS(stxdinh);
+INSERT INTO stxdinh SELECT a, a/10 FROM generate_series(1,9)a;
+INSERT INTO stxdinh1 SELECT a, a FROM generate_series(1,999)a;
+VACUUM ANALYZE stxdinh, stxdinh1;
+-- Ensure non-inherited stats are not applied to inherited query
+-- Without stats object, it looks like this
+SELECT * FROM check_estimated_rows('SELECT * FROM stxdinh* GROUP BY 1,2');
+CREATE STATISTICS stxdinh ON i,j FROM stxdinh;
+VACUUM ANALYZE stxdinh, stxdinh1;
+-- Since the stats object does not include inherited stats, it should not affect the estimates
+SELECT * FROM check_estimated_rows('SELECT * FROM stxdinh* GROUP BY 1,2');
+DROP TABLE stxdinh, stxdinh1;
+
 -- basic test for statistics on expressions
 CREATE TABLE ab1 (a INTEGER, b INTEGER, c TIMESTAMP, d TIMESTAMPTZ);
 
-- 
2.31.1

