Showing applied extended statistics in explain Part 2
Hi,
This original patch made by Tomas improves the usability of extended statistics,
so I rebased it on 362de947, and I'd like to re-start developing it.
The previous thread [1]/messages/by-id/8081617b-d80f-ae2b-b79f-ea7e926f9fcf@enterprisedb.com suggested something to solve. I'll try to solve it as
best I can, but I think this feature is worth it with some limitations.
Please find the attached file.
[1]: /messages/by-id/8081617b-d80f-ae2b-b79f-ea7e926f9fcf@enterprisedb.com
Regards,
Tatsuro Yamada
NTT Open Source Software Center
Attachments:
0001-show-stats-in-explain-rebased-on-362de947.patchapplication/octet-stream; name=0001-show-stats-in-explain-rebased-on-362de947.patchDownload
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 83d00a4663..c64bebe2fa 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -78,6 +78,9 @@ static void show_qual(List *qual, const char *qlabel,
static void show_scan_qual(List *qual, const char *qlabel,
PlanState *planstate, List *ancestors,
ExplainState *es);
+static void show_scan_stats(List *stats, List *clauses, List *ors,
+ PlanState *planstate, List *ancestors,
+ ExplainState *es);
static void show_upper_qual(List *qual, const char *qlabel,
PlanState *planstate, List *ancestors,
ExplainState *es);
@@ -1842,6 +1845,10 @@ ExplainNode(PlanState *planstate, List *ancestors,
if (plan->qual)
show_instrumentation_count("Rows Removed by Filter", 1,
planstate, es);
+ if (es->verbose)
+ show_scan_stats(plan->applied_stats, plan->applied_clauses,
+ plan->applied_clauses_or,
+ planstate, ancestors, es);
break;
case T_IndexOnlyScan:
show_scan_qual(((IndexOnlyScan *) plan)->indexqual,
@@ -1858,10 +1865,18 @@ ExplainNode(PlanState *planstate, List *ancestors,
if (es->analyze)
ExplainPropertyFloat("Heap Fetches", NULL,
planstate->instrument->ntuples2, 0, es);
+ if (es->verbose)
+ show_scan_stats(plan->applied_stats, plan->applied_clauses,
+ plan->applied_clauses_or,
+ planstate, ancestors, es);
break;
case T_BitmapIndexScan:
show_scan_qual(((BitmapIndexScan *) plan)->indexqualorig,
"Index Cond", planstate, ancestors, es);
+ if (es->verbose)
+ show_scan_stats(plan->applied_stats, plan->applied_clauses,
+ plan->applied_clauses_or,
+ planstate, ancestors, es);
break;
case T_BitmapHeapScan:
show_scan_qual(((BitmapHeapScan *) plan)->bitmapqualorig,
@@ -1891,6 +1906,10 @@ ExplainNode(PlanState *planstate, List *ancestors,
if (plan->qual)
show_instrumentation_count("Rows Removed by Filter", 1,
planstate, es);
+ if (es->verbose)
+ show_scan_stats(plan->applied_stats, plan->applied_clauses,
+ plan->applied_clauses_or,
+ planstate, ancestors, es);
break;
case T_Gather:
{
@@ -2078,6 +2097,10 @@ ExplainNode(PlanState *planstate, List *ancestors,
if (plan->qual)
show_instrumentation_count("Rows Removed by Filter", 1,
planstate, es);
+ if (es->verbose)
+ show_scan_stats(plan->applied_stats, plan->applied_clauses,
+ plan->applied_clauses_or,
+ planstate, ancestors, es);
break;
case T_WindowAgg:
show_upper_qual(plan->qual, "Filter", planstate, ancestors, es);
@@ -2093,6 +2116,10 @@ ExplainNode(PlanState *planstate, List *ancestors,
if (plan->qual)
show_instrumentation_count("Rows Removed by Filter", 1,
planstate, es);
+ if (es->verbose)
+ show_scan_stats(plan->applied_stats, plan->applied_clauses,
+ plan->applied_clauses_or,
+ planstate, ancestors, es);
break;
case T_Sort:
show_sort_keys(castNode(SortState, planstate), ancestors, es);
@@ -2412,6 +2439,76 @@ show_scan_qual(List *qual, const char *qlabel,
show_qual(qual, qlabel, planstate, ancestors, useprefix, es);
}
+/*
+ * Show a generic expression
+ */
+static char *
+deparse_stat_expression(Node *node,
+ PlanState *planstate, List *ancestors,
+ ExplainState *es)
+{
+ List *context;
+
+ /* Set up deparsing context */
+ context = set_deparse_context_plan(es->deparse_cxt,
+ planstate->plan,
+ ancestors);
+
+ /* Deparse the expression */
+ return deparse_expression(node, context, false, false);
+}
+
+/*
+ * Show a qualifier expression (which is a List with implicit AND semantics)
+ */
+static char *
+show_stat_qual(List *qual, bool is_or,
+ PlanState *planstate, List *ancestors,
+ ExplainState *es)
+{
+ Node *node;
+
+ /* No work if empty qual */
+ if (qual == NIL)
+ return NULL;
+
+ /* Convert AND list to explicit AND */
+ if (is_or)
+ node = (Node *) make_ors_explicit(qual);
+ else
+ node = (Node *) make_ands_explicit(qual);
+
+ /* And show it */
+ return deparse_stat_expression(node, planstate, ancestors, es);
+}
+
+/*
+ * Show applied statistics for scan plan node
+ */
+static void
+show_scan_stats(List *stats, List *clauses, List *ors,
+ PlanState *planstate, List *ancestors, ExplainState *es)
+{
+ ListCell *lc1, *lc2, *lc3;
+ StringInfoData str;
+
+ forthree (lc1, stats, lc2, clauses, lc3, ors)
+ {
+ StatisticExtInfo *stat = (StatisticExtInfo *) lfirst(lc1);
+ List *applied_clauses = (List *) lfirst(lc2);
+ int is_or = lfirst_int(lc3);
+
+ initStringInfo(&str);
+
+ appendStringInfo(&str, "%s.%s Clauses: %s",
+ get_namespace_name(get_statistics_namespace(stat->statOid)),
+ get_statistics_name(stat->statOid),
+ show_stat_qual(applied_clauses, is_or, planstate, ancestors, es));
+
+ ExplainPropertyText("Statistics", str.data, es);
+ }
+}
+
/*
* Show a qualifier expression for an upper-level plan node
*/
diff --git a/src/backend/nodes/makefuncs.c b/src/backend/nodes/makefuncs.c
index a02332a1ec..089310b304 100644
--- a/src/backend/nodes/makefuncs.c
+++ b/src/backend/nodes/makefuncs.c
@@ -718,6 +718,17 @@ make_ands_explicit(List *andclauses)
return make_andclause(andclauses);
}
+Expr *
+make_ors_explicit(List *orclauses)
+{
+ if (orclauses == NIL)
+ return (Expr *) makeBoolConst(true, false);
+ else if (list_length(orclauses) == 1)
+ return (Expr *) linitial(orclauses);
+ else
+ return make_orclause(orclauses);
+}
+
List *
make_ands_implicit(Expr *clause)
{
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 610f4a56d6..e648505b25 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -5389,12 +5389,27 @@ order_qual_clauses(PlannerInfo *root, List *clauses)
static void
copy_generic_path_info(Plan *dest, Path *src)
{
+ ListCell *lc;
+
dest->startup_cost = src->startup_cost;
dest->total_cost = src->total_cost;
dest->plan_rows = src->rows;
dest->plan_width = src->pathtarget->width;
dest->parallel_aware = src->parallel_aware;
dest->parallel_safe = src->parallel_safe;
+
+ dest->applied_stats = src->parent->applied_stats;
+ dest->applied_clauses_or = src->parent->applied_clauses_or;
+
+ dest->applied_clauses = NIL;
+ foreach (lc, src->parent->applied_clauses)
+ {
+ List *clauses = (List *) lfirst(lc);
+
+ dest->applied_clauses
+ = lappend(dest->applied_clauses,
+ maybe_extract_actual_clauses(clauses, false));
+ }
}
/*
diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c
index e5f4062bfb..06719a5d52 100644
--- a/src/backend/optimizer/util/relnode.c
+++ b/src/backend/optimizer/util/relnode.c
@@ -287,6 +287,10 @@ build_simple_rel(PlannerInfo *root, int relid, RelOptInfo *parent)
rel->partexprs = NULL;
rel->nullable_partexprs = NULL;
+ rel->applied_stats = NIL;
+ rel->applied_clauses = NIL;
+ rel->applied_clauses_or = NIL;
+
/*
* Pass assorted information down the inheritance hierarchy.
*/
@@ -762,6 +766,10 @@ build_join_rel(PlannerInfo *root,
joinrel->partexprs = NULL;
joinrel->nullable_partexprs = NULL;
+ joinrel->applied_stats = NIL;
+ joinrel->applied_clauses = NIL;
+ joinrel->applied_clauses_or = NIL;
+
/* Compute information relevant to the foreign relations. */
set_foreign_rel_properties(joinrel, outer_rel, inner_rel);
@@ -956,6 +964,10 @@ build_child_join_rel(PlannerInfo *root, RelOptInfo *outer_rel,
joinrel->partexprs = NULL;
joinrel->nullable_partexprs = NULL;
+ joinrel->applied_stats = NIL;
+ joinrel->applied_clauses = NIL;
+ joinrel->applied_clauses_or = NIL;
+
/* Compute information relevant to foreign relations. */
set_foreign_rel_properties(joinrel, outer_rel, inner_rel);
diff --git a/src/backend/optimizer/util/restrictinfo.c b/src/backend/optimizer/util/restrictinfo.c
index 0b406e9334..23136a330b 100644
--- a/src/backend/optimizer/util/restrictinfo.c
+++ b/src/backend/optimizer/util/restrictinfo.c
@@ -508,6 +508,41 @@ extract_actual_clauses(List *restrictinfo_list,
return result;
}
+/*
+ * maybe_extract_actual_clauses
+ *
+ * Just like extract_actual_clauses, but does not require the clauses to
+ * already be RestrictInfo.
+ *
+ * XXX Does not handle RestrictInfos nested in OR clauses.
+ */
+List *
+maybe_extract_actual_clauses(List *restrictinfo_list,
+ bool pseudoconstant)
+{
+ List *result = NIL;
+ ListCell *l;
+
+ foreach(l, restrictinfo_list)
+ {
+ RestrictInfo *rinfo;
+ Node *node = (Node *) lfirst(l);
+
+ if (!IsA(node, RestrictInfo))
+ {
+ result = lappend(result, node);
+ continue;
+ }
+
+ rinfo = (RestrictInfo *) node;
+
+ if (rinfo->pseudoconstant == pseudoconstant)
+ result = lappend(result, rinfo->clause);
+ }
+
+ return result;
+}
+
/*
* extract_actual_join_clauses
*
diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c
index c5461514d8..58d51b45d1 100644
--- a/src/backend/statistics/extended_stats.c
+++ b/src/backend/statistics/extended_stats.c
@@ -1858,6 +1858,11 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
list_exprs[listidx] = NULL;
}
+ /* add it to the list of applied stats/clauses */
+ rel->applied_stats = lappend(rel->applied_stats, stat);
+ rel->applied_clauses = lappend(rel->applied_clauses, stat_clauses);
+ rel->applied_clauses_or = lappend_int(rel->applied_clauses_or, (is_or) ? 1 : 0);
+
if (is_or)
{
bool *or_matches = NULL;
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index cea777e9d4..b5e42e139b 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -4069,6 +4069,7 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
ListCell *lc2;
Bitmapset *matched = NULL;
AttrNumber attnum_offset;
+ List *matched_exprs = NIL;
/*
* How much we need to offset the attnums? If there are no
@@ -4116,6 +4117,9 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
matched = bms_add_member(matched, attnum);
+ /* track expressions matched by this statistics */
+ matched_exprs = lappend(matched_exprs, varinfo->var);
+
found = true;
}
@@ -4144,6 +4148,9 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
matched = bms_add_member(matched, attnum);
+ /* track expressions matched by this statistics */
+ matched_exprs = lappend(matched_exprs, expr);
+
/* there should be just one matching expression */
break;
}
@@ -4152,6 +4159,10 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
}
}
+ rel->applied_stats = lappend(rel->applied_stats, matched_info);
+ rel->applied_clauses = lappend(rel->applied_clauses, matched_exprs);
+ rel->applied_clauses_or = lappend(rel->applied_clauses_or, 0);
+
/* Find the specific item that exactly matches the combination */
for (i = 0; i < stats->nitems; i++)
{
diff --git a/src/backend/utils/cache/lsyscache.c b/src/backend/utils/cache/lsyscache.c
index f730aa26c4..62f060298a 100644
--- a/src/backend/utils/cache/lsyscache.c
+++ b/src/backend/utils/cache/lsyscache.c
@@ -33,6 +33,7 @@
#include "catalog/pg_proc.h"
#include "catalog/pg_range.h"
#include "catalog/pg_statistic.h"
+#include "catalog/pg_statistic_ext.h"
#include "catalog/pg_subscription.h"
#include "catalog/pg_transform.h"
#include "catalog/pg_type.h"
@@ -3669,3 +3670,51 @@ get_subscription_name(Oid subid, bool missing_ok)
return subname;
}
+
+/*
+ * get_statistics_name
+ * Returns the name of a given extended statistics
+ *
+ * Returns a palloc'd copy of the string, or NULL if no such namespace.
+ */
+char *
+get_statistics_name(Oid stxid)
+{
+ HeapTuple tp;
+
+ tp = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(stxid));
+ if (HeapTupleIsValid(tp))
+ {
+ Form_pg_statistic_ext stxtup = (Form_pg_statistic_ext) GETSTRUCT(tp);
+ char *result;
+
+ result = pstrdup(NameStr(stxtup->stxname));
+ ReleaseSysCache(tp);
+ return result;
+ }
+ else
+ return NULL;
+}
+
+/*
+ * get_statistics_namespace
+ * Returns the namespace OID of a given extended statistics
+ */
+Oid
+get_statistics_namespace(Oid stxid)
+{
+ HeapTuple tp;
+
+ tp = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(stxid));
+ if (HeapTupleIsValid(tp))
+ {
+ Form_pg_statistic_ext stxtup = (Form_pg_statistic_ext) GETSTRUCT(tp);
+ Oid result;
+
+ result = stxtup->stxnamespace;
+ ReleaseSysCache(tp);
+ return result;
+ }
+ else
+ return InvalidOid;
+}
diff --git a/src/include/nodes/makefuncs.h b/src/include/nodes/makefuncs.h
index 2dc79648d2..1014efc83f 100644
--- a/src/include/nodes/makefuncs.h
+++ b/src/include/nodes/makefuncs.h
@@ -94,6 +94,8 @@ extern Node *make_and_qual(Node *qual1, Node *qual2);
extern Expr *make_ands_explicit(List *andclauses);
extern List *make_ands_implicit(Expr *clause);
+extern Expr *make_ors_explicit(List *orclauses);
+
extern IndexInfo *makeIndexInfo(int numattrs, int numkeyattrs, Oid amoid,
List *expressions, List *predicates,
bool unique, bool nulls_not_distinct,
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index 534692bee1..e40af5a961 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -1030,6 +1030,11 @@ typedef struct RelOptInfo
List **partexprs pg_node_attr(read_write_ignore);
/* Nullable partition key expressions */
List **nullable_partexprs pg_node_attr(read_write_ignore);
+
+ /* info about applied extended statistics */
+ List *applied_stats; /* list of StatisticExtInfo */
+ List *applied_clauses; /* list of lists (of clauses) */
+ List *applied_clauses_or; /* are the clauses AND or OR */
} RelOptInfo;
/*
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index b4ef6bc44c..ecf6c17e78 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -169,6 +169,11 @@ typedef struct Plan
*/
Bitmapset *extParam;
Bitmapset *allParam;
+
+ /* info about applied statistics */
+ List *applied_stats;
+ List *applied_clauses;
+ List *applied_clauses_or;
} Plan;
/* ----------------
diff --git a/src/include/optimizer/restrictinfo.h b/src/include/optimizer/restrictinfo.h
index 1b42c832c5..29aa519b99 100644
--- a/src/include/optimizer/restrictinfo.h
+++ b/src/include/optimizer/restrictinfo.h
@@ -39,6 +39,8 @@ extern bool restriction_is_securely_promotable(RestrictInfo *restrictinfo,
extern List *get_actual_clauses(List *restrictinfo_list);
extern List *extract_actual_clauses(List *restrictinfo_list,
bool pseudoconstant);
+extern List *maybe_extract_actual_clauses(List *restrictinfo_list,
+ bool pseudoconstant);
extern void extract_actual_join_clauses(List *restrictinfo_list,
Relids joinrelids,
List **joinquals,
diff --git a/src/include/utils/lsyscache.h b/src/include/utils/lsyscache.h
index e4a200b00e..3fdcb1fd3c 100644
--- a/src/include/utils/lsyscache.h
+++ b/src/include/utils/lsyscache.h
@@ -203,6 +203,9 @@ extern char *get_publication_name(Oid pubid, bool missing_ok);
extern Oid get_subscription_oid(const char *subname, bool missing_ok);
extern char *get_subscription_name(Oid subid, bool missing_ok);
+extern char *get_statistics_name(Oid stxid);
+extern Oid get_statistics_namespace(Oid stxid);
+
#define type_is_array(typid) (get_element_type(typid) != InvalidOid)
/* type_is_array_domain accepts both plain arrays and domains over arrays */
#define type_is_array_domain(typid) (get_base_element_type(typid) != InvalidOid)
On 3/1/24 01:19, Tatsuro Yamada wrote:
Hi,
This original patch made by Tomas improves the usability of extended statistics,
so I rebased it on 362de947, and I'd like to re-start developing it.The previous thread [1] suggested something to solve. I'll try to solve it as
best I can, but I think this feature is worth it with some limitations.
Please find the attached file.
Thank you for the interest in moving this patch forward. And I agree
it's worth to cut some of the stuff if it's necessary to make it work.
regards
--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
Hello Yamada-san,
I finally got to look at this patch again - apologies for taking so
long, I'm well aware it's rather frustrating to wait for feedback. I'll
try to pay more attention to this patch, and don't hesitate to ping me
off-list if immediate input is needed.
I looked at the patch from March 1 [1]/messages/by-id/TYYPR01MB82310B308BA8770838F681619E5E2@TYYPR01MB8231.jpnprd01.prod.outlook.com, which applies mostly with some
minor bitrot, but no major conflicts. A couple review comments:
1) The patch is not added to the CF app, which I think is a mistake. Can
you please add it to the 2024-07 commitfest? Otherwise people may not be
aware of it, won't do reviews etc. It'll require posting a rebased
patch, but should not be a big deal.
2) Not having the patch in a CF also means cfbot is not running tests on
it. Which is unfortunate, because the patch actually has an a bug cfbot
would find - I've noticed it after running the tests through the github
CI, see [2]https://cirrus-ci.com/build/6436352672137216.
FWIW I very much recommend setting up this CI and using it during
development, it turned out to be very valuable for me as it tests on a
range of systems, and more convenient than the rpi5 machines I used for
that purposes before. See src/tools/ci/README for details.
3) The bug has this symptom:
ERROR: unrecognized node type: 268
CONTEXT: PL/pgSQL function check_estimated_rows(text) line 7 ...
STATEMENT: SELECT * FROM check_estimated_rows('SELECT a, b FROM ...
but it only shows on the FreeBSD machine (in CI). But that's simply
because that's running tests with "-DCOPY_PARSE_PLAN_TREES", which
always copies the query plan, to make sure all the nodes can be copied.
And we don't have a copy function for the StatisticExtInfo node (that's
the node 268), so it fails.
FWIW you can have this failure even on a regular build, you just need to
do explain on a prepared statement (with an extended statistics):
CREATE TABLE t (a int, b int);
INSERT INTO t SELECT (i/100), (i/100)
FROM generate_series(1,1000) s(i);
CREATE STATISTICS ON a,b FROM t;
ANALYZE t;
PREPARE ps (INT, INT) AS SELECT * FROM t WHERE a = $1 AND b = $2;
EXPLAIN EXECUTE ps(5,5);
ERROR: unrecognized node type: 268
4) I can think of two basic ways to fix this issue - either allow
copying of the StatisticExtInto node, or represent the information in a
different way (e.g. add a new node for that purpose, or use existing
nodes to do that).
I don't think we should do the first thing - the StatisticExtInfo is in
pathnodes.h, is designed to be used during path construction, has a lot
of fields that we likely don't need to show stuff in explain - but we'd
need to copy those too, and that seems useless / expensive.
So I suggest we invent a new (much simpler) node, tracking only the bits
we actually need for in the explain part. Or alternatively, if you think
adding a separate node is an overkill, maybe we could keep just an OID
of the statistics we applied, and the explain would lookup the name?
But I think having a new node might might also make the patch simpler,
as the new struct could combine the information the patch keeps in three
separate lists. Instead, there's be just one new list in Plan, members
would be the new node type, and each element would be
(statistics OID, list of clauses, flag is_or)
or something like that.
5) In [3]/messages/by-id/459863.1627419001@sss.pgh.pa.us Tom raised two possible issues with doing this - cost of
copying the information, and locking. For the performance concerns, I
think the first thing we should do is measuring how expensive it is. I
suggest measuring the overhead for about three basic cases:
- table with no extended stats
- table with a couple (1-10?) extended stats
- table with a lot of (100?) extended stats
And see the impact of the patch. That is, measure the planning time with
master and with the patch applied. The table size does not matter much,
I think - this should measure just the planning, not execute the query.
In practice the extra costs will get negligible as the execution time
grows. But we're measuring the worst case, so that's fine.
For the locking, I agree with Robert [4]/messages/by-id/CA+TgmoZU34zo4=hyqgLH16iGpHQ6+QAesp7k5a1cfZB=+9xtsw@mail.gmail.com that this probably should not
be an issue - I don't see why would this be different from indexes etc.
But I haven't thought about that too much, so maybe investigate and test
this a bit more (that plans get invalidated if the statistics changes,
and so on).
6) I'm not sure we want to have this under EXPLAIN (VERBOSE). It's what
I did in the initial PoC patch, but maybe we should invent a new flag
for this purpose, otherwise VERBOSE will cover too much stuff? I'm
thinking about "STATS" for example.
This would probably mean the patch should also add a new auto_explain
"log_" flag to enable/disable this.
7) The patch really needs some docs - I'd mention this in the EXPLAIN
docs, probably. There's also a chapter about estimates, maybe that
should mention this too? Try searching for places in the SGML docs
mentioning extended stats and/or explain, I guess.
For tests, I guess stats_ext is the right place to test this. I'm not
sure what's the best way to do this, though. If it's covered by VERBOSE,
that seems it might be unstable - and that would be an issue. But maybe
we might add a function similar to check_estimated_rows(), but verifying
the query used the expected statistics to estimate expected clauses.
But maybe with the new explain "STATS" flag it would be easier, because
we could do EXPLAIN (COSTS OFF, STATS ON) and that would be stable.
As for what needs to be tested, I don't think we need to test how we
match queries/clauses to statistics - that's already tested. It's fine
to focus just on displaying the expected stuff. I'd take a couple of the
existing tests, and check those. And then also add a couple tests for
prepared statements, and invalidation of a plan after an extended
statistics gets dropped, etc.
So there's stuff to do to make this committable, but hopefully this
review gives you some guidance regarding what/how ;-)
regards
[1]: /messages/by-id/TYYPR01MB82310B308BA8770838F681619E5E2@TYYPR01MB8231.jpnprd01.prod.outlook.com
/messages/by-id/TYYPR01MB82310B308BA8770838F681619E5E2@TYYPR01MB8231.jpnprd01.prod.outlook.com
[2]: https://cirrus-ci.com/build/6436352672137216
[3]: /messages/by-id/459863.1627419001@sss.pgh.pa.us
[4]: /messages/by-id/CA+TgmoZU34zo4=hyqgLH16iGpHQ6+QAesp7k5a1cfZB=+9xtsw@mail.gmail.com
/messages/by-id/CA+TgmoZU34zo4=hyqgLH16iGpHQ6+QAesp7k5a1cfZB=+9xtsw@mail.gmail.com
--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
Hi Tomas!
Thanks for the comments!
1) The patch is not added to the CF app, which I think is a mistake. Can
you please add it to the 2024-07 commitfest? Otherwise people may not be
aware of it, won't do reviews etc. It'll require posting a rebased
patch, but should not be a big deal.
I added the patch to the 2024-07 commitfest today.
2) Not having the patch in a CF also means cfbot is not running tests on
it. Which is unfortunate, because the patch actually has an a bug cfbot
would find - I've noticed it after running the tests through the github
CI, see [2].
3) The bug has this symptom:
ERROR: unrecognized node type: 268
CONTEXT: PL/pgSQL function check_estimated_rows(text) line 7 ...
STATEMENT: SELECT * FROM check_estimated_rows('SELECT a, b FROM ...
4) I can think of two basic ways to fix this issue - either allow
copying of the StatisticExtInto node, or represent the information in a
different way (e.g. add a new node for that purpose, or use existing
nodes to do that).
Thanks for the info. I'll investigate using cfbot.
To fix the problem, I understand we need to create a new struct like
(statistics OID, list of clauses, flag is_or).
5) In [3] Tom raised two possible issues with doing this - cost of
copying the information, and locking. For the performance concerns, I
think the first thing we should do is measuring how expensive it is. I
suggest measuring the overhead for about three basic cases:
Okay, I'll measure it once the patch is completed and check the overhead.
I read [3][4] and in my opinion I agree with Robert.
As with indexes, there should be a mechanism for determining whether
extended statistics are used or not. If it were available, users would be
able to
tune using extended statistics and get better execution plans.
6) I'm not sure we want to have this under EXPLAIN (VERBOSE). It's what
I did in the initial PoC patch, but maybe we should invent a new flag
for this purpose, otherwise VERBOSE will cover too much stuff? I'm
thinking about "STATS" for example.This would probably mean the patch should also add a new auto_explain
"log_" flag to enable/disable this.
I thought it might be better to do this, so I'll fix it.
7) The patch really needs some docs - I'd mention this in the EXPLAIN
docs, probably. There's also a chapter about estimates, maybe that
should mention this too? Try searching for places in the SGML docs
mentioning extended stats and/or explain, I guess.
I plan to create documentation after the specifications are finalized.
For tests, I guess stats_ext is the right place to test this. I'm not
sure what's the best way to do this, though. If it's covered by VERBOSE,
that seems it might be unstable - and that would be an issue. But maybe
we might add a function similar to check_estimated_rows(), but verifying
the query used the expected statistics to estimate expected clauses.
As for testing, I think it's more convenient for reviewers to include it in
the patch,
so I'm thinking of including it in the next patch.
So there's stuff to do to make this committable, but hopefully this
review gives you some guidance regarding what/how ;-)
Thank you! It helps me a lot!
The attached patch does not correspond to the above comment.
But it does solve some of the issues mentioned in previous threads.
The next patch is planned to include:
6) Add stats option to explain command
8) Add regression test (stats_ext.sql)
4) Add new node (resolve errors in cfbot and prepared statement)
Regards,
Tatsuro Yamada
Show quoted text
[1]
/messages/by-id/TYYPR01MB82310B308BA8770838F681619E5E2@TYYPR01MB8231.jpnprd01.prod.outlook.com
[2] https://cirrus-ci.com/build/6436352672137216
[3]
/messages/by-id/459863.1627419001@sss.pgh.pa.us[4]
/messages/by-id/CA+TgmoZU34zo4=hyqgLH16iGpHQ6+QAesp7k5a1cfZB=+9xtsw@mail.gmail.com
--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
Attachments:
0001-show-stats-in-explain-rebased-on-15c9ac36.patchapplication/octet-stream; name=0001-show-stats-in-explain-rebased-on-15c9ac36.patchDownload
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index f1d71bc54e..622aad8394 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -78,6 +78,9 @@ static void show_qual(List *qual, const char *qlabel,
static void show_scan_qual(List *qual, const char *qlabel,
PlanState *planstate, List *ancestors,
ExplainState *es);
+static void show_scan_stats(List *stats, List *clauses, List *ors,
+ PlanState *planstate, List *ancestors,
+ ExplainState *es);
static void show_upper_qual(List *qual, const char *qlabel,
PlanState *planstate, List *ancestors,
ExplainState *es);
@@ -1795,6 +1798,10 @@ ExplainNode(PlanState *planstate, List *ancestors,
if (plan->qual)
show_instrumentation_count("Rows Removed by Filter", 1,
planstate, es);
+ if (es->verbose)
+ show_scan_stats(plan->applied_stats, plan->applied_clauses,
+ plan->applied_clauses_or,
+ planstate, ancestors, es);
break;
case T_IndexOnlyScan:
show_scan_qual(((IndexOnlyScan *) plan)->indexqual,
@@ -1811,10 +1818,18 @@ ExplainNode(PlanState *planstate, List *ancestors,
if (es->analyze)
ExplainPropertyFloat("Heap Fetches", NULL,
planstate->instrument->ntuples2, 0, es);
+ if (es->verbose)
+ show_scan_stats(plan->applied_stats, plan->applied_clauses,
+ plan->applied_clauses_or,
+ planstate, ancestors, es);
break;
case T_BitmapIndexScan:
show_scan_qual(((BitmapIndexScan *) plan)->indexqualorig,
"Index Cond", planstate, ancestors, es);
+ if (es->verbose)
+ show_scan_stats(plan->applied_stats, plan->applied_clauses,
+ plan->applied_clauses_or,
+ planstate, ancestors, es);
break;
case T_BitmapHeapScan:
show_scan_qual(((BitmapHeapScan *) plan)->bitmapqualorig,
@@ -1844,6 +1859,10 @@ ExplainNode(PlanState *planstate, List *ancestors,
if (plan->qual)
show_instrumentation_count("Rows Removed by Filter", 1,
planstate, es);
+ if (es->verbose)
+ show_scan_stats(plan->applied_stats, plan->applied_clauses,
+ plan->applied_clauses_or,
+ planstate, ancestors, es);
break;
case T_Gather:
{
@@ -2031,6 +2050,10 @@ ExplainNode(PlanState *planstate, List *ancestors,
if (plan->qual)
show_instrumentation_count("Rows Removed by Filter", 1,
planstate, es);
+ if (es->verbose)
+ show_scan_stats(plan->applied_stats, plan->applied_clauses,
+ plan->applied_clauses_or,
+ planstate, ancestors, es);
break;
case T_WindowAgg:
show_upper_qual(plan->qual, "Filter", planstate, ancestors, es);
@@ -2046,6 +2069,10 @@ ExplainNode(PlanState *planstate, List *ancestors,
if (plan->qual)
show_instrumentation_count("Rows Removed by Filter", 1,
planstate, es);
+ if (es->verbose)
+ show_scan_stats(plan->applied_stats, plan->applied_clauses,
+ plan->applied_clauses_or,
+ planstate, ancestors, es);
break;
case T_Sort:
show_sort_keys(castNode(SortState, planstate), ancestors, es);
@@ -2365,6 +2392,76 @@ show_scan_qual(List *qual, const char *qlabel,
show_qual(qual, qlabel, planstate, ancestors, useprefix, es);
}
+/*
+ * Show a generic expression
+ */
+static char *
+deparse_stat_expression(Node *node,
+ PlanState *planstate, List *ancestors,
+ ExplainState *es)
+{
+ List *context;
+
+ /* Set up deparsing context */
+ context = set_deparse_context_plan(es->deparse_cxt,
+ planstate->plan,
+ ancestors);
+
+ /* Deparse the expression */
+ return deparse_expression(node, context, false, false);
+}
+
+/*
+ * Show a qualifier expression (which is a List with implicit AND semantics)
+ */
+static char *
+show_stat_qual(List *qual, bool is_or,
+ PlanState *planstate, List *ancestors,
+ ExplainState *es)
+{
+ Node *node;
+
+ /* No work if empty qual */
+ if (qual == NIL)
+ return NULL;
+
+ /* Convert AND list to explicit AND */
+ if (is_or)
+ node = (Node *) make_ors_explicit(qual);
+ else
+ node = (Node *) make_ands_explicit(qual);
+
+ /* And show it */
+ return deparse_stat_expression(node, planstate, ancestors, es);
+}
+
+/*
+ * Show applied statistics for scan plan node
+ */
+static void
+show_scan_stats(List *stats, List *clauses, List *ors,
+ PlanState *planstate, List *ancestors, ExplainState *es)
+{
+ ListCell *lc1, *lc2, *lc3;
+ StringInfoData str;
+
+ forthree (lc1, stats, lc2, clauses, lc3, ors)
+ {
+ StatisticExtInfo *stat = (StatisticExtInfo *) lfirst(lc1);
+ List *applied_clauses = (List *) lfirst(lc2);
+ int is_or = lfirst_int(lc3);
+
+ initStringInfo(&str);
+
+ appendStringInfo(&str, "%s.%s Clauses: %s",
+ get_namespace_name(get_statistics_namespace(stat->statOid)),
+ get_statistics_name(stat->statOid),
+ show_stat_qual(applied_clauses, is_or, planstate, ancestors, es));
+
+ ExplainPropertyText("Statistics", str.data, es);
+ }
+}
+
/*
* Show a qualifier expression for an upper-level plan node
*/
diff --git a/src/backend/nodes/makefuncs.c b/src/backend/nodes/makefuncs.c
index c6fb571982..3dcfe108fb 100644
--- a/src/backend/nodes/makefuncs.c
+++ b/src/backend/nodes/makefuncs.c
@@ -718,6 +718,17 @@ make_ands_explicit(List *andclauses)
return make_andclause(andclauses);
}
+Expr *
+make_ors_explicit(List *orclauses)
+{
+ if (orclauses == NIL)
+ return (Expr *) makeBoolConst(true, false);
+ else if (list_length(orclauses) == 1)
+ return (Expr *) linitial(orclauses);
+ else
+ return make_orclause(orclauses);
+}
+
List *
make_ands_implicit(Expr *clause)
{
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 34ca6d4ac2..0f26660cbb 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -5380,12 +5380,27 @@ order_qual_clauses(PlannerInfo *root, List *clauses)
static void
copy_generic_path_info(Plan *dest, Path *src)
{
+ ListCell *lc;
+
dest->startup_cost = src->startup_cost;
dest->total_cost = src->total_cost;
dest->plan_rows = src->rows;
dest->plan_width = src->pathtarget->width;
dest->parallel_aware = src->parallel_aware;
dest->parallel_safe = src->parallel_safe;
+
+ dest->applied_stats = src->parent->applied_stats;
+ dest->applied_clauses_or = src->parent->applied_clauses_or;
+
+ dest->applied_clauses = NIL;
+ foreach (lc, src->parent->applied_clauses)
+ {
+ List *clauses = (List *) lfirst(lc);
+
+ dest->applied_clauses
+ = lappend(dest->applied_clauses,
+ maybe_extract_actual_clauses(clauses, false));
+ }
}
/*
diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c
index 5d83f60eb9..fb177d9ac3 100644
--- a/src/backend/optimizer/util/relnode.c
+++ b/src/backend/optimizer/util/relnode.c
@@ -285,6 +285,10 @@ build_simple_rel(PlannerInfo *root, int relid, RelOptInfo *parent)
rel->partexprs = NULL;
rel->nullable_partexprs = NULL;
+ rel->applied_stats = NIL;
+ rel->applied_clauses = NIL;
+ rel->applied_clauses_or = NIL;
+
/*
* Pass assorted information down the inheritance hierarchy.
*/
@@ -746,6 +750,10 @@ build_join_rel(PlannerInfo *root,
joinrel->partexprs = NULL;
joinrel->nullable_partexprs = NULL;
+ joinrel->applied_stats = NIL;
+ joinrel->applied_clauses = NIL;
+ joinrel->applied_clauses_or = NIL;
+
/* Compute information relevant to the foreign relations. */
set_foreign_rel_properties(joinrel, outer_rel, inner_rel);
@@ -939,6 +947,10 @@ build_child_join_rel(PlannerInfo *root, RelOptInfo *outer_rel,
joinrel->partexprs = NULL;
joinrel->nullable_partexprs = NULL;
+ joinrel->applied_stats = NIL;
+ joinrel->applied_clauses = NIL;
+ joinrel->applied_clauses_or = NIL;
+
/* Compute information relevant to foreign relations. */
set_foreign_rel_properties(joinrel, outer_rel, inner_rel);
diff --git a/src/backend/optimizer/util/restrictinfo.c b/src/backend/optimizer/util/restrictinfo.c
index d6d26a2b51..80bf7143f5 100644
--- a/src/backend/optimizer/util/restrictinfo.c
+++ b/src/backend/optimizer/util/restrictinfo.c
@@ -508,6 +508,41 @@ extract_actual_clauses(List *restrictinfo_list,
return result;
}
+/*
+ * maybe_extract_actual_clauses
+ *
+ * Just like extract_actual_clauses, but does not require the clauses to
+ * already be RestrictInfo.
+ *
+ * XXX Does not handle RestrictInfos nested in OR clauses.
+ */
+List *
+maybe_extract_actual_clauses(List *restrictinfo_list,
+ bool pseudoconstant)
+{
+ List *result = NIL;
+ ListCell *l;
+
+ foreach(l, restrictinfo_list)
+ {
+ RestrictInfo *rinfo;
+ Node *node = (Node *) lfirst(l);
+
+ if (!IsA(node, RestrictInfo))
+ {
+ result = lappend(result, node);
+ continue;
+ }
+
+ rinfo = (RestrictInfo *) node;
+
+ if (rinfo->pseudoconstant == pseudoconstant)
+ result = lappend(result, rinfo->clause);
+ }
+
+ return result;
+}
+
/*
* extract_actual_join_clauses
*
diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c
index 7f014a0cbb..5d3f8f54df 100644
--- a/src/backend/statistics/extended_stats.c
+++ b/src/backend/statistics/extended_stats.c
@@ -1858,6 +1858,11 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
list_exprs[listidx] = NULL;
}
+ /* add it to the list of applied stats/clauses */
+ rel->applied_stats = lappend(rel->applied_stats, stat);
+ rel->applied_clauses = lappend(rel->applied_clauses, stat_clauses);
+ rel->applied_clauses_or = lappend_int(rel->applied_clauses_or, (is_or) ? 1 : 0);
+
if (is_or)
{
bool *or_matches = NULL;
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index e11d022827..9e32abc7f9 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -4039,6 +4039,7 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
ListCell *lc2;
Bitmapset *matched = NULL;
AttrNumber attnum_offset;
+ List *matched_exprs = NIL;
/*
* How much we need to offset the attnums? If there are no
@@ -4086,6 +4087,9 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
matched = bms_add_member(matched, attnum);
+ /* track expressions matched by this statistics */
+ matched_exprs = lappend(matched_exprs, varinfo->var);
+
found = true;
}
@@ -4114,6 +4118,9 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
matched = bms_add_member(matched, attnum);
+ /* track expressions matched by this statistics */
+ matched_exprs = lappend(matched_exprs, expr);
+
/* there should be just one matching expression */
break;
}
@@ -4122,6 +4129,10 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
}
}
+ rel->applied_stats = lappend(rel->applied_stats, matched_info);
+ rel->applied_clauses = lappend(rel->applied_clauses, matched_exprs);
+ rel->applied_clauses_or = lappend(rel->applied_clauses_or, 0);
+
/* Find the specific item that exactly matches the combination */
for (i = 0; i < stats->nitems; i++)
{
diff --git a/src/backend/utils/cache/lsyscache.c b/src/backend/utils/cache/lsyscache.c
index fc6d267e44..d3412f0c2a 100644
--- a/src/backend/utils/cache/lsyscache.c
+++ b/src/backend/utils/cache/lsyscache.c
@@ -33,6 +33,7 @@
#include "catalog/pg_proc.h"
#include "catalog/pg_range.h"
#include "catalog/pg_statistic.h"
+#include "catalog/pg_statistic_ext.h"
#include "catalog/pg_subscription.h"
#include "catalog/pg_transform.h"
#include "catalog/pg_type.h"
@@ -3696,3 +3697,51 @@ get_subscription_name(Oid subid, bool missing_ok)
return subname;
}
+
+/*
+ * get_statistics_name
+ * Returns the name of a given extended statistics
+ *
+ * Returns a palloc'd copy of the string, or NULL if no such namespace.
+ */
+char *
+get_statistics_name(Oid stxid)
+{
+ HeapTuple tp;
+
+ tp = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(stxid));
+ if (HeapTupleIsValid(tp))
+ {
+ Form_pg_statistic_ext stxtup = (Form_pg_statistic_ext) GETSTRUCT(tp);
+ char *result;
+
+ result = pstrdup(NameStr(stxtup->stxname));
+ ReleaseSysCache(tp);
+ return result;
+ }
+ else
+ return NULL;
+}
+
+/*
+ * get_statistics_namespace
+ * Returns the namespace OID of a given extended statistics
+ */
+Oid
+get_statistics_namespace(Oid stxid)
+{
+ HeapTuple tp;
+
+ tp = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(stxid));
+ if (HeapTupleIsValid(tp))
+ {
+ Form_pg_statistic_ext stxtup = (Form_pg_statistic_ext) GETSTRUCT(tp);
+ Oid result;
+
+ result = stxtup->stxnamespace;
+ ReleaseSysCache(tp);
+ return result;
+ }
+ else
+ return InvalidOid;
+}
diff --git a/src/include/nodes/makefuncs.h b/src/include/nodes/makefuncs.h
index 3180703005..6af24e60b7 100644
--- a/src/include/nodes/makefuncs.h
+++ b/src/include/nodes/makefuncs.h
@@ -94,6 +94,8 @@ extern Node *make_and_qual(Node *qual1, Node *qual2);
extern Expr *make_ands_explicit(List *andclauses);
extern List *make_ands_implicit(Expr *clause);
+extern Expr *make_ors_explicit(List *orclauses);
+
extern IndexInfo *makeIndexInfo(int numattrs, int numkeyattrs, Oid amoid,
List *expressions, List *predicates,
bool unique, bool nulls_not_distinct,
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index ed85dc7414..aee7e57541 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -1028,6 +1028,11 @@ typedef struct RelOptInfo
List **partexprs pg_node_attr(read_write_ignore);
/* Nullable partition key expressions */
List **nullable_partexprs pg_node_attr(read_write_ignore);
+
+ /* info about applied extended statistics */
+ List *applied_stats; /* list of StatisticExtInfo */
+ List *applied_clauses; /* list of lists (of clauses) */
+ List *applied_clauses_or; /* are the clauses AND or OR */
} RelOptInfo;
/*
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index d40af8e59f..2ff7acf9d5 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -169,6 +169,11 @@ typedef struct Plan
*/
Bitmapset *extParam;
Bitmapset *allParam;
+
+ /* info about applied statistics */
+ List *applied_stats;
+ List *applied_clauses;
+ List *applied_clauses_or;
} Plan;
/* ----------------
diff --git a/src/include/optimizer/restrictinfo.h b/src/include/optimizer/restrictinfo.h
index e140e619ac..ff1082c4fb 100644
--- a/src/include/optimizer/restrictinfo.h
+++ b/src/include/optimizer/restrictinfo.h
@@ -39,6 +39,8 @@ extern bool restriction_is_securely_promotable(RestrictInfo *restrictinfo,
extern List *get_actual_clauses(List *restrictinfo_list);
extern List *extract_actual_clauses(List *restrictinfo_list,
bool pseudoconstant);
+extern List *maybe_extract_actual_clauses(List *restrictinfo_list,
+ bool pseudoconstant);
extern void extract_actual_join_clauses(List *restrictinfo_list,
Relids joinrelids,
List **joinquals,
diff --git a/src/include/utils/lsyscache.h b/src/include/utils/lsyscache.h
index c22cabdf42..0a85cc38b3 100644
--- a/src/include/utils/lsyscache.h
+++ b/src/include/utils/lsyscache.h
@@ -204,6 +204,9 @@ extern char *get_publication_name(Oid pubid, bool missing_ok);
extern Oid get_subscription_oid(const char *subname, bool missing_ok);
extern char *get_subscription_name(Oid subid, bool missing_ok);
+extern char *get_statistics_name(Oid stxid);
+extern Oid get_statistics_namespace(Oid stxid);
+
#define type_is_array(typid) (get_element_type(typid) != InvalidOid)
/* type_is_array_domain accepts both plain arrays and domains over arrays */
#define type_is_array_domain(typid) (get_base_element_type(typid) != InvalidOid)
Hi Tomas,
The attached patch does not correspond to the above comment.
But it does solve some of the issues mentioned in previous threads.
Oops, I made a mistake sending a patch on my previous email.
Attached patch is the right patch.
Regards,
Tatsuro Yamada
Attachments:
0001-Add-Showing-applied-extended-statistics-in-explain-r3.patchapplication/octet-stream; name=0001-Add-Showing-applied-extended-statistics-in-explain-r3.patchDownload
From 5e78a80ad51b1e9436c43d3422056cdc8d1117d3 Mon Sep 17 00:00:00 2001
From: Tatsuro Yamada <yamatattsu@gmail.com>
Date: Wed, 26 Jun 2024 17:03:34 +0900
Subject: [PATCH] Add Showing applied extended statistics in explain r3.
This patch fixes the following points of issues as Tomas mentioned
before:
1. Previously, clauses in Grouping queries were incorrectly displayed as
And lists, but have now been fixed to display them as comma lists.
2. To reduce overhead, extended statistics information is now tracked
only when the verbose option is selected.
Also, The strings displayed in Explain were replaced with the following:
s/Statistics/Ext Stats/
---
src/backend/commands/explain.c | 114 ++++++++++++++++++++++
src/backend/nodes/makefuncs.c | 11 +++
src/backend/optimizer/plan/createplan.c | 15 +++
src/backend/optimizer/util/relnode.c | 12 +++
src/backend/optimizer/util/restrictinfo.c | 35 +++++++
src/backend/statistics/extended_stats.c | 8 ++
src/backend/utils/adt/selfuncs.c | 15 +++
src/backend/utils/cache/lsyscache.c | 49 ++++++++++
src/include/nodes/makefuncs.h | 2 +
src/include/nodes/parsenodes.h | 3 +
src/include/nodes/pathnodes.h | 5 +
src/include/nodes/plannodes.h | 5 +
src/include/optimizer/restrictinfo.h | 2 +
src/include/utils/lsyscache.h | 3 +
14 files changed, 279 insertions(+)
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 94511a5a02..7e39d37b59 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -94,6 +94,9 @@ static void show_qual(List *qual, const char *qlabel,
static void show_scan_qual(List *qual, const char *qlabel,
PlanState *planstate, List *ancestors,
ExplainState *es);
+static void show_scan_stats(List *stats, List *clauses, List *ors,
+ PlanState *planstate, List *ancestors,
+ ExplainState *es);
static void show_upper_qual(List *qual, const char *qlabel,
PlanState *planstate, List *ancestors,
ExplainState *es);
@@ -483,6 +486,11 @@ standard_ExplainOneQuery(Query *query, int cursorOptions,
if (es->buffers)
bufusage_start = pgBufferUsage;
+
+ /* if this flag is true, applied ext stats are stored */
+ if (es->verbose)
+ query->isExplain = true;
+
INSTR_TIME_SET_CURRENT(planstart);
/* plan the query */
@@ -1975,6 +1983,10 @@ ExplainNode(PlanState *planstate, List *ancestors,
if (plan->qual)
show_instrumentation_count("Rows Removed by Filter", 1,
planstate, es);
+ if (es->verbose)
+ show_scan_stats(plan->applied_stats, plan->applied_clauses,
+ plan->applied_clauses_or,
+ planstate, ancestors, es);
break;
case T_IndexOnlyScan:
show_scan_qual(((IndexOnlyScan *) plan)->indexqual,
@@ -1991,10 +2003,18 @@ ExplainNode(PlanState *planstate, List *ancestors,
if (es->analyze)
ExplainPropertyFloat("Heap Fetches", NULL,
planstate->instrument->ntuples2, 0, es);
+ if (es->verbose)
+ show_scan_stats(plan->applied_stats, plan->applied_clauses,
+ plan->applied_clauses_or,
+ planstate, ancestors, es);
break;
case T_BitmapIndexScan:
show_scan_qual(((BitmapIndexScan *) plan)->indexqualorig,
"Index Cond", planstate, ancestors, es);
+ if (es->verbose)
+ show_scan_stats(plan->applied_stats, plan->applied_clauses,
+ plan->applied_clauses_or,
+ planstate, ancestors, es);
break;
case T_BitmapHeapScan:
show_scan_qual(((BitmapHeapScan *) plan)->bitmapqualorig,
@@ -2024,6 +2044,10 @@ ExplainNode(PlanState *planstate, List *ancestors,
if (plan->qual)
show_instrumentation_count("Rows Removed by Filter", 1,
planstate, es);
+ if (es->verbose)
+ show_scan_stats(plan->applied_stats, plan->applied_clauses,
+ plan->applied_clauses_or,
+ planstate, ancestors, es);
break;
case T_Gather:
{
@@ -2203,6 +2227,10 @@ ExplainNode(PlanState *planstate, List *ancestors,
if (plan->qual)
show_instrumentation_count("Rows Removed by Filter", 1,
planstate, es);
+ if (es->verbose)
+ show_scan_stats(plan->applied_stats, plan->applied_clauses,
+ plan->applied_clauses_or,
+ planstate, ancestors, es);
break;
case T_WindowAgg:
show_upper_qual(plan->qual, "Filter", planstate, ancestors, es);
@@ -2218,6 +2246,10 @@ ExplainNode(PlanState *planstate, List *ancestors,
if (plan->qual)
show_instrumentation_count("Rows Removed by Filter", 1,
planstate, es);
+ if (es->verbose)
+ show_scan_stats(plan->applied_stats, plan->applied_clauses,
+ plan->applied_clauses_or,
+ planstate, ancestors, es);
break;
case T_Sort:
show_sort_keys(castNode(SortState, planstate), ancestors, es);
@@ -2537,6 +2569,88 @@ show_scan_qual(List *qual, const char *qlabel,
show_qual(qual, qlabel, planstate, ancestors, useprefix, es);
}
+/*
+ * Show a generic expression
+ */
+static char *
+deparse_stat_expression(Node *node,
+ PlanState *planstate, List *ancestors,
+ ExplainState *es)
+{
+ List *context;
+
+ /* Set up deparsing context */
+ context = set_deparse_context_plan(es->deparse_cxt,
+ planstate->plan,
+ ancestors);
+
+ /* Deparse the expression */
+ return deparse_expression(node, context, false, false);
+}
+
+/*
+ * Show a qualifier expression (which is a List with implicit AND semantics)
+ */
+static char *
+show_stat_qual(List *qual, int is_or,
+ PlanState *planstate, List *ancestors,
+ ExplainState *es)
+{
+ Node *node;
+
+ /* No work if empty qual */
+ if (qual == NIL)
+ return NULL;
+
+ /* Convert AND list to explicit AND */
+ switch (is_or)
+ {
+ case 0:
+ node = (Node *) make_ands_explicit(qual);
+ break;
+ case 1:
+ node = (Node *) make_ors_explicit(qual);
+ break;
+ case 2:
+ /* Extended stats for GROUP BY clause should be comma separeted string */
+ node = (Node *) qual;
+ break;
+ default:
+ elog(ERROR, "unexpected value: %d", is_or);
+ break;
+ }
+
+ /* And show it */
+ return deparse_stat_expression(node, planstate, ancestors, es);
+}
+
+/*
+ * Show applied statistics for scan/agg/group plan node
+ */
+static void
+show_scan_stats(List *stats, List *clauses, List *ors,
+ PlanState *planstate, List *ancestors, ExplainState *es)
+{
+ ListCell *lc1, *lc2, *lc3;
+ StringInfoData str;
+
+ forthree (lc1, stats, lc2, clauses, lc3, ors)
+ {
+ StatisticExtInfo *stat = (StatisticExtInfo *) lfirst(lc1);
+ List *applied_clauses = (List *) lfirst(lc2);
+ int is_or = lfirst_int(lc3);
+
+ initStringInfo(&str);
+
+ appendStringInfo(&str, "%s.%s Clauses: %s",
+ get_namespace_name(get_statistics_namespace(stat->statOid)),
+ get_statistics_name(stat->statOid),
+ show_stat_qual(applied_clauses, is_or, planstate, ancestors, es));
+
+ ExplainPropertyText("Ext Stats", str.data, es);
+ }
+}
+
/*
* Show a qualifier expression for an upper-level plan node
*/
diff --git a/src/backend/nodes/makefuncs.c b/src/backend/nodes/makefuncs.c
index 61ac172a85..725c76ab63 100644
--- a/src/backend/nodes/makefuncs.c
+++ b/src/backend/nodes/makefuncs.c
@@ -733,6 +733,17 @@ make_ands_explicit(List *andclauses)
return make_andclause(andclauses);
}
+Expr *
+make_ors_explicit(List *orclauses)
+{
+ if (orclauses == NIL)
+ return (Expr *) makeBoolConst(true, false);
+ else if (list_length(orclauses) == 1)
+ return (Expr *) linitial(orclauses);
+ else
+ return make_orclause(orclauses);
+}
+
List *
make_ands_implicit(Expr *clause)
{
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 6b64c4a362..25541308f4 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -5408,12 +5408,27 @@ order_qual_clauses(PlannerInfo *root, List *clauses)
static void
copy_generic_path_info(Plan *dest, Path *src)
{
+ ListCell *lc;
+
dest->startup_cost = src->startup_cost;
dest->total_cost = src->total_cost;
dest->plan_rows = src->rows;
dest->plan_width = src->pathtarget->width;
dest->parallel_aware = src->parallel_aware;
dest->parallel_safe = src->parallel_safe;
+
+ dest->applied_stats = src->parent->applied_stats;
+ dest->applied_clauses_or = src->parent->applied_clauses_or;
+
+ dest->applied_clauses = NIL;
+ foreach (lc, src->parent->applied_clauses)
+ {
+ List *clauses = (List *) lfirst(lc);
+
+ dest->applied_clauses
+ = lappend(dest->applied_clauses,
+ maybe_extract_actual_clauses(clauses, false));
+ }
}
/*
diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c
index e05b21c884..637f730a8e 100644
--- a/src/backend/optimizer/util/relnode.c
+++ b/src/backend/optimizer/util/relnode.c
@@ -287,6 +287,10 @@ build_simple_rel(PlannerInfo *root, int relid, RelOptInfo *parent)
rel->partexprs = NULL;
rel->nullable_partexprs = NULL;
+ rel->applied_stats = NIL;
+ rel->applied_clauses = NIL;
+ rel->applied_clauses_or = NIL;
+
/*
* Pass assorted information down the inheritance hierarchy.
*/
@@ -769,6 +773,10 @@ build_join_rel(PlannerInfo *root,
joinrel->partexprs = NULL;
joinrel->nullable_partexprs = NULL;
+ joinrel->applied_stats = NIL;
+ joinrel->applied_clauses = NIL;
+ joinrel->applied_clauses_or = NIL;
+
/* Compute information relevant to the foreign relations. */
set_foreign_rel_properties(joinrel, outer_rel, inner_rel);
@@ -963,6 +971,10 @@ build_child_join_rel(PlannerInfo *root, RelOptInfo *outer_rel,
joinrel->partexprs = NULL;
joinrel->nullable_partexprs = NULL;
+ joinrel->applied_stats = NIL;
+ joinrel->applied_clauses = NIL;
+ joinrel->applied_clauses_or = NIL;
+
/* Compute information relevant to foreign relations. */
set_foreign_rel_properties(joinrel, outer_rel, inner_rel);
diff --git a/src/backend/optimizer/util/restrictinfo.c b/src/backend/optimizer/util/restrictinfo.c
index 0b406e9334..23136a330b 100644
--- a/src/backend/optimizer/util/restrictinfo.c
+++ b/src/backend/optimizer/util/restrictinfo.c
@@ -508,6 +508,41 @@ extract_actual_clauses(List *restrictinfo_list,
return result;
}
+/*
+ * maybe_extract_actual_clauses
+ *
+ * Just like extract_actual_clauses, but does not require the clauses to
+ * already be RestrictInfo.
+ *
+ * XXX Does not handle RestrictInfos nested in OR clauses.
+ */
+List *
+maybe_extract_actual_clauses(List *restrictinfo_list,
+ bool pseudoconstant)
+{
+ List *result = NIL;
+ ListCell *l;
+
+ foreach(l, restrictinfo_list)
+ {
+ RestrictInfo *rinfo;
+ Node *node = (Node *) lfirst(l);
+
+ if (!IsA(node, RestrictInfo))
+ {
+ result = lappend(result, node);
+ continue;
+ }
+
+ rinfo = (RestrictInfo *) node;
+
+ if (rinfo->pseudoconstant == pseudoconstant)
+ result = lappend(result, rinfo->clause);
+ }
+
+ return result;
+}
+
/*
* extract_actual_join_clauses
*
diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c
index 99fdf208db..4d92f24c61 100644
--- a/src/backend/statistics/extended_stats.c
+++ b/src/backend/statistics/extended_stats.c
@@ -1857,6 +1857,14 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
list_exprs[listidx] = NULL;
}
+ /* add it to the list of applied stats/clauses, if this flag is true */
+ if (root->parse->isExplain)
+ {
+ rel->applied_stats = lappend(rel->applied_stats, stat);
+ rel->applied_clauses = lappend(rel->applied_clauses, stat_clauses);
+ rel->applied_clauses_or = lappend_int(rel->applied_clauses_or, (is_or) ? 1 : 0);
+ }
+
if (is_or)
{
bool *or_matches = NULL;
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 5f5d7959d8..2aec61e7bb 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -4069,6 +4069,7 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
ListCell *lc2;
Bitmapset *matched = NULL;
AttrNumber attnum_offset;
+ List *matched_exprs = NIL;
/*
* How much we need to offset the attnums? If there are no
@@ -4116,6 +4117,9 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
matched = bms_add_member(matched, attnum);
+ /* track expressions matched by this statistics */
+ matched_exprs = lappend(matched_exprs, varinfo->var);
+
found = true;
}
@@ -4144,6 +4148,9 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
matched = bms_add_member(matched, attnum);
+ /* track expressions matched by this statistics */
+ matched_exprs = lappend(matched_exprs, expr);
+
/* there should be just one matching expression */
break;
}
@@ -4152,6 +4159,14 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
}
}
+ /* add it to the list of applied stats/clauses, if this flag is true */
+ if (root->parse->isExplain)
+ {
+ rel->applied_stats = lappend(rel->applied_stats, matched_info);
+ rel->applied_clauses = lappend(rel->applied_clauses, matched_exprs);
+ rel->applied_clauses_or = lappend_int(rel->applied_clauses_or, 2); /* 2: Use comma to deparse */
+ }
+
/* Find the specific item that exactly matches the combination */
for (i = 0; i < stats->nitems; i++)
{
diff --git a/src/backend/utils/cache/lsyscache.c b/src/backend/utils/cache/lsyscache.c
index 48a280d089..d135751dd0 100644
--- a/src/backend/utils/cache/lsyscache.c
+++ b/src/backend/utils/cache/lsyscache.c
@@ -35,6 +35,7 @@
#include "catalog/pg_publication.h"
#include "catalog/pg_range.h"
#include "catalog/pg_statistic.h"
+#include "catalog/pg_statistic_ext.h"
#include "catalog/pg_subscription.h"
#include "catalog/pg_transform.h"
#include "catalog/pg_type.h"
@@ -3714,3 +3715,51 @@ get_subscription_name(Oid subid, bool missing_ok)
return subname;
}
+
+/*
+ * get_statistics_name
+ * Returns the name of a given extended statistics
+ *
+ * Returns a palloc'd copy of the string, or NULL if no such namespace.
+ */
+char *
+get_statistics_name(Oid stxid)
+{
+ HeapTuple tp;
+
+ tp = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(stxid));
+ if (HeapTupleIsValid(tp))
+ {
+ Form_pg_statistic_ext stxtup = (Form_pg_statistic_ext) GETSTRUCT(tp);
+ char *result;
+
+ result = pstrdup(NameStr(stxtup->stxname));
+ ReleaseSysCache(tp);
+ return result;
+ }
+ else
+ return NULL;
+}
+
+/*
+ * get_statistics_namespace
+ * Returns the namespace OID of a given extended statistics
+ */
+Oid
+get_statistics_namespace(Oid stxid)
+{
+ HeapTuple tp;
+
+ tp = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(stxid));
+ if (HeapTupleIsValid(tp))
+ {
+ Form_pg_statistic_ext stxtup = (Form_pg_statistic_ext) GETSTRUCT(tp);
+ Oid result;
+
+ result = stxtup->stxnamespace;
+ ReleaseSysCache(tp);
+ return result;
+ }
+ else
+ return InvalidOid;
+}
diff --git a/src/include/nodes/makefuncs.h b/src/include/nodes/makefuncs.h
index 5209d3de89..f5ae402c16 100644
--- a/src/include/nodes/makefuncs.h
+++ b/src/include/nodes/makefuncs.h
@@ -94,6 +94,8 @@ extern Node *make_and_qual(Node *qual1, Node *qual2);
extern Expr *make_ands_explicit(List *andclauses);
extern List *make_ands_implicit(Expr *clause);
+extern Expr *make_ors_explicit(List *orclauses);
+
extern IndexInfo *makeIndexInfo(int numattrs, int numkeyattrs, Oid amoid,
List *expressions, List *predicates,
bool unique, bool nulls_not_distinct,
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h
index 85a62b538e..36b479c104 100644
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -238,6 +238,9 @@ typedef struct Query
ParseLoc stmt_location;
/* length in bytes; 0 means "rest of string" */
ParseLoc stmt_len pg_node_attr(query_jumble_ignore);
+
+ /* if true, query is explain verbose */
+ bool isExplain;
} Query;
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index 2ba297c117..d78f08c3c4 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -1043,6 +1043,11 @@ typedef struct RelOptInfo
List **partexprs pg_node_attr(read_write_ignore);
/* Nullable partition key expressions */
List **nullable_partexprs pg_node_attr(read_write_ignore);
+
+ /* info about applied extended statistics */
+ List *applied_stats; /* list of StatisticExtInfo */
+ List *applied_clauses; /* list of lists of clauses */
+ List *applied_clauses_or; /* are the clauses AND, OR, or Comma */
} RelOptInfo;
/*
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index 1aeeaec95e..504e5cc475 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -169,6 +169,11 @@ typedef struct Plan
*/
Bitmapset *extParam;
Bitmapset *allParam;
+
+ /* info about applied statistics */
+ List *applied_stats;
+ List *applied_clauses;
+ List *applied_clauses_or;
} Plan;
/* ----------------
diff --git a/src/include/optimizer/restrictinfo.h b/src/include/optimizer/restrictinfo.h
index 1b42c832c5..29aa519b99 100644
--- a/src/include/optimizer/restrictinfo.h
+++ b/src/include/optimizer/restrictinfo.h
@@ -39,6 +39,8 @@ extern bool restriction_is_securely_promotable(RestrictInfo *restrictinfo,
extern List *get_actual_clauses(List *restrictinfo_list);
extern List *extract_actual_clauses(List *restrictinfo_list,
bool pseudoconstant);
+extern List *maybe_extract_actual_clauses(List *restrictinfo_list,
+ bool pseudoconstant);
extern void extract_actual_join_clauses(List *restrictinfo_list,
Relids joinrelids,
List **joinquals,
diff --git a/src/include/utils/lsyscache.h b/src/include/utils/lsyscache.h
index 20446f6f83..3fab6d5eea 100644
--- a/src/include/utils/lsyscache.h
+++ b/src/include/utils/lsyscache.h
@@ -206,6 +206,9 @@ extern char *get_publication_name(Oid pubid, bool missing_ok);
extern Oid get_subscription_oid(const char *subname, bool missing_ok);
extern char *get_subscription_name(Oid subid, bool missing_ok);
+extern char *get_statistics_name(Oid stxid);
+extern Oid get_statistics_namespace(Oid stxid);
+
#define type_is_array(typid) (get_element_type(typid) != InvalidOid)
/* type_is_array_domain accepts both plain arrays and domains over arrays */
#define type_is_array_domain(typid) (get_base_element_type(typid) != InvalidOid)
--
2.39.3
Hi Tomas and All,
Attached file is a new patch including:
6) Add stats option to explain command
7) The patch really needs some docs (partly)
4) Add new node (resolve errors in cfbot and prepared statement)
I tried adding a new node in pathnode.h, but it doesn't work well.
So, it needs more time to implement it successfully because this is
the first time to add a new node in it.
8) Add regression test (stats_ext.sql)
Actually, I am not yet able to add new test cases to stats_ext.sql.
Instead, I created a simple test (test.sql) and have attached it.
Also, output.txt is the test result.
To add new test cases to stats_ext.sql,
I'd like to decide on a strategy for modifying it. In particular, there are
381 places where the check_estimated_rows function is used, so should I
include the same number of tests, or should we include the bare minimum
of tests that cover the code path? I think only the latter would be fine.
Any advice is appreciated. :-D
P.S.
I'm going to investigate how to use CI this weekend hopefully.
Regards,
Tatsuro Yamada
Attachments:
0001-Add-a-new-option-STATS-to-Explain-command.patchapplication/x-patch; name=0001-Add-a-new-option-STATS-to-Explain-command.patchDownload
From f739a85886cddb160885f22fa5c2bd56c1a23c5c Mon Sep 17 00:00:00 2001
From: Tatsuro Yamada <yamatattsu@gmail.com>
Date: Wed, 26 Jun 2024 17:03:34 +0900
Subject: [PATCH] Add a new option STATS to Explain command
It shows applied extended statistics in Explain command output.
This patch fixes the following points:
- Fix deparsed clauses of Grouping query
Previously, it was incorrectly displayed as Anded-lists. However,
it was fixed to comma lists.
- To reduce overhead, extended statistics information is now tracked
only when the STATS option is selected.
- s/Statistics/Ext Stats/
"Statistics" seemed ambiguous, so I changed it to "Ext Stats".
I'm not sure whether Ext Stats is preferable or not.
- Fix document for the STATS option partly, and todo list is below:
Todo
- sql-createstatistics.html
Add Explain outputs using STATS option
- planner-stats.html
Should I introduce Explain with the STATS option on this page?
---
doc/src/sgml/ref/explain.sgml | 13 +++
src/backend/commands/explain.c | 116 ++++++++++++++++++++++
src/backend/nodes/makefuncs.c | 11 ++
src/backend/optimizer/plan/createplan.c | 15 +++
src/backend/optimizer/util/relnode.c | 12 +++
src/backend/optimizer/util/restrictinfo.c | 35 +++++++
src/backend/statistics/extended_stats.c | 8 ++
src/backend/utils/adt/selfuncs.c | 15 +++
src/backend/utils/cache/lsyscache.c | 49 +++++++++
src/include/commands/explain.h | 1 +
src/include/nodes/makefuncs.h | 2 +
src/include/nodes/parsenodes.h | 3 +
src/include/nodes/pathnodes.h | 5 +
src/include/nodes/plannodes.h | 5 +
src/include/optimizer/restrictinfo.h | 2 +
src/include/utils/lsyscache.h | 3 +
16 files changed, 295 insertions(+)
diff --git a/doc/src/sgml/ref/explain.sgml b/doc/src/sgml/ref/explain.sgml
index db9d3a8549..b573152b6e 100644
--- a/doc/src/sgml/ref/explain.sgml
+++ b/doc/src/sgml/ref/explain.sgml
@@ -43,6 +43,7 @@ EXPLAIN [ ( <replaceable class="parameter">option</replaceable> [, ...] ) ] <rep
BUFFERS [ <replaceable class="parameter">boolean</replaceable> ]
SERIALIZE [ { NONE | TEXT | BINARY } ]
WAL [ <replaceable class="parameter">boolean</replaceable> ]
+ STATS [ <replaceable class="parameter">boolean</replaceable> ]
TIMING [ <replaceable class="parameter">boolean</replaceable> ]
SUMMARY [ <replaceable class="parameter">boolean</replaceable> ]
MEMORY [ <replaceable class="parameter">boolean</replaceable> ]
@@ -248,6 +249,18 @@ ROLLBACK;
</listitem>
</varlistentry>
+ <varlistentry>
+ <term><literal>STATS</literal></term>
+ <listitem>
+ <para>
+ Include information on applied <literal>Extended statistics</literal>.
+ Specifically, include the names of extended statistics and clauses.
+ See <xref linkend="planner-stats-extended"/> for details about extended
+ statistics. This parameter defaults to <literal>FALSE</literal>.
+ </para>
+ </listitem>
+ </varlistentry>
+
<varlistentry>
<term><literal>TIMING</literal></term>
<listitem>
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 94511a5a02..0fd93b52e5 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -94,6 +94,9 @@ static void show_qual(List *qual, const char *qlabel,
static void show_scan_qual(List *qual, const char *qlabel,
PlanState *planstate, List *ancestors,
ExplainState *es);
+static void show_scan_stats(List *stats, List *clauses, List *ors,
+ PlanState *planstate, List *ancestors,
+ ExplainState *es);
static void show_upper_qual(List *qual, const char *qlabel,
PlanState *planstate, List *ancestors,
ExplainState *es);
@@ -210,6 +213,8 @@ ExplainQuery(ParseState *pstate, ExplainStmt *stmt,
es->settings = defGetBoolean(opt);
else if (strcmp(opt->defname, "generic_plan") == 0)
es->generic = defGetBoolean(opt);
+ else if (strcmp(opt->defname, "stats") == 0)
+ es->stats = defGetBoolean(opt);
else if (strcmp(opt->defname, "timing") == 0)
{
timing_set = true;
@@ -483,6 +488,11 @@ standard_ExplainOneQuery(Query *query, int cursorOptions,
if (es->buffers)
bufusage_start = pgBufferUsage;
+
+ /* if this flag is true, applied ext stats are stored */
+ if (es->stats)
+ query->isExplain_Stats = true;
+
INSTR_TIME_SET_CURRENT(planstart);
/* plan the query */
@@ -1975,6 +1985,10 @@ ExplainNode(PlanState *planstate, List *ancestors,
if (plan->qual)
show_instrumentation_count("Rows Removed by Filter", 1,
planstate, es);
+ if (es->stats)
+ show_scan_stats(plan->applied_stats, plan->applied_clauses,
+ plan->applied_clauses_or,
+ planstate, ancestors, es);
break;
case T_IndexOnlyScan:
show_scan_qual(((IndexOnlyScan *) plan)->indexqual,
@@ -1991,10 +2005,18 @@ ExplainNode(PlanState *planstate, List *ancestors,
if (es->analyze)
ExplainPropertyFloat("Heap Fetches", NULL,
planstate->instrument->ntuples2, 0, es);
+ if (es->stats)
+ show_scan_stats(plan->applied_stats, plan->applied_clauses,
+ plan->applied_clauses_or,
+ planstate, ancestors, es);
break;
case T_BitmapIndexScan:
show_scan_qual(((BitmapIndexScan *) plan)->indexqualorig,
"Index Cond", planstate, ancestors, es);
+ if (es->stats)
+ show_scan_stats(plan->applied_stats, plan->applied_clauses,
+ plan->applied_clauses_or,
+ planstate, ancestors, es);
break;
case T_BitmapHeapScan:
show_scan_qual(((BitmapHeapScan *) plan)->bitmapqualorig,
@@ -2024,6 +2046,10 @@ ExplainNode(PlanState *planstate, List *ancestors,
if (plan->qual)
show_instrumentation_count("Rows Removed by Filter", 1,
planstate, es);
+ if (es->stats)
+ show_scan_stats(plan->applied_stats, plan->applied_clauses,
+ plan->applied_clauses_or,
+ planstate, ancestors, es);
break;
case T_Gather:
{
@@ -2203,6 +2229,10 @@ ExplainNode(PlanState *planstate, List *ancestors,
if (plan->qual)
show_instrumentation_count("Rows Removed by Filter", 1,
planstate, es);
+ if (es->stats)
+ show_scan_stats(plan->applied_stats, plan->applied_clauses,
+ plan->applied_clauses_or,
+ planstate, ancestors, es);
break;
case T_WindowAgg:
show_upper_qual(plan->qual, "Filter", planstate, ancestors, es);
@@ -2218,6 +2248,10 @@ ExplainNode(PlanState *planstate, List *ancestors,
if (plan->qual)
show_instrumentation_count("Rows Removed by Filter", 1,
planstate, es);
+ if (es->stats)
+ show_scan_stats(plan->applied_stats, plan->applied_clauses,
+ plan->applied_clauses_or,
+ planstate, ancestors, es);
break;
case T_Sort:
show_sort_keys(castNode(SortState, planstate), ancestors, es);
@@ -2537,6 +2571,88 @@ show_scan_qual(List *qual, const char *qlabel,
show_qual(qual, qlabel, planstate, ancestors, useprefix, es);
}
+/*
+ * Show a generic expression
+ */
+static char *
+deparse_stat_expression(Node *node,
+ PlanState *planstate, List *ancestors,
+ ExplainState *es)
+{
+ List *context;
+
+ /* Set up deparsing context */
+ context = set_deparse_context_plan(es->deparse_cxt,
+ planstate->plan,
+ ancestors);
+
+ /* Deparse the expression */
+ return deparse_expression(node, context, false, false);
+}
+
+/*
+ * Show a qualifier expression (which is a List with implicit AND semantics)
+ */
+static char *
+show_stat_qual(List *qual, int is_or,
+ PlanState *planstate, List *ancestors,
+ ExplainState *es)
+{
+ Node *node;
+
+ /* No work if empty qual */
+ if (qual == NIL)
+ return NULL;
+
+ /* Convert AND list to explicit AND */
+ switch (is_or)
+ {
+ case 0:
+ node = (Node *) make_ands_explicit(qual);
+ break;
+ case 1:
+ node = (Node *) make_ors_explicit(qual);
+ break;
+ case 2:
+ /* Extended stats for GROUP BY clause should be comma separeted string */
+ node = (Node *) qual;
+ break;
+ default:
+ elog(ERROR, "unexpected value: %d", is_or);
+ break;
+ }
+
+ /* And show it */
+ return deparse_stat_expression(node, planstate, ancestors, es);
+}
+
+/*
+ * Show applied statistics for scan/agg/group plan node
+ */
+static void
+show_scan_stats(List *stats, List *clauses, List *ors,
+ PlanState *planstate, List *ancestors, ExplainState *es)
+{
+ ListCell *lc1, *lc2, *lc3;
+ StringInfoData str;
+
+ forthree (lc1, stats, lc2, clauses, lc3, ors)
+ {
+ StatisticExtInfo *stat = (StatisticExtInfo *) lfirst(lc1);
+ List *applied_clauses = (List *) lfirst(lc2);
+ int is_or = lfirst_int(lc3);
+
+ initStringInfo(&str);
+
+ appendStringInfo(&str, "%s.%s Clauses: %s",
+ get_namespace_name(get_statistics_namespace(stat->statOid)),
+ get_statistics_name(stat->statOid),
+ show_stat_qual(applied_clauses, is_or, planstate, ancestors, es));
+
+ ExplainPropertyText("Ext Stats", str.data, es);
+ }
+}
+
/*
* Show a qualifier expression for an upper-level plan node
*/
diff --git a/src/backend/nodes/makefuncs.c b/src/backend/nodes/makefuncs.c
index 61ac172a85..725c76ab63 100644
--- a/src/backend/nodes/makefuncs.c
+++ b/src/backend/nodes/makefuncs.c
@@ -733,6 +733,17 @@ make_ands_explicit(List *andclauses)
return make_andclause(andclauses);
}
+Expr *
+make_ors_explicit(List *orclauses)
+{
+ if (orclauses == NIL)
+ return (Expr *) makeBoolConst(true, false);
+ else if (list_length(orclauses) == 1)
+ return (Expr *) linitial(orclauses);
+ else
+ return make_orclause(orclauses);
+}
+
List *
make_ands_implicit(Expr *clause)
{
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 6b64c4a362..25541308f4 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -5408,12 +5408,27 @@ order_qual_clauses(PlannerInfo *root, List *clauses)
static void
copy_generic_path_info(Plan *dest, Path *src)
{
+ ListCell *lc;
+
dest->startup_cost = src->startup_cost;
dest->total_cost = src->total_cost;
dest->plan_rows = src->rows;
dest->plan_width = src->pathtarget->width;
dest->parallel_aware = src->parallel_aware;
dest->parallel_safe = src->parallel_safe;
+
+ dest->applied_stats = src->parent->applied_stats;
+ dest->applied_clauses_or = src->parent->applied_clauses_or;
+
+ dest->applied_clauses = NIL;
+ foreach (lc, src->parent->applied_clauses)
+ {
+ List *clauses = (List *) lfirst(lc);
+
+ dest->applied_clauses
+ = lappend(dest->applied_clauses,
+ maybe_extract_actual_clauses(clauses, false));
+ }
}
/*
diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c
index e05b21c884..637f730a8e 100644
--- a/src/backend/optimizer/util/relnode.c
+++ b/src/backend/optimizer/util/relnode.c
@@ -287,6 +287,10 @@ build_simple_rel(PlannerInfo *root, int relid, RelOptInfo *parent)
rel->partexprs = NULL;
rel->nullable_partexprs = NULL;
+ rel->applied_stats = NIL;
+ rel->applied_clauses = NIL;
+ rel->applied_clauses_or = NIL;
+
/*
* Pass assorted information down the inheritance hierarchy.
*/
@@ -769,6 +773,10 @@ build_join_rel(PlannerInfo *root,
joinrel->partexprs = NULL;
joinrel->nullable_partexprs = NULL;
+ joinrel->applied_stats = NIL;
+ joinrel->applied_clauses = NIL;
+ joinrel->applied_clauses_or = NIL;
+
/* Compute information relevant to the foreign relations. */
set_foreign_rel_properties(joinrel, outer_rel, inner_rel);
@@ -963,6 +971,10 @@ build_child_join_rel(PlannerInfo *root, RelOptInfo *outer_rel,
joinrel->partexprs = NULL;
joinrel->nullable_partexprs = NULL;
+ joinrel->applied_stats = NIL;
+ joinrel->applied_clauses = NIL;
+ joinrel->applied_clauses_or = NIL;
+
/* Compute information relevant to foreign relations. */
set_foreign_rel_properties(joinrel, outer_rel, inner_rel);
diff --git a/src/backend/optimizer/util/restrictinfo.c b/src/backend/optimizer/util/restrictinfo.c
index 0b406e9334..23136a330b 100644
--- a/src/backend/optimizer/util/restrictinfo.c
+++ b/src/backend/optimizer/util/restrictinfo.c
@@ -508,6 +508,41 @@ extract_actual_clauses(List *restrictinfo_list,
return result;
}
+/*
+ * maybe_extract_actual_clauses
+ *
+ * Just like extract_actual_clauses, but does not require the clauses to
+ * already be RestrictInfo.
+ *
+ * XXX Does not handle RestrictInfos nested in OR clauses.
+ */
+List *
+maybe_extract_actual_clauses(List *restrictinfo_list,
+ bool pseudoconstant)
+{
+ List *result = NIL;
+ ListCell *l;
+
+ foreach(l, restrictinfo_list)
+ {
+ RestrictInfo *rinfo;
+ Node *node = (Node *) lfirst(l);
+
+ if (!IsA(node, RestrictInfo))
+ {
+ result = lappend(result, node);
+ continue;
+ }
+
+ rinfo = (RestrictInfo *) node;
+
+ if (rinfo->pseudoconstant == pseudoconstant)
+ result = lappend(result, rinfo->clause);
+ }
+
+ return result;
+}
+
/*
* extract_actual_join_clauses
*
diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c
index 99fdf208db..dbf921118e 100644
--- a/src/backend/statistics/extended_stats.c
+++ b/src/backend/statistics/extended_stats.c
@@ -1857,6 +1857,14 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
list_exprs[listidx] = NULL;
}
+ /* add it to the list of applied stats/clauses, if this flag is true */
+ if (root->parse->isExplain_Stats)
+ {
+ rel->applied_stats = lappend(rel->applied_stats, stat);
+ rel->applied_clauses = lappend(rel->applied_clauses, stat_clauses);
+ rel->applied_clauses_or = lappend_int(rel->applied_clauses_or, (is_or) ? 1 : 0);
+ }
+
if (is_or)
{
bool *or_matches = NULL;
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 5f5d7959d8..21feb92a58 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -4069,6 +4069,7 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
ListCell *lc2;
Bitmapset *matched = NULL;
AttrNumber attnum_offset;
+ List *matched_exprs = NIL;
/*
* How much we need to offset the attnums? If there are no
@@ -4116,6 +4117,9 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
matched = bms_add_member(matched, attnum);
+ /* track expressions matched by this statistics */
+ matched_exprs = lappend(matched_exprs, varinfo->var);
+
found = true;
}
@@ -4144,6 +4148,9 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
matched = bms_add_member(matched, attnum);
+ /* track expressions matched by this statistics */
+ matched_exprs = lappend(matched_exprs, expr);
+
/* there should be just one matching expression */
break;
}
@@ -4152,6 +4159,14 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
}
}
+ /* add it to the list of applied stats/clauses, if this flag is true */
+ if (root->parse->isExplain_Stats)
+ {
+ rel->applied_stats = lappend(rel->applied_stats, matched_info);
+ rel->applied_clauses = lappend(rel->applied_clauses, matched_exprs);
+ rel->applied_clauses_or = lappend_int(rel->applied_clauses_or, 2); /* 2: Use comma to deparse */
+ }
+
/* Find the specific item that exactly matches the combination */
for (i = 0; i < stats->nitems; i++)
{
diff --git a/src/backend/utils/cache/lsyscache.c b/src/backend/utils/cache/lsyscache.c
index 48a280d089..d135751dd0 100644
--- a/src/backend/utils/cache/lsyscache.c
+++ b/src/backend/utils/cache/lsyscache.c
@@ -35,6 +35,7 @@
#include "catalog/pg_publication.h"
#include "catalog/pg_range.h"
#include "catalog/pg_statistic.h"
+#include "catalog/pg_statistic_ext.h"
#include "catalog/pg_subscription.h"
#include "catalog/pg_transform.h"
#include "catalog/pg_type.h"
@@ -3714,3 +3715,51 @@ get_subscription_name(Oid subid, bool missing_ok)
return subname;
}
+
+/*
+ * get_statistics_name
+ * Returns the name of a given extended statistics
+ *
+ * Returns a palloc'd copy of the string, or NULL if no such namespace.
+ */
+char *
+get_statistics_name(Oid stxid)
+{
+ HeapTuple tp;
+
+ tp = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(stxid));
+ if (HeapTupleIsValid(tp))
+ {
+ Form_pg_statistic_ext stxtup = (Form_pg_statistic_ext) GETSTRUCT(tp);
+ char *result;
+
+ result = pstrdup(NameStr(stxtup->stxname));
+ ReleaseSysCache(tp);
+ return result;
+ }
+ else
+ return NULL;
+}
+
+/*
+ * get_statistics_namespace
+ * Returns the namespace OID of a given extended statistics
+ */
+Oid
+get_statistics_namespace(Oid stxid)
+{
+ HeapTuple tp;
+
+ tp = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(stxid));
+ if (HeapTupleIsValid(tp))
+ {
+ Form_pg_statistic_ext stxtup = (Form_pg_statistic_ext) GETSTRUCT(tp);
+ Oid result;
+
+ result = stxtup->stxnamespace;
+ ReleaseSysCache(tp);
+ return result;
+ }
+ else
+ return InvalidOid;
+}
diff --git a/src/include/commands/explain.h b/src/include/commands/explain.h
index 9b8b351d9a..f9da3c6f7c 100644
--- a/src/include/commands/explain.h
+++ b/src/include/commands/explain.h
@@ -55,6 +55,7 @@ typedef struct ExplainState
bool memory; /* print planner's memory usage information */
bool settings; /* print modified settings */
bool generic; /* generate a generic plan */
+ bool stats; /* generate a generic plan */
ExplainSerializeOption serialize; /* serialize the query's output? */
ExplainFormat format; /* output format */
/* state for output formatting --- not reset for each new plan tree */
diff --git a/src/include/nodes/makefuncs.h b/src/include/nodes/makefuncs.h
index 5209d3de89..f5ae402c16 100644
--- a/src/include/nodes/makefuncs.h
+++ b/src/include/nodes/makefuncs.h
@@ -94,6 +94,8 @@ extern Node *make_and_qual(Node *qual1, Node *qual2);
extern Expr *make_ands_explicit(List *andclauses);
extern List *make_ands_implicit(Expr *clause);
+extern Expr *make_ors_explicit(List *orclauses);
+
extern IndexInfo *makeIndexInfo(int numattrs, int numkeyattrs, Oid amoid,
List *expressions, List *predicates,
bool unique, bool nulls_not_distinct,
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h
index 85a62b538e..d148aabad4 100644
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -238,6 +238,9 @@ typedef struct Query
ParseLoc stmt_location;
/* length in bytes; 0 means "rest of string" */
ParseLoc stmt_len pg_node_attr(query_jumble_ignore);
+
+ /* if true, query is explain with stats option */
+ bool isExplain_Stats;
} Query;
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index 2ba297c117..d78f08c3c4 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -1043,6 +1043,11 @@ typedef struct RelOptInfo
List **partexprs pg_node_attr(read_write_ignore);
/* Nullable partition key expressions */
List **nullable_partexprs pg_node_attr(read_write_ignore);
+
+ /* info about applied extended statistics */
+ List *applied_stats; /* list of StatisticExtInfo */
+ List *applied_clauses; /* list of lists of clauses */
+ List *applied_clauses_or; /* are the clauses AND, OR, or Comma */
} RelOptInfo;
/*
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index 1aeeaec95e..504e5cc475 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -169,6 +169,11 @@ typedef struct Plan
*/
Bitmapset *extParam;
Bitmapset *allParam;
+
+ /* info about applied statistics */
+ List *applied_stats;
+ List *applied_clauses;
+ List *applied_clauses_or;
} Plan;
/* ----------------
diff --git a/src/include/optimizer/restrictinfo.h b/src/include/optimizer/restrictinfo.h
index 1b42c832c5..29aa519b99 100644
--- a/src/include/optimizer/restrictinfo.h
+++ b/src/include/optimizer/restrictinfo.h
@@ -39,6 +39,8 @@ extern bool restriction_is_securely_promotable(RestrictInfo *restrictinfo,
extern List *get_actual_clauses(List *restrictinfo_list);
extern List *extract_actual_clauses(List *restrictinfo_list,
bool pseudoconstant);
+extern List *maybe_extract_actual_clauses(List *restrictinfo_list,
+ bool pseudoconstant);
extern void extract_actual_join_clauses(List *restrictinfo_list,
Relids joinrelids,
List **joinquals,
diff --git a/src/include/utils/lsyscache.h b/src/include/utils/lsyscache.h
index 20446f6f83..3fab6d5eea 100644
--- a/src/include/utils/lsyscache.h
+++ b/src/include/utils/lsyscache.h
@@ -206,6 +206,9 @@ extern char *get_publication_name(Oid pubid, bool missing_ok);
extern Oid get_subscription_oid(const char *subname, bool missing_ok);
extern char *get_subscription_name(Oid subid, bool missing_ok);
+extern char *get_statistics_name(Oid stxid);
+extern Oid get_statistics_namespace(Oid stxid);
+
#define type_is_array(typid) (get_element_type(typid) != InvalidOid)
/* type_is_array_domain accepts both plain arrays and domains over arrays */
#define type_is_array_domain(typid) (get_base_element_type(typid) != InvalidOid)
--
2.39.3
Hi,
Thanks for working the feature. As a user, I find it useful, and I'd like to use
it in v18! Although I've just started start looking into it, I have a few questions.
(1)
Is it better to make the order of output consistent? For example, even
though there are three clauses shown in the below case, the order does not
match.
* "Filter" shows that "id1" is first.
* "Ext Stats" shows that "id2" is first.
-- An example
DROP TABLE IF EXISTS test;
CREATE TABLE test (id1 int2, id2 int4, id3 int8, value varchar(32));
INSERT INTO test (SELECT i%11, i%103, i%1009, 'hello' FROM generate_series(1,1000000) s(i));
create statistics test_s1 on id1, id2 from test; analyze;
=# EXPLAIN (STATS) SELECT * FROM test WHERE id1 = 1 AND (id2 = 2 OR id2 > 10);
QUERY PLAN
---------------------------------------------------------------------------------------
Gather (cost=1000.00..23092.77 rows=84311 width=20)
Workers Planned: 2
-> Parallel Seq Scan on test (cost=0.00..13661.67 rows=35130 width=20)
Filter: ((id1 = 1) AND ((id2 = 2) OR (id2 > 10))) -- here
Ext Stats: public.test_s1 Clauses: (((id2 = 2) OR (id2 > 10)) AND (id1 = 1)) -- here
(5 rows)
(2)
Do we really need the schema names without VERBOSE option? As in the above case,
"Ext Stats" shows schema name "public", even though the table name "test" isn't
shown with its schema name.
Additionally, if the VERBOSE option is specified, should the column names also be
printed with namespace?
=# EXPLAIN (VERBOSE, STATS) SELECT * FROM test WHERE id1 = 1 AND (id2 = 2 OR id2 > 10);
QUERY PLAN
---------------------------------------------------------------------------------------
Gather (cost=1000.00..22947.37 rows=82857 width=20)
Output: id1, id2, id3, value
Workers Planned: 2
-> Parallel Seq Scan on public.test (cost=0.00..13661.67 rows=34524 width=20)
Output: id1, id2, id3, value
Filter: ((test.id1 = 1) AND ((test.id2 = 2) OR (test.id2 > 10)))
Ext Stats: public.test_s1 Clauses: (((id2 = 2) OR (id2 > 10)) AND (id1 = 1)) -- here
(7 rows)
(3)
I might be misunderstanding something, but do we need the clauses? Is there any
case where users would want to know the clauses? For example, wouldn't the
following be sufficient?
Ext Stats: id1, id2 using test_s1
(4)
The extended statistics with "dependencies" or "ndistinct" option don't seem to
be shown in EXPLAIN output. Am I missing something? (Is this expected?)
I tested the examples in the documentation. Although it might work with
"mcv" option, I can't confirm that it works because "unrecognized node type"
error occurred in my environment.
https://www.postgresql.org/docs/current/sql-createstatistics.html
(It might be wrong since I'm beginner with extended stats codes.)
IIUC, the reason is that the patch only handles statext_mcv_clauselist_selectivity(),
and doesn't handle dependencies_clauselist_selectivity() and estimate_multivariate_ndistinct().
-- doesn't work with "dependencies" option?
=# \dX
List of extended statistics
Schema | Name | Definition | Ndistinct | Dependencies | MCV
--------+---------+--------------------+-----------+--------------+---------
public | s1 | a, b FROM t1 | (null) | defined | (null)
(2 rows)
=# EXPLAIN (STATS, ANALYZE) SELECT * FROM t1 WHERE (a = 1) AND (b = 0);
QUERY PLAN
-------------------------------------------------------------------------------------------------------------------
Gather (cost=1000.00..11685.00 rows=100 width=8) (actual time=0.214..50.327 rows=100 loops=1)
Workers Planned: 2
Workers Launched: 2
-> Parallel Seq Scan on t1 (cost=0.00..10675.00 rows=42 width=8) (actual time=30.300..46.610 rows=33 loops=3)
Filter: ((a = 1) AND (b = 0))
Rows Removed by Filter: 333300
Planning Time: 0.246 ms
Execution Time: 50.361 ms
(8 rows)
-- doesn't work with "ndistinct"?
=# \dX
List of extended statistics
Schema | Name | Definition | Ndistinct | Dependencies | MCV
--------+------+------------------------------------------------------------------+-----------+--------------+--------
public | s3 | date_trunc('month'::text, a), date_trunc('day'::text, a) FROM t3 | defined | (null) | (null)
(1 row)
postgres(437635)=# EXPLAIN (STATS, ANALYZE) SELECT * FROM t3
WHERE date_trunc('month', a) = '2020-01-01'::timestamp;
QUERY PLAN
----------------------------------------------------------------------------------------------------------
Seq Scan on t3 (cost=0.00..10210.01 rows=45710 width=8) (actual time=0.027..143.199 rows=44640 loops=1)
Filter: (date_trunc('month'::text, a) = '2020-01-01 00:00:00'::timestamp without time zone)
Rows Removed by Filter: 480961
Planning Time: 0.088 ms
Execution Time: 144.590 ms
(5 rows)
-- doesn't work with "mvc". It might work, but the error happens in my environments
=# \dX
List of extended statistics
Schema | Name | Definition | Ndistinct | Dependencies | MCV
--------+------+--------------+-----------+--------------+---------
public | s2 | a, b FROM t2 | (null) | (null) | defined
(1 row)
-- I encountered the error with the query.
=# EXPLAIN (STATS, ANALYZE) SELECT * FROM t2 WHERE (a = 1) AND (b = 1);
ERROR: unrecognized node type: 268
Regards,
--
Masahiro Ikeda
NTT DATA CORPORATION
On 6/26/24 11:06, Tatsuro Yamada wrote:
Hi Tomas!
Thanks for the comments!
1) The patch is not added to the CF app, which I think is a mistake. Can
you please add it to the 2024-07 commitfest? Otherwise people may not be
aware of it, won't do reviews etc. It'll require posting a rebased
patch, but should not be a big deal.I added the patch to the 2024-07 commitfest today.
2) Not having the patch in a CF also means cfbot is not running tests on
it. Which is unfortunate, because the patch actually has an a bug cfbot
would find - I've noticed it after running the tests through the github
CI, see [2].
3) The bug has this symptom:
ERROR: unrecognized node type: 268
CONTEXT: PL/pgSQL function check_estimated_rows(text) line 7 ...
STATEMENT: SELECT * FROM check_estimated_rows('SELECT a, b FROM ...
4) I can think of two basic ways to fix this issue - either allow
copying of the StatisticExtInto node, or represent the information in a
different way (e.g. add a new node for that purpose, or use existing
nodes to do that).Thanks for the info. I'll investigate using cfbot.
To fix the problem, I understand we need to create a new struct like
(statistics OID, list of clauses, flag is_or).
Yes, something like that, in the plannodes.h.
5) In [3] Tom raised two possible issues with doing this - cost of
copying the information, and locking. For the performance concerns, I
think the first thing we should do is measuring how expensive it is. I
suggest measuring the overhead for about three basic cases:Okay, I'll measure it once the patch is completed and check the overhead.
I read [3][4] and in my opinion I agree with Robert.
As with indexes, there should be a mechanism for determining whether
extended statistics are used or not. If it were available, users would be
able to
tune using extended statistics and get better execution plans.
I do agree with that, but I also understand Tom's concerns about the
costs. His concern is that to make this work, we have to keep/copy the
information for all queries, even if that user never does explain.
Yes, we do the same thing (copy of some pieces) for indexes, and from
this point of view it's equally reasonable. But there's the difference
that for indexes it's always been done this way, hence it's considered
"the baseline", while for extended stats we've not copied the data until
this patch, so it'd be seen as a regression.
I think there are two ways to deal with this - ideally, we'd show that
the overhead is negligible (~noise). And if it's measurable, we'd need
to argue that it's worth it - but that's much harder, IMHO.
So I'd suggest you try to measure the overhead on a couple cases (simple
query with 0 or more statistics applied).
6) I'm not sure we want to have this under EXPLAIN (VERBOSE). It's what
I did in the initial PoC patch, but maybe we should invent a new flag
for this purpose, otherwise VERBOSE will cover too much stuff? I'm
thinking about "STATS" for example.This would probably mean the patch should also add a new auto_explain
"log_" flag to enable/disable this.I thought it might be better to do this, so I'll fix it.
OK
7) The patch really needs some docs - I'd mention this in the EXPLAIN
docs, probably. There's also a chapter about estimates, maybe that
should mention this too? Try searching for places in the SGML docs
mentioning extended stats and/or explain, I guess.I plan to create documentation after the specifications are finalized.
I'm, not sure that's a good approach. Maybe it doesn't need to be
mentioned in the section explaining how estimates work, but it'd be good
to have it at least in the EXPLAIN command docs. The thing is - docs are
a nice way for reviewers to learn about how the feature is expected to
work / be used. Yes, it may need to be adjusted if the patch changes,
but it's likely much easier than changing the code.
For tests, I guess stats_ext is the right place to test this. I'm not
sure what's the best way to do this, though. If it's covered by VERBOSE,
that seems it might be unstable - and that would be an issue. But maybe
we might add a function similar to check_estimated_rows(), but verifying
the query used the expected statistics to estimate expected clauses.As for testing, I think it's more convenient for reviewers to include it in
the patch,
so I'm thinking of including it in the next patch.
I'm not sure I understand what you mean - what is more convenient to
include in the patch & you plan to include in the next patch version?
My opinion is that there clearly need to be some regression tests, be it
in stats_ext.sql or in some other script. But to make it easier, we
might have a function similar to check_estimated_rows() which would
extract just the interesting part of the plan.
So there's stuff to do to make this committable, but hopefully this
review gives you some guidance regarding what/how ;-)
Thank you! It helps me a lot!
The attached patch does not correspond to the above comment.
But it does solve some of the issues mentioned in previous threads.The next patch is planned to include:
6) Add stats option to explain command
8) Add regression test (stats_ext.sql)
4) Add new node (resolve errors in cfbot and prepared statement)
Sounds good.
regards
--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
On 6/28/24 13:16, Tatsuro Yamada wrote:
Hi Tomas and All,
Attached file is a new patch including:
6) Add stats option to explain command
7) The patch really needs some docs (partly)4) Add new node (resolve errors in cfbot and prepared statement)
I tried adding a new node in pathnode.h, but it doesn't work well.
So, it needs more time to implement it successfully because this is
the first time to add a new node in it.
I'm not sure why it didn't work well, and I haven't tried adding the
struct myself so I might be missing something important, but m
assumption was the new struct would go to plannodes.h. The planning
works in phases:
parse -> build Path nodes -> pick cheapest Path -> create Plan
and it's the Plan that is printed by EXPLAIN. The pathnodes.h and
plannodes.h match this, so if it's expected to be in Plan it should go
to plannodes.h I think.
8) Add regression test (stats_ext.sql)
Actually, I am not yet able to add new test cases to stats_ext.sql.
Why is that not possible? Can you explain?
Instead, I created a simple test (test.sql) and have attached it.
Also, output.txt is the test result.To add new test cases to stats_ext.sql,
I'd like to decide on a strategy for modifying it. In particular, there are
381 places where the check_estimated_rows function is used, so should I
include the same number of tests, or should we include the bare minimum
of tests that cover the code path? I think only the latter would be fine.
Any advice is appreciated. :-D
I don't understand. My suggestion was to create a new function, similar
to check_estimated_rows(), that's get a query, do EXPLAIN and extract
the list of applied statistics. Similar to what check_estimated_rows()
does for number of rows.
I did not mean to suggest you modify check_estimated_rows() to extract
both the number of rows and statistics, nor to modify the existing tests
(that's not very useful, because there's only one extended statistics in
each of those tests, and by testing the estimate we implicitly test that
it's applied).
My suggestion is to add a couple new queries, with multiple statistics
and multiple clauses etc. And then test the patch on those. You could do
simple EXPLAIN (COSTS OFF), or add the new function to make it a bit
more stable (but maybe it's not worth it).
regards
--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
Hi,
Let me share my opinion on those questions ...
On 7/12/24 12:09, Masahiro.Ikeda@nttdata.com wrote:
Hi,
Thanks for working the feature. As a user, I find it useful, and I'd like to use
it in v18! Although I've just started start looking into it, I have a few questions.(1)
Is it better to make the order of output consistent? For example, even
though there are three clauses shown in the below case, the order does not
match.
* "Filter" shows that "id1" is first.
* "Ext Stats" shows that "id2" is first.-- An example
DROP TABLE IF EXISTS test;
CREATE TABLE test (id1 int2, id2 int4, id3 int8, value varchar(32));
INSERT INTO test (SELECT i%11, i%103, i%1009, 'hello' FROM generate_series(1,1000000) s(i));
create statistics test_s1 on id1, id2 from test; analyze;=# EXPLAIN (STATS) SELECT * FROM test WHERE id1 = 1 AND (id2 = 2 OR id2 > 10);
QUERY PLAN
---------------------------------------------------------------------------------------
Gather (cost=1000.00..23092.77 rows=84311 width=20)
Workers Planned: 2
-> Parallel Seq Scan on test (cost=0.00..13661.67 rows=35130 width=20)
Filter: ((id1 = 1) AND ((id2 = 2) OR (id2 > 10))) -- here
Ext Stats: public.test_s1 Clauses: (((id2 = 2) OR (id2 > 10)) AND (id1 = 1)) -- here
(5 rows)
I don't think we need to make the order consistent. It probably wouldn't
hurt, but I'm not sure it's even possible for all scan types - for
example in an index scan, the clauses might be split between index
conditions and filters, etc.
(2)
Do we really need the schema names without VERBOSE option? As in the above case,
"Ext Stats" shows schema name "public", even though the table name "test" isn't
shown with its schema name.Additionally, if the VERBOSE option is specified, should the column names also be
printed with namespace?=# EXPLAIN (VERBOSE, STATS) SELECT * FROM test WHERE id1 = 1 AND (id2 = 2 OR id2 > 10);
QUERY PLAN
---------------------------------------------------------------------------------------
Gather (cost=1000.00..22947.37 rows=82857 width=20)
Output: id1, id2, id3, value
Workers Planned: 2
-> Parallel Seq Scan on public.test (cost=0.00..13661.67 rows=34524 width=20)
Output: id1, id2, id3, value
Filter: ((test.id1 = 1) AND ((test.id2 = 2) OR (test.id2 > 10)))
Ext Stats: public.test_s1 Clauses: (((id2 = 2) OR (id2 > 10)) AND (id1 = 1)) -- here
(7 rows)
Yeah, I don't think there's a good reason to force printing schema for
the statistics, if it's not needed for the table. The rules should be
the same, I think.
(3)
I might be misunderstanding something, but do we need the clauses? Is there any
case where users would want to know the clauses? For example, wouldn't the
following be sufficient?Ext Stats: id1, id2 using test_s1
The stats may overlap, and some clauses may be matching multiple of
them. And some statistics do not support all clause types (e.g.
functional dependencies work only with equality conditions). Yes, you
might deduce which statistics are used for which clause, but it's not
trivial - interpreting explain is already not trivial, let's not make it
harder.
(If tracking the exact clauses turns out to be expensive, we might
revisit this - it might make it cheaper).
(4)
The extended statistics with "dependencies" or "ndistinct" option don't seem to
be shown in EXPLAIN output. Am I missing something? (Is this expected?)I tested the examples in the documentation. Although it might work with
"mcv" option, I can't confirm that it works because "unrecognized node type"
error occurred in my environment.
https://www.postgresql.org/docs/current/sql-createstatistics.html(It might be wrong since I'm beginner with extended stats codes.)
IIUC, the reason is that the patch only handles statext_mcv_clauselist_selectivity(),
and doesn't handle dependencies_clauselist_selectivity() and estimate_multivariate_ndistinct().-- doesn't work with "dependencies" option?
=# \dX
List of extended statistics
Schema | Name | Definition | Ndistinct | Dependencies | MCV
--------+---------+--------------------+-----------+--------------+---------
public | s1 | a, b FROM t1 | (null) | defined | (null)
(2 rows)=# EXPLAIN (STATS, ANALYZE) SELECT * FROM t1 WHERE (a = 1) AND (b = 0);
QUERY PLAN
-------------------------------------------------------------------------------------------------------------------
Gather (cost=1000.00..11685.00 rows=100 width=8) (actual time=0.214..50.327 rows=100 loops=1)
Workers Planned: 2
Workers Launched: 2
-> Parallel Seq Scan on t1 (cost=0.00..10675.00 rows=42 width=8) (actual time=30.300..46.610 rows=33 loops=3)
Filter: ((a = 1) AND (b = 0))
Rows Removed by Filter: 333300
Planning Time: 0.246 ms
Execution Time: 50.361 ms
(8 rows)-- doesn't work with "ndistinct"?
=# \dX
List of extended statistics
Schema | Name | Definition | Ndistinct | Dependencies | MCV
--------+------+------------------------------------------------------------------+-----------+--------------+--------
public | s3 | date_trunc('month'::text, a), date_trunc('day'::text, a) FROM t3 | defined | (null) | (null)
(1 row)postgres(437635)=# EXPLAIN (STATS, ANALYZE) SELECT * FROM t3
WHERE date_trunc('month', a) = '2020-01-01'::timestamp;
QUERY PLAN
----------------------------------------------------------------------------------------------------------
Seq Scan on t3 (cost=0.00..10210.01 rows=45710 width=8) (actual time=0.027..143.199 rows=44640 loops=1)
Filter: (date_trunc('month'::text, a) = '2020-01-01 00:00:00'::timestamp without time zone)
Rows Removed by Filter: 480961
Planning Time: 0.088 ms
Execution Time: 144.590 ms
(5 rows)-- doesn't work with "mvc". It might work, but the error happens in my environments
=# \dX
List of extended statistics
Schema | Name | Definition | Ndistinct | Dependencies | MCV
--------+------+--------------+-----------+--------------+---------
public | s2 | a, b FROM t2 | (null) | (null) | defined
(1 row)-- I encountered the error with the query.
=# EXPLAIN (STATS, ANALYZE) SELECT * FROM t2 WHERE (a = 1) AND (b = 1);
ERROR: unrecognized node type: 268
Yes, you're right we don't show some stats. For dependencies there's the
problem that we don't apply them individually, so it's not really
possible to map clauses to individual stats. I wonder if we might have a
special "entry" to show clauses estimated by the functional dependencies
combined from all stats (instead of a particular statistics).
For ndistinct, I think we don't show this because it doesn't go through
clauselist_selectivity, which is the only thing I modified in the PoC.
But I guess we might improve estimate_num_groups() to track the stats in
a similar way, I guess.
regards
--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
Let me share my opinion on those questions ...
Thanks! I could understand the patch well thanks to your comments.
On 7/12/24 12:09, Masahiro.Ikeda@nttdata.com wrote:
Is it better to make the order of output consistent? For example, even
though there are three clauses shown in the below case, the order does
not match.
* "Filter" shows that "id1" is first.
* "Ext Stats" shows that "id2" is first.-- An example
DROP TABLE IF EXISTS test;
CREATE TABLE test (id1 int2, id2 int4, id3 int8, value varchar(32));
INSERT INTO test (SELECT i%11, i%103, i%1009, 'hello' FROM
generate_series(1,1000000) s(i)); create statistics test_s1 on id1,
id2 from test; analyze;=# EXPLAIN (STATS) SELECT * FROM test WHERE id1 = 1 AND (id2 = 2 OR id2 > 10);
QUERY PLAN
----------------------------------------------------------------------
----------------- Gather (cost=1000.00..23092.77 rows=84311
width=20)
Workers Planned: 2
-> Parallel Seq Scan on test (cost=0.00..13661.67 rows=35130 width=20)
Filter: ((id1 = 1) AND ((id2 = 2) OR (id2 > 10)))-- here
Ext Stats: public.test_s1 Clauses: (((id2 = 2) OR (id2 > 10)) AND (id1 = 1))
-- here
(5 rows)
I don't think we need to make the order consistent. It probably wouldn't hurt, but I'm
not sure it's even possible for all scan types - for example in an index scan, the clauses
might be split between index conditions and filters, etc.
OK, I understand it isn't unexpected behavior.
(3)
I might be misunderstanding something, but do we need the clauses? Is
there any case where users would want to know the clauses? For
example, wouldn't the following be sufficient?Ext Stats: id1, id2 using test_s1
The stats may overlap, and some clauses may be matching multiple of them. And some
statistics do not support all clause types (e.g.
functional dependencies work only with equality conditions). Yes, you might deduce
which statistics are used for which clause, but it's not trivial - interpreting explain is
already not trivial, let's not make it harder.(If tracking the exact clauses turns out to be expensive, we might revisit this - it might
make it cheaper).
Thanks. I agree that we need to show the clauses.
(4)
The extended statistics with "dependencies" or "ndistinct" option
don't seem to be shown in EXPLAIN output. Am I missing something? (Is
this expected?)I tested the examples in the documentation. Although it might work
with "mcv" option, I can't confirm that it works because "unrecognized node type"
error occurred in my environment.
https://urldefense.com/v3/__https://www.postgresql.org/docs/current/sq
l-createstatistics.html__;!!GCTRfqYYOYGmgK_z!9H-FTXrhg7cr0U2r4PoKEeWM1
v9feP8I8zlNyhf-801n-KI8bIMAxOQgaetSTpek3ECk2_FKWEsuApVZ-ys-ka7rfjX8ANB
9zQ$(It might be wrong since I'm beginner with extended stats codes.)
IIUC, the reason is that the patch only handles
statext_mcv_clauselist_selectivity(),
and doesn't handle dependencies_clauselist_selectivity() andestimate_multivariate_ndistinct().
-- doesn't work with "dependencies" option?
=# \dX
List of extended statistics
Schema | Name | Definition | Ndistinct | Dependencies | MCV
--------+---------+--------------------+-----------+--------------+---
--------+---------+--------------------+-----------+--------------+---
--------+---------+--------------------+-----------+--------------+---
public | s1 | a, b FROM t1 | (null) | defined | (null)
(2 rows)=# EXPLAIN (STATS, ANALYZE) SELECT * FROM t1 WHERE (a = 1) AND (b = 0);
QUERY PLAN
----------------------------------------------------------------------
---------------------------------------------
Gather (cost=1000.00..11685.00 rows=100 width=8) (actualtime=0.214..50.327 rows=100 loops=1)
Workers Planned: 2
Workers Launched: 2
-> Parallel Seq Scan on t1 (cost=0.00..10675.00 rows=42 width=8) (actualtime=30.300..46.610 rows=33 loops=3)
Filter: ((a = 1) AND (b = 0))
Rows Removed by Filter: 333300 Planning Time: 0.246 ms
Execution Time: 50.361 ms
(8 rows)-- doesn't work with "ndistinct"?
=# \dX
List of extended statistics
Schema | Name | Definition |Ndistinct | Dependencies | MCV
--------+------+------------------------------------------------------------------+------
-----+--------------+--------public | s3 | date_trunc('month'::text, a), date_trunc('day'::text, a) FROM t3 |
defined | (null) | (null)
(1 row)
postgres(437635)=# EXPLAIN (STATS, ANALYZE) SELECT * FROM t3
WHERE date_trunc('month', a) = '2020-01-01'::timestamp;
QUERY PLAN
----------------------------------------------------------------------
------------------------------------
Seq Scan on t3 (cost=0.00..10210.01 rows=45710 width=8) (actualtime=0.027..143.199 rows=44640 loops=1)
Filter: (date_trunc('month'::text, a) = '2020-01-01 00:00:00'::timestamp
without time zone)
Rows Removed by Filter: 480961
Planning Time: 0.088 ms
Execution Time: 144.590 ms
(5 rows)-- doesn't work with "mvc". It might work, but the error happens in my
environments =# \dX
List of extended statistics
Schema | Name | Definition | Ndistinct | Dependencies | MCV
--------+------+--------------+-----------+--------------+---------
public | s2 | a, b FROM t2 | (null) | (null) | defined
(1 row)-- I encountered the error with the query.
=# EXPLAIN (STATS, ANALYZE) SELECT * FROM t2 WHERE (a = 1) AND (b =
1);
ERROR: unrecognized node type: 268Yes, you're right we don't show some stats. For dependencies there's the problem that
we don't apply them individually, so it's not really possible to map clauses to individual
stats. I wonder if we might have a special "entry" to show clauses estimated by the
functional dependencies combined from all stats (instead of a particular statistics).
OK, I understand it's intended behavior for "dependencies" and we need to consider how to
show them in EXPLAIN output in future.
For ndistinct, I think we don't show this because it doesn't go through
clauselist_selectivity, which is the only thing I modified in the PoC.
But I guess we might improve estimate_num_groups() to track the stats in a similar way,
I guess.
Thanks. IIUC, the reason is that it doesn't go through statext_clauselist_selectivity() because
the number of clauses is one though it goes through clauselist_selectivity().
ERROR: unrecognized node type: 268
Regarding the above error, do "applied_stats" need have the list of "StatisticExtInfo"
because it's enough to have the list of Oid(stat->statOid) for EXPLAIN output in the current patch?
change_to_applied_stats_has_list_of_oids.diff is the change I assumed. Do you have any plan to
show extra information for example "kind" of "StatisticExtInfo"?
The above is just one idea came up with while I read the following comments of header
of pathnodes.h, and to support copy "StatisticExtInfo" will leads many other nodes to support copy.
* We don't support copying RelOptInfo, IndexOptInfo, or Path nodes.
* There are some subsidiary structs that are useful to copy, though.
By the way, I found curios result while I tested with the above patch. It shows same "Ext Stats" twice.
I think it's expected behavior because the stat is used when estimate the cost of "Partial HashAggregate" and "Group".
I've shared the result because I could not understand soon when I saw it first time. I think it's better to let users understand
when the stats are used, but I don't have any idea now.
-- I tested with the example of CREATE STATISTICS documentation.
psql=# EXPLAIN (STATS, ANALYZE) SELECT date_trunc('month', a), date_trunc('day', a) FROM t3 GROUP BY 1, 2;
QUERY PLAN
---------------------------------------------------------------------------------------------------------------------------------------------
Group (cost=9530.56..9576.18 rows=365 width=16) (actual time=286.908..287.909 rows=366 loops=1)
Group Key: (date_trunc('month'::text, a)), (date_trunc('day'::text, a))
-> Gather Merge (cost=9530.56..9572.53 rows=365 width=16) (actual time=286.904..287.822 rows=498 loops=1)
Workers Planned: 1
Workers Launched: 1
-> Sort (cost=8530.55..8531.46 rows=365 width=16) (actual time=282.905..282.919 rows=249 loops=2)
Sort Key: (date_trunc('month'::text, a)), (date_trunc('day'::text, a))
Sort Method: quicksort Memory: 32kB
Worker 0: Sort Method: quicksort Memory: 32kB
-> Partial HashAggregate (cost=8509.54..8515.02 rows=365 width=16) (actual time=282.716..282.768 rows=249 loops=2)
Group Key: date_trunc('month'::text, a), date_trunc('day'::text, a)
Batches: 1 Memory Usage: 45kB
Worker 0: Batches: 1 Memory Usage: 45kB
-> Parallel Seq Scan on t3 (cost=0.00..6963.66 rows=309177 width=16) (actual time=0.021..171.214 rows=262800 loops=2)
Ext Stats: public.s3 Clauses: date_trunc('month'::text, a), date_trunc('day'::text, a) -- here
Ext Stats: public.s3 Clauses: date_trunc('month'::text, a), date_trunc('day'::text, a) -- here
Planning Time: 114327.206 ms
Execution Time: 288.007 ms
(18 rows)
Regards,
--
Masahiro Ikeda
NTT DATA CORPORATION
Attachments:
change_to_applied_stats_has_list_of_oids.diffapplication/octet-stream; name=change_to_applied_stats_has_list_of_oids.diffDownload
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 8c80c34619..7d3dcca655 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -2644,15 +2644,15 @@ show_scan_stats(List *stats, List *clauses, List *ors,
forthree (lc1, stats, lc2, clauses, lc3, ors)
{
- StatisticExtInfo *stat = (StatisticExtInfo *) lfirst(lc1);
+ Oid statOid = lfirst_oid(lc1);
List *applied_clauses = (List *) lfirst(lc2);
int is_or = lfirst_int(lc3);
initStringInfo(&str);
appendStringInfo(&str, "%s.%s Clauses: %s",
- get_namespace_name(get_statistics_namespace(stat->statOid)),
- get_statistics_name(stat->statOid),
+ get_namespace_name(get_statistics_namespace(statOid)),
+ get_statistics_name(statOid),
show_stat_qual(applied_clauses, is_or, planstate, ancestors, es));
ExplainPropertyText("Ext Stats", str.data, es);
diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c
index dbf921118e..79ab608160 100644
--- a/src/backend/statistics/extended_stats.c
+++ b/src/backend/statistics/extended_stats.c
@@ -1860,7 +1860,7 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
/* add it to the list of applied stats/clauses, if this flag is true */
if (root->parse->isExplain_Stats)
{
- rel->applied_stats = lappend(rel->applied_stats, stat);
+ rel->applied_stats = lappend_oid(rel->applied_stats, stat->statOid);
rel->applied_clauses = lappend(rel->applied_clauses, stat_clauses);
rel->applied_clauses_or = lappend_int(rel->applied_clauses_or, (is_or) ? 1 : 0);
}
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 21feb92a58..42f5306ec7 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -4162,7 +4162,7 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
/* add it to the list of applied stats/clauses, if this flag is true */
if (root->parse->isExplain_Stats)
{
- rel->applied_stats = lappend(rel->applied_stats, matched_info);
+ rel->applied_stats = lappend_oid(rel->applied_stats, matched_info->statOid);
rel->applied_clauses = lappend(rel->applied_clauses, matched_exprs);
rel->applied_clauses_or = lappend_int(rel->applied_clauses_or, 2); /* 2: Use comma to deparse */
}
On 7/18/24 12:37, Masahiro.Ikeda@nttdata.com wrote:
Let me share my opinion on those questions ...
...>
For ndistinct, I think we don't show this because it doesn't go through
clauselist_selectivity, which is the only thing I modified in the PoC.
But I guess we might improve estimate_num_groups() to track the stats in a similar way,
I guess.Thanks. IIUC, the reason is that it doesn't go through statext_clauselist_selectivity() because
the number of clauses is one though it goes through clauselist_selectivity().
Ah, I see I misunderstood the original report. The query used was
EXPLAIN (STATS, ANALYZE) SELECT * FROM t3
WHERE date_trunc('month', a) = '2020-01-01'::timestamp;
And it has nothing to do with the number of clauses being one neither.
The problem is this estimate is handled by examine_variable() matching
the expression to the "expression" stats, and injecting it into the
variable, so that the clauselist_selectivity() sees these stats.
This would happen even if you build just expression statistics on each
of the date_trunc() calls, and then tried a query with two clauses:
CREATE STATISTICS s4 ON date_trunc('day', a) FROM t3;
CREATE STATISTICS s3 ON date_trunc('month', a) FROM t3;
EXPLAIN SELECT * FROM t3
WHERE date_trunc('month', a) = '2020-01-01'::timestamp
AND date_trunc('day', 'a') = '2020-01-01'::timestamp;
Not sure how to handle this - we could remember when explain_variable()
injects statistics like this, I guess. But do we know that each call to
examine_variable() is for estimation? And do we know for which clause?
ERROR: unrecognized node type: 268
Regarding the above error, do "applied_stats" need have the list of "StatisticExtInfo"
because it's enough to have the list of Oid(stat->statOid) for EXPLAIN output in the current patch?
change_to_applied_stats_has_list_of_oids.diff is the change I assumed. Do you have any plan to
show extra information for example "kind" of "StatisticExtInfo"?The above is just one idea came up with while I read the following comments of header
of pathnodes.h, and to support copy "StatisticExtInfo" will leads many other nodes to support copy.
* We don't support copying RelOptInfo, IndexOptInfo, or Path nodes.
* There are some subsidiary structs that are useful to copy, though.
I do think tracking just the OID would work, because we already know how
to copy List objects. But if we want to also track the clauses, we'd
have to keep multiple lists, right? That seems a bit inconvenient.
By the way, I found curios result while I tested with the above patch. It shows same "Ext Stats" twice.
I think it's expected behavior because the stat is used when estimate the cost of "Partial HashAggregate" and "Group".
I've shared the result because I could not understand soon when I saw it first time. I think it's better to let users understand
when the stats are used, but I don't have any idea now.-- I tested with the example of CREATE STATISTICS documentation.
psql=# EXPLAIN (STATS, ANALYZE) SELECT date_trunc('month', a), date_trunc('day', a) FROM t3 GROUP BY 1, 2;
QUERY PLAN
---------------------------------------------------------------------------------------------------------------------------------------------
Group (cost=9530.56..9576.18 rows=365 width=16) (actual time=286.908..287.909 rows=366 loops=1)
Group Key: (date_trunc('month'::text, a)), (date_trunc('day'::text, a))
-> Gather Merge (cost=9530.56..9572.53 rows=365 width=16) (actual time=286.904..287.822 rows=498 loops=1)
Workers Planned: 1
Workers Launched: 1
-> Sort (cost=8530.55..8531.46 rows=365 width=16) (actual time=282.905..282.919 rows=249 loops=2)
Sort Key: (date_trunc('month'::text, a)), (date_trunc('day'::text, a))
Sort Method: quicksort Memory: 32kB
Worker 0: Sort Method: quicksort Memory: 32kB
-> Partial HashAggregate (cost=8509.54..8515.02 rows=365 width=16) (actual time=282.716..282.768 rows=249 loops=2)
Group Key: date_trunc('month'::text, a), date_trunc('day'::text, a)
Batches: 1 Memory Usage: 45kB
Worker 0: Batches: 1 Memory Usage: 45kB
-> Parallel Seq Scan on t3 (cost=0.00..6963.66 rows=309177 width=16) (actual time=0.021..171.214 rows=262800 loops=2)
Ext Stats: public.s3 Clauses: date_trunc('month'::text, a), date_trunc('day'::text, a) -- here
Ext Stats: public.s3 Clauses: date_trunc('month'::text, a), date_trunc('day'::text, a) -- here
Planning Time: 114327.206 ms
Execution Time: 288.007 ms
(18 rows)
I haven't looked into this, but my guess would be this is somehow
related to the parallelism - there's one parallel worker, which means we
have 2 processes to report stats for (leader + worker). And you get two
copies of the "Ext Stats" line. Could be a coincidence, ofc, but maybe
there's a loop to print some worker info, and you print the statistics
info in it?
regards
--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
On 7/18/24 12:37, Masahiro.Ikeda@nttdata.com wrote:
Let me share my opinion on those questions ...
...>
For ndistinct, I think we don't show this because it doesn't go
through clauselist_selectivity, which is the only thing I modified in the PoC.
But I guess we might improve estimate_num_groups() to track the stats
in a similar way, I guess.Thanks. IIUC, the reason is that it doesn't go through
statext_clauselist_selectivity() because the number of clauses is one though it goesthrough clauselist_selectivity().
Ah, I see I misunderstood the original report. The query used was
EXPLAIN (STATS, ANALYZE) SELECT * FROM t3
WHERE date_trunc('month', a) = '2020-01-01'::timestamp;And it has nothing to do with the number of clauses being one neither.
The problem is this estimate is handled by examine_variable() matching the expression
to the "expression" stats, and injecting it into the variable, so that the
clauselist_selectivity() sees these stats.This would happen even if you build just expression statistics on each of the
date_trunc() calls, and then tried a query with two clauses:CREATE STATISTICS s4 ON date_trunc('day', a) FROM t3;
CREATE STATISTICS s3 ON date_trunc('month', a) FROM t3;EXPLAIN SELECT * FROM t3
WHERE date_trunc('month', a) = '2020-01-01'::timestamp
AND date_trunc('day', 'a') = '2020-01-01'::timestamp;Not sure how to handle this - we could remember when explain_variable() injects
statistics like this, I guess. But do we know that each call to
examine_variable() is for estimation? And do we know for which clause?
I see. The issue is related to extended statistics for single expression. As a
first step, it's ok for me that we don't support it.
The below is just an idea to know clauses...
Although I'm missing something, can callers of examine_variable()
for estimation to rebuild the clauses from partial information of "OpExpr"?
Only clause_selectivity_ext() knows the information of actual full clauses.
But we don't need full information. It's enough to know the information
to show "OpExpr" for EXPLAIN.
get_oper_expr() deparse "OpExpr" using only the operator oid and arguments
in get_oper_expr().
If so, the caller to estimate, for example eqsel_internal(), scalarineqsel_wrapper()
and so on, seems to be able to know the "OpExpr" information, which are operator
oid and arguments, and used extended statistics easily to show for EXPLAIN.
# Memo: the call path of the estimation function
caller to estimate selectivity (eqsel_internal()/scalargtjoinsel_wrappter()/...)
-> get_restriction_variable()/get_join_valiables()
-> examine_variable()
ERROR: unrecognized node type: 268
Regarding the above error, do "applied_stats" need have the list of "StatisticExtInfo"
because it's enough to have the list of Oid(stat->statOid) for EXPLAIN output in thecurrent patch?
change_to_applied_stats_has_list_of_oids.diff is the change I assumed.
Do you have any plan to show extra information for example "kind" of"StatisticExtInfo"?
The above is just one idea came up with while I read the following
comments of header of pathnodes.h, and to support copy "StatisticExtInfo" will leadsmany other nodes to support copy.
* We don't support copying RelOptInfo, IndexOptInfo, or Path nodes.
* There are some subsidiary structs that are useful to copy, though.I do think tracking just the OID would work, because we already know how to copy List
objects. But if we want to also track the clauses, we'd have to keep multiple lists, right?
That seems a bit inconvenient.
Understood. In future, we might show not only the applied_clauses but also the clauses of
its extended statistics (StatisticExtInfo->exprs).
By the way, I found curios result while I tested with the above patch. It shows same
"Ext Stats" twice.
I think it's expected behavior because the stat is used when estimate the cost of
"Partial HashAggregate" and "Group".
I've shared the result because I could not understand soon when I saw
it first time. I think it's better to let users understand when the stats are used, but Idon't have any idea now.
-- I tested with the example of CREATE STATISTICS documentation.
psql=# EXPLAIN (STATS, ANALYZE) SELECT date_trunc('month', a), date_trunc('day',a) FROM t3 GROUP BY 1, 2;
QUERY PLAN
----------------------------------------------------------------------
----------------------------------------------------------------------
- Group (cost=9530.56..9576.18 rows=365 width=16) (actual
time=286.908..287.909 rows=366 loops=1)
Group Key: (date_trunc('month'::text, a)), (date_trunc('day'::text, a))
-> Gather Merge (cost=9530.56..9572.53 rows=365 width=16) (actualtime=286.904..287.822 rows=498 loops=1)
Workers Planned: 1
Workers Launched: 1
-> Sort (cost=8530.55..8531.46 rows=365 width=16) (actualtime=282.905..282.919 rows=249 loops=2)
Sort Key: (date_trunc('month'::text, a)), (date_trunc('day'::text, a))
Sort Method: quicksort Memory: 32kB
Worker 0: Sort Method: quicksort Memory: 32kB
-> Partial HashAggregate (cost=8509.54..8515.02 rows=365width=16) (actual time=282.716..282.768 rows=249 loops=2)
Group Key: date_trunc('month'::text, a), date_trunc('day'::text,
a)
Batches: 1 Memory Usage: 45kB
Worker 0: Batches: 1 Memory Usage: 45kB
-> Parallel Seq Scan on t3 (cost=0.00..6963.66 rows=309177width=16) (actual time=0.021..171.214 rows=262800 loops=2)
Ext Stats: public.s3 Clauses: date_trunc('month'::text,
a), date_trunc('day'::text, a) -- here
Ext Stats: public.s3 Clauses:
date_trunc('month'::text, a), date_trunc('day'::text, a) -- here
Planning Time: 114327.206 ms Execution Time: 288.007 ms
(18 rows)I haven't looked into this, but my guess would be this is somehow related to the
parallelism - there's one parallel worker, which means we have 2 processes to report
stats for (leader + worker). And you get two copies of the "Ext Stats" line. Could be a
coincidence, ofc, but maybe there's a loop to print some worker info, and you print the
statistics info in it?
I think yes and no. In the above case, it relates to parallelism, but it doesn't print the
information per each worker.
-- Make the number of workers is 5 and EXPLAIN without ANALYZE option.
-- But "Ext Stats" is printed only twice.
=# EXPLAIN (STATS) SELECT date_trunc('month', a), date_trunc('day', a) FROM t3 GROUP BY 1, 2;
QUERY PLAN
-------------------------------------------------------------------------------------------------------------------
Group (cost=4449.49..4489.50 rows=365 width=16)
Group Key: (date_trunc('month'::text, a)), (date_trunc('day'::text, a))
-> Gather Merge (cost=4449.49..4478.55 rows=1825 width=16)
Workers Planned: 5
-> Sort (cost=4449.41..4450.32 rows=365 width=16)
Sort Key: (date_trunc('month'::text, a)), (date_trunc('day'::text, a))
-> Partial HashAggregate (cost=4428.40..4433.88 rows=365 width=16)
Group Key: date_trunc('month'::text, a), date_trunc('day'::text, a)
-> Parallel Seq Scan on t3 (cost=0.00..3902.80 rows=105120 width=16)
Ext Stats: public.s3 Clauses: date_trunc('month'::text, a), date_trunc('day'::text, a)
Ext Stats: public.s3 Clauses: date_trunc('month'::text, a), date_trunc('day'::text, a)
(11 rows)
When creating a group path, it creates partial grouping paths if possible, and then
creates the final grouping path. At this time, both the partial grouping path and
the final grouping path use the same RelOptInfo to repeatedly use the extended
statistics to know how many groups there will be. That's why it outputs only twice.
There may be other similar calculation for partial paths.
# The call path of the above query
create_grouping_paths
create_ordinary_grouping_paths
create_partial_grouping_paths
get_number_of_groups
estimate_num_groups
estimate_multivariate_ndistinct -- first time to estimate the number of groups for partial grouping path
get_number_of_groups
estimate_num_groups
estimate_multivariate_ndistinct -- second time to estimate the number of groups for final grouping path
Regards,
--
Masahiro Ikeda
NTT DATA CORPORATION
Hi All,
I apologize for not being able to continue development due to various
circumstances.
The attached file is the rebased patch.
I will now catch up on the discussion and try to revise the patch.
Regards,
Tatsuro Yamada
On Fri, Jul 19, 2024 at 7:17 PM <Masahiro.Ikeda@nttdata.com> wrote:
Show quoted text
On 7/18/24 12:37, Masahiro.Ikeda@nttdata.com wrote:
Let me share my opinion on those questions ...
...>
For ndistinct, I think we don't show this because it doesn't go
through clauselist_selectivity, which is the only thing I modified inthe PoC.
But I guess we might improve estimate_num_groups() to track the stats
in a similar way, I guess.Thanks. IIUC, the reason is that it doesn't go through
statext_clauselist_selectivity() because the number of clauses is onethough it goes
through clauselist_selectivity().
Ah, I see I misunderstood the original report. The query used was
EXPLAIN (STATS, ANALYZE) SELECT * FROM t3
WHERE date_trunc('month', a) = '2020-01-01'::timestamp;And it has nothing to do with the number of clauses being one neither.
The problem is this estimate is handled by examine_variable() matching
the expression
to the "expression" stats, and injecting it into the variable, so that
the
clauselist_selectivity() sees these stats.
This would happen even if you build just expression statistics on each
of the
date_trunc() calls, and then tried a query with two clauses:
CREATE STATISTICS s4 ON date_trunc('day', a) FROM t3;
CREATE STATISTICS s3 ON date_trunc('month', a) FROM t3;EXPLAIN SELECT * FROM t3
WHERE date_trunc('month', a) = '2020-01-01'::timestamp
AND date_trunc('day', 'a') = '2020-01-01'::timestamp;Not sure how to handle this - we could remember when explain_variable()
injects
statistics like this, I guess. But do we know that each call to
examine_variable() is for estimation? And do we know for which clause?I see. The issue is related to extended statistics for single expression.
As a
first step, it's ok for me that we don't support it.The below is just an idea to know clauses...
Although I'm missing something, can callers of examine_variable()
for estimation to rebuild the clauses from partial information of "OpExpr"?Only clause_selectivity_ext() knows the information of actual full clauses.
But we don't need full information. It's enough to know the information
to show "OpExpr" for EXPLAIN.get_oper_expr() deparse "OpExpr" using only the operator oid and arguments
in get_oper_expr().If so, the caller to estimate, for example eqsel_internal(),
scalarineqsel_wrapper()
and so on, seems to be able to know the "OpExpr" information, which are
operator
oid and arguments, and used extended statistics easily to show for EXPLAIN.# Memo: the call path of the estimation function
caller to estimate selectivity
(eqsel_internal()/scalargtjoinsel_wrappter()/...)
-> get_restriction_variable()/get_join_valiables()
-> examine_variable()ERROR: unrecognized node type: 268
Regarding the above error, do "applied_stats" need have the list of
"StatisticExtInfo"
because it's enough to have the list of Oid(stat->statOid) for EXPLAIN
output in the
current patch?
change_to_applied_stats_has_list_of_oids.diff is the change I assumed.
Do you have any plan to show extra information for example "kind" of"StatisticExtInfo"?
The above is just one idea came up with while I read the following
comments of header of pathnodes.h, and to support copy"StatisticExtInfo" will leads
many other nodes to support copy.
* We don't support copying RelOptInfo, IndexOptInfo, or Path nodes.
* There are some subsidiary structs that are useful to copy, though.I do think tracking just the OID would work, because we already know how
to copy List
objects. But if we want to also track the clauses, we'd have to keep
multiple lists, right?
That seems a bit inconvenient.
Understood. In future, we might show not only the applied_clauses but also
the clauses of
its extended statistics (StatisticExtInfo->exprs).By the way, I found curios result while I tested with the above patch.
It shows same
"Ext Stats" twice.
I think it's expected behavior because the stat is used when estimate
the cost of
"Partial HashAggregate" and "Group".
I've shared the result because I could not understand soon when I saw
it first time. I think it's better to let users understand when thestats are used, but I
don't have any idea now.
-- I tested with the example of CREATE STATISTICS documentation.
psql=# EXPLAIN (STATS, ANALYZE) SELECT date_trunc('month', a),date_trunc('day',
a) FROM t3 GROUP BY 1, 2;
QUERY
PLAN
----------------------------------------------------------------------
----------------------------------------------------------------------
- Group (cost=9530.56..9576.18 rows=365 width=16) (actual
time=286.908..287.909 rows=366 loops=1)
Group Key: (date_trunc('month'::text, a)), (date_trunc('day'::text,a))
-> Gather Merge (cost=9530.56..9572.53 rows=365 width=16) (actual
time=286.904..287.822 rows=498 loops=1)
Workers Planned: 1
Workers Launched: 1
-> Sort (cost=8530.55..8531.46 rows=365 width=16) (actualtime=282.905..282.919 rows=249 loops=2)
Sort Key: (date_trunc('month'::text, a)),
(date_trunc('day'::text, a))
Sort Method: quicksort Memory: 32kB
Worker 0: Sort Method: quicksort Memory: 32kB
-> Partial HashAggregate (cost=8509.54..8515.02rows=365
width=16) (actual time=282.716..282.768 rows=249 loops=2)
Group Key: date_trunc('month'::text, a),
date_trunc('day'::text,
a)
Batches: 1 Memory Usage: 45kB
Worker 0: Batches: 1 Memory Usage: 45kB
-> Parallel Seq Scan on t3 (cost=0.00..6963.66rows=309177
width=16) (actual time=0.021..171.214 rows=262800 loops=2)
Ext Stats: public.s3 Clauses:
date_trunc('month'::text,
a), date_trunc('day'::text, a) -- here
Ext Stats: public.s3 Clauses:
date_trunc('month'::text, a), date_trunc('day'::text, a) -- here
Planning Time: 114327.206 ms Execution Time: 288.007 ms
(18 rows)I haven't looked into this, but my guess would be this is somehow
related to the
parallelism - there's one parallel worker, which means we have 2
processes to report
stats for (leader + worker). And you get two copies of the "Ext Stats"
line. Could be a
coincidence, ofc, but maybe there's a loop to print some worker info,
and you print the
statistics info in it?
I think yes and no. In the above case, it relates to parallelism, but it
doesn't print the
information per each worker.-- Make the number of workers is 5 and EXPLAIN without ANALYZE option.
-- But "Ext Stats" is printed only twice.
=# EXPLAIN (STATS) SELECT date_trunc('month', a), date_trunc('day', a)
FROM t3 GROUP BY 1, 2;
QUERY PLAN-------------------------------------------------------------------------------------------------------------------
Group (cost=4449.49..4489.50 rows=365 width=16)
Group Key: (date_trunc('month'::text, a)), (date_trunc('day'::text, a))
-> Gather Merge (cost=4449.49..4478.55 rows=1825 width=16)
Workers Planned: 5
-> Sort (cost=4449.41..4450.32 rows=365 width=16)
Sort Key: (date_trunc('month'::text, a)),
(date_trunc('day'::text, a))
-> Partial HashAggregate (cost=4428.40..4433.88 rows=365
width=16)
Group Key: date_trunc('month'::text, a),
date_trunc('day'::text, a)
-> Parallel Seq Scan on t3 (cost=0.00..3902.80
rows=105120 width=16)
Ext Stats: public.s3 Clauses:
date_trunc('month'::text, a), date_trunc('day'::text, a)
Ext Stats: public.s3 Clauses:
date_trunc('month'::text, a), date_trunc('day'::text, a)
(11 rows)When creating a group path, it creates partial grouping paths if possible,
and then
creates the final grouping path. At this time, both the partial grouping
path and
the final grouping path use the same RelOptInfo to repeatedly use the
extended
statistics to know how many groups there will be. That's why it outputs
only twice.
There may be other similar calculation for partial paths.# The call path of the above query
create_grouping_paths
create_ordinary_grouping_paths
create_partial_grouping_paths
get_number_of_groups
estimate_num_groups
estimate_multivariate_ndistinct -- first time to estimate the
number of groups for partial grouping path
get_number_of_groups
estimate_num_groups
estimate_multivariate_ndistinct -- second time to estimate the
number of groups for final grouping pathRegards,
--
Masahiro Ikeda
NTT DATA CORPORATION
Attachments:
0001-Add-a-new-option-STATS-to-Explain-command_r2.patchapplication/octet-stream; name=0001-Add-a-new-option-STATS-to-Explain-command_r2.patchDownload
From dc160dafc2f1a91d7b306f2d1e582a16c63f8c58 Mon Sep 17 00:00:00 2001
From: Tatsuro Yamada <yamatattsu@gmail.com>
Date: Fri, 1 Nov 2024 13:46:35 +0900
Subject: [PATCH] Add a new option STATS to Explain command
It shows applied extended statistics in Explain command output.
This patch fixes the following points:
- Rebased on 49d6c7d8
- Will fix document such as:
Todo
- sql-createstatistics.html
Add Explain outputs using STATS option
- planner-stats.html
Should I introduce Explain with the STATS option on this page?
---
doc/src/sgml/ref/explain.sgml | 13 +++
src/backend/commands/explain.c | 121 ++++++++++++++++++++++
src/backend/nodes/makefuncs.c | 11 ++
src/backend/optimizer/plan/createplan.c | 15 +++
src/backend/optimizer/util/relnode.c | 12 +++
src/backend/optimizer/util/restrictinfo.c | 35 +++++++
src/backend/statistics/extended_stats.c | 8 ++
src/backend/utils/adt/selfuncs.c | 15 +++
src/backend/utils/cache/lsyscache.c | 49 +++++++++
src/include/commands/explain.h | 1 +
src/include/nodes/makefuncs.h | 2 +
src/include/nodes/parsenodes.h | 3 +
src/include/nodes/pathnodes.h | 5 +
src/include/nodes/plannodes.h | 5 +
src/include/optimizer/restrictinfo.h | 2 +
src/include/utils/lsyscache.h | 3 +
16 files changed, 300 insertions(+)
diff --git a/doc/src/sgml/ref/explain.sgml b/doc/src/sgml/ref/explain.sgml
index db9d3a8549..b573152b6e 100644
--- a/doc/src/sgml/ref/explain.sgml
+++ b/doc/src/sgml/ref/explain.sgml
@@ -43,6 +43,7 @@ EXPLAIN [ ( <replaceable class="parameter">option</replaceable> [, ...] ) ] <rep
BUFFERS [ <replaceable class="parameter">boolean</replaceable> ]
SERIALIZE [ { NONE | TEXT | BINARY } ]
WAL [ <replaceable class="parameter">boolean</replaceable> ]
+ STATS [ <replaceable class="parameter">boolean</replaceable> ]
TIMING [ <replaceable class="parameter">boolean</replaceable> ]
SUMMARY [ <replaceable class="parameter">boolean</replaceable> ]
MEMORY [ <replaceable class="parameter">boolean</replaceable> ]
@@ -248,6 +249,18 @@ ROLLBACK;
</listitem>
</varlistentry>
+ <varlistentry>
+ <term><literal>STATS</literal></term>
+ <listitem>
+ <para>
+ Include information on applied <literal>Extended statistics</literal>.
+ Specifically, include the names of extended statistics and clauses.
+ See <xref linkend="planner-stats-extended"/> for details about extended
+ statistics. This parameter defaults to <literal>FALSE</literal>.
+ </para>
+ </listitem>
+ </varlistentry>
+
<varlistentry>
<term><literal>TIMING</literal></term>
<listitem>
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 7c0fd63b2f..40c8462d68 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -94,6 +94,9 @@ static void show_qual(List *qual, const char *qlabel,
static void show_scan_qual(List *qual, const char *qlabel,
PlanState *planstate, List *ancestors,
ExplainState *es);
+static void show_scan_stats(List *stats, List *clauses, List *ors,
+ PlanState *planstate, List *ancestors,
+ ExplainState *es);
static void show_upper_qual(List *qual, const char *qlabel,
PlanState *planstate, List *ancestors,
ExplainState *es);
@@ -219,6 +222,8 @@ ExplainQuery(ParseState *pstate, ExplainStmt *stmt,
es->settings = defGetBoolean(opt);
else if (strcmp(opt->defname, "generic_plan") == 0)
es->generic = defGetBoolean(opt);
+ else if (strcmp(opt->defname, "stats") == 0)
+ es->stats = defGetBoolean(opt);
else if (strcmp(opt->defname, "timing") == 0)
{
timing_set = true;
@@ -490,6 +495,11 @@ standard_ExplainOneQuery(Query *query, int cursorOptions,
if (es->buffers)
bufusage_start = pgBufferUsage;
+
+ /* if this flag is true, applied ext stats are stored */
+ if (es->stats)
+ query->isExplain_Stats = true;
+
INSTR_TIME_SET_CURRENT(planstart);
/* plan the query */
@@ -2107,6 +2117,10 @@ ExplainNode(PlanState *planstate, List *ancestors,
if (plan->qual)
show_instrumentation_count("Rows Removed by Filter", 1,
planstate, es);
+ if (es->stats)
+ show_scan_stats(plan->applied_stats, plan->applied_clauses,
+ plan->applied_clauses_or,
+ planstate, ancestors, es);
break;
case T_IndexOnlyScan:
show_scan_qual(((IndexOnlyScan *) plan)->indexqual,
@@ -2123,10 +2137,18 @@ ExplainNode(PlanState *planstate, List *ancestors,
if (es->analyze)
ExplainPropertyFloat("Heap Fetches", NULL,
planstate->instrument->ntuples2, 0, es);
+ if (es->stats)
+ show_scan_stats(plan->applied_stats, plan->applied_clauses,
+ plan->applied_clauses_or,
+ planstate, ancestors, es);
break;
case T_BitmapIndexScan:
show_scan_qual(((BitmapIndexScan *) plan)->indexqualorig,
"Index Cond", planstate, ancestors, es);
+ if (es->stats)
+ show_scan_stats(plan->applied_stats, plan->applied_clauses,
+ plan->applied_clauses_or,
+ planstate, ancestors, es);
break;
case T_BitmapHeapScan:
show_scan_qual(((BitmapHeapScan *) plan)->bitmapqualorig,
@@ -2157,6 +2179,11 @@ ExplainNode(PlanState *planstate, List *ancestors,
planstate, es);
if (IsA(plan, CteScan))
show_ctescan_info(castNode(CteScanState, planstate), es);
+
+ if (es->stats)
+ show_scan_stats(plan->applied_stats, plan->applied_clauses,
+ plan->applied_clauses_or,
+ planstate, ancestors, es);
break;
case T_Gather:
{
@@ -2224,6 +2251,10 @@ ExplainNode(PlanState *planstate, List *ancestors,
if (plan->qual)
show_instrumentation_count("Rows Removed by Filter", 1,
planstate, es);
+ if (es->stats)
+ show_scan_stats(plan->applied_stats, plan->applied_clauses,
+ plan->applied_clauses_or,
+ planstate, ancestors, es);
break;
case T_TableFuncScan:
if (es->verbose)
@@ -2338,6 +2369,10 @@ ExplainNode(PlanState *planstate, List *ancestors,
if (plan->qual)
show_instrumentation_count("Rows Removed by Filter", 1,
planstate, es);
+ if (es->stats)
+ show_scan_stats(plan->applied_stats, plan->applied_clauses,
+ plan->applied_clauses_or,
+ planstate, ancestors, es);
break;
case T_WindowAgg:
show_upper_qual(plan->qual, "Filter", planstate, ancestors, es);
@@ -2354,6 +2389,10 @@ ExplainNode(PlanState *planstate, List *ancestors,
if (plan->qual)
show_instrumentation_count("Rows Removed by Filter", 1,
planstate, es);
+ if (es->stats)
+ show_scan_stats(plan->applied_stats, plan->applied_clauses,
+ plan->applied_clauses_or,
+ planstate, ancestors, es);
break;
case T_Sort:
show_sort_keys(castNode(SortState, planstate), ancestors, es);
@@ -2680,6 +2719,88 @@ show_scan_qual(List *qual, const char *qlabel,
show_qual(qual, qlabel, planstate, ancestors, useprefix, es);
}
+/*
+ * Show a generic expression
+ */
+static char *
+deparse_stat_expression(Node *node,
+ PlanState *planstate, List *ancestors,
+ ExplainState *es)
+{
+ List *context;
+
+ /* Set up deparsing context */
+ context = set_deparse_context_plan(es->deparse_cxt,
+ planstate->plan,
+ ancestors);
+
+ /* Deparse the expression */
+ return deparse_expression(node, context, false, false);
+}
+
+/*
+ * Show a qualifier expression (which is a List with implicit AND semantics)
+ */
+static char *
+show_stat_qual(List *qual, int is_or,
+ PlanState *planstate, List *ancestors,
+ ExplainState *es)
+{
+ Node *node;
+
+ /* No work if empty qual */
+ if (qual == NIL)
+ return NULL;
+
+ /* Convert AND list to explicit AND */
+ switch (is_or)
+ {
+ case 0:
+ node = (Node *) make_ands_explicit(qual);
+ break;
+ case 1:
+ node = (Node *) make_ors_explicit(qual);
+ break;
+ case 2:
+ /* Extended stats for GROUP BY clause should be comma separeted string */
+ node = (Node *) qual;
+ break;
+ default:
+ elog(ERROR, "unexpected value: %d", is_or);
+ break;
+ }
+
+ /* And show it */
+ return deparse_stat_expression(node, planstate, ancestors, es);
+}
+
+/*
+ * Show applied statistics for scan/agg/group plan node
+ */
+static void
+show_scan_stats(List *stats, List *clauses, List *ors,
+ PlanState *planstate, List *ancestors, ExplainState *es)
+{
+ ListCell *lc1, *lc2, *lc3;
+ StringInfoData str;
+
+ forthree (lc1, stats, lc2, clauses, lc3, ors)
+ {
+ StatisticExtInfo *stat = (StatisticExtInfo *) lfirst(lc1);
+ List *applied_clauses = (List *) lfirst(lc2);
+ int is_or = lfirst_int(lc3);
+
+ initStringInfo(&str);
+
+ appendStringInfo(&str, "%s.%s Clauses: %s",
+ get_namespace_name(get_statistics_namespace(stat->statOid)),
+ get_statistics_name(stat->statOid),
+ show_stat_qual(applied_clauses, is_or, planstate, ancestors, es));
+
+ ExplainPropertyText("Ext Stats", str.data, es);
+ }
+}
+
/*
* Show a qualifier expression for an upper-level plan node
*/
diff --git a/src/backend/nodes/makefuncs.c b/src/backend/nodes/makefuncs.c
index 9cac3c1c27..bcd81926e8 100644
--- a/src/backend/nodes/makefuncs.c
+++ b/src/backend/nodes/makefuncs.c
@@ -733,6 +733,17 @@ make_ands_explicit(List *andclauses)
return make_andclause(andclauses);
}
+Expr *
+make_ors_explicit(List *orclauses)
+{
+ if (orclauses == NIL)
+ return (Expr *) makeBoolConst(true, false);
+ else if (list_length(orclauses) == 1)
+ return (Expr *) linitial(orclauses);
+ else
+ return make_orclause(orclauses);
+}
+
List *
make_ands_implicit(Expr *clause)
{
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index f2ed0d81f6..a21a401c1b 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -5452,6 +5452,8 @@ order_qual_clauses(PlannerInfo *root, List *clauses)
static void
copy_generic_path_info(Plan *dest, Path *src)
{
+ ListCell *lc;
+
dest->disabled_nodes = src->disabled_nodes;
dest->startup_cost = src->startup_cost;
dest->total_cost = src->total_cost;
@@ -5459,6 +5461,19 @@ copy_generic_path_info(Plan *dest, Path *src)
dest->plan_width = src->pathtarget->width;
dest->parallel_aware = src->parallel_aware;
dest->parallel_safe = src->parallel_safe;
+
+ dest->applied_stats = src->parent->applied_stats;
+ dest->applied_clauses_or = src->parent->applied_clauses_or;
+
+ dest->applied_clauses = NIL;
+ foreach (lc, src->parent->applied_clauses)
+ {
+ List *clauses = (List *) lfirst(lc);
+
+ dest->applied_clauses
+ = lappend(dest->applied_clauses,
+ maybe_extract_actual_clauses(clauses, false));
+ }
}
/*
diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c
index d7266e4cdb..53ba64e731 100644
--- a/src/backend/optimizer/util/relnode.c
+++ b/src/backend/optimizer/util/relnode.c
@@ -287,6 +287,10 @@ build_simple_rel(PlannerInfo *root, int relid, RelOptInfo *parent)
rel->partexprs = NULL;
rel->nullable_partexprs = NULL;
+ rel->applied_stats = NIL;
+ rel->applied_clauses = NIL;
+ rel->applied_clauses_or = NIL;
+
/*
* Pass assorted information down the inheritance hierarchy.
*/
@@ -769,6 +773,10 @@ build_join_rel(PlannerInfo *root,
joinrel->partexprs = NULL;
joinrel->nullable_partexprs = NULL;
+ joinrel->applied_stats = NIL;
+ joinrel->applied_clauses = NIL;
+ joinrel->applied_clauses_or = NIL;
+
/* Compute information relevant to the foreign relations. */
set_foreign_rel_properties(joinrel, outer_rel, inner_rel);
@@ -953,6 +961,10 @@ build_child_join_rel(PlannerInfo *root, RelOptInfo *outer_rel,
joinrel->partexprs = NULL;
joinrel->nullable_partexprs = NULL;
+ joinrel->applied_stats = NIL;
+ joinrel->applied_clauses = NIL;
+ joinrel->applied_clauses_or = NIL;
+
/* Compute information relevant to foreign relations. */
set_foreign_rel_properties(joinrel, outer_rel, inner_rel);
diff --git a/src/backend/optimizer/util/restrictinfo.c b/src/backend/optimizer/util/restrictinfo.c
index 0b406e9334..23136a330b 100644
--- a/src/backend/optimizer/util/restrictinfo.c
+++ b/src/backend/optimizer/util/restrictinfo.c
@@ -508,6 +508,41 @@ extract_actual_clauses(List *restrictinfo_list,
return result;
}
+/*
+ * maybe_extract_actual_clauses
+ *
+ * Just like extract_actual_clauses, but does not require the clauses to
+ * already be RestrictInfo.
+ *
+ * XXX Does not handle RestrictInfos nested in OR clauses.
+ */
+List *
+maybe_extract_actual_clauses(List *restrictinfo_list,
+ bool pseudoconstant)
+{
+ List *result = NIL;
+ ListCell *l;
+
+ foreach(l, restrictinfo_list)
+ {
+ RestrictInfo *rinfo;
+ Node *node = (Node *) lfirst(l);
+
+ if (!IsA(node, RestrictInfo))
+ {
+ result = lappend(result, node);
+ continue;
+ }
+
+ rinfo = (RestrictInfo *) node;
+
+ if (rinfo->pseudoconstant == pseudoconstant)
+ result = lappend(result, rinfo->clause);
+ }
+
+ return result;
+}
+
/*
* extract_actual_join_clauses
*
diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c
index 99fdf208db..dbf921118e 100644
--- a/src/backend/statistics/extended_stats.c
+++ b/src/backend/statistics/extended_stats.c
@@ -1857,6 +1857,14 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
list_exprs[listidx] = NULL;
}
+ /* add it to the list of applied stats/clauses, if this flag is true */
+ if (root->parse->isExplain_Stats)
+ {
+ rel->applied_stats = lappend(rel->applied_stats, stat);
+ rel->applied_clauses = lappend(rel->applied_clauses, stat_clauses);
+ rel->applied_clauses_or = lappend_int(rel->applied_clauses_or, (is_or) ? 1 : 0);
+ }
+
if (is_or)
{
bool *or_matches = NULL;
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 08fa6774d9..0a71685629 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -4073,6 +4073,7 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
ListCell *lc2;
Bitmapset *matched = NULL;
AttrNumber attnum_offset;
+ List *matched_exprs = NIL;
/*
* How much we need to offset the attnums? If there are no
@@ -4120,6 +4121,9 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
matched = bms_add_member(matched, attnum);
+ /* track expressions matched by this statistics */
+ matched_exprs = lappend(matched_exprs, varinfo->var);
+
found = true;
}
@@ -4148,6 +4152,9 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
matched = bms_add_member(matched, attnum);
+ /* track expressions matched by this statistics */
+ matched_exprs = lappend(matched_exprs, expr);
+
/* there should be just one matching expression */
break;
}
@@ -4156,6 +4163,14 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
}
}
+ /* add it to the list of applied stats/clauses, if this flag is true */
+ if (root->parse->isExplain_Stats)
+ {
+ rel->applied_stats = lappend(rel->applied_stats, matched_info);
+ rel->applied_clauses = lappend(rel->applied_clauses, matched_exprs);
+ rel->applied_clauses_or = lappend_int(rel->applied_clauses_or, 2); /* 2: Use comma to deparse */
+ }
+
/* Find the specific item that exactly matches the combination */
for (i = 0; i < stats->nitems; i++)
{
diff --git a/src/backend/utils/cache/lsyscache.c b/src/backend/utils/cache/lsyscache.c
index a85dc0d891..7368e82b62 100644
--- a/src/backend/utils/cache/lsyscache.c
+++ b/src/backend/utils/cache/lsyscache.c
@@ -35,6 +35,7 @@
#include "catalog/pg_publication.h"
#include "catalog/pg_range.h"
#include "catalog/pg_statistic.h"
+#include "catalog/pg_statistic_ext.h"
#include "catalog/pg_subscription.h"
#include "catalog/pg_transform.h"
#include "catalog/pg_type.h"
@@ -3714,3 +3715,51 @@ get_subscription_name(Oid subid, bool missing_ok)
return subname;
}
+
+/*
+ * get_statistics_name
+ * Returns the name of a given extended statistics
+ *
+ * Returns a palloc'd copy of the string, or NULL if no such namespace.
+ */
+char *
+get_statistics_name(Oid stxid)
+{
+ HeapTuple tp;
+
+ tp = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(stxid));
+ if (HeapTupleIsValid(tp))
+ {
+ Form_pg_statistic_ext stxtup = (Form_pg_statistic_ext) GETSTRUCT(tp);
+ char *result;
+
+ result = pstrdup(NameStr(stxtup->stxname));
+ ReleaseSysCache(tp);
+ return result;
+ }
+ else
+ return NULL;
+}
+
+/*
+ * get_statistics_namespace
+ * Returns the namespace OID of a given extended statistics
+ */
+Oid
+get_statistics_namespace(Oid stxid)
+{
+ HeapTuple tp;
+
+ tp = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(stxid));
+ if (HeapTupleIsValid(tp))
+ {
+ Form_pg_statistic_ext stxtup = (Form_pg_statistic_ext) GETSTRUCT(tp);
+ Oid result;
+
+ result = stxtup->stxnamespace;
+ ReleaseSysCache(tp);
+ return result;
+ }
+ else
+ return InvalidOid;
+}
diff --git a/src/include/commands/explain.h b/src/include/commands/explain.h
index aa5872bc15..f06dcc0c3f 100644
--- a/src/include/commands/explain.h
+++ b/src/include/commands/explain.h
@@ -55,6 +55,7 @@ typedef struct ExplainState
bool memory; /* print planner's memory usage information */
bool settings; /* print modified settings */
bool generic; /* generate a generic plan */
+ bool stats; /* print applied extended stats */
ExplainSerializeOption serialize; /* serialize the query's output? */
ExplainFormat format; /* output format */
/* state for output formatting --- not reset for each new plan tree */
diff --git a/src/include/nodes/makefuncs.h b/src/include/nodes/makefuncs.h
index 0765e5c57b..05ef9f5f17 100644
--- a/src/include/nodes/makefuncs.h
+++ b/src/include/nodes/makefuncs.h
@@ -94,6 +94,8 @@ extern Node *make_and_qual(Node *qual1, Node *qual2);
extern Expr *make_ands_explicit(List *andclauses);
extern List *make_ands_implicit(Expr *clause);
+extern Expr *make_ors_explicit(List *orclauses);
+
extern IndexInfo *makeIndexInfo(int numattrs, int numkeyattrs, Oid amoid,
List *expressions, List *predicates,
bool unique, bool nulls_not_distinct,
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h
index 0d96db5638..5364c0495d 100644
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -240,6 +240,9 @@ typedef struct Query
ParseLoc stmt_location;
/* length in bytes; 0 means "rest of string" */
ParseLoc stmt_len pg_node_attr(query_jumble_ignore);
+
+ /* if true, query is explain with stats option */
+ bool isExplain_Stats;
} Query;
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index add0f9e45f..63b477b985 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -1049,6 +1049,11 @@ typedef struct RelOptInfo
List **partexprs pg_node_attr(read_write_ignore);
/* Nullable partition key expressions */
List **nullable_partexprs pg_node_attr(read_write_ignore);
+
+ /* info about applied extended statistics */
+ List *applied_stats; /* list of StatisticExtInfo */
+ List *applied_clauses; /* list of lists of clauses */
+ List *applied_clauses_or; /* are the clauses AND, OR, or Comma */
} RelOptInfo;
/*
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index 52f29bcdb6..03fa039f36 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -170,6 +170,11 @@ typedef struct Plan
*/
Bitmapset *extParam;
Bitmapset *allParam;
+
+ /* info about applied statistics */
+ List *applied_stats;
+ List *applied_clauses;
+ List *applied_clauses_or;
} Plan;
/* ----------------
diff --git a/src/include/optimizer/restrictinfo.h b/src/include/optimizer/restrictinfo.h
index fe03a8ecd3..41bbb3231c 100644
--- a/src/include/optimizer/restrictinfo.h
+++ b/src/include/optimizer/restrictinfo.h
@@ -39,6 +39,8 @@ extern bool restriction_is_securely_promotable(RestrictInfo *restrictinfo,
extern List *get_actual_clauses(List *restrictinfo_list);
extern List *extract_actual_clauses(List *restrictinfo_list,
bool pseudoconstant);
+extern List *maybe_extract_actual_clauses(List *restrictinfo_list,
+ bool pseudoconstant);
extern void extract_actual_join_clauses(List *restrictinfo_list,
Relids joinrelids,
List **joinquals,
diff --git a/src/include/utils/lsyscache.h b/src/include/utils/lsyscache.h
index 20446f6f83..3fab6d5eea 100644
--- a/src/include/utils/lsyscache.h
+++ b/src/include/utils/lsyscache.h
@@ -206,6 +206,9 @@ extern char *get_publication_name(Oid pubid, bool missing_ok);
extern Oid get_subscription_oid(const char *subname, bool missing_ok);
extern char *get_subscription_name(Oid subid, bool missing_ok);
+extern char *get_statistics_name(Oid stxid);
+extern Oid get_statistics_namespace(Oid stxid);
+
#define type_is_array(typid) (get_element_type(typid) != InvalidOid)
/* type_is_array_domain accepts both plain arrays and domains over arrays */
#define type_is_array_domain(typid) (get_base_element_type(typid) != InvalidOid)
--
2.43.5
Hi everyone!
Thank you for your work.
1) While exploring extended statistics, I encountered a bug that occurs
when using EXPLAIN (STATS) with queries containing OR conditions:
CREATE TABLE t (a int, b int, c int, d int);
INSERT INTO t SELECT x/10+1, x, x + 10, x * 2 FROM
generate_series(1,10000) g(x);
CREATE STATISTICS ON a, b FROM t;
CREATE STATISTICS ON c, d FROM t;
ANALYZE;
The following query works as expected:
EXPLAIN (STATS) SELECT * FROM t WHERE a > 0 AND b > 0 AND c > 0 AND d > 0;
QUERY PLAN
----------------------------------------------------------------
Seq Scan on t (cost=0.00..255.00 rows=10000 width=16)
Filter: ((a > 0) AND (b > 0) AND (c > 0) AND (d > 0))
Ext Stats: public.t_a_b_stat Clauses: ((a > 0) AND (b > 0))
Ext Stats: public.t_c_d_stat Clauses: ((c > 0) AND (d > 0))
(4 rows)
However, when using OR conditions, the following query results in an error:
EXPLAIN (ANALYZE, STATS) SELECT * FROM t WHERE a > 0 AND b > 0 OR c > 0
AND d > 0;
ERROR: unrecognized node type: 314
2) It would be great if the STATS flag appeared as an option when
pressing Tab during query input in the psql command-line interface.
Best regards,
Ilia Evdokimov,
Tantor Labs LLC.
On 11/18/24 13:52, Ilia Evdokimov wrote:
Hi everyone!
Thank you for your work.
1) While exploring extended statistics, I encountered a bug that occurs
when using EXPLAIN (STATS) with queries containing OR conditions:CREATE TABLE t (a int, b int, c int, d int);
INSERT INTO t SELECT x/10+1, x, x + 10, x * 2 FROM
generate_series(1,10000) g(x);
CREATE STATISTICS ON a, b FROM t;
CREATE STATISTICS ON c, d FROM t;
ANALYZE;The following query works as expected:
EXPLAIN (STATS) SELECT * FROM t WHERE a > 0 AND b > 0 AND c > 0 AND d > 0;
QUERY PLAN
----------------------------------------------------------------
Seq Scan on t (cost=0.00..255.00 rows=10000 width=16)
Filter: ((a > 0) AND (b > 0) AND (c > 0) AND (d > 0))
Ext Stats: public.t_a_b_stat Clauses: ((a > 0) AND (b > 0))
Ext Stats: public.t_c_d_stat Clauses: ((c > 0) AND (d > 0))
(4 rows)However, when using OR conditions, the following query results in an error:
EXPLAIN (ANALYZE, STATS) SELECT * FROM t WHERE a > 0 AND b > 0 OR c > 0
AND d > 0;
ERROR: unrecognized node type: 314
I believe this is the issue I mentioned when I first posted the original
version of this patch:
2) The deparsing is modeled (i.e. copied) from how we deal with index
quals, but it's having issues with nested OR clauses, because there
are nested RestrictInfo nodes and the deparsing does not expect that.
In other words, the AND-clauses happens to be parsed like this:
BoolExpr (boolop=and)
RestrictInfo (clause=OpExpr, ...)
RestrictInfo (clause=OpExpr, ...)
And the deparse_expression() machinery does not expect RI nodes, which
is where the error message comes from.
An obvious solution would be to extend get_rule_expr() like this:
case T_RestrictInfo:
get_rule_expr((Node *) ((RestrictInfo *) node)->clause,
context, showimplicit);
break;
But I think this would be wrong - AFAIK ruleutils.c is meant to handle
these nodes. As the ruleutils.c header says:
Functions to convert stored expressions/querytrees back to
source text
And RestrictInfo surely is not meant to be stored anywhere - it's a
runtime only node, caching some data.
So I think the correct solution is to not pass any expressions with
RestrictInfo to deparse_expression(). Either by stripping the nodes, or
by not adding them at all.
The patch tries to do the stripping by maybe_extract_actual_clauses(),
but that only looks at the top node, and that is not sufficient here.
Maybe it would be possible to walk the whole tree, and remove all the
RestrictInfos nodes - including intermediate ones, not just the top. But
I'm not quite sure it wouldn't cause issues elsewhere (assuming it
modifies the existing nodes). It still feels a bit like fixing a problem
we shouldn't really have ...
The only alternative approach I can think of is to make sure we never
add any RestrictInfo nodes in these lists. But we build them for
selectivity estimation, and the RestrictInfo is meant for that.
So I'm a bit unsure what to do about this :-(
In any case, I think this shows the patch needs more tests.
2) It would be great if the STATS flag appeared as an option when
pressing Tab during query input in the psql command-line interface.
True. Tab autocomplete would be nice.
regards
--
Tomas Vondra
On 11/18/24 22:15, Tomas Vondra wrote:
...
So I think the correct solution is to not pass any expressions with
RestrictInfo to deparse_expression(). Either by stripping the nodes, or
by not adding them at all.The patch tries to do the stripping by maybe_extract_actual_clauses(),
but that only looks at the top node, and that is not sufficient here.
Maybe it would be possible to walk the whole tree, and remove all the
RestrictInfos nodes - including intermediate ones, not just the top. But
I'm not quite sure it wouldn't cause issues elsewhere (assuming it
modifies the existing nodes). It still feels a bit like fixing a problem
we shouldn't really have ...
To make this idea a bit more concrete, here's a patch removing all
RestrictInfo nodes in show_stat_qual(). But still, it feels wrong.
regards
--
Tomas Vondra
Attachments:
rinfo-fix.patchtext/x-patch; charset=UTF-8; name=rinfo-fix.patchDownload
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 40c8462d680..bf2faa6d9f2 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -2738,6 +2738,25 @@ deparse_stat_expression(Node *node,
return deparse_expression(node, context, false, false);
}
+static Node *
+strip_restrict_info_nodes(Node *node, void *context)
+{
+ if (node == NULL)
+ return NULL;
+
+ switch (nodeTag(node))
+ {
+ case T_RestrictInfo:
+ return (Node *) ((RestrictInfo *) node)->clause;
+
+ default:
+ break;
+ }
+
+ return expression_tree_mutator(node, strip_restrict_info_nodes,
+ (void *) context);
+}
+
/*
* Show a qualifier expression (which is a List with implicit AND semantics)
*/
@@ -2752,6 +2771,10 @@ show_stat_qual(List *qual, int is_or,
if (qual == NIL)
return NULL;
+ /* remove all RestrictInfo nodes, which are not expected by deparsing */
+ qual = (List *) expression_tree_mutator((Node *) qual,
+ strip_restrict_info_nodes, NULL);
+
/* Convert AND list to explicit AND */
switch (is_or)
{
On 19.11.2024 00:38, Tomas Vondra wrote:
On 11/18/24 22:15, Tomas Vondra wrote:
...
So I think the correct solution is to not pass any expressions with
RestrictInfo to deparse_expression(). Either by stripping the nodes, or
by not adding them at all.The patch tries to do the stripping by maybe_extract_actual_clauses(),
but that only looks at the top node, and that is not sufficient here.
Maybe it would be possible to walk the whole tree, and remove all the
RestrictInfos nodes - including intermediate ones, not just the top. But
I'm not quite sure it wouldn't cause issues elsewhere (assuming it
modifies the existing nodes). It still feels a bit like fixing a problem
we shouldn't really have ...To make this idea a bit more concrete, here's a patch removing all
RestrictInfo nodes in show_stat_qual(). But still, it feels wrong.regards
Yes, removing all 'RestrictInfos' during deparsing using
'expression_tree_mutator()' is not optimal. However, I don't see an
alternative. Perhaps it could be done this earlier in extended_stats.c
to avoid the need for cleanup later ...
--
Best regards,
Ilia Evdokimov,
Tantor Labs LLC.
Ilia Evdokimov <ilya.evdokimov@tantorlabs.com> writes:
On 19.11.2024 00:38, Tomas Vondra wrote:
On 11/18/24 22:15, Tomas Vondra wrote:
So I think the correct solution is to not pass any expressions with
RestrictInfo to deparse_expression(). Either by stripping the nodes, or
by not adding them at all.The patch tries to do the stripping by maybe_extract_actual_clauses(),
but that only looks at the top node, and that is not sufficient here.
Pardon me for being late to the party, but I don't understand why this
is difficult. There should never be more than one layer of
RestrictInfos, at the top level of an implicitly-ANDed list of quals.
The only exception to this is that RestrictInfos representing OR
clauses have an additional field "orclause" that attaches
RestrictInfos to the elements of the OR list --- but the main "clause"
field doesn't look like that, and you can just ignore "orclause" if it
doesn't suit you. So ISTM this doesn't need to be any harder than
what extract_actual_clauses() does (and has done for decades).
regards, tom lane
On 11/1/24 12:22, Tatsuro Yamada wrote:
Hi All,
I apologize for not being able to continue development due to various
circumstances.
The attached file is the rebased patch.
I will now catch up on the discussion and try to revise the patch.
I wonder why it’s so important to know exactly where and who has used
extended statistics.
I often use SQL Server to compare execution plans generated by
PostgreSQL, and I appreciate how they display the usage of extended
statistics. They clearly identify which statistics were utilized during
query planning and include them in the summary section of query plan.
I find this method much easier to implement, as it allows us to see any
usage points - remember that `estimate_num_groups` may be applied in
multiple places and may not always correspond to a node clause.
It’s worth noting that some clauses may be transformed during the
planning process, and the scan filter may not align with the estimated
clause. It’s possible that certain clauses might not appear in the final
estimated plan based on the extended statistics.
If necessary, we could add an `extstat_ID` to the summary to reference
it in the plan nodes.
--
regards, Andrei Lepikhov
Hi All,
Thank you everyone for your cooperation with comments on the patch and
solution ideas.
I am sorting through your review comments now. And after rebasing the
patch, I plan to
send a patch that addresses the comments as much as possible to -hackers by
Feb 21 at the latest.
Therefore, the status of the patch for this feature from last month's
commit fest will be
"moving to the next commit fest".
P.S.
Thanks for letting me know, Ilia.
Thanks,
Tatsuro Yamada
On 05.02.2025 09:28, Tatsuro Yamada wrote:
Hi All,
Thank you everyone for your cooperation with comments on the patch and
solution ideas.
I am sorting through your review comments now. And after rebasing the
patch, I plan to
send a patch that addresses the comments as much as possible to
-hackers by Feb 21 at the latest.Therefore, the status of the patch for this feature from last month's
commit fest will be
"moving to the next commit fest".P.S.
Thanks for letting me know, Ilia.Thanks,
Tatsuro Yamada
Thank for your work!
Regarding dependency statistics, I noticed that we can't apply them
individually, which makes it difficult to map specific clauses to
individual stats. However, why not display the dependency statistics
above the execution plan as a list? We could format it as a list, but I
believe it's crucial to highlight when dependency statistics are
applied, Analyzing dependencies stats often takes the longest compared
to other statistics, and if users could see the extended statistics for
dependencies upfront, they might be able to remove them if they're not
useful for the plan.
--
Best regards,
Ilia Evdokimov,
Tantor Labs LLC.
On 1/24/25 11:17, Andrei Lepikhov wrote:
On 11/1/24 12:22, Tatsuro Yamada wrote:
Hi All,
I apologize for not being able to continue development due to various
circumstances.
The attached file is the rebased patch.
I will now catch up on the discussion and try to revise the patch.I wonder why it’s so important to know exactly where and who has used
extended statistics.I often use SQL Server to compare execution plans generated by
PostgreSQL, and I appreciate how they display the usage of extended
statistics. They clearly identify which statistics were utilized during
query planning and include them in the summary section of query plan.
I admit not knowing what exactly SQL Server shows in the explain, but it
seems helpful to know which part of the plan used which statistics, no?
Imagine we only knew an index was used, but not which node used it and
for what keys. That would be a bit ... useless.
Or what info does the SQL server include in the plan, exactly? Can you
share an example?
I find this method much easier to implement, as it allows us to see any
usage points - remember that `estimate_num_groups` may be applied in
multiple places and may not always correspond to a node clause.
I may be wrong, but I don't quite see why would that be significantly
easier to implement. You still need to track the information somewhere,
and it probably needs for individual Path nodes. Because while building
the plan you don't know which paths will get selected.
Although, maybe the selectivity estimation is sufficiently independent
from the exact paths? In which case we might track it at the plan level.
Still, I don't think that really makes this much easier to implement.
The code would only move a little bit to a different place, but other
than that it would remain the same.
It’s worth noting that some clauses may be transformed during the
planning process, and the scan filter may not align with the estimated
clause. It’s possible that certain clauses might not appear in the final
estimated plan based on the extended statistics.If necessary, we could add an `extstat_ID` to the summary to reference
it in the plan nodes.
Not sure, but I'd prefer not to add "indirection" the people would have
to follow in the explain plan. I'd much rather duplicate the same info
(which should be rare anyway, we usually don't have the same statistics
used in multiple places in one query).
The point about estimate_num_groups is good - I think there will be more
cases where linking the extended statistics to a clause will be hard.
But maybe let's not block the whole patch because of that?
regards
--
Tomas Vondra
Hi All,
I've organized the discussion so far and improved the patch.
The issues and their status are below.
* Issues and status (or comment):
I've numbered them for ease of management.
====
T6. Changed option to show extended statistics (from VERBOSE to STATS)
-> Done already on the previous patch
T4. Add new node (to resolve errors in cfbot and prepared statement)
-> Done on the attached patch maybe
M2. Handle VERBOSE option more correctly
-> Done on the attache patch
M4. Organize the specifications for the types of ExtStats to display
-> Done. I understood the specifications (restrictions) below and wrote
it
on explain.sgml.
- depndencied or mcv of extended stats are displayed
- ndistinct is excluded
T9. Add log_stats option to auto_explain
-> Done on the attached patches
I2. Add tab completion to psql
-> Done on the attached patches
T7. Add documents
-> Partially completed.
- Done:
- doc/src/sgml/ref/explain.sgml
- doc/src/sgml/auto-explain.sgml
- Not started yet:
- doc/src/sgml/ref/create_statistics.sgml
- doc/src/sgml/perform.sgml
- doc/src/sgml/planstats.sgml
T8. Add regression test (stats_ext.sql)
-> Work in progress.
M5. Extstats showing twice issue
-> Work in progress.
The problem is related to Partial/Final groupby or Parallel query.
I found that HashParallel is one of the causes to show it twice.
(See result.txt)
I1. Fix error when query has nested OR condition
-> Not started yet.
As a starting point, it's good to understand the difference
between maybe_extract_actual_clauses() and extract_actual_clauses(),
right?
T5. Measure overhead of the feature
-> Not started yet.
But I understood a conditions to measure overhead:
- simple query without no extended stats
- simple query with 100 extended stats
====
* Regarding to the attached patches:
This patch is WIP patch including the following fixes:
====
- 0001-Add-a-new-option-STATS-to-EXPLAIN-command-r3.patch
- Rebased on 9926f854
- Added a new struct Applied_ExtStats in plannode.h (T4)
- Hopefully this will solve the issue related to
"-DCOPY_PARSE_PLAN_TREES"
with cfbot.
- To pass the extended statistics information from path to plan,
it might be more appropriate to define a new structure in
primnode.h
rather than plannode.h. Any advice would be appreciated.
- Handled EXPLAIN(STATS, VERBOSE) option (M2)
- Before the fix, schema name was always added to extended
statistics name,
but with this patch, schema name is added to the following only
when
VERBOSE option is selected:
- Extended statistics name, table name, and column name
- Added Supported extended statistics types in document (M4)
- 0002-Add-a-new-option-auto_explain.log_stats.patch
- It allows to use auto_explain.log_stats option on auto_explain
- 0003-Add-a-new-tab-completion-for-EXPLAIN-STATS-on-psql.patch
- When you run "EXPLAIN (<tab>" or "EXPLAIN (S<tab>" on psql,
"STATS" string is displayed.
=====
Finally, the attached test.sql and result.txt are a test case (incomplete)
and
its results. When the test case is completed, it will be merged into
ext_stats.sql.
Any advice is welcome!
Regards,
Tatsuro Yamada
Attachments:
0002-Add-a-new-option-auto_explain.log_stats.patchapplication/octet-stream; name=0002-Add-a-new-option-auto_explain.log_stats.patchDownload
From a6bc82b5112cb424058b985c3028ca6b61d1c831 Mon Sep 17 00:00:00 2001
From: Tatsuro Yamada <yamatattsu@gmail.com>
Date: Mon, 10 Feb 2025 08:40:57 +0900
Subject: [PATCH 2/3] Add a new option auto_explain.log_stats to auto_explain
(T9)
---
contrib/auto_explain/auto_explain.c | 13 +++++++++++++
doc/src/sgml/auto-explain.sgml | 18 ++++++++++++++++++
2 files changed, 31 insertions(+)
diff --git a/contrib/auto_explain/auto_explain.c b/contrib/auto_explain/auto_explain.c
index f1ad876e821..564b7b5cdbb 100644
--- a/contrib/auto_explain/auto_explain.c
+++ b/contrib/auto_explain/auto_explain.c
@@ -29,6 +29,7 @@ static bool auto_explain_log_analyze = false;
static bool auto_explain_log_verbose = false;
static bool auto_explain_log_buffers = false;
static bool auto_explain_log_wal = false;
+static bool auto_explain_log_stats = false;
static bool auto_explain_log_triggers = false;
static bool auto_explain_log_timing = true;
static bool auto_explain_log_settings = false;
@@ -170,6 +171,17 @@ _PG_init(void)
NULL,
NULL);
+ DefineCustomBoolVariable("auto_explain.log_stats",
+ "Use EXPLAIN STATS for plan logging.",
+ NULL,
+ &auto_explain_log_stats,
+ false,
+ PGC_SUSET,
+ 0,
+ NULL,
+ NULL,
+ NULL);
+
DefineCustomBoolVariable("auto_explain.log_triggers",
"Include trigger statistics in plans.",
"This has no effect unless log_analyze is also set.",
@@ -396,6 +408,7 @@ explain_ExecutorEnd(QueryDesc *queryDesc)
es->summary = es->analyze;
/* No support for MEMORY */
/* es->memory = false; */
+ es->stats = auto_explain_log_stats;
es->format = auto_explain_log_format;
es->settings = auto_explain_log_settings;
diff --git a/doc/src/sgml/auto-explain.sgml b/doc/src/sgml/auto-explain.sgml
index 0c4656ee302..a3be3f38adc 100644
--- a/doc/src/sgml/auto-explain.sgml
+++ b/doc/src/sgml/auto-explain.sgml
@@ -148,6 +148,24 @@ LOAD 'auto_explain';
</listitem>
</varlistentry>
+ <varlistentry id="auto-explain-configuration-parameters-log-stats">
+ <term>
+ <varname>auto_explain.log_stats</varname> (<type>boolean</type>)
+ <indexterm>
+ <primary><varname>auto_explain.log_stats</varname> configuration parameter</primary>
+ </indexterm>
+ </term>
+ <listitem>
+ <para>
+ <varname>auto_explain.log_stats</varname> controls whether applied
+ Extended Statistic are printed when an execution plan is logged; it's
+ equivalent to the <literal>STATS</literal> option of <command>EXPLAIN</command>.
+ This parameter is off by default.
+ Only superusers can change this setting.
+ </para>
+ </listitem>
+ </varlistentry>
+
<varlistentry id="auto-explain-configuration-parameters-log-timing">
<term>
<varname>auto_explain.log_timing</varname> (<type>boolean</type>)
--
2.43.5
0003-Add-a-new-tab-completion-for-EXPLAIN-STATS-on-psql.patchapplication/octet-stream; name=0003-Add-a-new-tab-completion-for-EXPLAIN-STATS-on-psql.patchDownload
From 972d5fdad76872480909cccc82bb32be759d452b Mon Sep 17 00:00:00 2001
From: Tatsuro Yamada <yamatattsu@gmail.com>
Date: Mon, 10 Feb 2025 08:41:58 +0900
Subject: [PATCH 3/3] Add a new tab completion for EXPLAIN (STATS) on psql (I2)
- When you run "EXPLAIN (<tab>" or "EXPLAIN (S<tab>" on psql,
"STATS" string is displayed.
---
src/bin/psql/tab-complete.in.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/bin/psql/tab-complete.in.c b/src/bin/psql/tab-complete.in.c
index 5f6897c8486..d4b504ebf31 100644
--- a/src/bin/psql/tab-complete.in.c
+++ b/src/bin/psql/tab-complete.in.c
@@ -4247,8 +4247,8 @@ match_previous_words(int pattern_id,
if (ends_with(prev_wd, '(') || ends_with(prev_wd, ','))
COMPLETE_WITH("ANALYZE", "VERBOSE", "COSTS", "SETTINGS", "GENERIC_PLAN",
"BUFFERS", "SERIALIZE", "WAL", "TIMING", "SUMMARY",
- "MEMORY", "FORMAT");
- else if (TailMatches("ANALYZE|VERBOSE|COSTS|SETTINGS|GENERIC_PLAN|BUFFERS|WAL|TIMING|SUMMARY|MEMORY"))
+ "MEMORY", "FORMAT", "STATS");
+ else if (TailMatches("ANALYZE|VERBOSE|COSTS|SETTINGS|GENERIC_PLAN|BUFFERS|WAL|TIMING|SUMMARY|MEMORY|STATS"))
COMPLETE_WITH("ON", "OFF");
else if (TailMatches("SERIALIZE"))
COMPLETE_WITH("TEXT", "NONE", "BINARY");
--
2.43.5
0001-Add-a-new-option-STATS-to-EXPLAIN-command-r3.patchapplication/octet-stream; name=0001-Add-a-new-option-STATS-to-EXPLAIN-command-r3.patchDownload
From 82ba847cdbd1074b8f630827b20eedefc6a451b9 Mon Sep 17 00:00:00 2001
From: Tatsuro Yamada <yamatattsu@gmail.com>
Date: Mon, 10 Feb 2025 08:33:23 +0900
Subject: [PATCH 1/3] Add a new option STATS to EXPLAIN command
This patch allows to show applied extended statistics in EXPLAIN command output.
It includes the following points:
- Rebased on 9926f854
- Added a new struct Applied_ExtStats in plannode.h (T4)
- Hopefully this will solve the issue related to "-DCOPY_PARSE_PLAN_TREES"
with cfbot.
- To pass the extended statistics information from path to plan,
it might be more appropriate to define a new structure in primnode.h
rather than plannode.h. Any advice would be appreciated.
- Handled EXPLAIN(STATS, VERBOSE) option (M2)
- Before the fix, schema name was always added to extended statistics name,
but with this patch, schema name is added to the following only when
VERBOSE option is selected:
- Extended statistics name, table name, and column name
- Added Supported extended statistics types in document (M4)
---
doc/src/sgml/ref/explain.sgml | 14 +++
src/backend/commands/explain.c | 140 ++++++++++++++++++++++
src/backend/nodes/makefuncs.c | 11 ++
src/backend/optimizer/plan/createplan.c | 17 +++
src/backend/optimizer/util/relnode.c | 12 ++
src/backend/optimizer/util/restrictinfo.c | 35 ++++++
src/backend/statistics/extended_stats.c | 8 ++
src/backend/utils/adt/selfuncs.c | 15 +++
src/backend/utils/cache/lsyscache.c | 49 ++++++++
src/include/commands/explain.h | 1 +
src/include/nodes/makefuncs.h | 2 +
src/include/nodes/parsenodes.h | 3 +
src/include/nodes/pathnodes.h | 5 +
src/include/nodes/plannodes.h | 15 +++
src/include/optimizer/restrictinfo.h | 2 +
src/include/utils/lsyscache.h | 3 +
16 files changed, 332 insertions(+)
diff --git a/doc/src/sgml/ref/explain.sgml b/doc/src/sgml/ref/explain.sgml
index 6361a14e65d..176ff4b0f2f 100644
--- a/doc/src/sgml/ref/explain.sgml
+++ b/doc/src/sgml/ref/explain.sgml
@@ -43,6 +43,7 @@ EXPLAIN [ ( <replaceable class="parameter">option</replaceable> [, ...] ) ] <rep
BUFFERS [ <replaceable class="parameter">boolean</replaceable> ]
SERIALIZE [ { NONE | TEXT | BINARY } ]
WAL [ <replaceable class="parameter">boolean</replaceable> ]
+ STATS [ <replaceable class="parameter">boolean</replaceable> ]
TIMING [ <replaceable class="parameter">boolean</replaceable> ]
SUMMARY [ <replaceable class="parameter">boolean</replaceable> ]
MEMORY [ <replaceable class="parameter">boolean</replaceable> ]
@@ -250,6 +251,19 @@ ROLLBACK;
</listitem>
</varlistentry>
+ <varlistentry>
+ <term><literal>STATS</literal></term>
+ <listitem>
+ <para>
+ Include information on applied <literal>Extended statistics</literal>.
+ Specifically, include the names of extended statistics and clauses.
+ Supported extended statistics types are Dependencies and MCV.
+ See <xref linkend="planner-stats-extended"/> for details about extended
+ statistics. This parameter defaults to <literal>FALSE</literal>.
+ </para>
+ </listitem>
+ </varlistentry>
+
<varlistentry>
<term><literal>TIMING</literal></term>
<listitem>
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index c24e66f82e1..8aac5736864 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -94,6 +94,15 @@ static void show_qual(List *qual, const char *qlabel,
static void show_scan_qual(List *qual, const char *qlabel,
PlanState *planstate, List *ancestors,
ExplainState *es);
+static char *deparse_stat_expression(Node *node,
+ PlanState *planstate, List *ancestors,
+ bool useprefix, ExplainState *es);
+static char *show_stat_qual(List *qual, int is_or,
+ PlanState *planstate, List *ancestors,
+ bool useprefix, ExplainState *es);
+static void show_scan_stats(List *stats, List *clauses, List *ors,
+ PlanState *planstate, List *ancestors,
+ ExplainState *es);
static void show_upper_qual(List *qual, const char *qlabel,
PlanState *planstate, List *ancestors,
ExplainState *es);
@@ -223,6 +232,8 @@ ExplainQuery(ParseState *pstate, ExplainStmt *stmt,
es->settings = defGetBoolean(opt);
else if (strcmp(opt->defname, "generic_plan") == 0)
es->generic = defGetBoolean(opt);
+ else if (strcmp(opt->defname, "stats") == 0)
+ es->stats = defGetBoolean(opt);
else if (strcmp(opt->defname, "timing") == 0)
{
timing_set = true;
@@ -497,6 +508,11 @@ standard_ExplainOneQuery(Query *query, int cursorOptions,
if (es->buffers)
bufusage_start = pgBufferUsage;
+
+ /* if this flag is true, applied ext stats are stored */
+ if (es->stats)
+ query->isExplain_Stats = true;
+
INSTR_TIME_SET_CURRENT(planstart);
/* plan the query */
@@ -2114,6 +2130,11 @@ ExplainNode(PlanState *planstate, List *ancestors,
if (plan->qual)
show_instrumentation_count("Rows Removed by Filter", 1,
planstate, es);
+ if (es->stats)
+ show_scan_stats(plan->app_extstats->applied_stats,
+ plan->app_extstats->applied_clauses,
+ plan->app_extstats->applied_clauses_or,
+ planstate, ancestors, es);
break;
case T_IndexOnlyScan:
show_scan_qual(((IndexOnlyScan *) plan)->indexqual,
@@ -2130,10 +2151,20 @@ ExplainNode(PlanState *planstate, List *ancestors,
if (es->analyze)
ExplainPropertyFloat("Heap Fetches", NULL,
planstate->instrument->ntuples2, 0, es);
+ if (es->stats)
+ show_scan_stats(plan->app_extstats->applied_stats,
+ plan->app_extstats->applied_clauses,
+ plan->app_extstats->applied_clauses_or,
+ planstate, ancestors, es);
break;
case T_BitmapIndexScan:
show_scan_qual(((BitmapIndexScan *) plan)->indexqualorig,
"Index Cond", planstate, ancestors, es);
+ if (es->stats)
+ show_scan_stats(plan->app_extstats->applied_stats,
+ plan->app_extstats->applied_clauses,
+ plan->app_extstats->applied_clauses_or,
+ planstate, ancestors, es);
break;
case T_BitmapHeapScan:
show_scan_qual(((BitmapHeapScan *) plan)->bitmapqualorig,
@@ -2164,6 +2195,12 @@ ExplainNode(PlanState *planstate, List *ancestors,
planstate, es);
if (IsA(plan, CteScan))
show_ctescan_info(castNode(CteScanState, planstate), es);
+
+ if (es->stats)
+ show_scan_stats(plan->app_extstats->applied_stats,
+ plan->app_extstats->applied_clauses,
+ plan->app_extstats->applied_clauses_or,
+ planstate, ancestors, es);
break;
case T_Gather:
{
@@ -2231,6 +2268,11 @@ ExplainNode(PlanState *planstate, List *ancestors,
if (plan->qual)
show_instrumentation_count("Rows Removed by Filter", 1,
planstate, es);
+ if (es->stats)
+ show_scan_stats(plan->app_extstats->applied_stats,
+ plan->app_extstats->applied_clauses,
+ plan->app_extstats->applied_clauses_or,
+ planstate, ancestors, es);
break;
case T_TableFuncScan:
if (es->verbose)
@@ -2345,6 +2387,11 @@ ExplainNode(PlanState *planstate, List *ancestors,
if (plan->qual)
show_instrumentation_count("Rows Removed by Filter", 1,
planstate, es);
+ if (es->stats)
+ show_scan_stats(plan->app_extstats->applied_stats,
+ plan->app_extstats->applied_clauses,
+ plan->app_extstats->applied_clauses_or,
+ planstate, ancestors, es);
break;
case T_WindowAgg:
show_upper_qual(plan->qual, "Filter", planstate, ancestors, es);
@@ -2361,6 +2408,11 @@ ExplainNode(PlanState *planstate, List *ancestors,
if (plan->qual)
show_instrumentation_count("Rows Removed by Filter", 1,
planstate, es);
+ if (es->stats)
+ show_scan_stats(plan->app_extstats->applied_stats,
+ plan->app_extstats->applied_clauses,
+ plan->app_extstats->applied_clauses_or,
+ planstate, ancestors, es);
break;
case T_Sort:
show_sort_keys(castNode(SortState, planstate), ancestors, es);
@@ -2687,6 +2739,94 @@ show_scan_qual(List *qual, const char *qlabel,
show_qual(qual, qlabel, planstate, ancestors, useprefix, es);
}
+/*
+ * Show a generic expression
+ */
+static char *
+deparse_stat_expression(Node *node,
+ PlanState *planstate, List *ancestors,
+ bool useprefix, ExplainState *es)
+{
+ List *context;
+
+ /* Set up deparsing context */
+ context = set_deparse_context_plan(es->deparse_cxt,
+ planstate->plan,
+ ancestors);
+
+ /* Deparse the expression */
+ return deparse_expression(node, context, useprefix, false);
+}
+
+/*
+ * Show a qualifier expression for extended stats
+ */
+static char *
+show_stat_qual(List *qual, int is_or,
+ PlanState *planstate, List *ancestors,
+ bool useprefix, ExplainState *es)
+{
+ Node *node;
+
+ /* No work if empty qual */
+ if (qual == NIL)
+ return NULL;
+
+ /* Convert AND list to explicit AND */
+ switch (is_or)
+ {
+ case 0:
+ node = (Node *) make_ands_explicit(qual);
+ break;
+ case 1:
+ node = (Node *) make_ors_explicit(qual);
+ break;
+ case 2:
+ /* Extended stats for GROUP BY clause should be comma separeted string */
+ node = (Node *) qual;
+ break;
+ default:
+ elog(ERROR, "unexpected value: %d", is_or);
+ break;
+ }
+
+ /* And show it */
+ return deparse_stat_expression(node, planstate, ancestors, useprefix, es);
+}
+
+/*
+ * Show applied statistics for scan/agg/group plan node
+ */
+static void
+show_scan_stats(List *stats, List *clauses, List *ors,
+ PlanState *planstate, List *ancestors, ExplainState *es)
+{
+ ListCell *lc1, *lc2, *lc3;
+ StringInfoData str;
+ bool useprefix;
+
+ useprefix = es->verbose;
+
+ forthree (lc1, stats, lc2, clauses, lc3, ors)
+ {
+ StatisticExtInfo *stat = (StatisticExtInfo *) lfirst(lc1);
+ List *applied_clauses = (List *) lfirst(lc2);
+ int is_or = lfirst_int(lc3);
+
+ initStringInfo(&str);
+
+ if (useprefix)
+ appendStringInfo(&str, "%s.",
+ get_namespace_name(get_statistics_namespace(stat->statOid)));
+
+ appendStringInfo(&str, "%s Clauses: %s",
+ get_statistics_name(stat->statOid),
+ show_stat_qual(applied_clauses, is_or, planstate, ancestors, useprefix, es));
+
+ ExplainPropertyText("Ext Stats", str.data, es);
+ }
+}
+
/*
* Show a qualifier expression for an upper-level plan node
*/
diff --git a/src/backend/nodes/makefuncs.c b/src/backend/nodes/makefuncs.c
index 007612563ca..c09667a8b42 100644
--- a/src/backend/nodes/makefuncs.c
+++ b/src/backend/nodes/makefuncs.c
@@ -759,6 +759,17 @@ make_ands_explicit(List *andclauses)
return make_andclause(andclauses);
}
+Expr *
+make_ors_explicit(List *orclauses)
+{
+ if (orclauses == NIL)
+ return (Expr *) makeBoolConst(true, false);
+ else if (list_length(orclauses) == 1)
+ return (Expr *) linitial(orclauses);
+ else
+ return make_orclause(orclauses);
+}
+
List *
make_ands_implicit(Expr *clause)
{
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 816a2b2a576..7a92799700e 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -5455,6 +5455,8 @@ order_qual_clauses(PlannerInfo *root, List *clauses)
static void
copy_generic_path_info(Plan *dest, Path *src)
{
+ ListCell *lc;
+
dest->disabled_nodes = src->disabled_nodes;
dest->startup_cost = src->startup_cost;
dest->total_cost = src->total_cost;
@@ -5462,6 +5464,21 @@ copy_generic_path_info(Plan *dest, Path *src)
dest->plan_width = src->pathtarget->width;
dest->parallel_aware = src->parallel_aware;
dest->parallel_safe = src->parallel_safe;
+
+ /* Is this the right place to use makeNode()? */
+ dest->app_extstats = makeNode(Applied_ExtStats);
+ dest->app_extstats->applied_stats = src->parent->applied_stats;
+ dest->app_extstats->applied_clauses_or = src->parent->applied_clauses_or;
+ dest->app_extstats->applied_clauses = NIL;
+
+ foreach (lc, src->parent->applied_clauses)
+ {
+ List *clauses = (List *) lfirst(lc);
+
+ dest->app_extstats->applied_clauses
+ = lappend(dest->app_extstats->applied_clauses,
+ maybe_extract_actual_clauses(clauses, false));
+ }
}
/*
diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c
index ff507331a06..b46ad85cba4 100644
--- a/src/backend/optimizer/util/relnode.c
+++ b/src/backend/optimizer/util/relnode.c
@@ -287,6 +287,10 @@ build_simple_rel(PlannerInfo *root, int relid, RelOptInfo *parent)
rel->partexprs = NULL;
rel->nullable_partexprs = NULL;
+ rel->applied_stats = NIL;
+ rel->applied_clauses = NIL;
+ rel->applied_clauses_or = NIL;
+
/*
* Pass assorted information down the inheritance hierarchy.
*/
@@ -769,6 +773,10 @@ build_join_rel(PlannerInfo *root,
joinrel->partexprs = NULL;
joinrel->nullable_partexprs = NULL;
+ joinrel->applied_stats = NIL;
+ joinrel->applied_clauses = NIL;
+ joinrel->applied_clauses_or = NIL;
+
/* Compute information relevant to the foreign relations. */
set_foreign_rel_properties(joinrel, outer_rel, inner_rel);
@@ -953,6 +961,10 @@ build_child_join_rel(PlannerInfo *root, RelOptInfo *outer_rel,
joinrel->partexprs = NULL;
joinrel->nullable_partexprs = NULL;
+ joinrel->applied_stats = NIL;
+ joinrel->applied_clauses = NIL;
+ joinrel->applied_clauses_or = NIL;
+
/* Compute information relevant to foreign relations. */
set_foreign_rel_properties(joinrel, outer_rel, inner_rel);
diff --git a/src/backend/optimizer/util/restrictinfo.c b/src/backend/optimizer/util/restrictinfo.c
index a80083d2323..b6cc5d7c2e2 100644
--- a/src/backend/optimizer/util/restrictinfo.c
+++ b/src/backend/optimizer/util/restrictinfo.c
@@ -499,6 +499,41 @@ extract_actual_clauses(List *restrictinfo_list,
return result;
}
+/*
+ * maybe_extract_actual_clauses
+ *
+ * Just like extract_actual_clauses, but does not require the clauses to
+ * already be RestrictInfo.
+ *
+ * XXX Does not handle RestrictInfos nested in OR clauses.
+ */
+List *
+maybe_extract_actual_clauses(List *restrictinfo_list,
+ bool pseudoconstant)
+{
+ List *result = NIL;
+ ListCell *l;
+
+ foreach(l, restrictinfo_list)
+ {
+ RestrictInfo *rinfo;
+ Node *node = (Node *) lfirst(l);
+
+ if (!IsA(node, RestrictInfo))
+ {
+ result = lappend(result, node);
+ continue;
+ }
+
+ rinfo = (RestrictInfo *) node;
+
+ if (rinfo->pseudoconstant == pseudoconstant)
+ result = lappend(result, rinfo->clause);
+ }
+
+ return result;
+}
+
/*
* extract_actual_join_clauses
*
diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c
index a8b63ec0884..c7367252aee 100644
--- a/src/backend/statistics/extended_stats.c
+++ b/src/backend/statistics/extended_stats.c
@@ -1854,6 +1854,14 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
list_exprs[listidx] = NULL;
}
+ /* add it to the list of applied stats/clauses, if this flag is true */
+ if (root->parse->isExplain_Stats)
+ {
+ rel->applied_stats = lappend(rel->applied_stats, stat);
+ rel->applied_clauses = lappend(rel->applied_clauses, stat_clauses);
+ rel->applied_clauses_or = lappend_int(rel->applied_clauses_or, (is_or) ? 1 : 0);
+ }
+
if (is_or)
{
bool *or_matches = NULL;
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index d3d1e485bb2..3258a3517a3 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -4083,6 +4083,7 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
ListCell *lc2;
Bitmapset *matched = NULL;
AttrNumber attnum_offset;
+ List *matched_exprs = NIL;
/*
* How much we need to offset the attnums? If there are no
@@ -4130,6 +4131,9 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
matched = bms_add_member(matched, attnum);
+ /* track expressions matched by this statistics */
+ matched_exprs = lappend(matched_exprs, varinfo->var);
+
found = true;
}
@@ -4158,6 +4162,9 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
matched = bms_add_member(matched, attnum);
+ /* track expressions matched by this statistics */
+ matched_exprs = lappend(matched_exprs, expr);
+
/* there should be just one matching expression */
break;
}
@@ -4166,6 +4173,14 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
}
}
+ /* add it to the list of applied stats/clauses, if this flag is true */
+ if (root->parse->isExplain_Stats)
+ {
+ rel->applied_stats = lappend(rel->applied_stats, matched_info);
+ rel->applied_clauses = lappend(rel->applied_clauses, matched_exprs);
+ rel->applied_clauses_or = lappend_int(rel->applied_clauses_or, 2); /* 2: Use comma to deparse */
+ }
+
/* Find the specific item that exactly matches the combination */
for (i = 0; i < stats->nitems; i++)
{
diff --git a/src/backend/utils/cache/lsyscache.c b/src/backend/utils/cache/lsyscache.c
index bcfa5cb4add..36e3f72b041 100644
--- a/src/backend/utils/cache/lsyscache.c
+++ b/src/backend/utils/cache/lsyscache.c
@@ -36,6 +36,7 @@
#include "catalog/pg_publication.h"
#include "catalog/pg_range.h"
#include "catalog/pg_statistic.h"
+#include "catalog/pg_statistic_ext.h"
#include "catalog/pg_subscription.h"
#include "catalog/pg_transform.h"
#include "catalog/pg_type.h"
@@ -3741,3 +3742,51 @@ get_subscription_name(Oid subid, bool missing_ok)
return subname;
}
+
+/*
+ * get_statistics_name
+ * Returns the name of a given extended statistics
+ *
+ * Returns a palloc'd copy of the string, or NULL if no such name.
+ */
+char *
+get_statistics_name(Oid stxid)
+{
+ HeapTuple tp;
+
+ tp = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(stxid));
+ if (HeapTupleIsValid(tp))
+ {
+ Form_pg_statistic_ext stxtup = (Form_pg_statistic_ext) GETSTRUCT(tp);
+ char *result;
+
+ result = pstrdup(NameStr(stxtup->stxname));
+ ReleaseSysCache(tp);
+ return result;
+ }
+ else
+ return NULL;
+}
+
+/*
+ * get_statistics_namespace
+ * Returns the namespace OID of a given extended statistics
+ */
+Oid
+get_statistics_namespace(Oid stxid)
+{
+ HeapTuple tp;
+
+ tp = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(stxid));
+ if (HeapTupleIsValid(tp))
+ {
+ Form_pg_statistic_ext stxtup = (Form_pg_statistic_ext) GETSTRUCT(tp);
+ Oid result;
+
+ result = stxtup->stxnamespace;
+ ReleaseSysCache(tp);
+ return result;
+ }
+ else
+ return InvalidOid;
+}
diff --git a/src/include/commands/explain.h b/src/include/commands/explain.h
index ea7419951f4..797075126a0 100644
--- a/src/include/commands/explain.h
+++ b/src/include/commands/explain.h
@@ -55,6 +55,7 @@ typedef struct ExplainState
bool memory; /* print planner's memory usage information */
bool settings; /* print modified settings */
bool generic; /* generate a generic plan */
+ bool stats; /* print applied extended stats */
ExplainSerializeOption serialize; /* serialize the query's output? */
ExplainFormat format; /* output format */
/* state for output formatting --- not reset for each new plan tree */
diff --git a/src/include/nodes/makefuncs.h b/src/include/nodes/makefuncs.h
index 5473ce9a288..b32aa762c9b 100644
--- a/src/include/nodes/makefuncs.h
+++ b/src/include/nodes/makefuncs.h
@@ -95,6 +95,8 @@ extern Node *make_and_qual(Node *qual1, Node *qual2);
extern Expr *make_ands_explicit(List *andclauses);
extern List *make_ands_implicit(Expr *clause);
+extern Expr *make_ors_explicit(List *orclauses);
+
extern IndexInfo *makeIndexInfo(int numattrs, int numkeyattrs, Oid amoid,
List *expressions, List *predicates,
bool unique, bool nulls_not_distinct,
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h
index 8dd421fa0ef..27b48629e66 100644
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -249,6 +249,9 @@ typedef struct Query
ParseLoc stmt_location;
/* length in bytes; 0 means "rest of string" */
ParseLoc stmt_len pg_node_attr(query_jumble_ignore);
+
+ /* if true, query is explain with stats option */
+ bool isExplain_Stats;
} Query;
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index 00c700cc3e7..569d876fdfe 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -1068,6 +1068,11 @@ typedef struct RelOptInfo
List **partexprs pg_node_attr(read_write_ignore);
/* Nullable partition key expressions */
List **nullable_partexprs pg_node_attr(read_write_ignore);
+
+ /* info about applied extended statistics */
+ List *applied_stats; /* list of StatisticExtInfo */
+ List *applied_clauses; /* list of lists of clauses */
+ List *applied_clauses_or; /* are the clauses AND, OR, or Comma */
} RelOptInfo;
/*
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index 2a2cf816cb6..39d2d131348 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -177,6 +177,9 @@ typedef struct Plan
*/
Bitmapset *extParam;
Bitmapset *allParam;
+
+ /* info about applied extended statistics */
+ struct Applied_ExtStats *app_extstats;
} Plan;
/* ----------------
@@ -1617,4 +1620,16 @@ typedef enum MonotonicFunction
MONOTONICFUNC_BOTH = MONOTONICFUNC_INCREASING | MONOTONICFUNC_DECREASING,
} MonotonicFunction;
+/*
+ * Applied_ExtStats - Information to show applied Extend Statistics
+ *
+ */
+typedef struct Applied_ExtStats
+{
+ NodeTag type;
+ List *applied_stats;
+ List *applied_clauses;
+ List *applied_clauses_or;
+} Applied_ExtStats;
+
#endif /* PLANNODES_H */
diff --git a/src/include/optimizer/restrictinfo.h b/src/include/optimizer/restrictinfo.h
index ec91fc9c583..0c34af43138 100644
--- a/src/include/optimizer/restrictinfo.h
+++ b/src/include/optimizer/restrictinfo.h
@@ -50,6 +50,8 @@ extern bool restriction_is_securely_promotable(RestrictInfo *restrictinfo,
extern List *get_actual_clauses(List *restrictinfo_list);
extern List *extract_actual_clauses(List *restrictinfo_list,
bool pseudoconstant);
+extern List *maybe_extract_actual_clauses(List *restrictinfo_list,
+ bool pseudoconstant);
extern void extract_actual_join_clauses(List *restrictinfo_list,
Relids joinrelids,
List **joinquals,
diff --git a/src/include/utils/lsyscache.h b/src/include/utils/lsyscache.h
index 6fab7aa6009..f1e73e389c8 100644
--- a/src/include/utils/lsyscache.h
+++ b/src/include/utils/lsyscache.h
@@ -207,6 +207,9 @@ extern char *get_publication_name(Oid pubid, bool missing_ok);
extern Oid get_subscription_oid(const char *subname, bool missing_ok);
extern char *get_subscription_name(Oid subid, bool missing_ok);
+extern char *get_statistics_name(Oid stxid);
+extern Oid get_statistics_namespace(Oid stxid);
+
#define type_is_array(typid) (get_element_type(typid) != InvalidOid)
/* type_is_array_domain accepts both plain arrays and domains over arrays */
#define type_is_array_domain(typid) (get_base_element_type(typid) != InvalidOid)
--
2.43.5
On 8/2/2025 20:50, Tomas Vondra wrote:
On 1/24/25 11:17, Andrei Lepikhov wrote:
On 11/1/24 12:22, Tatsuro Yamada wrote:
I often use SQL Server to compare execution plans generated by
PostgreSQL, and I appreciate how they display the usage of extended
statistics. They clearly identify which statistics were utilized during
query planning and include them in the summary section of query plan.I admit not knowing what exactly SQL Server shows in the explain, but it
seems helpful to know which part of the plan used which statistics, no?
I wonder if you meant clauses, not a part of the plan. Clauses may be
used on different levels of the join tree, but they are estimated once.
I meant we may just refer to the statistic used by its name in the
node's explanation if this node contains estimated clause. I use
multiple extended statistics. Sometimes, they intersect, and it is hard
to say which one was used for a specific clause.
Imagine we only knew an index was used, but not which node used it and
for what keys. That would be a bit ... useless.
Sure, but I am suspicious about the strong necessity to show a specific
clause estimated by the statistic. It is almost obvious because of the
simple choosing algorithm.
Or what info does the SQL server include in the plan, exactly? Can you
share an example?
I wouldn't say SQL Server designed it ideally. In an XML file, you can
find something like this:
<StatisticsInfo LastUpdate="2024-09-09T21:55:04.43"
ModificationCount="0" SamplingPercent="17.9892"
Statistics="[_WA_Sys_00000001_0A55DF1D]" Table="[_InfoRgX]"
Schema="[dbo]" Database="[DB]"></StatisticsInfo>
The main profit here - you see all the stats involved in estimations
(and their state), even if final plan doesn't contain estimated stuff at
all.
I find this method much easier to implement, as it allows us to see any
usage points - remember that `estimate_num_groups` may be applied in
multiple places and may not always correspond to a node clause.I may be wrong, but I don't quite see why would that be significantly
easier to implement. You still need to track the information somewhere,
and it probably needs for individual Path nodes. Because while building
the plan you don't know which paths will get selected.
In my mind, gathering extended statistics usage data may be implemented
like it is already done for many parameters in the stat collector. It
may help identify whether statistics are used frequently or not.
Although, maybe the selectivity estimation is sufficiently independent
from the exact paths? In which case we might track it at the plan level.
Not only that argument. Usually, when I request a DBMS about an EXPLAIN,
my typical desire is to realise which plan we have and why the optimiser
has skipped another, maybe even more optimal, plan. I wonder if people
frequently have the same intention.
That means you will never see clauses (and their estimations) that were
kicked off the plan - ppi_clauses of a parameterised path, for example.
And it may hide the real mechanic that caused skipping the better plan.
Mentioning all statistics involved in the summary may reveal such info.
The point about estimate_num_groups is good - I think there will be more
cases where linking the extended statistics to a clause will be hard.
But maybe let's not block the whole patch because of that?
Sure, if the current code doesn't block further improvements of showing
all statistics involved in the query planning process.
I just wanted to emphasize the key idea: quite frequently we don't see
in explain stuff that triggered suboptimal plan, because it was
overestimated and excluded: an index, join, clause ... .
Anyway, thanks for answers, I will discover the code more.
--
regards, Andrei Lepikhov
On 2/10/25 10:09, Andrei Lepikhov wrote:
On 8/2/2025 20:50, Tomas Vondra wrote:
On 1/24/25 11:17, Andrei Lepikhov wrote:
On 11/1/24 12:22, Tatsuro Yamada wrote:
I often use SQL Server to compare execution plans generated by
PostgreSQL, and I appreciate how they display the usage of extended
statistics. They clearly identify which statistics were utilized during
query planning and include them in the summary section of query plan.I admit not knowing what exactly SQL Server shows in the explain, but it
seems helpful to know which part of the plan used which statistics, no?I wonder if you meant clauses, not a part of the plan. Clauses may be
used on different levels of the join tree, but they are estimated once.I meant we may just refer to the statistic used by its name in the
node's explanation if this node contains estimated clause. I use
multiple extended statistics. Sometimes, they intersect, and it is hard
to say which one was used for a specific clause.
I did mean "part of the plan", i.e. the actual operation. That being
said, maybe you're right the exact node doesn't matter all that much,
and it'd be better to have a separate list of stats for the whole plan.
That'd probably work better for stuff like estimate_num_groups(), which
is hard to assign to a particular operation ...
Imagine we only knew an index was used, but not which node used it and
for what keys. That would be a bit ... useless.Sure, but I am suspicious about the strong necessity to show a specific
clause estimated by the statistic. It is almost obvious because of the
simple choosing algorithm.
You could say the same thing about indexes, The process of picking an
index is fairly straightforward, same as determining keys and filters
for the index scan. Yet we still show all of this in the plan. The
algorithm may be simple, but the outcomes are far from obvious. And we
may also enhance/rework the algorithm in the future ... I don't think we
should require intimate knowledge of these details from users.
Also, a couple paragraphs back you wrote:
Sometimes, they intersect, and it is hard to say which one was used
for a specific clause.
Doesn't that really contradict your claim that it's "almost obvious"?
Or what info does the SQL server include in the plan, exactly? Can you
share an example?I wouldn't say SQL Server designed it ideally. In an XML file, you can
find something like this:<StatisticsInfo LastUpdate="2024-09-09T21:55:04.43"
ModificationCount="0" SamplingPercent="17.9892"
Statistics="[_WA_Sys_00000001_0A55DF1D]" Table="[_InfoRgX]"
Schema="[dbo]" Database="[DB]"></StatisticsInfo>The main profit here - you see all the stats involved in estimations
(and their state), even if final plan doesn't contain estimated stuff at
all.
OK, that seems very underwhelming. I still think we should show which
clauses were estimated using which statistics object.
I find this method much easier to implement, as it allows us to see any
usage points - remember that `estimate_num_groups` may be applied in
multiple places and may not always correspond to a node clause.I may be wrong, but I don't quite see why would that be significantly
easier to implement. You still need to track the information somewhere,
and it probably needs for individual Path nodes. Because while building
the plan you don't know which paths will get selected.In my mind, gathering extended statistics usage data may be implemented
like it is already done for many parameters in the stat collector. It
may help identify whether statistics are used frequently or not.
Isn't the statistics collector dealing with a completely different use
case? Sure, maybe it would be interesting to track how many queries used
which statistics object / for what purpose. A bit like we track counts
for indexes etc.
But that seems very different from what this patch aims to do, which is
to track info about how a particular query used extended stats.
I suppose we could have a "buffer" to remember which statistics objects
were used by a single statement/transaction, just like we do for tables
or indexes. But I still think we should show clauses estimated by each
statistics object in a given query, and this would not help with that.
Although, maybe the selectivity estimation is sufficiently independent
from the exact paths? In which case we might track it at the plan level.Not only that argument. Usually, when I request a DBMS about an EXPLAIN,
my typical desire is to realise which plan we have and why the optimiser
has skipped another, maybe even more optimal, plan. I wonder if people
frequently have the same intention.
That means you will never see clauses (and their estimations) that were
kicked off the plan - ppi_clauses of a parameterised path, for example.
And it may hide the real mechanic that caused skipping the better plan.
Mentioning all statistics involved in the summary may reveal such info.
I don't quite agree with this argument. Explain is meant to describe one
particular plan, not why some other plans were not selected. So I don't
think we should be adding details about estimating clauses from some
other alternative plan. We're not showing info about costing for other
indexes either, for example.
Also, if we started adding more and more statistics, that'd be just an
extra argument against the "almost obvious" claim. Because not only
you'd have to deduce which clauses were estimated by each object, you'd
also first have to figure out which statistics were used for the plan.
The point about estimate_num_groups is good - I think there will be more
cases where linking the extended statistics to a clause will be hard.
But maybe let's not block the whole patch because of that?Sure, if the current code doesn't block further improvements of showing
all statistics involved in the query planning process.I just wanted to emphasize the key idea: quite frequently we don't see
in explain stuff that triggered suboptimal plan, because it was
overestimated and excluded: an index, join, clause ... .Anyway, thanks for answers, I will discover the code more.
Thanks for the discussion.
I wonder what Yamada-san thinks about these suggestions ... He's the one
actually developing the patch, so I'd like to know his opinions.
regards
--
Tomas Vondra
Hi Tomas and ALL,
I wonder what Yamada-san thinks about these suggestions ... He's the one
actually developing the patch, so I'd like to know his opinions.
I will state my thoughts on the two points of discussion.
I often use SQL Server to compare execution plans generated by
PostgreSQL, and I appreciate how they display the usage of extended
statistics. They clearly identify which statistics were utilized during
query planning and include them in the summary section of query plan.I admit not knowing what exactly SQL Server shows in the explain, but it
seems helpful to know which part of the plan used which statistics, no?I wonder if you meant clauses, not a part of the plan. Clauses may be
used on different levels of the join tree, but they are estimated once.I meant we may just refer to the statistic used by its name in the
node's explanation if this node contains estimated clause. I use
multiple extended statistics. Sometimes, they intersect, and it is hard
to say which one was used for a specific clause.I did mean "part of the plan", i.e. the actual operation. That being
said, maybe you're right the exact node doesn't matter all that much,
and it'd be better to have a separate list of stats for the whole plan.That'd probably work better for stuff like estimate_num_groups(), which
is hard to assign to a particular operation ...
The patch currently under development aims to display the actual usage of
extended statistics on a plan-by-plan basis.
I thought that being able to check the extended statistics used by plan
nodes
in a plan would make it easier to tune plans using extended statistics.
On the other hand, I also think that a mechanism to check the usage of
extended statistics on a DB-by-DB basis would be useful.
I talked about this idea in my talk at PGConf.dev 2024.
This is similar to pg_stat_user_indexes for indexes. If you can find unused
or
infrequently used extended statistics, you can delete unnecessary ones.
However, this functionality is not included in this patch. I plan to
provide it as
a separate patch after the current patch is committed.
I may be wrong, but I don't quite see why would that be significantly
easier to implement. You still need to track the information somewhere,
and it probably needs for individual Path nodes. Because while building
the plan you don't know which paths will get selected.In my mind, gathering extended statistics usage data may be implemented
like it is already done for many parameters in the stat collector. It
may help identify whether statistics are used frequently or not.Isn't the statistics collector dealing with a completely different use
case? Sure, maybe it would be interesting to track how many queries used
which statistics object / for what purpose. A bit like we track counts
for indexes etc.But that seems very different from what this patch aims to do, which is
to track info about how a particular query used extended stats.I suppose we could have a "buffer" to remember which statistics objects
were used by a single statement/transaction, just like we do for tables
or indexes. But I still think we should show clauses estimated by each
statistics object in a given query, and this would not help with that.
Ideally, it would be nice to be able to show which extended statistics
were used for which clauses. However, based on the discussion so far,
this seems difficult to implement.
A second best solution would be to add a summary of the extended statistics
used at the end of the plan. This alone would make it easier for users to
use the extended statistics.
The expected output is as follows:
- Do not output "Clauses: ".
- Display only the names of the extended statistics used.
- If displaying the names of multiple extended statistics,
should they be joined by commas? (Or should Ext Stats: be displayed
on multiple lines?)
e.g.,
explain (stats) select 1 from t where a = 1 and b =1;
## Output from the current patch
QUERY PLAN
--------------------------------------------------------
Seq Scan on t (cost=0.00..1943.00 rows=10210 width=4)
Filter: ((a = 1) AND (b = 1))
Ext Stats: s Clauses: ((a = 1) AND (b = 1))
(3 rows)
## Output for displaying summary extended statistics
QUERY PLAN
--------------------------------------------------------
Seq Scan on t (cost=0.00..1943.00 rows=10210 width=4)
Filter: ((a = 1) AND (b = 1))
Planning:
Ext Stats: s <== Is it acceptable?
If we do not display "Clause" information, I believe that there is
no need to deparse clauses, and therefore no need to improve the deparse
function to enable recursive traversal of expression trees.
To further the discussion, I will modify the patch to display the summary
as shown above and send it to -hackers.
P.S.
I have tried to understand the discussion correctly, but please
let me know if I have misunderstood anything.
Regards,
Tatsuro Yamada
On 2/12/25 06:02, Tatsuro Yamada wrote:
Hi Tomas and ALL,
I wonder what Yamada-san thinks about these suggestions ... He's the one
actually developing the patch, so I'd like to know his opinions.I will state my thoughts on the two points of discussion.
I often use SQL Server to compare execution plans generated by
PostgreSQL, and I appreciate how they display the usage of extended
statistics. They clearly identify which statistics were utilized during
query planning and include them in the summary section of query plan.I admit not knowing what exactly SQL Server shows in the explain, but it
seems helpful to know which part of the plan used which statistics, no?I wonder if you meant clauses, not a part of the plan. Clauses may be
used on different levels of the join tree, but they are estimated once.I meant we may just refer to the statistic used by its name in the
node's explanation if this node contains estimated clause. I use
multiple extended statistics. Sometimes, they intersect, and it is hard
to say which one was used for a specific clause.I did mean "part of the plan", i.e. the actual operation. That being
said, maybe you're right the exact node doesn't matter all that much,
and it'd be better to have a separate list of stats for the whole plan.That'd probably work better for stuff like estimate_num_groups(), which
is hard to assign to a particular operation ...The patch currently under development aims to display the actual usage of
extended statistics on a plan-by-plan basis.
I thought that being able to check the extended statistics used by plan
nodes
in a plan would make it easier to tune plans using extended statistics.On the other hand, I also think that a mechanism to check the usage of
extended statistics on a DB-by-DB basis would be useful.
I talked about this idea in my talk at PGConf.dev 2024.
This is similar to pg_stat_user_indexes for indexes. If you can find
unused or
infrequently used extended statistics, you can delete unnecessary ones.
However, this functionality is not included in this patch. I plan to
provide it as
a separate patch after the current patch is committed.
+1 to treating the DB-level stuff (i.e. system catalog showing how often
was each statistics used, etc.) as a separate feature / patch. I did not
mean to suggest this should be included in this patch, my point was that
it's very different from what this patch aimed to do.
I may be wrong, but I don't quite see why would that be significantly
easier to implement. You still need to track the information somewhere,
and it probably needs for individual Path nodes. Because while building
the plan you don't know which paths will get selected.In my mind, gathering extended statistics usage data may be implemented
like it is already done for many parameters in the stat collector. It
may help identify whether statistics are used frequently or not.Isn't the statistics collector dealing with a completely different use
case? Sure, maybe it would be interesting to track how many queries used
which statistics object / for what purpose. A bit like we track counts
for indexes etc.But that seems very different from what this patch aims to do, which is
to track info about how a particular query used extended stats.I suppose we could have a "buffer" to remember which statistics objects
were used by a single statement/transaction, just like we do for tables
or indexes. But I still think we should show clauses estimated by each
statistics object in a given query, and this would not help with that.Ideally, it would be nice to be able to show which extended statistics
were used for which clauses. However, based on the discussion so far,
this seems difficult to implement.
Maybe I'm missing something, but why would this be difficult to
implement? There was a problem with RestrictInfo nodes, but doesn't
Tom's suggestion [1] to use extract_actual_clauses() address this? I
might have done something silly in the PoC patch.
/messages/by-id/1691487.1733180716@sss.pgh.pa.us
A second best solution would be to add a summary of the extended statistics
used at the end of the plan. This alone would make it easier for users to
use the extended statistics.The expected output is as follows:
- Do not output "Clauses: ".
- Display only the names of the extended statistics used.
- If displaying the names of multiple extended statistics,
should they be joined by commas? (Or should Ext Stats: be displayed
on multiple lines?)
The list of stats is better than no information at all, but I don't
quite see why we couldn't show clauses, and to me the clauses seem like
a pretty important piece of information.
e.g.,
explain (stats) select 1 from t where a = 1 and b =1;## Output from the current patch
QUERY PLAN
--------------------------------------------------------
Seq Scan on t (cost=0.00..1943.00 rows=10210 width=4)
Filter: ((a = 1) AND (b = 1))
Ext Stats: s Clauses: ((a = 1) AND (b = 1))
(3 rows)## Output for displaying summary extended statistics
QUERY PLAN
--------------------------------------------------------
Seq Scan on t (cost=0.00..1943.00 rows=10210 width=4)
Filter: ((a = 1) AND (b = 1))
Planning:
Ext Stats: s <== Is it acceptable?If we do not display "Clause" information, I believe that there is
no need to deparse clauses, and therefore no need to improve the deparse
function to enable recursive traversal of expression trees.
What do you mean by the need to improve the deparse function? Is this
about the nested RestrictInfos? Have you tried doing that Tom suggested
in [1], or does it not address the issue?
To further the discussion, I will modify the patch to display the summary
as shown above and send it to -hackers.P.S.
I have tried to understand the discussion correctly, but please
let me know if I have misunderstood anything.
Thanks for working on this!
regards
--
Tomas Vondra
On 2/10/25 16:56, Tomas Vondra wrote:
On 2/10/25 10:09, Andrei Lepikhov wrote:
On 8/2/2025 20:50, Tomas Vondra wrote:
<StatisticsInfo LastUpdate="2024-09-09T21:55:04.43"
ModificationCount="0" SamplingPercent="17.9892"
Statistics="[_WA_Sys_00000001_0A55DF1D]" Table="[_InfoRgX]"
Schema="[dbo]" Database="[DB]"></StatisticsInfo>The main profit here - you see all the stats involved in estimations
(and their state), even if final plan doesn't contain estimated stuff at
all.OK, that seems very underwhelming. I still think we should show which
clauses were estimated using which statistics object.
To understand how it may work, I employed the EXPLAIN extensibility
introduced in PG 18 to show the use of plain statistics [1]https://github.com/danolivo/pg_index_stats. It looks
like the following:
EXPLAIN (COSTS OFF, STAT ON)
SELECT * FROM sc_a WHERE x=1 AND y LIKE 'a';
Seq Scan on sc_a
Filter: ((y ~~ 'a'::text) AND (x = 1))
Statistics:
"sc_a.y: 1 times, stats: { MCV: 10 values, Correlation,
ndistinct: 10.0000, nullfrac: 0.0000, width: 5 }
"sc_a.x: 1 times, stats: { Histogram: 0 values, Correlation,
ndistinct: -1.0000, nullfrac: 0.0000, width: 4 }
As you can see, stat usage is summarised at the end of the EXPLAIN. It
contains information about the column, how many times it was used and
the parameters of statistic slots.
Of course, being an extension it is constrained a lot, but even there is
the profit:
1. You may see types of statistics exist on the column
2. Precision of the histogram or MCV (statistic_target) on a specific
table - some users forget to increase it on large (or partitioned) tables
3. You have basic stat like nullfrac, ndistinct without the necessity to
teach personnel how to gather it on a production instance safely.
Also, using it in real cases, I realised that it would be highly
profitable to specify which statistic type was used to estimate this
specific clause.
Of course, extended statistics have their own specifics, which may
require another output format. Just consider this example too.
[1]: https://github.com/danolivo/pg_index_stats
--
regards, Andrei Lepikhov
I rebased this patch series; here it applies to current master. I
didn't review it or change anything.
--
Álvaro Herrera Breisgau, Deutschland — https://www.EnterpriseDB.com/
"Hay que recordar que la existencia en el cosmos, y particularmente la
elaboración de civilizaciones dentro de él no son, por desgracia,
nada idílicas" (Ijon Tichy)
Attachments:
0001-Add-a-new-option-STATS-to-EXPLAIN-command.patchtext/x-diff; charset=utf-8Download
From ff2cc418d59bd32e92617fca6f8c2f5f8f4951ad Mon Sep 17 00:00:00 2001
From: Tatsuro Yamada <yamatattsu@gmail.com>
Date: Mon, 10 Feb 2025 08:33:23 +0900
Subject: [PATCH 1/3] Add a new option STATS to EXPLAIN command
This patch allows to show applied extended statistics in EXPLAIN command output.
It includes the following points:
- Rebased on 9926f854
- Added a new struct Applied_ExtStats in plannode.h (T4)
- Hopefully this will solve the issue related to "-DCOPY_PARSE_PLAN_TREES"
with cfbot.
- To pass the extended statistics information from path to plan,
it might be more appropriate to define a new structure in primnode.h
rather than plannode.h. Any advice would be appreciated.
- Handled EXPLAIN(STATS, VERBOSE) option (M2)
- Before the fix, schema name was always added to extended statistics name,
but with this patch, schema name is added to the following only when
VERBOSE option is selected:
- Extended statistics name, table name, and column name
- Added Supported extended statistics types in document (M4)
---
doc/src/sgml/ref/explain.sgml | 14 +++
src/backend/commands/explain.c | 138 ++++++++++++++++++++++
src/backend/commands/explain_state.c | 2 +
src/backend/nodes/makefuncs.c | 11 ++
src/backend/optimizer/plan/createplan.c | 17 +++
src/backend/optimizer/util/relnode.c | 12 ++
src/backend/optimizer/util/restrictinfo.c | 35 ++++++
src/backend/statistics/extended_stats.c | 8 ++
src/backend/utils/adt/selfuncs.c | 15 +++
src/backend/utils/cache/lsyscache.c | 49 ++++++++
src/include/commands/explain_state.h | 1 +
src/include/nodes/makefuncs.h | 2 +
src/include/nodes/parsenodes.h | 3 +
src/include/nodes/pathnodes.h | 5 +
src/include/nodes/plannodes.h | 15 +++
src/include/optimizer/restrictinfo.h | 2 +
src/include/utils/lsyscache.h | 3 +
17 files changed, 332 insertions(+)
diff --git a/doc/src/sgml/ref/explain.sgml b/doc/src/sgml/ref/explain.sgml
index 6dda680aa0d..6bfe694dae8 100644
--- a/doc/src/sgml/ref/explain.sgml
+++ b/doc/src/sgml/ref/explain.sgml
@@ -43,6 +43,7 @@ EXPLAIN [ ( <replaceable class="parameter">option</replaceable> [, ...] ) ] <rep
BUFFERS [ <replaceable class="parameter">boolean</replaceable> ]
SERIALIZE [ { NONE | TEXT | BINARY } ]
WAL [ <replaceable class="parameter">boolean</replaceable> ]
+ STATS [ <replaceable class="parameter">boolean</replaceable> ]
TIMING [ <replaceable class="parameter">boolean</replaceable> ]
SUMMARY [ <replaceable class="parameter">boolean</replaceable> ]
MEMORY [ <replaceable class="parameter">boolean</replaceable> ]
@@ -249,6 +250,19 @@ ROLLBACK;
</listitem>
</varlistentry>
+ <varlistentry>
+ <term><literal>STATS</literal></term>
+ <listitem>
+ <para>
+ Include information on applied <literal>Extended statistics</literal>.
+ Specifically, include the names of extended statistics and clauses.
+ Supported extended statistics types are Dependencies and MCV.
+ See <xref linkend="planner-stats-extended"/> for details about extended
+ statistics. This parameter defaults to <literal>FALSE</literal>.
+ </para>
+ </listitem>
+ </varlistentry>
+
<varlistentry>
<term><literal>TIMING</literal></term>
<listitem>
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 8345bc0264b..53593775939 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -87,6 +87,15 @@ static void show_qual(List *qual, const char *qlabel,
static void show_scan_qual(List *qual, const char *qlabel,
PlanState *planstate, List *ancestors,
ExplainState *es);
+static char *deparse_stat_expression(Node *node,
+ PlanState *planstate, List *ancestors,
+ bool useprefix, ExplainState *es);
+static char *show_stat_qual(List *qual, int is_or,
+ PlanState *planstate, List *ancestors,
+ bool useprefix, ExplainState *es);
+static void show_scan_stats(List *stats, List *clauses, List *ors,
+ PlanState *planstate, List *ancestors,
+ ExplainState *es);
static void show_upper_qual(List *qual, const char *qlabel,
PlanState *planstate, List *ancestors,
ExplainState *es);
@@ -347,6 +356,11 @@ standard_ExplainOneQuery(Query *query, int cursorOptions,
if (es->buffers)
bufusage_start = pgBufferUsage;
+
+ /* if this flag is true, applied ext stats are stored */
+ if (es->stats)
+ query->isExplain_Stats = true;
+
INSTR_TIME_SET_CURRENT(planstart);
/* plan the query */
@@ -1966,6 +1980,11 @@ ExplainNode(PlanState *planstate, List *ancestors,
show_instrumentation_count("Rows Removed by Filter", 1,
planstate, es);
show_indexsearches_info(planstate, es);
+ if (es->stats)
+ show_scan_stats(plan->app_extstats->applied_stats,
+ plan->app_extstats->applied_clauses,
+ plan->app_extstats->applied_clauses_or,
+ planstate, ancestors, es);
break;
case T_IndexOnlyScan:
show_scan_qual(((IndexOnlyScan *) plan)->indexqual,
@@ -1983,11 +2002,21 @@ ExplainNode(PlanState *planstate, List *ancestors,
ExplainPropertyFloat("Heap Fetches", NULL,
planstate->instrument->ntuples2, 0, es);
show_indexsearches_info(planstate, es);
+ if (es->stats)
+ show_scan_stats(plan->app_extstats->applied_stats,
+ plan->app_extstats->applied_clauses,
+ plan->app_extstats->applied_clauses_or,
+ planstate, ancestors, es);
break;
case T_BitmapIndexScan:
show_scan_qual(((BitmapIndexScan *) plan)->indexqualorig,
"Index Cond", planstate, ancestors, es);
show_indexsearches_info(planstate, es);
+ if (es->stats)
+ show_scan_stats(plan->app_extstats->applied_stats,
+ plan->app_extstats->applied_clauses,
+ plan->app_extstats->applied_clauses_or,
+ planstate, ancestors, es);
break;
case T_BitmapHeapScan:
show_scan_qual(((BitmapHeapScan *) plan)->bitmapqualorig,
@@ -2018,6 +2047,12 @@ ExplainNode(PlanState *planstate, List *ancestors,
planstate, es);
if (IsA(plan, CteScan))
show_ctescan_info(castNode(CteScanState, planstate), es);
+
+ if (es->stats)
+ show_scan_stats(plan->app_extstats->applied_stats,
+ plan->app_extstats->applied_clauses,
+ plan->app_extstats->applied_clauses_or,
+ planstate, ancestors, es);
break;
case T_Gather:
{
@@ -2085,6 +2120,11 @@ ExplainNode(PlanState *planstate, List *ancestors,
if (plan->qual)
show_instrumentation_count("Rows Removed by Filter", 1,
planstate, es);
+ if (es->stats)
+ show_scan_stats(plan->app_extstats->applied_stats,
+ plan->app_extstats->applied_clauses,
+ plan->app_extstats->applied_clauses_or,
+ planstate, ancestors, es);
break;
case T_TableFuncScan:
if (es->verbose)
@@ -2199,6 +2239,11 @@ ExplainNode(PlanState *planstate, List *ancestors,
if (plan->qual)
show_instrumentation_count("Rows Removed by Filter", 1,
planstate, es);
+ if (es->stats)
+ show_scan_stats(plan->app_extstats->applied_stats,
+ plan->app_extstats->applied_clauses,
+ plan->app_extstats->applied_clauses_or,
+ planstate, ancestors, es);
break;
case T_WindowAgg:
show_window_def(castNode(WindowAggState, planstate), ancestors, es);
@@ -2216,6 +2261,11 @@ ExplainNode(PlanState *planstate, List *ancestors,
if (plan->qual)
show_instrumentation_count("Rows Removed by Filter", 1,
planstate, es);
+ if (es->stats)
+ show_scan_stats(plan->app_extstats->applied_stats,
+ plan->app_extstats->applied_clauses,
+ plan->app_extstats->applied_clauses_or,
+ planstate, ancestors, es);
break;
case T_Sort:
show_sort_keys(castNode(SortState, planstate), ancestors, es);
@@ -2547,6 +2597,94 @@ show_scan_qual(List *qual, const char *qlabel,
show_qual(qual, qlabel, planstate, ancestors, useprefix, es);
}
+/*
+ * Show a generic expression
+ */
+static char *
+deparse_stat_expression(Node *node,
+ PlanState *planstate, List *ancestors,
+ bool useprefix, ExplainState *es)
+{
+ List *context;
+
+ /* Set up deparsing context */
+ context = set_deparse_context_plan(es->deparse_cxt,
+ planstate->plan,
+ ancestors);
+
+ /* Deparse the expression */
+ return deparse_expression(node, context, useprefix, false);
+}
+
+/*
+ * Show a qualifier expression for extended stats
+ */
+static char *
+show_stat_qual(List *qual, int is_or,
+ PlanState *planstate, List *ancestors,
+ bool useprefix, ExplainState *es)
+{
+ Node *node;
+
+ /* No work if empty qual */
+ if (qual == NIL)
+ return NULL;
+
+ /* Convert AND list to explicit AND */
+ switch (is_or)
+ {
+ case 0:
+ node = (Node *) make_ands_explicit(qual);
+ break;
+ case 1:
+ node = (Node *) make_ors_explicit(qual);
+ break;
+ case 2:
+ /* Extended stats for GROUP BY clause should be comma separeted string */
+ node = (Node *) qual;
+ break;
+ default:
+ elog(ERROR, "unexpected value: %d", is_or);
+ break;
+ }
+
+ /* And show it */
+ return deparse_stat_expression(node, planstate, ancestors, useprefix, es);
+}
+
+/*
+ * Show applied statistics for scan/agg/group plan node
+ */
+static void
+show_scan_stats(List *stats, List *clauses, List *ors,
+ PlanState *planstate, List *ancestors, ExplainState *es)
+{
+ ListCell *lc1, *lc2, *lc3;
+ StringInfoData str;
+ bool useprefix;
+
+ useprefix = es->verbose;
+
+ forthree (lc1, stats, lc2, clauses, lc3, ors)
+ {
+ StatisticExtInfo *stat = (StatisticExtInfo *) lfirst(lc1);
+ List *applied_clauses = (List *) lfirst(lc2);
+ int is_or = lfirst_int(lc3);
+
+ initStringInfo(&str);
+
+ if (useprefix)
+ appendStringInfo(&str, "%s.",
+ get_namespace_name(get_statistics_namespace(stat->statOid)));
+
+ appendStringInfo(&str, "%s Clauses: %s",
+ get_statistics_name(stat->statOid),
+ show_stat_qual(applied_clauses, is_or, planstate, ancestors, useprefix, es));
+
+ ExplainPropertyText("Ext Stats", str.data, es);
+ }
+}
+
/*
* Show a qualifier expression for an upper-level plan node
*/
diff --git a/src/backend/commands/explain_state.c b/src/backend/commands/explain_state.c
index 60d98d63a62..5d1ff461a7d 100644
--- a/src/backend/commands/explain_state.c
+++ b/src/backend/commands/explain_state.c
@@ -103,6 +103,8 @@ ParseExplainOptionList(ExplainState *es, List *options, ParseState *pstate)
es->settings = defGetBoolean(opt);
else if (strcmp(opt->defname, "generic_plan") == 0)
es->generic = defGetBoolean(opt);
+ else if (strcmp(opt->defname, "stats") == 0)
+ es->stats = defGetBoolean(opt);
else if (strcmp(opt->defname, "timing") == 0)
{
timing_set = true;
diff --git a/src/backend/nodes/makefuncs.c b/src/backend/nodes/makefuncs.c
index e2d9e9be41a..fe021f35e05 100644
--- a/src/backend/nodes/makefuncs.c
+++ b/src/backend/nodes/makefuncs.c
@@ -806,6 +806,17 @@ make_ands_explicit(List *andclauses)
return make_andclause(andclauses);
}
+Expr *
+make_ors_explicit(List *orclauses)
+{
+ if (orclauses == NIL)
+ return (Expr *) makeBoolConst(true, false);
+ else if (list_length(orclauses) == 1)
+ return (Expr *) linitial(orclauses);
+ else
+ return make_orclause(orclauses);
+}
+
List *
make_ands_implicit(Expr *clause)
{
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index bfefc7dbea1..fbcf64ef264 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -5561,6 +5561,8 @@ order_qual_clauses(PlannerInfo *root, List *clauses)
static void
copy_generic_path_info(Plan *dest, Path *src)
{
+ ListCell *lc;
+
dest->disabled_nodes = src->disabled_nodes;
dest->startup_cost = src->startup_cost;
dest->total_cost = src->total_cost;
@@ -5568,6 +5570,21 @@ copy_generic_path_info(Plan *dest, Path *src)
dest->plan_width = src->pathtarget->width;
dest->parallel_aware = src->parallel_aware;
dest->parallel_safe = src->parallel_safe;
+
+ /* Is this the right place to use makeNode()? */
+ dest->app_extstats = makeNode(Applied_ExtStats);
+ dest->app_extstats->applied_stats = src->parent->applied_stats;
+ dest->app_extstats->applied_clauses_or = src->parent->applied_clauses_or;
+ dest->app_extstats->applied_clauses = NIL;
+
+ foreach (lc, src->parent->applied_clauses)
+ {
+ List *clauses = (List *) lfirst(lc);
+
+ dest->app_extstats->applied_clauses
+ = lappend(dest->app_extstats->applied_clauses,
+ maybe_extract_actual_clauses(clauses, false));
+ }
}
/*
diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c
index ff507331a06..b46ad85cba4 100644
--- a/src/backend/optimizer/util/relnode.c
+++ b/src/backend/optimizer/util/relnode.c
@@ -287,6 +287,10 @@ build_simple_rel(PlannerInfo *root, int relid, RelOptInfo *parent)
rel->partexprs = NULL;
rel->nullable_partexprs = NULL;
+ rel->applied_stats = NIL;
+ rel->applied_clauses = NIL;
+ rel->applied_clauses_or = NIL;
+
/*
* Pass assorted information down the inheritance hierarchy.
*/
@@ -769,6 +773,10 @@ build_join_rel(PlannerInfo *root,
joinrel->partexprs = NULL;
joinrel->nullable_partexprs = NULL;
+ joinrel->applied_stats = NIL;
+ joinrel->applied_clauses = NIL;
+ joinrel->applied_clauses_or = NIL;
+
/* Compute information relevant to the foreign relations. */
set_foreign_rel_properties(joinrel, outer_rel, inner_rel);
@@ -953,6 +961,10 @@ build_child_join_rel(PlannerInfo *root, RelOptInfo *outer_rel,
joinrel->partexprs = NULL;
joinrel->nullable_partexprs = NULL;
+ joinrel->applied_stats = NIL;
+ joinrel->applied_clauses = NIL;
+ joinrel->applied_clauses_or = NIL;
+
/* Compute information relevant to foreign relations. */
set_foreign_rel_properties(joinrel, outer_rel, inner_rel);
diff --git a/src/backend/optimizer/util/restrictinfo.c b/src/backend/optimizer/util/restrictinfo.c
index a80083d2323..b6cc5d7c2e2 100644
--- a/src/backend/optimizer/util/restrictinfo.c
+++ b/src/backend/optimizer/util/restrictinfo.c
@@ -499,6 +499,41 @@ extract_actual_clauses(List *restrictinfo_list,
return result;
}
+/*
+ * maybe_extract_actual_clauses
+ *
+ * Just like extract_actual_clauses, but does not require the clauses to
+ * already be RestrictInfo.
+ *
+ * XXX Does not handle RestrictInfos nested in OR clauses.
+ */
+List *
+maybe_extract_actual_clauses(List *restrictinfo_list,
+ bool pseudoconstant)
+{
+ List *result = NIL;
+ ListCell *l;
+
+ foreach(l, restrictinfo_list)
+ {
+ RestrictInfo *rinfo;
+ Node *node = (Node *) lfirst(l);
+
+ if (!IsA(node, RestrictInfo))
+ {
+ result = lappend(result, node);
+ continue;
+ }
+
+ rinfo = (RestrictInfo *) node;
+
+ if (rinfo->pseudoconstant == pseudoconstant)
+ result = lappend(result, rinfo->clause);
+ }
+
+ return result;
+}
+
/*
* extract_actual_join_clauses
*
diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c
index a8b63ec0884..c7367252aee 100644
--- a/src/backend/statistics/extended_stats.c
+++ b/src/backend/statistics/extended_stats.c
@@ -1854,6 +1854,14 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
list_exprs[listidx] = NULL;
}
+ /* add it to the list of applied stats/clauses, if this flag is true */
+ if (root->parse->isExplain_Stats)
+ {
+ rel->applied_stats = lappend(rel->applied_stats, stat);
+ rel->applied_clauses = lappend(rel->applied_clauses, stat_clauses);
+ rel->applied_clauses_or = lappend_int(rel->applied_clauses_or, (is_or) ? 1 : 0);
+ }
+
if (is_or)
{
bool *or_matches = NULL;
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 17fbfa9b410..e90937bec55 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -4346,6 +4346,7 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
ListCell *lc2;
Bitmapset *matched = NULL;
AttrNumber attnum_offset;
+ List *matched_exprs = NIL;
/*
* How much we need to offset the attnums? If there are no
@@ -4393,6 +4394,9 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
matched = bms_add_member(matched, attnum);
+ /* track expressions matched by this statistics */
+ matched_exprs = lappend(matched_exprs, varinfo->var);
+
found = true;
}
@@ -4421,6 +4425,9 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
matched = bms_add_member(matched, attnum);
+ /* track expressions matched by this statistics */
+ matched_exprs = lappend(matched_exprs, expr);
+
/* there should be just one matching expression */
break;
}
@@ -4429,6 +4436,14 @@ estimate_multivariate_ndistinct(PlannerInfo *root, RelOptInfo *rel,
}
}
+ /* add it to the list of applied stats/clauses, if this flag is true */
+ if (root->parse->isExplain_Stats)
+ {
+ rel->applied_stats = lappend(rel->applied_stats, matched_info);
+ rel->applied_clauses = lappend(rel->applied_clauses, matched_exprs);
+ rel->applied_clauses_or = lappend_int(rel->applied_clauses_or, 2); /* 2: Use comma to deparse */
+ }
+
/* Find the specific item that exactly matches the combination */
for (i = 0; i < stats->nitems; i++)
{
diff --git a/src/backend/utils/cache/lsyscache.c b/src/backend/utils/cache/lsyscache.c
index c460a72b75d..562ee4a8f9c 100644
--- a/src/backend/utils/cache/lsyscache.c
+++ b/src/backend/utils/cache/lsyscache.c
@@ -36,6 +36,7 @@
#include "catalog/pg_publication.h"
#include "catalog/pg_range.h"
#include "catalog/pg_statistic.h"
+#include "catalog/pg_statistic_ext.h"
#include "catalog/pg_subscription.h"
#include "catalog/pg_transform.h"
#include "catalog/pg_type.h"
@@ -3854,3 +3855,51 @@ get_subscription_name(Oid subid, bool missing_ok)
return subname;
}
+
+/*
+ * get_statistics_name
+ * Returns the name of a given extended statistics
+ *
+ * Returns a palloc'd copy of the string, or NULL if no such name.
+ */
+char *
+get_statistics_name(Oid stxid)
+{
+ HeapTuple tp;
+
+ tp = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(stxid));
+ if (HeapTupleIsValid(tp))
+ {
+ Form_pg_statistic_ext stxtup = (Form_pg_statistic_ext) GETSTRUCT(tp);
+ char *result;
+
+ result = pstrdup(NameStr(stxtup->stxname));
+ ReleaseSysCache(tp);
+ return result;
+ }
+ else
+ return NULL;
+}
+
+/*
+ * get_statistics_namespace
+ * Returns the namespace OID of a given extended statistics
+ */
+Oid
+get_statistics_namespace(Oid stxid)
+{
+ HeapTuple tp;
+
+ tp = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(stxid));
+ if (HeapTupleIsValid(tp))
+ {
+ Form_pg_statistic_ext stxtup = (Form_pg_statistic_ext) GETSTRUCT(tp);
+ Oid result;
+
+ result = stxtup->stxnamespace;
+ ReleaseSysCache(tp);
+ return result;
+ }
+ else
+ return InvalidOid;
+}
diff --git a/src/include/commands/explain_state.h b/src/include/commands/explain_state.h
index 32728f5d1a1..10b4103db83 100644
--- a/src/include/commands/explain_state.h
+++ b/src/include/commands/explain_state.h
@@ -55,6 +55,7 @@ typedef struct ExplainState
bool memory; /* print planner's memory usage information */
bool settings; /* print modified settings */
bool generic; /* generate a generic plan */
+ bool stats; /* print applied extended stats */
ExplainSerializeOption serialize; /* serialize the query's output? */
ExplainFormat format; /* output format */
/* state for output formatting --- not reset for each new plan tree */
diff --git a/src/include/nodes/makefuncs.h b/src/include/nodes/makefuncs.h
index 5473ce9a288..b32aa762c9b 100644
--- a/src/include/nodes/makefuncs.h
+++ b/src/include/nodes/makefuncs.h
@@ -95,6 +95,8 @@ extern Node *make_and_qual(Node *qual1, Node *qual2);
extern Expr *make_ands_explicit(List *andclauses);
extern List *make_ands_implicit(Expr *clause);
+extern Expr *make_ors_explicit(List *orclauses);
+
extern IndexInfo *makeIndexInfo(int numattrs, int numkeyattrs, Oid amoid,
List *expressions, List *predicates,
bool unique, bool nulls_not_distinct,
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h
index 86a236bd58b..acadd843d69 100644
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -254,6 +254,9 @@ typedef struct Query
ParseLoc stmt_location;
/* length in bytes; 0 means "rest of string" */
ParseLoc stmt_len pg_node_attr(query_jumble_ignore);
+
+ /* if true, query is explain with stats option */
+ bool isExplain_Stats;
} Query;
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index ad2726f026f..081546cc771 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -1075,6 +1075,11 @@ typedef struct RelOptInfo
List **partexprs pg_node_attr(read_write_ignore);
/* Nullable partition key expressions */
List **nullable_partexprs pg_node_attr(read_write_ignore);
+
+ /* info about applied extended statistics */
+ List *applied_stats; /* list of StatisticExtInfo */
+ List *applied_clauses; /* list of lists of clauses */
+ List *applied_clauses_or; /* are the clauses AND, OR, or Comma */
} RelOptInfo;
/*
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index 29d7732d6a0..aba7b29eb21 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -239,6 +239,9 @@ typedef struct Plan
*/
Bitmapset *extParam;
Bitmapset *allParam;
+
+ /* info about applied extended statistics */
+ struct Applied_ExtStats *app_extstats;
} Plan;
/* ----------------
@@ -1792,4 +1795,16 @@ typedef enum MonotonicFunction
MONOTONICFUNC_BOTH = MONOTONICFUNC_INCREASING | MONOTONICFUNC_DECREASING,
} MonotonicFunction;
+/*
+ * Applied_ExtStats - Information to show applied Extend Statistics
+ *
+ */
+typedef struct Applied_ExtStats
+{
+ NodeTag type;
+ List *applied_stats;
+ List *applied_clauses;
+ List *applied_clauses_or;
+} Applied_ExtStats;
+
#endif /* PLANNODES_H */
diff --git a/src/include/optimizer/restrictinfo.h b/src/include/optimizer/restrictinfo.h
index ec91fc9c583..0c34af43138 100644
--- a/src/include/optimizer/restrictinfo.h
+++ b/src/include/optimizer/restrictinfo.h
@@ -50,6 +50,8 @@ extern bool restriction_is_securely_promotable(RestrictInfo *restrictinfo,
extern List *get_actual_clauses(List *restrictinfo_list);
extern List *extract_actual_clauses(List *restrictinfo_list,
bool pseudoconstant);
+extern List *maybe_extract_actual_clauses(List *restrictinfo_list,
+ bool pseudoconstant);
extern void extract_actual_join_clauses(List *restrictinfo_list,
Relids joinrelids,
List **joinquals,
diff --git a/src/include/utils/lsyscache.h b/src/include/utils/lsyscache.h
index fa7c7e0323b..88b8f9c1bd6 100644
--- a/src/include/utils/lsyscache.h
+++ b/src/include/utils/lsyscache.h
@@ -211,6 +211,9 @@ extern char *get_publication_name(Oid pubid, bool missing_ok);
extern Oid get_subscription_oid(const char *subname, bool missing_ok);
extern char *get_subscription_name(Oid subid, bool missing_ok);
+extern char *get_statistics_name(Oid stxid);
+extern Oid get_statistics_namespace(Oid stxid);
+
#define type_is_array(typid) (get_element_type(typid) != InvalidOid)
/* type_is_array_domain accepts both plain arrays and domains over arrays */
#define type_is_array_domain(typid) (get_base_element_type(typid) != InvalidOid)
--
2.39.5
0002-Add-a-new-option-auto_explain.log_stats-to-auto_expl.patchtext/x-diff; charset=utf-8Download
From 253a6906eb3ed19d8de2b089fa581710fd9886a5 Mon Sep 17 00:00:00 2001
From: Tatsuro Yamada <yamatattsu@gmail.com>
Date: Mon, 10 Feb 2025 08:40:57 +0900
Subject: [PATCH 2/3] Add a new option auto_explain.log_stats to auto_explain
(T9)
---
contrib/auto_explain/auto_explain.c | 13 +++++++++++++
doc/src/sgml/auto-explain.sgml | 18 ++++++++++++++++++
2 files changed, 31 insertions(+)
diff --git a/contrib/auto_explain/auto_explain.c b/contrib/auto_explain/auto_explain.c
index 1f4badb4928..20fbcc69025 100644
--- a/contrib/auto_explain/auto_explain.c
+++ b/contrib/auto_explain/auto_explain.c
@@ -34,6 +34,7 @@ static bool auto_explain_log_analyze = false;
static bool auto_explain_log_verbose = false;
static bool auto_explain_log_buffers = false;
static bool auto_explain_log_wal = false;
+static bool auto_explain_log_stats = false;
static bool auto_explain_log_triggers = false;
static bool auto_explain_log_timing = true;
static bool auto_explain_log_settings = false;
@@ -175,6 +176,17 @@ _PG_init(void)
NULL,
NULL);
+ DefineCustomBoolVariable("auto_explain.log_stats",
+ "Use EXPLAIN STATS for plan logging.",
+ NULL,
+ &auto_explain_log_stats,
+ false,
+ PGC_SUSET,
+ 0,
+ NULL,
+ NULL,
+ NULL);
+
DefineCustomBoolVariable("auto_explain.log_triggers",
"Include trigger statistics in plans.",
"This has no effect unless log_analyze is also set.",
@@ -401,6 +413,7 @@ explain_ExecutorEnd(QueryDesc *queryDesc)
es->summary = es->analyze;
/* No support for MEMORY */
/* es->memory = false; */
+ es->stats = auto_explain_log_stats;
es->format = auto_explain_log_format;
es->settings = auto_explain_log_settings;
diff --git a/doc/src/sgml/auto-explain.sgml b/doc/src/sgml/auto-explain.sgml
index 15c868021e6..96599b12f3b 100644
--- a/doc/src/sgml/auto-explain.sgml
+++ b/doc/src/sgml/auto-explain.sgml
@@ -148,6 +148,24 @@ LOAD 'auto_explain';
</listitem>
</varlistentry>
+ <varlistentry id="auto-explain-configuration-parameters-log-stats">
+ <term>
+ <varname>auto_explain.log_stats</varname> (<type>boolean</type>)
+ <indexterm>
+ <primary><varname>auto_explain.log_stats</varname> configuration parameter</primary>
+ </indexterm>
+ </term>
+ <listitem>
+ <para>
+ <varname>auto_explain.log_stats</varname> controls whether applied
+ Extended Statistic are printed when an execution plan is logged; it's
+ equivalent to the <literal>STATS</literal> option of <command>EXPLAIN</command>.
+ This parameter is off by default.
+ Only superusers can change this setting.
+ </para>
+ </listitem>
+ </varlistentry>
+
<varlistentry id="auto-explain-configuration-parameters-log-timing">
<term>
<varname>auto_explain.log_timing</varname> (<type>boolean</type>)
--
2.39.5
0003-Add-a-new-tab-completion-for-EXPLAIN-STATS-on-psql-I.patchtext/x-diff; charset=utf-8Download
From c8cc8de12f1830fd5b6659c98c2764f1a2e405c8 Mon Sep 17 00:00:00 2001
From: Tatsuro Yamada <yamatattsu@gmail.com>
Date: Mon, 10 Feb 2025 08:41:58 +0900
Subject: [PATCH 3/3] Add a new tab completion for EXPLAIN (STATS) on psql (I2)
- When you run "EXPLAIN (<tab>" or "EXPLAIN (S<tab>" on psql,
"STATS" string is displayed.
---
src/bin/psql/tab-complete.in.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/bin/psql/tab-complete.in.c b/src/bin/psql/tab-complete.in.c
index 1f2ca946fc5..c51332baa78 100644
--- a/src/bin/psql/tab-complete.in.c
+++ b/src/bin/psql/tab-complete.in.c
@@ -4338,8 +4338,8 @@ match_previous_words(int pattern_id,
if (ends_with(prev_wd, '(') || ends_with(prev_wd, ','))
COMPLETE_WITH("ANALYZE", "VERBOSE", "COSTS", "SETTINGS", "GENERIC_PLAN",
"BUFFERS", "SERIALIZE", "WAL", "TIMING", "SUMMARY",
- "MEMORY", "FORMAT");
- else if (TailMatches("ANALYZE|VERBOSE|COSTS|SETTINGS|GENERIC_PLAN|BUFFERS|WAL|TIMING|SUMMARY|MEMORY"))
+ "MEMORY", "FORMAT", "STATS");
+ else if (TailMatches("ANALYZE|VERBOSE|COSTS|SETTINGS|GENERIC_PLAN|BUFFERS|WAL|TIMING|SUMMARY|MEMORY|STATS"))
COMPLETE_WITH("ON", "OFF");
else if (TailMatches("SERIALIZE"))
COMPLETE_WITH("TEXT", "NONE", "BINARY");
--
2.39.5