pg_stat_advisor extension
Attachments:
0001-pg_stat_advisor-extension.patchtext/x-diff; name=0001-pg_stat_advisor-extension.patchDownload
From 9485605416030e79843feabf9c101a88703b9779 Mon Sep 17 00:00:00 2001
From: Ilia Evdokimov <ilya.evdokimov@tantorlabs.com>
Date: Tue, 30 Jan 2024 13:49:07 +0300
Subject: [PATCH] 'pg_stat_advisor' extension.
This serves as a hook into the executor for determining total rows and planned rows.
The process starts by checking the `pg_stat_advisor.suggest_statistics_threshold` GUC parameter.
If it's set to 0.0, the extension does not proceed further. When the parameter is greater than 0.0,
the extension evaluates the accuracy of planned rows. A suggestion for creating statistics is made
if the ratio of total rows to planned rows is greater than or equal to this threshold.
Only then does it extract the relation and columns from the query.
The extension checks pg_statistic_ext for existing relevant statistics. If no statistics are found,
it prints a notice suggesting the creation of statistics, using the naming format 'relationName_columns'.
Author: Ilia Evdokimov
---
contrib/Makefile | 1 +
contrib/meson.build | 1 +
contrib/pg_stat_advisor/.gitignore | 3 +
contrib/pg_stat_advisor/Makefile | 20 +
contrib/pg_stat_advisor/README.md | 85 ++++
.../expected/pg_stat_advisor.out | 96 ++++
contrib/pg_stat_advisor/meson.build | 30 ++
contrib/pg_stat_advisor/pg_stat_advisor.c | 477 ++++++++++++++++++
.../pg_stat_advisor/sql/pg_stat_advisor.sql | 50 ++
9 files changed, 763 insertions(+)
create mode 100644 contrib/pg_stat_advisor/.gitignore
create mode 100644 contrib/pg_stat_advisor/Makefile
create mode 100644 contrib/pg_stat_advisor/README.md
create mode 100644 contrib/pg_stat_advisor/expected/pg_stat_advisor.out
create mode 100644 contrib/pg_stat_advisor/meson.build
create mode 100644 contrib/pg_stat_advisor/pg_stat_advisor.c
create mode 100644 contrib/pg_stat_advisor/sql/pg_stat_advisor.sql
diff --git a/contrib/Makefile b/contrib/Makefile
index da4e2316a3..da9a4ceeaa 100644
--- a/contrib/Makefile
+++ b/contrib/Makefile
@@ -34,6 +34,7 @@ SUBDIRS = \
pg_buffercache \
pg_freespacemap \
pg_prewarm \
+ pg_stat_advisor \
pg_stat_statements \
pg_surgery \
pg_trgm \
diff --git a/contrib/meson.build b/contrib/meson.build
index c12dc906ca..a20d99443b 100644
--- a/contrib/meson.build
+++ b/contrib/meson.build
@@ -49,6 +49,7 @@ subdir('pgcrypto')
subdir('pg_freespacemap')
subdir('pg_prewarm')
subdir('pgrowlocks')
+subdir('pg_stat_advisor')
subdir('pg_stat_statements')
subdir('pgstattuple')
subdir('pg_surgery')
diff --git a/contrib/pg_stat_advisor/.gitignore b/contrib/pg_stat_advisor/.gitignore
new file mode 100644
index 0000000000..913175ff6e
--- /dev/null
+++ b/contrib/pg_stat_advisor/.gitignore
@@ -0,0 +1,3 @@
+/log/
+/results/
+/tmp_check/
diff --git a/contrib/pg_stat_advisor/Makefile b/contrib/pg_stat_advisor/Makefile
new file mode 100644
index 0000000000..f31b939e8a
--- /dev/null
+++ b/contrib/pg_stat_advisor/Makefile
@@ -0,0 +1,20 @@
+# contrib/pg_stat_advisor/Makefile
+
+MODULE_big = pg_stat_advisor
+OBJS = \
+ $(WIN32RES) \
+ pg_stat_advisor.o
+PGFILEDESC = "pg_stat_advisor - analyze query performance and recommend the creation of additional statistics"
+
+REGRESS = pg_stat_advisor
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/pg_stat_advisor
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/contrib/pg_stat_advisor/README.md b/contrib/pg_stat_advisor/README.md
new file mode 100644
index 0000000000..f9610f2ed5
--- /dev/null
+++ b/contrib/pg_stat_advisor/README.md
@@ -0,0 +1,85 @@
+## pg_stat_advisor - PostgreSQL advisor to create statistics
+
+pg_stat_advisor is a PostgreSQL extension designed to analyze query performance and recommend the creation of additional statistics to improve query plan.
+
+Append pg_stat_advisor to shared_preload_libraries configuration parameter in your postgresql.conf file then restart the PostgreSQL database to apply the changes. Or you can use "LOAD 'pg_stat_advisor';"command
+```
+LOAD 'pg_stat_advisor';
+```
+
+There is the pg_stat_advisor.suggest_statistics_threshold GUC that can be used to set a suggest_statistics_threshold. It is the the ratio of total tuples produced compared to the planned rows. If parameter is set by 0, the printing switches off.
+
+For example:
+```
+SET pg_stat_advisor.suggest_statistics_threshold = 1.0;
+```
+
+Examples:
+
+
+```
+postgres=# create table t (i int, j int);
+CREATE TABLE
+postgres=# insert into t select i/10, i/100 from generate_series(1, 1000000) i;
+INSERT 0 1000000
+postgres=# analyze t;
+ANALYZE
+postgres=# explain analyze select * from t where i = 100 and j = 10;
+ QUERY PLAN
+
+----------------------------------------------------------------------------------------------
+------------------
+ Gather (cost=1000.00..11675.10 rows=1 width=8) (actual time=0.526..61.564 rows=10 loops=1)
+ Workers Planned: 2
+ Workers Launched: 2
+ -> Parallel Seq Scan on t (cost=0.00..10675.00 rows=1 width=8) (actual time=35.369..54.44
+7 rows=3 loops=3)
+ Filter: ((i = 100) AND (j = 10))
+ Rows Removed by Filter: 333330
+ Planning Time: 0.148 ms
+ Execution Time: 61.589 ms
+(8 rows)
+
+
+postgres=# load 'pg_stat_advisor';
+LOAD
+postgres=# set pg_stat_advisor.suggest_statistics_threshold = 0.1;
+SET
+postgres=# explain analyze select * from t where i = 100 and j = 10;
+NOTICE: pg_stat_advisor suggestion: CREATE STATISTICS t_i_j ON i, j FROM t
+ QUERY PLAN
+
+----------------------------------------------------------------------------------------------
+------------------
+ Gather (cost=1000.00..11675.10 rows=1 width=8) (actual time=0.400..59.292 rows=10 loops=1)
+ Workers Planned: 2
+ Workers Launched: 2
+ -> Parallel Seq Scan on t (cost=0.00..10675.00 rows=1 width=8) (actual time=35.614..54.29
+1 rows=3 loops=3)
+ Filter: ((i = 100) AND (j = 10))
+ Rows Removed by Filter: 333330
+ Planning Time: 0.081 ms
+ Execution Time: 59.413 ms
+(8 rows)
+
+
+postgres=# create statistics stat_t_i_j on i, j from t;
+CREATE STATISTICS
+postgres=# analyze t;
+ANALYZE
+postgres=# explain analyze select * from t where i = 100 and j = 10;
+ QUERY PLAN
+
+----------------------------------------------------------------------------------------------
+------------------
+ Gather (cost=1000.00..11675.10 rows=10 width=8) (actual time=0.400..59.292 rows=10 loops=1)
+ Workers Planned: 2
+ Workers Launched: 2
+ -> Parallel Seq Scan on t (cost=0.00..10675.00 rows=1 width=8) (actual time=35.614..54.29
+1 rows=3 loops=3)
+ Filter: ((i = 100) AND (j = 10))
+ Rows Removed by Filter: 333330
+ Planning Time: 0.081 ms
+ Execution Time: 59.413 ms
+(8 rows)
+```
diff --git a/contrib/pg_stat_advisor/expected/pg_stat_advisor.out b/contrib/pg_stat_advisor/expected/pg_stat_advisor.out
new file mode 100644
index 0000000000..b45e7153ae
--- /dev/null
+++ b/contrib/pg_stat_advisor/expected/pg_stat_advisor.out
@@ -0,0 +1,96 @@
+LOAD 'pg_stat_advisor';
+SET pg_stat_advisor.suggest_statistics_threshold = 0.1;
+-------------------------------Check printing suggestion-----------------------------------------------------------
+CREATE TABLE t (i INT, j INT);
+INSERT INTO t SELECT i/10, i/100 FROM GENERATE_SERIES(1,1000000) i;
+ANALYZE t;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM t WHERE i = 100 AND j = 10;
+NOTICE: pg_stat_advisor suggestion: CREATE STATISTICS t_i_j ON i, j FROM t
+ QUERY PLAN
+------------------------------------------------------
+ Gather (actual rows=10 loops=1)
+ Workers Planned: 2
+ Workers Launched: 2
+ -> Parallel Seq Scan on t (actual rows=3 loops=3)
+ Filter: ((i = 100) AND (j = 10))
+ Rows Removed by Filter: 333330
+(6 rows)
+
+CREATE STATISTICS t_i_j ON i, j FROM t;
+ANALYZE t;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM t WHERE i = 100 AND j = 10;
+ QUERY PLAN
+------------------------------------------------------
+ Gather (actual rows=10 loops=1)
+ Workers Planned: 2
+ Workers Launched: 2
+ -> Parallel Seq Scan on t (actual rows=3 loops=3)
+ Filter: ((i = 100) AND (j = 10))
+ Rows Removed by Filter: 333330
+(6 rows)
+
+DROP STATISTICS t_i_j;
+-------------------------------Check existing statistics except suggested name-------------------------------------
+CREATE STATISTICS statistics_t_i_j ON i, j FROM t;
+ANALYZE t;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM t WHERE i = 100 AND j = 10;
+ QUERY PLAN
+------------------------------------------------------
+ Gather (actual rows=10 loops=1)
+ Workers Planned: 2
+ Workers Launched: 2
+ -> Parallel Seq Scan on t (actual rows=3 loops=3)
+ Filter: ((i = 100) AND (j = 10))
+ Rows Removed by Filter: 333330
+(6 rows)
+
+DROP STATISTICS statistics_t_i_j;
+DROP TABLE t;
+-------------------------------Check suggest_statistics_threshold parameter----------------------------------------
+SET pg_stat_advisor.suggest_statistics_threshold = 10.0;
+CREATE TABLE inner_tab(x INT, y INT);
+CREATE TABLE outer_tab(pk INT PRIMARY KEY, x INT, y INT);
+CREATE INDEX ON inner_tab(x,y);
+INSERT INTO outer_tab VALUES (GENERATE_SERIES(1,100000), GENERATE_SERIES(1,100000), GENERATE_SERIES(1,100000)*10);
+INSERT INTO inner_tab VALUES (GENERATE_SERIES(1,1000000)/10, GENERATE_SERIES(1,1000000)/10*10);
+ANALYZE inner_tab;
+ANALYZE outer_tab;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM outer_tab JOIN inner_tab USING(x,y) WHERE pk=1;
+NOTICE: pg_stat_advisor suggestion: CREATE STATISTICS inner_tab_x_y ON x, y FROM inner_tab
+ QUERY PLAN
+-------------------------------------------------------------------------------------
+ Nested Loop (actual rows=10 loops=1)
+ -> Index Scan using outer_tab_pkey on outer_tab (actual rows=1 loops=1)
+ Index Cond: (pk = 1)
+ -> Index Only Scan using inner_tab_x_y_idx on inner_tab (actual rows=10 loops=1)
+ Index Cond: ((x = outer_tab.x) AND (y = outer_tab.y))
+ Heap Fetches: 10
+(6 rows)
+
+CREATE STATISTICS inner_tab_x_y ON x, y FROM inner_tab;
+ANALYZE inner_tab;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM outer_tab JOIN inner_tab USING(x,y) WHERE pk=1;
+ QUERY PLAN
+-------------------------------------------------------------------------------------
+ Nested Loop (actual rows=10 loops=1)
+ -> Index Scan using outer_tab_pkey on outer_tab (actual rows=1 loops=1)
+ Index Cond: (pk = 1)
+ -> Index Only Scan using inner_tab_x_y_idx on inner_tab (actual rows=10 loops=1)
+ Index Cond: ((x = outer_tab.x) AND (y = outer_tab.y))
+ Heap Fetches: 10
+(6 rows)
+
+DROP STATISTICS inner_tab_x_y;
+SET pg_stat_advisor.suggest_statistics_threshold = 10.1;
+ANALYZE inner_tab;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM outer_tab JOIN inner_tab USING(x,y) WHERE pk=1;
+ QUERY PLAN
+-------------------------------------------------------------------------------------
+ Nested Loop (actual rows=10 loops=1)
+ -> Index Scan using outer_tab_pkey on outer_tab (actual rows=1 loops=1)
+ Index Cond: (pk = 1)
+ -> Index Only Scan using inner_tab_x_y_idx on inner_tab (actual rows=10 loops=1)
+ Index Cond: ((x = outer_tab.x) AND (y = outer_tab.y))
+ Heap Fetches: 10
+(6 rows)
+
diff --git a/contrib/pg_stat_advisor/meson.build b/contrib/pg_stat_advisor/meson.build
new file mode 100644
index 0000000000..292917c025
--- /dev/null
+++ b/contrib/pg_stat_advisor/meson.build
@@ -0,0 +1,30 @@
+# Copyright (c) 2022-2024, PostgreSQL Global Development Group
+
+pg_stat_advisor_sources = files(
+ 'pg_stat_advisor.c',
+)
+
+if host_system == 'windows'
+ pg_stat_advisor_sources += rc_lib_gen.process(win32ver_rc, extra_args: [
+ '--NAME', 'pg_stat_advsior',
+ '--FILEDESC', 'pg_stat_advisor - suggestion of creating statistics',])
+endif
+
+pg_stat_advisor = shared_module('pg_stat_advisor',
+ pg_stat_advisor_sources,
+ kwargs: contrib_mod_args + {
+ 'dependencies': contrib_mod_args['dependencies'],
+ },
+)
+contrib_targets += pg_stat_advisor
+
+tests += {
+ 'name': 'pg_stat_advisor',
+ 'sd': meson.current_source_dir(),
+ 'bd': meson.current_build_dir(),
+ 'regress': {
+ 'sql': [
+ 'pg_stat_advisor',
+ ],
+ },
+}
diff --git a/contrib/pg_stat_advisor/pg_stat_advisor.c b/contrib/pg_stat_advisor/pg_stat_advisor.c
new file mode 100644
index 0000000000..498a41d4bf
--- /dev/null
+++ b/contrib/pg_stat_advisor/pg_stat_advisor.c
@@ -0,0 +1,477 @@
+#include "postgres.h"
+
+#include "access/hash.h"
+#include "access/table.h"
+#include "catalog/pg_statistic_ext.h"
+#include "commands/explain.h"
+#include "optimizer/optimizer.h"
+#include "parser/parsetree.h"
+#include "statistics/statistics.h"
+#include "utils/guc.h"
+#include "utils/syscache.h"
+#include "utils/lsyscache.h"
+
+PG_MODULE_MAGIC;
+
+/* GUC variables */
+static double pg_stat_advisor_suggest_statistics_threshold = 0.0;
+
+/* Current nesting depth of ExecutorRun calls */
+static int nesting_level = 0;
+
+#define pg_stat_advisor_enabled() (nesting_level == 0)
+
+void _PG_init(void);
+
+/* Saved hook values in case of unload */
+static ExecutorStart_hook_type prev_ExecutorStart = NULL;
+static ExecutorRun_hook_type prev_ExecutorRun = NULL;
+static ExecutorFinish_hook_type prev_ExecutorFinish = NULL;
+static ExecutorEnd_hook_type prev_ExecutorEnd = NULL;
+
+static void explain_ExecutorStart(QueryDesc *queryDesc, int eflags);
+static void explain_ExecutorRun(QueryDesc *queryDesc,
+ ScanDirection direction,
+ uint64 count, bool execute_once);
+static void explain_ExecutorFinish(QueryDesc *queryDesc);
+static void explain_ExecutorEnd(QueryDesc *queryDesc);
+
+static void SuggestMultiColumnStatisticsForNode(PlanState *planstate, ExplainState *es);
+
+/*
+ * Module load callback
+ */
+void
+_PG_init(void)
+{
+ /* Define custom GUC variables. */
+ DefineCustomRealVariable("pg_stat_advisor.suggest_statistics_threshold",
+ "Set the threshold for actual/estimated rows",
+ "Zero disables suggestion of creating statistics",
+ &pg_stat_advisor_suggest_statistics_threshold,
+ 0.0,
+ 0.0,
+ INT_MAX,
+ PGC_SUSET,
+ 0,
+ NULL,
+ NULL,
+ NULL);
+
+ MarkGUCPrefixReserved("pg_stat_advisor");
+
+ /* Install hooks. */
+ prev_ExecutorStart = ExecutorStart_hook;
+ ExecutorStart_hook = explain_ExecutorStart;
+ prev_ExecutorRun = ExecutorRun_hook;
+ ExecutorRun_hook = explain_ExecutorRun;
+ prev_ExecutorFinish = ExecutorFinish_hook;
+ ExecutorFinish_hook = explain_ExecutorFinish;
+ prev_ExecutorEnd = ExecutorEnd_hook;
+ ExecutorEnd_hook = explain_ExecutorEnd;
+}
+
+/*
+ * ExecutorStart hook: start up logging if needed
+ */
+static void
+explain_ExecutorStart(QueryDesc *queryDesc, int eflags)
+{
+ if (prev_ExecutorStart)
+ prev_ExecutorStart(queryDesc, eflags);
+ else
+ standard_ExecutorStart(queryDesc, eflags);
+
+ if (pg_stat_advisor_enabled() && queryDesc->totaltime == NULL)
+ {
+ MemoryContext oldcxt;
+
+ oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
+ queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL, false);
+ MemoryContextSwitchTo(oldcxt);
+ }
+}
+
+/*
+ * ExecutorRun hook: all we need do is track nesting depth
+ */
+static void
+explain_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction,
+ uint64 count, bool execute_once)
+{
+ nesting_level++;
+ PG_TRY();
+ {
+ if (prev_ExecutorRun)
+ prev_ExecutorRun(queryDesc, direction, count, execute_once);
+ else
+ standard_ExecutorRun(queryDesc, direction, count, execute_once);
+ }
+ PG_FINALLY();
+ {
+ nesting_level--;
+ }
+ PG_END_TRY();
+}
+
+/*
+ * ExecutorFinish hook: all we need do is track nesting depth
+ */
+static void
+explain_ExecutorFinish(QueryDesc *queryDesc)
+{
+ nesting_level++;
+ PG_TRY();
+ {
+ if (prev_ExecutorFinish)
+ prev_ExecutorFinish(queryDesc);
+ else
+ standard_ExecutorFinish(queryDesc);
+ }
+ PG_FINALLY();
+ {
+ nesting_level--;
+ }
+ PG_END_TRY();
+}
+
+/*
+ * ExecutorEnd hook: log results if needed
+ */
+static void
+explain_ExecutorEnd(QueryDesc *queryDesc)
+{
+ if (queryDesc->totaltime && pg_stat_advisor_enabled())
+ {
+ MemoryContext oldcxt;
+ ExplainState *es;
+
+ /*
+ * Make sure we operate in the per-query context, so any cruft will be
+ * discarded later during ExecutorEnd.
+ */
+ oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
+
+ /*
+ * Make sure stats accumulation is done. (Note: it's okay if several
+ * levels of hook all do this.)
+ */
+ InstrEndLoop(queryDesc->totaltime);
+
+ es = NewExplainState();
+
+ es->analyze = queryDesc->instrument_options;
+
+ ExplainBeginOutput(es);
+ ExplainQueryText(es, queryDesc);
+
+ ExplainPrintPlan(es, queryDesc);
+ if (es->analyze)
+ ExplainPrintTriggers(es, queryDesc);
+ if (es->costs)
+ ExplainPrintJITSummary(es, queryDesc);
+ ExplainEndOutput(es);
+
+ if (pg_stat_advisor_suggest_statistics_threshold && !IsParallelWorker())
+ SuggestMultiColumnStatisticsForNode(queryDesc->planstate, es);
+
+ MemoryContextSwitchTo(oldcxt);
+ }
+
+ if (prev_ExecutorEnd)
+ prev_ExecutorEnd(queryDesc);
+ else
+ standard_ExecutorEnd(queryDesc);
+}
+
+/**
+ * Comparator used to sort Vars by name
+ */
+static int
+vars_list_comparator(const ListCell *a, const ListCell *b)
+{
+ char *va = strVal(linitial(((ColumnRef *) lfirst(a))->fields));
+ char *vb = strVal(linitial(((ColumnRef *) lfirst(b))->fields));
+
+ return strcmp(va, vb);
+}
+
+/**
+ * Suggest statistics for specified subplans.
+ */
+static void
+SuggestMultiColumnStatisticsForSubPlans(List *plans, ExplainState *es)
+{
+ ListCell *lst;
+
+ foreach(lst, plans)
+ {
+ SubPlanState *sps = (SubPlanState *) lfirst(lst);
+
+ SuggestMultiColumnStatisticsForNode(sps->planstate, es);
+ }
+}
+
+/**
+ * Suggest statistics for plan subnodes.
+ */
+static void
+SuggestMultiColumnStatisticsForMemberNodes(PlanState **planstates, int nsubnodes,
+ ExplainState *es)
+{
+ int j;
+
+ for (j = 0; j < nsubnodes; j++)
+ SuggestMultiColumnStatisticsForNode(planstates[j], es);
+}
+
+/**
+ * Find extended statistics from 'relation_oid' relation on colmap
+ */
+static bool
+FindExtendedStatisticsOnVars(Oid *relation_oid, Bitmapset *colmap)
+{
+ bool isExistExtendedStatistics = false;
+ ListCell *oid;
+
+ /* Receive all extended statistics for current relation */
+ Relation pg_stextdata = table_open(*relation_oid, RowExclusiveLock);
+ List *statistics = RelationGetStatExtList(pg_stextdata);
+
+ foreach(oid, statistics)
+ {
+ Oid statOid = lfirst_oid(oid);
+ Form_pg_statistic_ext staForm;
+ HeapTuple htup;
+ Bitmapset *keys = NULL;
+ int i;
+
+ /* Read from pg_statistic_ext */
+ htup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statOid));
+ if (!HeapTupleIsValid(htup))
+ elog(ERROR, "cache lookup failed for statistics object %u", statOid);
+ staForm = (Form_pg_statistic_ext) GETSTRUCT(htup);
+
+ for (i = 0; i < staForm->stxkeys.dim1; i++)
+ keys = bms_add_member(keys, staForm->stxkeys.values[i]);
+
+ /* If we have statistics for current columns */
+ if (!bms_compare(keys, colmap))
+ {
+ isExistExtendedStatistics = true;
+
+ ReleaseSysCache(htup);
+ bms_free(keys);
+
+ break;
+ }
+
+ ReleaseSysCache(htup);
+ bms_free(keys);
+ }
+
+ list_free(statistics);
+ table_close(pg_stextdata, RowExclusiveLock);
+
+ return isExistExtendedStatistics;
+}
+
+/**
+ * Suggest statistics for qual
+ */
+static void
+SuggestMultiColumnStatisticsForQual(List *qual, ExplainState *es)
+{
+ List *vars = NULL;
+ ListCell *lc;
+
+ /* Extract vars from all quals */
+ foreach(lc, qual)
+ {
+ Node *node = (Node *) lfirst(lc);
+
+ if (IsA(node, RestrictInfo))
+ node = (Node *) ((RestrictInfo *) node)->clause;
+ vars = list_concat(vars, pull_vars_of_level(node, 0));
+ }
+
+ /* Loop until we considered all vars */
+ while (vars != NULL)
+ {
+ ListCell *cell;
+ List *cols = NULL;
+ Index relno = 0;
+ Bitmapset *colmap = NULL;
+
+ /* Contruct list of unique vars */
+ foreach(cell, vars)
+ {
+ Node *node = (Node *) lfirst(cell);
+
+ if (IsA(node, Var))
+ {
+ Var *var = (Var *) node;
+ int varno = IS_SPECIAL_VARNO(var->varno) ? var->varnosyn : var->varno;
+
+ if (cols == NULL || varno == relno)
+ {
+ int varattno = IS_SPECIAL_VARNO(var->varno) ? var->varattnosyn : var->varattno;
+
+ relno = varno;
+ if (var->varattno > 0 &&
+ !bms_is_member(varattno, colmap) &&
+ varno >= 1 && /* not synthetic var */
+ varno <= list_length(es->rtable) &&
+ list_length(cols) < STATS_MAX_DIMENSIONS)
+ {
+ RangeTblEntry *rte = rt_fetch(varno, es->rtable);
+
+ if (rte->rtekind == RTE_RELATION)
+ {
+ ColumnRef *col = makeNode(ColumnRef);
+ char *colname = get_rte_attribute_name(rte, varattno);
+
+ col->fields = list_make1(makeString(colname));
+ cols = lappend(cols, col);
+ colmap = bms_add_member(colmap, varattno);
+ }
+ }
+ }
+ else
+ {
+ continue;
+ }
+ }
+ vars = foreach_delete_current(vars, cell);
+ }
+ /* To suggest statitics we need to have at least 2 columns */
+ if (list_length(cols) >= 2)
+ {
+ RangeTblEntry *rte = rt_fetch(relno, es->rtable);
+ char *rel_name = get_rel_name(rte->relid);
+ char *stat_name = rel_name;
+ char *create_stat_stmt = (char *) "";
+ char const *sep = "ON";
+ size_t name_len;
+
+ /* Sort variables by name */
+ list_sort(cols, vars_list_comparator);
+
+ /*
+ * Construct name for statistic by concatenating relation name
+ * with all columns
+ */
+ foreach(cell, cols)
+ {
+ char *col_name = strVal(linitial(((ColumnRef *) lfirst(cell))->fields));
+
+ stat_name = psprintf("%s_%s", stat_name, col_name);
+ create_stat_stmt = psprintf("%s%s %s", create_stat_stmt, sep, col_name);
+ sep = ",";
+ }
+
+ name_len = strlen(stat_name);
+ /* Truncate name if it doesn't fit in NameData */
+ if (name_len >= NAMEDATALEN)
+ stat_name = psprintf("%.*s_%08x", NAMEDATALEN - 10, stat_name, (unsigned) hash_any((uint8 *) stat_name, name_len));
+
+ if (!FindExtendedStatisticsOnVars(&rte->relid, colmap))
+ {
+ ereport(NOTICE, (errmsg("pg_stat_advisor suggestion: CREATE STATISTICS %s %s FROM %s",
+ stat_name, create_stat_stmt, rel_name),
+ errhidestmt(true)));
+ }
+ }
+
+ bms_free(colmap);
+ }
+}
+
+/**
+ * Suggest statistics for node
+ */
+static void
+SuggestMultiColumnStatisticsForNode(PlanState *planstate, ExplainState *es)
+{
+ Plan *plan = planstate->plan;
+
+ if (planstate->instrument && plan->plan_rows != 0 &&
+ pg_stat_advisor_suggest_statistics_threshold > 0.0 &&
+ planstate->instrument->ntuples / plan->plan_rows >= pg_stat_advisor_suggest_statistics_threshold)
+ {
+ elog(DEBUG1, "Estimated=%f, actual=%f, error=%f: plan=%s",
+ plan->plan_rows,
+ planstate->instrument->ntuples,
+ planstate->instrument->ntuples / plan->plan_rows,
+ nodeToString(plan));
+ /* quals, sort keys, etc */
+ switch (nodeTag(plan))
+ {
+ case T_IndexScan:
+ SuggestMultiColumnStatisticsForQual(((IndexScan *) plan)->indexqualorig, es);
+ break;
+ case T_IndexOnlyScan:
+ SuggestMultiColumnStatisticsForQual(((IndexOnlyScan *) plan)->indexqual, es);
+ break;
+ case T_BitmapIndexScan:
+ SuggestMultiColumnStatisticsForQual(((BitmapIndexScan *) plan)->indexqualorig, es);
+ break;
+ case T_NestLoop:
+ SuggestMultiColumnStatisticsForQual(((NestLoop *) plan)->join.joinqual, es);
+ break;
+ case T_MergeJoin:
+ SuggestMultiColumnStatisticsForQual(((MergeJoin *) plan)->mergeclauses, es);
+ SuggestMultiColumnStatisticsForQual(((MergeJoin *) plan)->join.joinqual, es);
+ break;
+ case T_HashJoin:
+ SuggestMultiColumnStatisticsForQual(((HashJoin *) plan)->hashclauses, es);
+ SuggestMultiColumnStatisticsForQual(((HashJoin *) plan)->join.joinqual, es);
+ break;
+ default:
+ break;
+ }
+ SuggestMultiColumnStatisticsForQual(plan->qual, es);
+ }
+
+ /* initPlan-s */
+ if (planstate->initPlan)
+ SuggestMultiColumnStatisticsForSubPlans(planstate->initPlan, es);
+
+ /* lefttree */
+ if (outerPlanState(planstate))
+ SuggestMultiColumnStatisticsForNode(outerPlanState(planstate), es);
+
+ /* righttree */
+ if (innerPlanState(planstate))
+ SuggestMultiColumnStatisticsForNode(innerPlanState(planstate), es);
+
+ /* special child plans */
+ switch (nodeTag(plan))
+ {
+ case T_Append:
+ SuggestMultiColumnStatisticsForMemberNodes(((AppendState *) planstate)->appendplans,
+ ((AppendState *) planstate)->as_nplans,
+ es);
+ break;
+ case T_MergeAppend:
+ SuggestMultiColumnStatisticsForMemberNodes(((MergeAppendState *) planstate)->mergeplans,
+ ((MergeAppendState *) planstate)->ms_nplans,
+ es);
+ break;
+ case T_BitmapAnd:
+ SuggestMultiColumnStatisticsForMemberNodes(((BitmapAndState *) planstate)->bitmapplans,
+ ((BitmapAndState *) planstate)->nplans,
+ es);
+ break;
+ case T_BitmapOr:
+ SuggestMultiColumnStatisticsForMemberNodes(((BitmapOrState *) planstate)->bitmapplans,
+ ((BitmapOrState *) planstate)->nplans,
+ es);
+ break;
+ case T_SubqueryScan:
+ SuggestMultiColumnStatisticsForNode(((SubqueryScanState *) planstate)->subplan, es);
+ break;
+ default:
+ break;
+ }
+}
diff --git a/contrib/pg_stat_advisor/sql/pg_stat_advisor.sql b/contrib/pg_stat_advisor/sql/pg_stat_advisor.sql
new file mode 100644
index 0000000000..3fd8b24416
--- /dev/null
+++ b/contrib/pg_stat_advisor/sql/pg_stat_advisor.sql
@@ -0,0 +1,50 @@
+LOAD 'pg_stat_advisor';
+SET pg_stat_advisor.suggest_statistics_threshold = 0.1;
+
+-------------------------------Check printing suggestion-----------------------------------------------------------
+
+CREATE TABLE t (i INT, j INT);
+INSERT INTO t SELECT i/10, i/100 FROM GENERATE_SERIES(1,1000000) i;
+ANALYZE t;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM t WHERE i = 100 AND j = 10;
+
+CREATE STATISTICS t_i_j ON i, j FROM t;
+
+ANALYZE t;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM t WHERE i = 100 AND j = 10;
+
+DROP STATISTICS t_i_j;
+
+-------------------------------Check existing statistics except suggested name-------------------------------------
+
+CREATE STATISTICS statistics_t_i_j ON i, j FROM t;
+
+ANALYZE t;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM t WHERE i = 100 AND j = 10;
+
+DROP STATISTICS statistics_t_i_j;
+DROP TABLE t;
+
+-------------------------------Check suggest_statistics_threshold parameter----------------------------------------
+
+SET pg_stat_advisor.suggest_statistics_threshold = 10.0;
+
+CREATE TABLE inner_tab(x INT, y INT);
+CREATE TABLE outer_tab(pk INT PRIMARY KEY, x INT, y INT);
+CREATE INDEX ON inner_tab(x,y);
+INSERT INTO outer_tab VALUES (GENERATE_SERIES(1,100000), GENERATE_SERIES(1,100000), GENERATE_SERIES(1,100000)*10);
+INSERT INTO inner_tab VALUES (GENERATE_SERIES(1,1000000)/10, GENERATE_SERIES(1,1000000)/10*10);
+ANALYZE inner_tab;
+ANALYZE outer_tab;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM outer_tab JOIN inner_tab USING(x,y) WHERE pk=1;
+
+CREATE STATISTICS inner_tab_x_y ON x, y FROM inner_tab;
+
+ANALYZE inner_tab;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM outer_tab JOIN inner_tab USING(x,y) WHERE pk=1;
+
+DROP STATISTICS inner_tab_x_y;
+SET pg_stat_advisor.suggest_statistics_threshold = 10.1;
+
+ANALYZE inner_tab;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM outer_tab JOIN inner_tab USING(x,y) WHERE pk=1;
\ No newline at end of file
--
2.34.1
<div>Hi hackers,</div><div> </div><div>I'm reaching out again regarding the patch with new extension 'pg_stat_advisor' aimed at enhancing query plan efficiency through the suggestion of creating statistics.</div><div> </div><div>I understand the community is busy, but I would greatly value any feedback or thoughts on this extension.</div><div> </div><div>Thank you for your time and consideration.</div><div> </div><div>Best regards,</div><div>Ilia Evdokimov,</div><div>Tantor Labs LLC.</div>
Hi hackers,
I've encountered and addressed errors in the
"0001-pg_stat_advisor-extension.patch" when applying it to the main
branch, specifically trailing whitespace issues at lines 117 and 118:
```
0001-pg_stat_advisor-extension.patch:117: trailing whitespace.
QUERY PLAN
0001-pg_stat_advisor-extension.patch:118: trailing whitespace.
warning: 2 lines add whitespace errors.
```
An updated patch is attached for review
I welcome your insights, feedback, and evaluations regarding the
necessity of integrating this new extension into PostgreSQL.
Kind regards,
Ilia Evdokimov,
Tantor Labs LLC.
Attachments:
0002-pg_stat_advisor-extension.patchtext/x-patch; charset=UTF-8; name=0002-pg_stat_advisor-extension.patchDownload
From 6316706c42996219e507bb6ded9dd1e872180e38 Mon Sep 17 00:00:00 2001
From: Ilia Evdokimov <ilya.evdokimov@tantorlabs.ru>
Date: Tue, 6 Feb 2024 18:11:04 +0300
Subject: [PATCH] pg_stat_advisor extension
---
contrib/Makefile | 1 +
contrib/meson.build | 1 +
contrib/pg_stat_advisor/.gitignore | 3 +
contrib/pg_stat_advisor/Makefile | 20 +
contrib/pg_stat_advisor/README.md | 85 ++++
.../expected/pg_stat_advisor.out | 96 ++++
contrib/pg_stat_advisor/meson.build | 30 ++
contrib/pg_stat_advisor/pg_stat_advisor.c | 477 ++++++++++++++++++
.../pg_stat_advisor/sql/pg_stat_advisor.sql | 50 ++
9 files changed, 763 insertions(+)
create mode 100644 contrib/pg_stat_advisor/.gitignore
create mode 100644 contrib/pg_stat_advisor/Makefile
create mode 100644 contrib/pg_stat_advisor/README.md
create mode 100644 contrib/pg_stat_advisor/expected/pg_stat_advisor.out
create mode 100644 contrib/pg_stat_advisor/meson.build
create mode 100644 contrib/pg_stat_advisor/pg_stat_advisor.c
create mode 100644 contrib/pg_stat_advisor/sql/pg_stat_advisor.sql
diff --git a/contrib/Makefile b/contrib/Makefile
index da4e2316a3..da9a4ceeaa 100644
--- a/contrib/Makefile
+++ b/contrib/Makefile
@@ -34,6 +34,7 @@ SUBDIRS = \
pg_buffercache \
pg_freespacemap \
pg_prewarm \
+ pg_stat_advisor \
pg_stat_statements \
pg_surgery \
pg_trgm \
diff --git a/contrib/meson.build b/contrib/meson.build
index c12dc906ca..a20d99443b 100644
--- a/contrib/meson.build
+++ b/contrib/meson.build
@@ -49,6 +49,7 @@ subdir('pgcrypto')
subdir('pg_freespacemap')
subdir('pg_prewarm')
subdir('pgrowlocks')
+subdir('pg_stat_advisor')
subdir('pg_stat_statements')
subdir('pgstattuple')
subdir('pg_surgery')
diff --git a/contrib/pg_stat_advisor/.gitignore b/contrib/pg_stat_advisor/.gitignore
new file mode 100644
index 0000000000..913175ff6e
--- /dev/null
+++ b/contrib/pg_stat_advisor/.gitignore
@@ -0,0 +1,3 @@
+/log/
+/results/
+/tmp_check/
diff --git a/contrib/pg_stat_advisor/Makefile b/contrib/pg_stat_advisor/Makefile
new file mode 100644
index 0000000000..f31b939e8a
--- /dev/null
+++ b/contrib/pg_stat_advisor/Makefile
@@ -0,0 +1,20 @@
+# contrib/pg_stat_advisor/Makefile
+
+MODULE_big = pg_stat_advisor
+OBJS = \
+ $(WIN32RES) \
+ pg_stat_advisor.o
+PGFILEDESC = "pg_stat_advisor - analyze query performance and recommend the creation of additional statistics"
+
+REGRESS = pg_stat_advisor
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/pg_stat_advisor
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/contrib/pg_stat_advisor/README.md b/contrib/pg_stat_advisor/README.md
new file mode 100644
index 0000000000..f9610f2ed5
--- /dev/null
+++ b/contrib/pg_stat_advisor/README.md
@@ -0,0 +1,85 @@
+## pg_stat_advisor - PostgreSQL advisor to create statistics
+
+pg_stat_advisor is a PostgreSQL extension designed to analyze query performance and recommend the creation of additional statistics to improve query plan.
+
+Append pg_stat_advisor to shared_preload_libraries configuration parameter in your postgresql.conf file then restart the PostgreSQL database to apply the changes. Or you can use "LOAD 'pg_stat_advisor';"command
+```
+LOAD 'pg_stat_advisor';
+```
+
+There is the pg_stat_advisor.suggest_statistics_threshold GUC that can be used to set a suggest_statistics_threshold. It is the the ratio of total tuples produced compared to the planned rows. If parameter is set by 0, the printing switches off.
+
+For example:
+```
+SET pg_stat_advisor.suggest_statistics_threshold = 1.0;
+```
+
+Examples:
+
+
+```
+postgres=# create table t (i int, j int);
+CREATE TABLE
+postgres=# insert into t select i/10, i/100 from generate_series(1, 1000000) i;
+INSERT 0 1000000
+postgres=# analyze t;
+ANALYZE
+postgres=# explain analyze select * from t where i = 100 and j = 10;
+ QUERY PLAN
+
+----------------------------------------------------------------------------------------------
+------------------
+ Gather (cost=1000.00..11675.10 rows=1 width=8) (actual time=0.526..61.564 rows=10 loops=1)
+ Workers Planned: 2
+ Workers Launched: 2
+ -> Parallel Seq Scan on t (cost=0.00..10675.00 rows=1 width=8) (actual time=35.369..54.44
+7 rows=3 loops=3)
+ Filter: ((i = 100) AND (j = 10))
+ Rows Removed by Filter: 333330
+ Planning Time: 0.148 ms
+ Execution Time: 61.589 ms
+(8 rows)
+
+
+postgres=# load 'pg_stat_advisor';
+LOAD
+postgres=# set pg_stat_advisor.suggest_statistics_threshold = 0.1;
+SET
+postgres=# explain analyze select * from t where i = 100 and j = 10;
+NOTICE: pg_stat_advisor suggestion: CREATE STATISTICS t_i_j ON i, j FROM t
+ QUERY PLAN
+
+----------------------------------------------------------------------------------------------
+------------------
+ Gather (cost=1000.00..11675.10 rows=1 width=8) (actual time=0.400..59.292 rows=10 loops=1)
+ Workers Planned: 2
+ Workers Launched: 2
+ -> Parallel Seq Scan on t (cost=0.00..10675.00 rows=1 width=8) (actual time=35.614..54.29
+1 rows=3 loops=3)
+ Filter: ((i = 100) AND (j = 10))
+ Rows Removed by Filter: 333330
+ Planning Time: 0.081 ms
+ Execution Time: 59.413 ms
+(8 rows)
+
+
+postgres=# create statistics stat_t_i_j on i, j from t;
+CREATE STATISTICS
+postgres=# analyze t;
+ANALYZE
+postgres=# explain analyze select * from t where i = 100 and j = 10;
+ QUERY PLAN
+
+----------------------------------------------------------------------------------------------
+------------------
+ Gather (cost=1000.00..11675.10 rows=10 width=8) (actual time=0.400..59.292 rows=10 loops=1)
+ Workers Planned: 2
+ Workers Launched: 2
+ -> Parallel Seq Scan on t (cost=0.00..10675.00 rows=1 width=8) (actual time=35.614..54.29
+1 rows=3 loops=3)
+ Filter: ((i = 100) AND (j = 10))
+ Rows Removed by Filter: 333330
+ Planning Time: 0.081 ms
+ Execution Time: 59.413 ms
+(8 rows)
+```
diff --git a/contrib/pg_stat_advisor/expected/pg_stat_advisor.out b/contrib/pg_stat_advisor/expected/pg_stat_advisor.out
new file mode 100644
index 0000000000..b45e7153ae
--- /dev/null
+++ b/contrib/pg_stat_advisor/expected/pg_stat_advisor.out
@@ -0,0 +1,96 @@
+LOAD 'pg_stat_advisor';
+SET pg_stat_advisor.suggest_statistics_threshold = 0.1;
+-------------------------------Check printing suggestion-----------------------------------------------------------
+CREATE TABLE t (i INT, j INT);
+INSERT INTO t SELECT i/10, i/100 FROM GENERATE_SERIES(1,1000000) i;
+ANALYZE t;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM t WHERE i = 100 AND j = 10;
+NOTICE: pg_stat_advisor suggestion: CREATE STATISTICS t_i_j ON i, j FROM t
+ QUERY PLAN
+------------------------------------------------------
+ Gather (actual rows=10 loops=1)
+ Workers Planned: 2
+ Workers Launched: 2
+ -> Parallel Seq Scan on t (actual rows=3 loops=3)
+ Filter: ((i = 100) AND (j = 10))
+ Rows Removed by Filter: 333330
+(6 rows)
+
+CREATE STATISTICS t_i_j ON i, j FROM t;
+ANALYZE t;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM t WHERE i = 100 AND j = 10;
+ QUERY PLAN
+------------------------------------------------------
+ Gather (actual rows=10 loops=1)
+ Workers Planned: 2
+ Workers Launched: 2
+ -> Parallel Seq Scan on t (actual rows=3 loops=3)
+ Filter: ((i = 100) AND (j = 10))
+ Rows Removed by Filter: 333330
+(6 rows)
+
+DROP STATISTICS t_i_j;
+-------------------------------Check existing statistics except suggested name-------------------------------------
+CREATE STATISTICS statistics_t_i_j ON i, j FROM t;
+ANALYZE t;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM t WHERE i = 100 AND j = 10;
+ QUERY PLAN
+------------------------------------------------------
+ Gather (actual rows=10 loops=1)
+ Workers Planned: 2
+ Workers Launched: 2
+ -> Parallel Seq Scan on t (actual rows=3 loops=3)
+ Filter: ((i = 100) AND (j = 10))
+ Rows Removed by Filter: 333330
+(6 rows)
+
+DROP STATISTICS statistics_t_i_j;
+DROP TABLE t;
+-------------------------------Check suggest_statistics_threshold parameter----------------------------------------
+SET pg_stat_advisor.suggest_statistics_threshold = 10.0;
+CREATE TABLE inner_tab(x INT, y INT);
+CREATE TABLE outer_tab(pk INT PRIMARY KEY, x INT, y INT);
+CREATE INDEX ON inner_tab(x,y);
+INSERT INTO outer_tab VALUES (GENERATE_SERIES(1,100000), GENERATE_SERIES(1,100000), GENERATE_SERIES(1,100000)*10);
+INSERT INTO inner_tab VALUES (GENERATE_SERIES(1,1000000)/10, GENERATE_SERIES(1,1000000)/10*10);
+ANALYZE inner_tab;
+ANALYZE outer_tab;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM outer_tab JOIN inner_tab USING(x,y) WHERE pk=1;
+NOTICE: pg_stat_advisor suggestion: CREATE STATISTICS inner_tab_x_y ON x, y FROM inner_tab
+ QUERY PLAN
+-------------------------------------------------------------------------------------
+ Nested Loop (actual rows=10 loops=1)
+ -> Index Scan using outer_tab_pkey on outer_tab (actual rows=1 loops=1)
+ Index Cond: (pk = 1)
+ -> Index Only Scan using inner_tab_x_y_idx on inner_tab (actual rows=10 loops=1)
+ Index Cond: ((x = outer_tab.x) AND (y = outer_tab.y))
+ Heap Fetches: 10
+(6 rows)
+
+CREATE STATISTICS inner_tab_x_y ON x, y FROM inner_tab;
+ANALYZE inner_tab;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM outer_tab JOIN inner_tab USING(x,y) WHERE pk=1;
+ QUERY PLAN
+-------------------------------------------------------------------------------------
+ Nested Loop (actual rows=10 loops=1)
+ -> Index Scan using outer_tab_pkey on outer_tab (actual rows=1 loops=1)
+ Index Cond: (pk = 1)
+ -> Index Only Scan using inner_tab_x_y_idx on inner_tab (actual rows=10 loops=1)
+ Index Cond: ((x = outer_tab.x) AND (y = outer_tab.y))
+ Heap Fetches: 10
+(6 rows)
+
+DROP STATISTICS inner_tab_x_y;
+SET pg_stat_advisor.suggest_statistics_threshold = 10.1;
+ANALYZE inner_tab;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM outer_tab JOIN inner_tab USING(x,y) WHERE pk=1;
+ QUERY PLAN
+-------------------------------------------------------------------------------------
+ Nested Loop (actual rows=10 loops=1)
+ -> Index Scan using outer_tab_pkey on outer_tab (actual rows=1 loops=1)
+ Index Cond: (pk = 1)
+ -> Index Only Scan using inner_tab_x_y_idx on inner_tab (actual rows=10 loops=1)
+ Index Cond: ((x = outer_tab.x) AND (y = outer_tab.y))
+ Heap Fetches: 10
+(6 rows)
+
diff --git a/contrib/pg_stat_advisor/meson.build b/contrib/pg_stat_advisor/meson.build
new file mode 100644
index 0000000000..292917c025
--- /dev/null
+++ b/contrib/pg_stat_advisor/meson.build
@@ -0,0 +1,30 @@
+# Copyright (c) 2022-2024, PostgreSQL Global Development Group
+
+pg_stat_advisor_sources = files(
+ 'pg_stat_advisor.c',
+)
+
+if host_system == 'windows'
+ pg_stat_advisor_sources += rc_lib_gen.process(win32ver_rc, extra_args: [
+ '--NAME', 'pg_stat_advsior',
+ '--FILEDESC', 'pg_stat_advisor - suggestion of creating statistics',])
+endif
+
+pg_stat_advisor = shared_module('pg_stat_advisor',
+ pg_stat_advisor_sources,
+ kwargs: contrib_mod_args + {
+ 'dependencies': contrib_mod_args['dependencies'],
+ },
+)
+contrib_targets += pg_stat_advisor
+
+tests += {
+ 'name': 'pg_stat_advisor',
+ 'sd': meson.current_source_dir(),
+ 'bd': meson.current_build_dir(),
+ 'regress': {
+ 'sql': [
+ 'pg_stat_advisor',
+ ],
+ },
+}
diff --git a/contrib/pg_stat_advisor/pg_stat_advisor.c b/contrib/pg_stat_advisor/pg_stat_advisor.c
new file mode 100644
index 0000000000..498a41d4bf
--- /dev/null
+++ b/contrib/pg_stat_advisor/pg_stat_advisor.c
@@ -0,0 +1,477 @@
+#include "postgres.h"
+
+#include "access/hash.h"
+#include "access/table.h"
+#include "catalog/pg_statistic_ext.h"
+#include "commands/explain.h"
+#include "optimizer/optimizer.h"
+#include "parser/parsetree.h"
+#include "statistics/statistics.h"
+#include "utils/guc.h"
+#include "utils/syscache.h"
+#include "utils/lsyscache.h"
+
+PG_MODULE_MAGIC;
+
+/* GUC variables */
+static double pg_stat_advisor_suggest_statistics_threshold = 0.0;
+
+/* Current nesting depth of ExecutorRun calls */
+static int nesting_level = 0;
+
+#define pg_stat_advisor_enabled() (nesting_level == 0)
+
+void _PG_init(void);
+
+/* Saved hook values in case of unload */
+static ExecutorStart_hook_type prev_ExecutorStart = NULL;
+static ExecutorRun_hook_type prev_ExecutorRun = NULL;
+static ExecutorFinish_hook_type prev_ExecutorFinish = NULL;
+static ExecutorEnd_hook_type prev_ExecutorEnd = NULL;
+
+static void explain_ExecutorStart(QueryDesc *queryDesc, int eflags);
+static void explain_ExecutorRun(QueryDesc *queryDesc,
+ ScanDirection direction,
+ uint64 count, bool execute_once);
+static void explain_ExecutorFinish(QueryDesc *queryDesc);
+static void explain_ExecutorEnd(QueryDesc *queryDesc);
+
+static void SuggestMultiColumnStatisticsForNode(PlanState *planstate, ExplainState *es);
+
+/*
+ * Module load callback
+ */
+void
+_PG_init(void)
+{
+ /* Define custom GUC variables. */
+ DefineCustomRealVariable("pg_stat_advisor.suggest_statistics_threshold",
+ "Set the threshold for actual/estimated rows",
+ "Zero disables suggestion of creating statistics",
+ &pg_stat_advisor_suggest_statistics_threshold,
+ 0.0,
+ 0.0,
+ INT_MAX,
+ PGC_SUSET,
+ 0,
+ NULL,
+ NULL,
+ NULL);
+
+ MarkGUCPrefixReserved("pg_stat_advisor");
+
+ /* Install hooks. */
+ prev_ExecutorStart = ExecutorStart_hook;
+ ExecutorStart_hook = explain_ExecutorStart;
+ prev_ExecutorRun = ExecutorRun_hook;
+ ExecutorRun_hook = explain_ExecutorRun;
+ prev_ExecutorFinish = ExecutorFinish_hook;
+ ExecutorFinish_hook = explain_ExecutorFinish;
+ prev_ExecutorEnd = ExecutorEnd_hook;
+ ExecutorEnd_hook = explain_ExecutorEnd;
+}
+
+/*
+ * ExecutorStart hook: start up logging if needed
+ */
+static void
+explain_ExecutorStart(QueryDesc *queryDesc, int eflags)
+{
+ if (prev_ExecutorStart)
+ prev_ExecutorStart(queryDesc, eflags);
+ else
+ standard_ExecutorStart(queryDesc, eflags);
+
+ if (pg_stat_advisor_enabled() && queryDesc->totaltime == NULL)
+ {
+ MemoryContext oldcxt;
+
+ oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
+ queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL, false);
+ MemoryContextSwitchTo(oldcxt);
+ }
+}
+
+/*
+ * ExecutorRun hook: all we need do is track nesting depth
+ */
+static void
+explain_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction,
+ uint64 count, bool execute_once)
+{
+ nesting_level++;
+ PG_TRY();
+ {
+ if (prev_ExecutorRun)
+ prev_ExecutorRun(queryDesc, direction, count, execute_once);
+ else
+ standard_ExecutorRun(queryDesc, direction, count, execute_once);
+ }
+ PG_FINALLY();
+ {
+ nesting_level--;
+ }
+ PG_END_TRY();
+}
+
+/*
+ * ExecutorFinish hook: all we need do is track nesting depth
+ */
+static void
+explain_ExecutorFinish(QueryDesc *queryDesc)
+{
+ nesting_level++;
+ PG_TRY();
+ {
+ if (prev_ExecutorFinish)
+ prev_ExecutorFinish(queryDesc);
+ else
+ standard_ExecutorFinish(queryDesc);
+ }
+ PG_FINALLY();
+ {
+ nesting_level--;
+ }
+ PG_END_TRY();
+}
+
+/*
+ * ExecutorEnd hook: log results if needed
+ */
+static void
+explain_ExecutorEnd(QueryDesc *queryDesc)
+{
+ if (queryDesc->totaltime && pg_stat_advisor_enabled())
+ {
+ MemoryContext oldcxt;
+ ExplainState *es;
+
+ /*
+ * Make sure we operate in the per-query context, so any cruft will be
+ * discarded later during ExecutorEnd.
+ */
+ oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
+
+ /*
+ * Make sure stats accumulation is done. (Note: it's okay if several
+ * levels of hook all do this.)
+ */
+ InstrEndLoop(queryDesc->totaltime);
+
+ es = NewExplainState();
+
+ es->analyze = queryDesc->instrument_options;
+
+ ExplainBeginOutput(es);
+ ExplainQueryText(es, queryDesc);
+
+ ExplainPrintPlan(es, queryDesc);
+ if (es->analyze)
+ ExplainPrintTriggers(es, queryDesc);
+ if (es->costs)
+ ExplainPrintJITSummary(es, queryDesc);
+ ExplainEndOutput(es);
+
+ if (pg_stat_advisor_suggest_statistics_threshold && !IsParallelWorker())
+ SuggestMultiColumnStatisticsForNode(queryDesc->planstate, es);
+
+ MemoryContextSwitchTo(oldcxt);
+ }
+
+ if (prev_ExecutorEnd)
+ prev_ExecutorEnd(queryDesc);
+ else
+ standard_ExecutorEnd(queryDesc);
+}
+
+/**
+ * Comparator used to sort Vars by name
+ */
+static int
+vars_list_comparator(const ListCell *a, const ListCell *b)
+{
+ char *va = strVal(linitial(((ColumnRef *) lfirst(a))->fields));
+ char *vb = strVal(linitial(((ColumnRef *) lfirst(b))->fields));
+
+ return strcmp(va, vb);
+}
+
+/**
+ * Suggest statistics for specified subplans.
+ */
+static void
+SuggestMultiColumnStatisticsForSubPlans(List *plans, ExplainState *es)
+{
+ ListCell *lst;
+
+ foreach(lst, plans)
+ {
+ SubPlanState *sps = (SubPlanState *) lfirst(lst);
+
+ SuggestMultiColumnStatisticsForNode(sps->planstate, es);
+ }
+}
+
+/**
+ * Suggest statistics for plan subnodes.
+ */
+static void
+SuggestMultiColumnStatisticsForMemberNodes(PlanState **planstates, int nsubnodes,
+ ExplainState *es)
+{
+ int j;
+
+ for (j = 0; j < nsubnodes; j++)
+ SuggestMultiColumnStatisticsForNode(planstates[j], es);
+}
+
+/**
+ * Find extended statistics from 'relation_oid' relation on colmap
+ */
+static bool
+FindExtendedStatisticsOnVars(Oid *relation_oid, Bitmapset *colmap)
+{
+ bool isExistExtendedStatistics = false;
+ ListCell *oid;
+
+ /* Receive all extended statistics for current relation */
+ Relation pg_stextdata = table_open(*relation_oid, RowExclusiveLock);
+ List *statistics = RelationGetStatExtList(pg_stextdata);
+
+ foreach(oid, statistics)
+ {
+ Oid statOid = lfirst_oid(oid);
+ Form_pg_statistic_ext staForm;
+ HeapTuple htup;
+ Bitmapset *keys = NULL;
+ int i;
+
+ /* Read from pg_statistic_ext */
+ htup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statOid));
+ if (!HeapTupleIsValid(htup))
+ elog(ERROR, "cache lookup failed for statistics object %u", statOid);
+ staForm = (Form_pg_statistic_ext) GETSTRUCT(htup);
+
+ for (i = 0; i < staForm->stxkeys.dim1; i++)
+ keys = bms_add_member(keys, staForm->stxkeys.values[i]);
+
+ /* If we have statistics for current columns */
+ if (!bms_compare(keys, colmap))
+ {
+ isExistExtendedStatistics = true;
+
+ ReleaseSysCache(htup);
+ bms_free(keys);
+
+ break;
+ }
+
+ ReleaseSysCache(htup);
+ bms_free(keys);
+ }
+
+ list_free(statistics);
+ table_close(pg_stextdata, RowExclusiveLock);
+
+ return isExistExtendedStatistics;
+}
+
+/**
+ * Suggest statistics for qual
+ */
+static void
+SuggestMultiColumnStatisticsForQual(List *qual, ExplainState *es)
+{
+ List *vars = NULL;
+ ListCell *lc;
+
+ /* Extract vars from all quals */
+ foreach(lc, qual)
+ {
+ Node *node = (Node *) lfirst(lc);
+
+ if (IsA(node, RestrictInfo))
+ node = (Node *) ((RestrictInfo *) node)->clause;
+ vars = list_concat(vars, pull_vars_of_level(node, 0));
+ }
+
+ /* Loop until we considered all vars */
+ while (vars != NULL)
+ {
+ ListCell *cell;
+ List *cols = NULL;
+ Index relno = 0;
+ Bitmapset *colmap = NULL;
+
+ /* Contruct list of unique vars */
+ foreach(cell, vars)
+ {
+ Node *node = (Node *) lfirst(cell);
+
+ if (IsA(node, Var))
+ {
+ Var *var = (Var *) node;
+ int varno = IS_SPECIAL_VARNO(var->varno) ? var->varnosyn : var->varno;
+
+ if (cols == NULL || varno == relno)
+ {
+ int varattno = IS_SPECIAL_VARNO(var->varno) ? var->varattnosyn : var->varattno;
+
+ relno = varno;
+ if (var->varattno > 0 &&
+ !bms_is_member(varattno, colmap) &&
+ varno >= 1 && /* not synthetic var */
+ varno <= list_length(es->rtable) &&
+ list_length(cols) < STATS_MAX_DIMENSIONS)
+ {
+ RangeTblEntry *rte = rt_fetch(varno, es->rtable);
+
+ if (rte->rtekind == RTE_RELATION)
+ {
+ ColumnRef *col = makeNode(ColumnRef);
+ char *colname = get_rte_attribute_name(rte, varattno);
+
+ col->fields = list_make1(makeString(colname));
+ cols = lappend(cols, col);
+ colmap = bms_add_member(colmap, varattno);
+ }
+ }
+ }
+ else
+ {
+ continue;
+ }
+ }
+ vars = foreach_delete_current(vars, cell);
+ }
+ /* To suggest statitics we need to have at least 2 columns */
+ if (list_length(cols) >= 2)
+ {
+ RangeTblEntry *rte = rt_fetch(relno, es->rtable);
+ char *rel_name = get_rel_name(rte->relid);
+ char *stat_name = rel_name;
+ char *create_stat_stmt = (char *) "";
+ char const *sep = "ON";
+ size_t name_len;
+
+ /* Sort variables by name */
+ list_sort(cols, vars_list_comparator);
+
+ /*
+ * Construct name for statistic by concatenating relation name
+ * with all columns
+ */
+ foreach(cell, cols)
+ {
+ char *col_name = strVal(linitial(((ColumnRef *) lfirst(cell))->fields));
+
+ stat_name = psprintf("%s_%s", stat_name, col_name);
+ create_stat_stmt = psprintf("%s%s %s", create_stat_stmt, sep, col_name);
+ sep = ",";
+ }
+
+ name_len = strlen(stat_name);
+ /* Truncate name if it doesn't fit in NameData */
+ if (name_len >= NAMEDATALEN)
+ stat_name = psprintf("%.*s_%08x", NAMEDATALEN - 10, stat_name, (unsigned) hash_any((uint8 *) stat_name, name_len));
+
+ if (!FindExtendedStatisticsOnVars(&rte->relid, colmap))
+ {
+ ereport(NOTICE, (errmsg("pg_stat_advisor suggestion: CREATE STATISTICS %s %s FROM %s",
+ stat_name, create_stat_stmt, rel_name),
+ errhidestmt(true)));
+ }
+ }
+
+ bms_free(colmap);
+ }
+}
+
+/**
+ * Suggest statistics for node
+ */
+static void
+SuggestMultiColumnStatisticsForNode(PlanState *planstate, ExplainState *es)
+{
+ Plan *plan = planstate->plan;
+
+ if (planstate->instrument && plan->plan_rows != 0 &&
+ pg_stat_advisor_suggest_statistics_threshold > 0.0 &&
+ planstate->instrument->ntuples / plan->plan_rows >= pg_stat_advisor_suggest_statistics_threshold)
+ {
+ elog(DEBUG1, "Estimated=%f, actual=%f, error=%f: plan=%s",
+ plan->plan_rows,
+ planstate->instrument->ntuples,
+ planstate->instrument->ntuples / plan->plan_rows,
+ nodeToString(plan));
+ /* quals, sort keys, etc */
+ switch (nodeTag(plan))
+ {
+ case T_IndexScan:
+ SuggestMultiColumnStatisticsForQual(((IndexScan *) plan)->indexqualorig, es);
+ break;
+ case T_IndexOnlyScan:
+ SuggestMultiColumnStatisticsForQual(((IndexOnlyScan *) plan)->indexqual, es);
+ break;
+ case T_BitmapIndexScan:
+ SuggestMultiColumnStatisticsForQual(((BitmapIndexScan *) plan)->indexqualorig, es);
+ break;
+ case T_NestLoop:
+ SuggestMultiColumnStatisticsForQual(((NestLoop *) plan)->join.joinqual, es);
+ break;
+ case T_MergeJoin:
+ SuggestMultiColumnStatisticsForQual(((MergeJoin *) plan)->mergeclauses, es);
+ SuggestMultiColumnStatisticsForQual(((MergeJoin *) plan)->join.joinqual, es);
+ break;
+ case T_HashJoin:
+ SuggestMultiColumnStatisticsForQual(((HashJoin *) plan)->hashclauses, es);
+ SuggestMultiColumnStatisticsForQual(((HashJoin *) plan)->join.joinqual, es);
+ break;
+ default:
+ break;
+ }
+ SuggestMultiColumnStatisticsForQual(plan->qual, es);
+ }
+
+ /* initPlan-s */
+ if (planstate->initPlan)
+ SuggestMultiColumnStatisticsForSubPlans(planstate->initPlan, es);
+
+ /* lefttree */
+ if (outerPlanState(planstate))
+ SuggestMultiColumnStatisticsForNode(outerPlanState(planstate), es);
+
+ /* righttree */
+ if (innerPlanState(planstate))
+ SuggestMultiColumnStatisticsForNode(innerPlanState(planstate), es);
+
+ /* special child plans */
+ switch (nodeTag(plan))
+ {
+ case T_Append:
+ SuggestMultiColumnStatisticsForMemberNodes(((AppendState *) planstate)->appendplans,
+ ((AppendState *) planstate)->as_nplans,
+ es);
+ break;
+ case T_MergeAppend:
+ SuggestMultiColumnStatisticsForMemberNodes(((MergeAppendState *) planstate)->mergeplans,
+ ((MergeAppendState *) planstate)->ms_nplans,
+ es);
+ break;
+ case T_BitmapAnd:
+ SuggestMultiColumnStatisticsForMemberNodes(((BitmapAndState *) planstate)->bitmapplans,
+ ((BitmapAndState *) planstate)->nplans,
+ es);
+ break;
+ case T_BitmapOr:
+ SuggestMultiColumnStatisticsForMemberNodes(((BitmapOrState *) planstate)->bitmapplans,
+ ((BitmapOrState *) planstate)->nplans,
+ es);
+ break;
+ case T_SubqueryScan:
+ SuggestMultiColumnStatisticsForNode(((SubqueryScanState *) planstate)->subplan, es);
+ break;
+ default:
+ break;
+ }
+}
diff --git a/contrib/pg_stat_advisor/sql/pg_stat_advisor.sql b/contrib/pg_stat_advisor/sql/pg_stat_advisor.sql
new file mode 100644
index 0000000000..3fd8b24416
--- /dev/null
+++ b/contrib/pg_stat_advisor/sql/pg_stat_advisor.sql
@@ -0,0 +1,50 @@
+LOAD 'pg_stat_advisor';
+SET pg_stat_advisor.suggest_statistics_threshold = 0.1;
+
+-------------------------------Check printing suggestion-----------------------------------------------------------
+
+CREATE TABLE t (i INT, j INT);
+INSERT INTO t SELECT i/10, i/100 FROM GENERATE_SERIES(1,1000000) i;
+ANALYZE t;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM t WHERE i = 100 AND j = 10;
+
+CREATE STATISTICS t_i_j ON i, j FROM t;
+
+ANALYZE t;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM t WHERE i = 100 AND j = 10;
+
+DROP STATISTICS t_i_j;
+
+-------------------------------Check existing statistics except suggested name-------------------------------------
+
+CREATE STATISTICS statistics_t_i_j ON i, j FROM t;
+
+ANALYZE t;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM t WHERE i = 100 AND j = 10;
+
+DROP STATISTICS statistics_t_i_j;
+DROP TABLE t;
+
+-------------------------------Check suggest_statistics_threshold parameter----------------------------------------
+
+SET pg_stat_advisor.suggest_statistics_threshold = 10.0;
+
+CREATE TABLE inner_tab(x INT, y INT);
+CREATE TABLE outer_tab(pk INT PRIMARY KEY, x INT, y INT);
+CREATE INDEX ON inner_tab(x,y);
+INSERT INTO outer_tab VALUES (GENERATE_SERIES(1,100000), GENERATE_SERIES(1,100000), GENERATE_SERIES(1,100000)*10);
+INSERT INTO inner_tab VALUES (GENERATE_SERIES(1,1000000)/10, GENERATE_SERIES(1,1000000)/10*10);
+ANALYZE inner_tab;
+ANALYZE outer_tab;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM outer_tab JOIN inner_tab USING(x,y) WHERE pk=1;
+
+CREATE STATISTICS inner_tab_x_y ON x, y FROM inner_tab;
+
+ANALYZE inner_tab;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM outer_tab JOIN inner_tab USING(x,y) WHERE pk=1;
+
+DROP STATISTICS inner_tab_x_y;
+SET pg_stat_advisor.suggest_statistics_threshold = 10.1;
+
+ANALYZE inner_tab;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM outer_tab JOIN inner_tab USING(x,y) WHERE pk=1;
\ No newline at end of file
--
2.34.1
Import Notes
Resolved by subject fallback
Dear Team,
Firstly, I would like to extend my sincere apologies for the confusion
and technical oversights in our previous discussions regarding the
'pg_stat_advisor extension'. To address this and facilitate a clearer,
more focused dialogue, I have initiated a new thread to consolidate our
discussions on this matter.
For context, our previous conversation can be found here:
/messages/by-id/4681151706615977@mail.yandex.ru.
The extension 'pg_stat_advisor' extension is architected to optimize
query plan. It operates by suggesting when to create extended
statistics, particularly in queries where current selectivity estimates
fall short. This is achieved through the GUC parameter
'pg_stat_advisor.suggest_statistics_threshold', which assesses the ratio
of total tuples compared to the planned rows. This feature is
instrumental in identifying scenarios where the planner's estimates
could be optimized.
You can install the extension by:
```
LOAD 'pg_stat_advisor'
SET pg_stat_advisor.suggest_statistics_threshold = 1.0;
```
Example:
```
EXPLAIN ANALYZE SELECT * FROM t WHERE i = 100 AND j = 10;
NOTICE: pg_stat_advisor suggestion: CREATE STATISTICS t_i_j ON i, j FROM t
QUERY PLAN
--------------------------------------------------------------------------------------------------------
```
After EXPLAIN ANALYZE command you can see the message of suggestion
creating statistics with name 't_i_j' on 'i', 'j' columns from 't' table.
Thank you for your understanding, patience, and continued support.
Best regards,
Ilia Evdokimov,
Tantor Labs LLC.
On Tue, Feb 6, 2024 at 12:06 AM Ilia Evdokimov
<ilya.evdokimov@tantorlabs.com> wrote:
Hi hackers,
I'm reaching out again regarding the patch with new extension 'pg_stat_advisor' aimed at enhancing query plan efficiency through the suggestion of creating statistics.
I understand the community is busy, but I would greatly value any feedback or thoughts on this extension.
+ /* Define custom GUC variables. */
+ DefineCustomRealVariable("pg_stat_advisor.suggest_statistics_threshold",
+ "Set the threshold for actual/estimated rows",
+ "Zero disables suggestion of creating statistics",
+ &pg_stat_advisor_suggest_statistics_threshold,
+ 0.0,
+ 0.0,
+ INT_MAX,
+ PGC_SUSET,
+ 0,
+ NULL,
+ NULL,
+ NULL);
INT_MAX
should be 1.0?
+ if (!FindExtendedStatisticsOnVars(&rte->relid, colmap))
+ {
+ ereport(NOTICE, (errmsg("pg_stat_advisor suggestion: CREATE
STATISTICS %s %s FROM %s",
+ stat_name, create_stat_stmt, rel_name),
+ errhidestmt(true)));
+ }
now CREATE STATISTICS, the statistics name is optional.
but here you explicitly mention the statistics kind would be great.
+ elog(DEBUG1, "Estimated=%f, actual=%f, error=%f: plan=%s",
+ plan->plan_rows,
+ planstate->instrument->ntuples,
+ planstate->instrument->ntuples / plan->plan_rows,
+ nodeToString(plan));
` error=%f` seems not that right.
Also since the documentation is limited, more comments explaining
SuggestMultiColumnStatisticsForNode would be great.
overall the comments are very little, it should be more (that's my opinion).
On 6/2/2024 22:27, Ilia Evdokimov wrote:
I welcome your insights, feedback, and evaluations regarding the
necessity of integrating this new extension into PostgreSQL.
Besides other issues that were immediately raised during the discovery
of the extension, Let me emphasize two issues:
1. In the case of parallel workers the plan_rows value has a different
semantics than the number of rows predicted. Just explore
get_parallel_divisor().
2. The extension recommends new statistics immediately upon an error
finding. But what if the reason for the error is stale statistics? Or
this error may be raised for only one specific set of constants, and
estimation will be done well in another 99.9999% of cases for the same
expression.
According to No.2, it might make sense to collect and track clause
combinations and cardinality errors found and let the DBA make decisions
on their own.
--
regards,
Andrei Lepikhov
Postgres Professional
On Feb 8 2024 at 00:00:00 jian he
INT MAX
should be 1.0?
I don’t know why Konstantin Knizhnik used the ratio of actual tuples to
the planned ones, but most who start testing my extension expect that it
will be a coefficient from 0 to 1, which will be the ratio of the
estimated tuples to the actual ones. Therefore, I changed the value of
this coefficient the other way around and now the value can be from 0 to
1. The patch with changes has been attached.
now CREATE STATISTICS, the statistics name is optional
I constructed the name of the statistics so that the user could copy the
line with 'CREATE STATISTICS' with the mouse and execute this command
faster. But if the user wants ITS name, he can do it manually.
here you can explicitly mention the statistics kind would be great
I agree with you. That would be my next step. That's why I'm doing it now.
Also since the documentation is limited, more comments
explainingSuggestMultiColumnStatisticsForNode would be great.
overall the comments are very little, it should be more (that's my opinion).
Yes, certainly. I'll do it in the next patch.
I'm looking forward to your thoughts and feedback.
Regards,
Ilia Evdokimov,
Tantor Labs LLC.
Attachments:
0003-pg_stat_advisor-extension.patchtext/x-patch; charset=UTF-8; name=0003-pg_stat_advisor-extension.patchDownload
From f87f4a57e532d57f43dab4764d08ddf83d9f3d8f Mon Sep 17 00:00:00 2001
From: Ilia Evdokimov <ilya.evdokimov@tantorlabs.ru>
Date: Thu, 8 Feb 2024 16:00:57 +0300
Subject: [PATCH] 'pg_stat_advisor' extension.
This serves as a hook into the executor for determining total rows and planned rows.
The process starts by checking the `pg_stat_advisor.suggest_statistics_threshold` GUC parameter.
If it's set to 0.0, the extension does not proceed further. When the parameter is greater than 0.0,
the extension evaluates the accuracy of planned rows. A suggestion for creating statistics is made
if the ratio of total rows to planned rows is greater than or equal to this threshold.
Only then does it extract the relation and columns from the query.
The extension checks pg_statistic_ext for existing relevant statistics. If no statistics are found,
it prints a notice suggesting the creation of statistics, using the naming format 'relationName_columns'.
Author: Ilia Evdokimov
---
contrib/Makefile | 1 +
contrib/meson.build | 1 +
contrib/pg_stat_advisor/.gitignore | 3 +
contrib/pg_stat_advisor/Makefile | 20 +
contrib/pg_stat_advisor/README.md | 85 +++
.../expected/pg_stat_advisor.out | 96 ++++
contrib/pg_stat_advisor/meson.build | 30 ++
contrib/pg_stat_advisor/pg_stat_advisor.c | 482 ++++++++++++++++++
.../pg_stat_advisor/sql/pg_stat_advisor.sql | 50 ++
9 files changed, 768 insertions(+)
create mode 100644 contrib/pg_stat_advisor/.gitignore
create mode 100644 contrib/pg_stat_advisor/Makefile
create mode 100644 contrib/pg_stat_advisor/README.md
create mode 100644 contrib/pg_stat_advisor/expected/pg_stat_advisor.out
create mode 100644 contrib/pg_stat_advisor/meson.build
create mode 100644 contrib/pg_stat_advisor/pg_stat_advisor.c
create mode 100644 contrib/pg_stat_advisor/sql/pg_stat_advisor.sql
diff --git a/contrib/Makefile b/contrib/Makefile
index da4e2316a3..da9a4ceeaa 100644
--- a/contrib/Makefile
+++ b/contrib/Makefile
@@ -34,6 +34,7 @@ SUBDIRS = \
pg_buffercache \
pg_freespacemap \
pg_prewarm \
+ pg_stat_advisor \
pg_stat_statements \
pg_surgery \
pg_trgm \
diff --git a/contrib/meson.build b/contrib/meson.build
index c12dc906ca..a20d99443b 100644
--- a/contrib/meson.build
+++ b/contrib/meson.build
@@ -49,6 +49,7 @@ subdir('pgcrypto')
subdir('pg_freespacemap')
subdir('pg_prewarm')
subdir('pgrowlocks')
+subdir('pg_stat_advisor')
subdir('pg_stat_statements')
subdir('pgstattuple')
subdir('pg_surgery')
diff --git a/contrib/pg_stat_advisor/.gitignore b/contrib/pg_stat_advisor/.gitignore
new file mode 100644
index 0000000000..913175ff6e
--- /dev/null
+++ b/contrib/pg_stat_advisor/.gitignore
@@ -0,0 +1,3 @@
+/log/
+/results/
+/tmp_check/
diff --git a/contrib/pg_stat_advisor/Makefile b/contrib/pg_stat_advisor/Makefile
new file mode 100644
index 0000000000..f31b939e8a
--- /dev/null
+++ b/contrib/pg_stat_advisor/Makefile
@@ -0,0 +1,20 @@
+# contrib/pg_stat_advisor/Makefile
+
+MODULE_big = pg_stat_advisor
+OBJS = \
+ $(WIN32RES) \
+ pg_stat_advisor.o
+PGFILEDESC = "pg_stat_advisor - analyze query performance and recommend the creation of additional statistics"
+
+REGRESS = pg_stat_advisor
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/pg_stat_advisor
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/contrib/pg_stat_advisor/README.md b/contrib/pg_stat_advisor/README.md
new file mode 100644
index 0000000000..3f4e97f195
--- /dev/null
+++ b/contrib/pg_stat_advisor/README.md
@@ -0,0 +1,85 @@
+## pg_stat_advisor - PostgreSQL advisor to create statistics
+
+pg_stat_advisor is a PostgreSQL extension designed to analyze query performance and recommend the creation of additional statistics to improve query plan.
+
+Append pg_stat_advisor to shared_preload_libraries configuration parameter in your postgresql.conf file then restart the PostgreSQL database to apply the changes. Or you can use "LOAD 'pg_stat_advisor';"command
+```
+LOAD 'pg_stat_advisor';
+```
+
+There is the pg_stat_advisor.suggest_statistics_threshold GUC that can be used to set a suggest_statistics_threshold. It is the the ratio of the planned rows compared to the total tuples produced. If parameter is set by 0, the printing switches off.
+
+For example:
+```
+SET pg_stat_advisor.suggest_statistics_threshold = 0.1;
+```
+
+Examples:
+
+
+```
+postgres=# create table t (i int, j int);
+CREATE TABLE
+postgres=# insert into t select i/10, i/100 from generate_series(1, 1000000) i;
+INSERT 0 1000000
+postgres=# analyze t;
+ANALYZE
+postgres=# explain analyze select * from t where i = 100 and j = 10;
+ QUERY PLAN
+
+----------------------------------------------------------------------------------------------
+------------------
+ Gather (cost=1000.00..11675.10 rows=1 width=8) (actual time=0.526..61.564 rows=10 loops=1)
+ Workers Planned: 2
+ Workers Launched: 2
+ -> Parallel Seq Scan on t (cost=0.00..10675.00 rows=1 width=8) (actual time=35.369..54.44
+7 rows=3 loops=3)
+ Filter: ((i = 100) AND (j = 10))
+ Rows Removed by Filter: 333330
+ Planning Time: 0.148 ms
+ Execution Time: 61.589 ms
+(8 rows)
+
+
+postgres=# load 'pg_stat_advisor';
+LOAD
+postgres=# set pg_stat_advisor.suggest_statistics_threshold = 0.1;
+SET
+postgres=# explain analyze select * from t where i = 100 and j = 10;
+NOTICE: pg_stat_advisor suggestion: CREATE STATISTICS t_i_j ON i, j FROM t
+ QUERY PLAN
+
+----------------------------------------------------------------------------------------------
+------------------
+ Gather (cost=1000.00..11675.10 rows=1 width=8) (actual time=0.400..59.292 rows=10 loops=1)
+ Workers Planned: 2
+ Workers Launched: 2
+ -> Parallel Seq Scan on t (cost=0.00..10675.00 rows=1 width=8) (actual time=35.614..54.29
+1 rows=3 loops=3)
+ Filter: ((i = 100) AND (j = 10))
+ Rows Removed by Filter: 333330
+ Planning Time: 0.081 ms
+ Execution Time: 59.413 ms
+(8 rows)
+
+
+postgres=# create statistics stat_t_i_j on i, j from t;
+CREATE STATISTICS
+postgres=# analyze t;
+ANALYZE
+postgres=# explain analyze select * from t where i = 100 and j = 10;
+ QUERY PLAN
+
+----------------------------------------------------------------------------------------------
+------------------
+ Gather (cost=1000.00..11675.10 rows=10 width=8) (actual time=0.400..59.292 rows=10 loops=1)
+ Workers Planned: 2
+ Workers Launched: 2
+ -> Parallel Seq Scan on t (cost=0.00..10675.00 rows=1 width=8) (actual time=35.614..54.29
+1 rows=3 loops=3)
+ Filter: ((i = 100) AND (j = 10))
+ Rows Removed by Filter: 333330
+ Planning Time: 0.081 ms
+ Execution Time: 59.413 ms
+(8 rows)
+```
diff --git a/contrib/pg_stat_advisor/expected/pg_stat_advisor.out b/contrib/pg_stat_advisor/expected/pg_stat_advisor.out
new file mode 100644
index 0000000000..c41c410857
--- /dev/null
+++ b/contrib/pg_stat_advisor/expected/pg_stat_advisor.out
@@ -0,0 +1,96 @@
+LOAD 'pg_stat_advisor';
+SET pg_stat_advisor.suggest_statistics_threshold = 0.1;
+-------------------------------Check printing suggestion-----------------------------------------------------------
+CREATE TABLE t (i INT, j INT);
+INSERT INTO t SELECT i/10, i/100 FROM GENERATE_SERIES(1,1000000) i;
+ANALYZE t;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM t WHERE i = 100 AND j = 10;
+NOTICE: pg_stat_advisor suggestion: CREATE STATISTICS t_i_j ON i, j FROM t
+ QUERY PLAN
+------------------------------------------------------
+ Gather (actual rows=10 loops=1)
+ Workers Planned: 2
+ Workers Launched: 2
+ -> Parallel Seq Scan on t (actual rows=3 loops=3)
+ Filter: ((i = 100) AND (j = 10))
+ Rows Removed by Filter: 333330
+(6 rows)
+
+CREATE STATISTICS t_i_j ON i, j FROM t;
+ANALYZE t;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM t WHERE i = 100 AND j = 10;
+ QUERY PLAN
+------------------------------------------------------
+ Gather (actual rows=10 loops=1)
+ Workers Planned: 2
+ Workers Launched: 2
+ -> Parallel Seq Scan on t (actual rows=3 loops=3)
+ Filter: ((i = 100) AND (j = 10))
+ Rows Removed by Filter: 333330
+(6 rows)
+
+DROP STATISTICS t_i_j;
+-------------------------------Check existing statistics except suggested name-------------------------------------
+CREATE STATISTICS statistics_t_i_j ON i, j FROM t;
+ANALYZE t;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM t WHERE i = 100 AND j = 10;
+ QUERY PLAN
+------------------------------------------------------
+ Gather (actual rows=10 loops=1)
+ Workers Planned: 2
+ Workers Launched: 2
+ -> Parallel Seq Scan on t (actual rows=3 loops=3)
+ Filter: ((i = 100) AND (j = 10))
+ Rows Removed by Filter: 333330
+(6 rows)
+
+DROP STATISTICS statistics_t_i_j;
+DROP TABLE t;
+-------------------------------Check suggest_statistics_threshold parameter----------------------------------------
+SET pg_stat_advisor.suggest_statistics_threshold = 0.1;
+CREATE TABLE inner_tab(x INT, y INT);
+CREATE TABLE outer_tab(pk INT PRIMARY KEY, x INT, y INT);
+CREATE INDEX ON inner_tab(x,y);
+INSERT INTO outer_tab VALUES (GENERATE_SERIES(1,100000), GENERATE_SERIES(1,100000), GENERATE_SERIES(1,100000)*10);
+INSERT INTO inner_tab VALUES (GENERATE_SERIES(1,1000000)/10, GENERATE_SERIES(1,1000000)/10*10);
+ANALYZE inner_tab;
+ANALYZE outer_tab;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM outer_tab JOIN inner_tab USING(x,y) WHERE pk=1;
+NOTICE: pg_stat_advisor suggestion: CREATE STATISTICS inner_tab_x_y ON x, y FROM inner_tab
+ QUERY PLAN
+-------------------------------------------------------------------------------------
+ Nested Loop (actual rows=10 loops=1)
+ -> Index Scan using outer_tab_pkey on outer_tab (actual rows=1 loops=1)
+ Index Cond: (pk = 1)
+ -> Index Only Scan using inner_tab_x_y_idx on inner_tab (actual rows=10 loops=1)
+ Index Cond: ((x = outer_tab.x) AND (y = outer_tab.y))
+ Heap Fetches: 10
+(6 rows)
+
+CREATE STATISTICS inner_tab_x_y ON x, y FROM inner_tab;
+ANALYZE inner_tab;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM outer_tab JOIN inner_tab USING(x,y) WHERE pk=1;
+ QUERY PLAN
+-------------------------------------------------------------------------------------
+ Nested Loop (actual rows=10 loops=1)
+ -> Index Scan using outer_tab_pkey on outer_tab (actual rows=1 loops=1)
+ Index Cond: (pk = 1)
+ -> Index Only Scan using inner_tab_x_y_idx on inner_tab (actual rows=10 loops=1)
+ Index Cond: ((x = outer_tab.x) AND (y = outer_tab.y))
+ Heap Fetches: 10
+(6 rows)
+
+DROP STATISTICS inner_tab_x_y;
+SET pg_stat_advisor.suggest_statistics_threshold = 0.09;
+ANALYZE inner_tab;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM outer_tab JOIN inner_tab USING(x,y) WHERE pk=1;
+ QUERY PLAN
+-------------------------------------------------------------------------------------
+ Nested Loop (actual rows=10 loops=1)
+ -> Index Scan using outer_tab_pkey on outer_tab (actual rows=1 loops=1)
+ Index Cond: (pk = 1)
+ -> Index Only Scan using inner_tab_x_y_idx on inner_tab (actual rows=10 loops=1)
+ Index Cond: ((x = outer_tab.x) AND (y = outer_tab.y))
+ Heap Fetches: 10
+(6 rows)
+
diff --git a/contrib/pg_stat_advisor/meson.build b/contrib/pg_stat_advisor/meson.build
new file mode 100644
index 0000000000..292917c025
--- /dev/null
+++ b/contrib/pg_stat_advisor/meson.build
@@ -0,0 +1,30 @@
+# Copyright (c) 2022-2024, PostgreSQL Global Development Group
+
+pg_stat_advisor_sources = files(
+ 'pg_stat_advisor.c',
+)
+
+if host_system == 'windows'
+ pg_stat_advisor_sources += rc_lib_gen.process(win32ver_rc, extra_args: [
+ '--NAME', 'pg_stat_advsior',
+ '--FILEDESC', 'pg_stat_advisor - suggestion of creating statistics',])
+endif
+
+pg_stat_advisor = shared_module('pg_stat_advisor',
+ pg_stat_advisor_sources,
+ kwargs: contrib_mod_args + {
+ 'dependencies': contrib_mod_args['dependencies'],
+ },
+)
+contrib_targets += pg_stat_advisor
+
+tests += {
+ 'name': 'pg_stat_advisor',
+ 'sd': meson.current_source_dir(),
+ 'bd': meson.current_build_dir(),
+ 'regress': {
+ 'sql': [
+ 'pg_stat_advisor',
+ ],
+ },
+}
diff --git a/contrib/pg_stat_advisor/pg_stat_advisor.c b/contrib/pg_stat_advisor/pg_stat_advisor.c
new file mode 100644
index 0000000000..817ab36253
--- /dev/null
+++ b/contrib/pg_stat_advisor/pg_stat_advisor.c
@@ -0,0 +1,482 @@
+#include "postgres.h"
+
+#include "access/hash.h"
+#include "access/table.h"
+#include "catalog/pg_statistic_ext.h"
+#include "commands/explain.h"
+#include "optimizer/optimizer.h"
+#include "parser/parsetree.h"
+#include "statistics/statistics.h"
+#include "utils/guc.h"
+#include "utils/syscache.h"
+#include "utils/lsyscache.h"
+
+PG_MODULE_MAGIC;
+
+/* GUC variables */
+static double pg_stat_advisor_suggest_statistics_threshold = 0.0;
+
+/* Current nesting depth of ExecutorRun calls */
+static int nesting_level = 0;
+
+#define pg_stat_advisor_enabled() (nesting_level == 0)
+
+void _PG_init(void);
+
+/* Saved hook values in case of unload */
+static ExecutorStart_hook_type prev_ExecutorStart = NULL;
+static ExecutorRun_hook_type prev_ExecutorRun = NULL;
+static ExecutorFinish_hook_type prev_ExecutorFinish = NULL;
+static ExecutorEnd_hook_type prev_ExecutorEnd = NULL;
+
+static void explain_ExecutorStart(QueryDesc *queryDesc, int eflags);
+static void explain_ExecutorRun(QueryDesc *queryDesc,
+ ScanDirection direction,
+ uint64 count, bool execute_once);
+static void explain_ExecutorFinish(QueryDesc *queryDesc);
+static void explain_ExecutorEnd(QueryDesc *queryDesc);
+
+static void SuggestMultiColumnStatisticsForNode(PlanState *planstate, ExplainState *es);
+
+/*
+ * Module load callback
+ */
+void
+_PG_init(void)
+{
+ /* Define custom GUC variables. */
+ DefineCustomRealVariable("pg_stat_advisor.suggest_statistics_threshold",
+ "Set the threshold for estimated/actual rows",
+ "Zero disables suggestion of creating statistics",
+ &pg_stat_advisor_suggest_statistics_threshold,
+ 0.0,
+ 0.0,
+ 1.0,
+ PGC_SUSET,
+ 0,
+ NULL,
+ NULL,
+ NULL);
+
+ MarkGUCPrefixReserved("pg_stat_advisor");
+
+ /* Install hooks. */
+ prev_ExecutorStart = ExecutorStart_hook;
+ ExecutorStart_hook = explain_ExecutorStart;
+ prev_ExecutorRun = ExecutorRun_hook;
+ ExecutorRun_hook = explain_ExecutorRun;
+ prev_ExecutorFinish = ExecutorFinish_hook;
+ ExecutorFinish_hook = explain_ExecutorFinish;
+ prev_ExecutorEnd = ExecutorEnd_hook;
+ ExecutorEnd_hook = explain_ExecutorEnd;
+}
+
+/*
+ * ExecutorStart hook: start up logging if needed
+ */
+static void
+explain_ExecutorStart(QueryDesc *queryDesc, int eflags)
+{
+ if (prev_ExecutorStart)
+ prev_ExecutorStart(queryDesc, eflags);
+ else
+ standard_ExecutorStart(queryDesc, eflags);
+
+ if (pg_stat_advisor_enabled() && queryDesc->totaltime == NULL)
+ {
+ MemoryContext oldcxt;
+
+ oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
+ queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL, false);
+ MemoryContextSwitchTo(oldcxt);
+ }
+}
+
+/*
+ * ExecutorRun hook: all we need do is track nesting depth
+ */
+static void
+explain_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction,
+ uint64 count, bool execute_once)
+{
+ nesting_level++;
+ PG_TRY();
+ {
+ if (prev_ExecutorRun)
+ prev_ExecutorRun(queryDesc, direction, count, execute_once);
+ else
+ standard_ExecutorRun(queryDesc, direction, count, execute_once);
+ }
+ PG_FINALLY();
+ {
+ nesting_level--;
+ }
+ PG_END_TRY();
+}
+
+/*
+ * ExecutorFinish hook: all we need do is track nesting depth
+ */
+static void
+explain_ExecutorFinish(QueryDesc *queryDesc)
+{
+ nesting_level++;
+ PG_TRY();
+ {
+ if (prev_ExecutorFinish)
+ prev_ExecutorFinish(queryDesc);
+ else
+ standard_ExecutorFinish(queryDesc);
+ }
+ PG_FINALLY();
+ {
+ nesting_level--;
+ }
+ PG_END_TRY();
+}
+
+/*
+ * ExecutorEnd hook: log results if needed
+ */
+static void
+explain_ExecutorEnd(QueryDesc *queryDesc)
+{
+ if (queryDesc->totaltime && pg_stat_advisor_enabled())
+ {
+ MemoryContext oldcxt;
+ ExplainState *es;
+
+ /*
+ * Make sure we operate in the per-query context, so any cruft will be
+ * discarded later during ExecutorEnd.
+ */
+ oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
+
+ /*
+ * Make sure stats accumulation is done. (Note: it's okay if several
+ * levels of hook all do this.)
+ */
+ InstrEndLoop(queryDesc->totaltime);
+
+ es = NewExplainState();
+
+ es->analyze = queryDesc->instrument_options;
+
+ ExplainBeginOutput(es);
+ ExplainQueryText(es, queryDesc);
+
+ ExplainPrintPlan(es, queryDesc);
+ if (es->analyze)
+ ExplainPrintTriggers(es, queryDesc);
+ if (es->costs)
+ ExplainPrintJITSummary(es, queryDesc);
+ ExplainEndOutput(es);
+
+ if (pg_stat_advisor_suggest_statistics_threshold && !IsParallelWorker())
+ SuggestMultiColumnStatisticsForNode(queryDesc->planstate, es);
+
+ MemoryContextSwitchTo(oldcxt);
+ }
+
+ if (prev_ExecutorEnd)
+ prev_ExecutorEnd(queryDesc);
+ else
+ standard_ExecutorEnd(queryDesc);
+}
+
+/**
+ * Comparator used to sort Vars by name
+ */
+static int
+vars_list_comparator(const ListCell *a, const ListCell *b)
+{
+ char *va = strVal(linitial(((ColumnRef *) lfirst(a))->fields));
+ char *vb = strVal(linitial(((ColumnRef *) lfirst(b))->fields));
+
+ return strcmp(va, vb);
+}
+
+/**
+ * Suggest statistics for specified subplans.
+ */
+static void
+SuggestMultiColumnStatisticsForSubPlans(List *plans, ExplainState *es)
+{
+ ListCell *lst;
+
+ foreach(lst, plans)
+ {
+ SubPlanState *sps = (SubPlanState *) lfirst(lst);
+
+ SuggestMultiColumnStatisticsForNode(sps->planstate, es);
+ }
+}
+
+/**
+ * Suggest statistics for plan subnodes.
+ */
+static void
+SuggestMultiColumnStatisticsForMemberNodes(PlanState **planstates, int nsubnodes,
+ ExplainState *es)
+{
+ int j;
+
+ for (j = 0; j < nsubnodes; j++)
+ SuggestMultiColumnStatisticsForNode(planstates[j], es);
+}
+
+/**
+ * Find extended statistics from 'relation_oid' relation on colmap
+ */
+static bool
+FindExtendedStatisticsOnVars(Oid *relation_oid, Bitmapset *colmap)
+{
+ bool isExistExtendedStatistics = false;
+ ListCell *oid;
+
+ /* Receive all extended statistics for current relation */
+ Relation pg_stextdata = table_open(*relation_oid, RowExclusiveLock);
+ List *statistics = RelationGetStatExtList(pg_stextdata);
+
+ foreach(oid, statistics)
+ {
+ Oid statOid = lfirst_oid(oid);
+ Form_pg_statistic_ext staForm;
+ HeapTuple htup;
+ Bitmapset *keys = NULL;
+ int i;
+
+ /* Read from pg_statistic_ext */
+ htup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statOid));
+ if (!HeapTupleIsValid(htup))
+ elog(ERROR, "cache lookup failed for statistics object %u", statOid);
+ staForm = (Form_pg_statistic_ext) GETSTRUCT(htup);
+
+ for (i = 0; i < staForm->stxkeys.dim1; i++)
+ keys = bms_add_member(keys, staForm->stxkeys.values[i]);
+
+ /* If we have statistics for current columns */
+ if (!bms_compare(keys, colmap))
+ {
+ isExistExtendedStatistics = true;
+
+ elog(DEBUG1, "pg_stat_advisor: found statistics for %s relation - %s",
+ get_rel_name(*relation_oid),
+ get_rel_name(statOid));
+
+ ReleaseSysCache(htup);
+ bms_free(keys);
+
+ break;
+ }
+
+ ReleaseSysCache(htup);
+ bms_free(keys);
+ }
+
+ list_free(statistics);
+ table_close(pg_stextdata, RowExclusiveLock);
+
+ return isExistExtendedStatistics;
+}
+
+/**
+ * Suggest statistics for qual
+ */
+static void
+SuggestMultiColumnStatisticsForQual(List *qual, ExplainState *es)
+{
+ List *vars = NULL;
+ ListCell *lc;
+
+ /* Extract vars from all quals */
+ foreach(lc, qual)
+ {
+ Node *node = (Node *) lfirst(lc);
+
+ if (IsA(node, RestrictInfo))
+ node = (Node *) ((RestrictInfo *) node)->clause;
+ vars = list_concat(vars, pull_vars_of_level(node, 0));
+ }
+
+ /* Loop until we considered all vars */
+ while (vars != NULL)
+ {
+ ListCell *cell;
+ List *cols = NULL;
+ Index relno = 0;
+ Bitmapset *colmap = NULL;
+
+ /* Contruct list of unique vars */
+ foreach(cell, vars)
+ {
+ Node *node = (Node *) lfirst(cell);
+
+ if (IsA(node, Var))
+ {
+ Var *var = (Var *) node;
+ int varno = IS_SPECIAL_VARNO(var->varno) ? var->varnosyn : var->varno;
+
+ if (cols == NULL || varno == relno)
+ {
+ int varattno = IS_SPECIAL_VARNO(var->varno) ? var->varattnosyn : var->varattno;
+
+ relno = varno;
+ if (var->varattno > 0 &&
+ !bms_is_member(varattno, colmap) &&
+ varno >= 1 && /* not synthetic var */
+ varno <= list_length(es->rtable) &&
+ list_length(cols) < STATS_MAX_DIMENSIONS)
+ {
+ RangeTblEntry *rte = rt_fetch(varno, es->rtable);
+
+ if (rte->rtekind == RTE_RELATION)
+ {
+ ColumnRef *col = makeNode(ColumnRef);
+ char *colname = get_rte_attribute_name(rte, varattno);
+
+ col->fields = list_make1(makeString(colname));
+ cols = lappend(cols, col);
+ colmap = bms_add_member(colmap, varattno);
+ }
+ }
+ }
+ else
+ {
+ continue;
+ }
+ }
+ vars = foreach_delete_current(vars, cell);
+ }
+ /* To suggest statitics we need to have at least 2 columns */
+ if (list_length(cols) >= 2)
+ {
+ RangeTblEntry *rte = rt_fetch(relno, es->rtable);
+ char *rel_name = get_rel_name(rte->relid);
+ char *stat_name = rel_name;
+ char *create_stat_stmt = (char *) "";
+ char const *sep = "ON";
+ size_t name_len;
+
+ /* Sort variables by name */
+ list_sort(cols, vars_list_comparator);
+
+ /*
+ * Construct name for statistic by concatenating relation name
+ * with all columns
+ */
+ foreach(cell, cols)
+ {
+ char *col_name = strVal(linitial(((ColumnRef *) lfirst(cell))->fields));
+
+ stat_name = psprintf("%s_%s", stat_name, col_name);
+ create_stat_stmt = psprintf("%s%s %s", create_stat_stmt, sep, col_name);
+ sep = ",";
+ }
+
+ name_len = strlen(stat_name);
+ /* Truncate name if it doesn't fit in NameData */
+ if (name_len >= NAMEDATALEN)
+ stat_name = psprintf("%.*s_%08x", NAMEDATALEN - 10, stat_name, (unsigned) hash_any((uint8 *) stat_name, name_len));
+
+ if (!FindExtendedStatisticsOnVars(&rte->relid, colmap))
+ {
+ ereport(NOTICE, (errmsg("pg_stat_advisor suggestion: CREATE STATISTICS %s %s FROM %s",
+ stat_name, create_stat_stmt, rel_name),
+ errhidestmt(true)));
+ }
+ }
+
+ bms_free(colmap);
+ }
+}
+
+/**
+ * Suggest statistics for node
+ */
+static void
+SuggestMultiColumnStatisticsForNode(PlanState *planstate, ExplainState *es)
+{
+ Plan *plan = planstate->plan;
+
+ if (planstate->instrument && plan->plan_rows != 0 &&
+ pg_stat_advisor_suggest_statistics_threshold > 0.0 &&
+ plan->plan_rows / planstate->instrument->ntuples <= pg_stat_advisor_suggest_statistics_threshold)
+ {
+ elog(DEBUG1, "pg_stat_advisor: suggest statistics due to following parameters \
+ estimated=%f, actual=%f, ratio=%f: plan=%s",
+ plan->plan_rows,
+ planstate->instrument->ntuples,
+ plan->plan_rows / planstate->instrument->ntuples,
+ nodeToString(plan));
+ /* quals, sort keys, etc */
+ switch (nodeTag(plan))
+ {
+ case T_IndexScan:
+ SuggestMultiColumnStatisticsForQual(((IndexScan *) plan)->indexqualorig, es);
+ break;
+ case T_IndexOnlyScan:
+ SuggestMultiColumnStatisticsForQual(((IndexOnlyScan *) plan)->indexqual, es);
+ break;
+ case T_BitmapIndexScan:
+ SuggestMultiColumnStatisticsForQual(((BitmapIndexScan *) plan)->indexqualorig, es);
+ break;
+ case T_NestLoop:
+ SuggestMultiColumnStatisticsForQual(((NestLoop *) plan)->join.joinqual, es);
+ break;
+ case T_MergeJoin:
+ SuggestMultiColumnStatisticsForQual(((MergeJoin *) plan)->mergeclauses, es);
+ SuggestMultiColumnStatisticsForQual(((MergeJoin *) plan)->join.joinqual, es);
+ break;
+ case T_HashJoin:
+ SuggestMultiColumnStatisticsForQual(((HashJoin *) plan)->hashclauses, es);
+ SuggestMultiColumnStatisticsForQual(((HashJoin *) plan)->join.joinqual, es);
+ break;
+ default:
+ break;
+ }
+ SuggestMultiColumnStatisticsForQual(plan->qual, es);
+ }
+
+ /* initPlan-s */
+ if (planstate->initPlan)
+ SuggestMultiColumnStatisticsForSubPlans(planstate->initPlan, es);
+
+ /* lefttree */
+ if (outerPlanState(planstate))
+ SuggestMultiColumnStatisticsForNode(outerPlanState(planstate), es);
+
+ /* righttree */
+ if (innerPlanState(planstate))
+ SuggestMultiColumnStatisticsForNode(innerPlanState(planstate), es);
+
+ /* special child plans */
+ switch (nodeTag(plan))
+ {
+ case T_Append:
+ SuggestMultiColumnStatisticsForMemberNodes(((AppendState *) planstate)->appendplans,
+ ((AppendState *) planstate)->as_nplans,
+ es);
+ break;
+ case T_MergeAppend:
+ SuggestMultiColumnStatisticsForMemberNodes(((MergeAppendState *) planstate)->mergeplans,
+ ((MergeAppendState *) planstate)->ms_nplans,
+ es);
+ break;
+ case T_BitmapAnd:
+ SuggestMultiColumnStatisticsForMemberNodes(((BitmapAndState *) planstate)->bitmapplans,
+ ((BitmapAndState *) planstate)->nplans,
+ es);
+ break;
+ case T_BitmapOr:
+ SuggestMultiColumnStatisticsForMemberNodes(((BitmapOrState *) planstate)->bitmapplans,
+ ((BitmapOrState *) planstate)->nplans,
+ es);
+ break;
+ case T_SubqueryScan:
+ SuggestMultiColumnStatisticsForNode(((SubqueryScanState *) planstate)->subplan, es);
+ break;
+ default:
+ break;
+ }
+}
diff --git a/contrib/pg_stat_advisor/sql/pg_stat_advisor.sql b/contrib/pg_stat_advisor/sql/pg_stat_advisor.sql
new file mode 100644
index 0000000000..81c8aaab2b
--- /dev/null
+++ b/contrib/pg_stat_advisor/sql/pg_stat_advisor.sql
@@ -0,0 +1,50 @@
+LOAD 'pg_stat_advisor';
+SET pg_stat_advisor.suggest_statistics_threshold = 0.1;
+
+-------------------------------Check printing suggestion-----------------------------------------------------------
+
+CREATE TABLE t (i INT, j INT);
+INSERT INTO t SELECT i/10, i/100 FROM GENERATE_SERIES(1,1000000) i;
+ANALYZE t;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM t WHERE i = 100 AND j = 10;
+
+CREATE STATISTICS t_i_j ON i, j FROM t;
+
+ANALYZE t;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM t WHERE i = 100 AND j = 10;
+
+DROP STATISTICS t_i_j;
+
+-------------------------------Check existing statistics except suggested name-------------------------------------
+
+CREATE STATISTICS statistics_t_i_j ON i, j FROM t;
+
+ANALYZE t;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM t WHERE i = 100 AND j = 10;
+
+DROP STATISTICS statistics_t_i_j;
+DROP TABLE t;
+
+-------------------------------Check suggest_statistics_threshold parameter----------------------------------------
+
+SET pg_stat_advisor.suggest_statistics_threshold = 0.1;
+
+CREATE TABLE inner_tab(x INT, y INT);
+CREATE TABLE outer_tab(pk INT PRIMARY KEY, x INT, y INT);
+CREATE INDEX ON inner_tab(x,y);
+INSERT INTO outer_tab VALUES (GENERATE_SERIES(1,100000), GENERATE_SERIES(1,100000), GENERATE_SERIES(1,100000)*10);
+INSERT INTO inner_tab VALUES (GENERATE_SERIES(1,1000000)/10, GENERATE_SERIES(1,1000000)/10*10);
+ANALYZE inner_tab;
+ANALYZE outer_tab;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM outer_tab JOIN inner_tab USING(x,y) WHERE pk=1;
+
+CREATE STATISTICS inner_tab_x_y ON x, y FROM inner_tab;
+
+ANALYZE inner_tab;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM outer_tab JOIN inner_tab USING(x,y) WHERE pk=1;
+
+DROP STATISTICS inner_tab_x_y;
+SET pg_stat_advisor.suggest_statistics_threshold = 0.09;
+
+ANALYZE inner_tab;
+EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) SELECT * FROM outer_tab JOIN inner_tab USING(x,y) WHERE pk=1;
\ No newline at end of file
--
2.34.1
<div><div>Our further discussion of this new extension takes place in this thread: /messages/by-id/f822b674-9697-43b9-931b-4d69729a26ff@tantorlabs.com .Due to technical difficulties in the current thread, I will not be able to conduct a dialogue except in HTML format. And this will make it inconvenient for everyone to read the messages. I apologize for the inconvenience caused. Regards, Ilia Evdokimov, TantorLabs LLC.</div></div>
On Feb 08 2024 at 07:14:18, Andrei Lepikhov wrote:
1. In the case of parallel workers the plan_rows value has a different
semantics than the number of rows predicted. Just explore
get_parallel_divisor().
Yes, this is a very weighty and important issue. I need to think about
this very carefully.
2. The extension recommends new statistics immediately upon an error
finding. But what if the reason for the error is stale statistics? Or
this error may be raised for only one specific set of constants, and
estimation will be done well in another 99.9999% of cases for the same
expression.
According to No.2, it might make sense to collect and track clause
combinations and cardinality errors found and let the DBA make decisions
on their own.
Your proposal is very interesting. In my opinion, it is worth
considering updating the extended statistics if they are truly stale.
And write about this in a separate message with suggestion updating
statistics.
If I succeed, then in the next patch I will add the kind of extended
statistics to the message, deal with the parallel workers and update
statistics if necessary.
If you have additional suggestions and thoughts, feel free to write them
in this thread.
Regards,
Ilia Evdokimov,
Tantor Labs LLC.
1. In the case of parallel workers the plan_rows value has a different semantics than the number of rows predicted. Just explore
get_parallel_divisor().
2. The extension recommends new statistics immediately upon an error finding. But what if the reason for the error is stale statistics? Or
this error may be raised for only one specific set of constants, and
estimation will be done well in another 99.9999% of cases for the same
expression.
The new parameter, `pg_stat_advisor.analyze_scale_factor`, can suggest
the execution of the ANALYZE command on specific tables. The extension
now evaluates the ratio of `n_live_tup` (number of live tuples) to
`n_mod_since_analyze` (number of modifications since last analyze) in
the `pg_stat_all_tables` catalog. If this ratio exceeds the value
specified in `analyze_scale_factor`, the extension will suggest an
update to the table's statistics.
There are a lot of parameters that influences on estimated rows.
Statistics might not help improve estimated rows. This feature is
designed to provide users with data-driven insights to decide whether
updating statistics via the ANALYZE command could potentially improve
query performance. By suggesting rather than automatically executing
statistics updates, we empower you to make informed decisions based on
the specific needs and conditions of your database environment.
I've developed an extension that provides suggestions on whether to
update or create statistics for your PostgreSQL database, without
executing any changes. This approach allows you to consider various
parameters that influence row estimates and make informed decisions
about optimizing your database's performance.
Your feedback is invaluable, and we look forward to hearing about your
experiences and any improvements you might suggest. Best regards, Ilia
Evdokimov Tantor Labs LLC.
Attachments:
0001-pg_stat_advisor-extension.patchtext/x-patch; charset=UTF-8; name=0001-pg_stat_advisor-extension.patchDownload
From eb998bea96a3640d240afa63e08cc8cf98925bf7 Mon Sep 17 00:00:00 2001
From: Ilia Evdokimov <ilya.evdokimov@tantorlabs.ru>
Date: Mon, 13 May 2024 14:41:59 +0300
Subject: [PATCH] 'pg_stat_advisor' extension
This service as a hook into executor. It has two GUC-parameters.
pg_stat_advisor.analyze_scale_factor: if ratio of pg_stat_all_tables.n_live_tup to pg_stat_all_tables.n_mod_since_analyze is greater than pg_stat_advisor.analyze_scale_factor extension prints suggestion executing ANALYZE command.
pg_stat_advisor.suggest_statistics_threshold: the ratio of total rows to planned rows is greater than or equal to this threshold the extension prints suggestion executing the creation of statistics, using the naming format 'relationName_columns'
---
contrib/Makefile | 1 +
contrib/pg_stat_advisor/Makefile | 20 +
.../expected/pg_stat_advisor.out | 52 ++
contrib/pg_stat_advisor/meson.build | 30 +
contrib/pg_stat_advisor/pg_stat_advisor.c | 560 ++++++++++++++++++
.../pg_stat_advisor/sql/pg_stat_advisor.sql | 24 +
6 files changed, 687 insertions(+)
create mode 100644 contrib/pg_stat_advisor/Makefile
create mode 100644 contrib/pg_stat_advisor/expected/pg_stat_advisor.out
create mode 100644 contrib/pg_stat_advisor/meson.build
create mode 100644 contrib/pg_stat_advisor/pg_stat_advisor.c
create mode 100644 contrib/pg_stat_advisor/sql/pg_stat_advisor.sql
diff --git a/contrib/Makefile b/contrib/Makefile
index abd780f277..d6ce2fe562 100644
--- a/contrib/Makefile
+++ b/contrib/Makefile
@@ -33,6 +33,7 @@ SUBDIRS = \
pg_buffercache \
pg_freespacemap \
pg_prewarm \
+ pg_stat_advisor \
pg_stat_statements \
pg_surgery \
pg_trgm \
diff --git a/contrib/pg_stat_advisor/Makefile b/contrib/pg_stat_advisor/Makefile
new file mode 100644
index 0000000000..f31b939e8a
--- /dev/null
+++ b/contrib/pg_stat_advisor/Makefile
@@ -0,0 +1,20 @@
+# contrib/pg_stat_advisor/Makefile
+
+MODULE_big = pg_stat_advisor
+OBJS = \
+ $(WIN32RES) \
+ pg_stat_advisor.o
+PGFILEDESC = "pg_stat_advisor - analyze query performance and recommend the creation of additional statistics"
+
+REGRESS = pg_stat_advisor
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/pg_stat_advisor
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/contrib/pg_stat_advisor/expected/pg_stat_advisor.out b/contrib/pg_stat_advisor/expected/pg_stat_advisor.out
new file mode 100644
index 0000000000..8f3dab2c2f
--- /dev/null
+++ b/contrib/pg_stat_advisor/expected/pg_stat_advisor.out
@@ -0,0 +1,52 @@
+LOAD 'pg_stat_advisor';
+SET pg_stat_advisor.analyze_scale_factor = 0.4;
+SET pg_stat_advisor.suggest_statistics_threshold = 0.11;
+CREATE TABLE my_tbl(fld_1 INTEGER, fld_2 BIGINT) WITH (autovacuum_enabled = false);
+INSERT INTO my_tbl (fld_1, fld_2)
+SELECT
+ i/100 as fld_1,
+ i/500 as fld_2
+FROM generate_series(1, 10000000) s(i);
+ANALYZE my_tbl;
+INSERT INTO my_tbl (fld_1, fld_2)
+SELECT
+ i/100 as fld_1,
+ i/500 as fld_2
+FROM generate_series(1, 10000000) s(i);
+EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF) SELECT * FROM my_tbl WHERE fld_1 = 1 AND fld_2 = 0;
+NOTICE: pg_stat_advisor suggestion: 'ANALYZE my_tbl'
+ QUERY PLAN
+------------------------------------------------------------
+ Gather (actual rows=200 loops=1)
+ Workers Planned: 2
+ Workers Launched: 2
+ -> Parallel Seq Scan on my_tbl (actual rows=67 loops=3)
+ Filter: ((fld_1 = 1) AND (fld_2 = 0))
+ Rows Removed by Filter: 6666600
+(6 rows)
+
+ANALYZE my_tbl;
+EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF) SELECT * FROM my_tbl WHERE fld_1 = 1 AND fld_2 = 0;
+NOTICE: pg_stat_advisor suggestion: CREATE STATISTICS my_tbl_fld_1_fld_2 ON fld_1, fld_2 FROM my_tbl
+ QUERY PLAN
+------------------------------------------------------------
+ Gather (actual rows=200 loops=1)
+ Workers Planned: 2
+ Workers Launched: 2
+ -> Parallel Seq Scan on my_tbl (actual rows=67 loops=3)
+ Filter: ((fld_1 = 1) AND (fld_2 = 0))
+ Rows Removed by Filter: 6666600
+(6 rows)
+
+CREATE STATISTICS my_tbl_fld_1_fld_2 ON fld_1, fld_2 FROM my_tbl;
+EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF) SELECT * FROM my_tbl WHERE fld_1 = 1 AND fld_2 = 0;
+ QUERY PLAN
+------------------------------------------------------------
+ Gather (actual rows=200 loops=1)
+ Workers Planned: 2
+ Workers Launched: 2
+ -> Parallel Seq Scan on my_tbl (actual rows=67 loops=3)
+ Filter: ((fld_1 = 1) AND (fld_2 = 0))
+ Rows Removed by Filter: 6666600
+(6 rows)
+
diff --git a/contrib/pg_stat_advisor/meson.build b/contrib/pg_stat_advisor/meson.build
new file mode 100644
index 0000000000..292917c025
--- /dev/null
+++ b/contrib/pg_stat_advisor/meson.build
@@ -0,0 +1,30 @@
+# Copyright (c) 2022-2024, PostgreSQL Global Development Group
+
+pg_stat_advisor_sources = files(
+ 'pg_stat_advisor.c',
+)
+
+if host_system == 'windows'
+ pg_stat_advisor_sources += rc_lib_gen.process(win32ver_rc, extra_args: [
+ '--NAME', 'pg_stat_advsior',
+ '--FILEDESC', 'pg_stat_advisor - suggestion of creating statistics',])
+endif
+
+pg_stat_advisor = shared_module('pg_stat_advisor',
+ pg_stat_advisor_sources,
+ kwargs: contrib_mod_args + {
+ 'dependencies': contrib_mod_args['dependencies'],
+ },
+)
+contrib_targets += pg_stat_advisor
+
+tests += {
+ 'name': 'pg_stat_advisor',
+ 'sd': meson.current_source_dir(),
+ 'bd': meson.current_build_dir(),
+ 'regress': {
+ 'sql': [
+ 'pg_stat_advisor',
+ ],
+ },
+}
diff --git a/contrib/pg_stat_advisor/pg_stat_advisor.c b/contrib/pg_stat_advisor/pg_stat_advisor.c
new file mode 100644
index 0000000000..ea391ec886
--- /dev/null
+++ b/contrib/pg_stat_advisor/pg_stat_advisor.c
@@ -0,0 +1,560 @@
+#include "postgres.h"
+
+#include "access/hash.h"
+#include "access/table.h"
+#include "catalog/pg_statistic_ext.h"
+#include "commands/explain.h"
+#include "executor/spi.h"
+#include "optimizer/optimizer.h"
+#include "parser/parsetree.h"
+#include "statistics/statistics.h"
+#include "utils/guc.h"
+#include "utils/syscache.h"
+#include "utils/lsyscache.h"
+
+PG_MODULE_MAGIC;
+
+/* GUC variables */
+static double pg_stat_advisor_suggest_statistics_threshold = 1.0;
+static double pg_stat_advisor_analyze_scale_factor = 0.0;
+
+/* Current nesting depth of ExecutorRun calls */
+static int nesting_level = 0;
+
+/* Is the current top-level query to be sampled? */
+#define pg_stat_advisor_enabled() (nesting_level == 0)
+
+void _PG_init(void);
+
+/* Saved hook values in case of unload */
+static ExecutorStart_hook_type prev_ExecutorStart = NULL;
+static ExecutorRun_hook_type prev_ExecutorRun = NULL;
+static ExecutorFinish_hook_type prev_ExecutorFinish = NULL;
+static ExecutorEnd_hook_type prev_ExecutorEnd = NULL;
+
+static void explain_ExecutorStart(QueryDesc *queryDesc, int eflags);
+static void explain_ExecutorRun(QueryDesc *queryDesc,
+ ScanDirection direction,
+ uint64 count, bool execute_once);
+static void explain_ExecutorFinish(QueryDesc *queryDesc);
+static void explain_ExecutorEnd(QueryDesc *queryDesc);
+
+static void SuggestMultiColumnStatisticsForNode(PlanState *planstate, ExplainState *es);
+bool UpdateStatistics(char *rel_name);
+
+/*
+ * Module load callback
+ */
+void
+_PG_init(void)
+{
+ /* Define custom GUC variables. */
+ DefineCustomRealVariable("pg_stat_advisor.suggest_statistics_threshold",
+ "Set the threshold for estimated/actual rows",
+ "Zero disables suggestion of creating statistics",
+ &pg_stat_advisor_suggest_statistics_threshold,
+ 1.0,
+ 0.0,
+ 1.0,
+ PGC_SUSET,
+ 0,
+ NULL,
+ NULL,
+ NULL);
+
+ DefineCustomRealVariable("pg_stat_advisor.analyze_scale_factor",
+ "Set the threshold for estimated/actual rows",
+ "Zero disables suggestion of updatinging statistics",
+ &pg_stat_advisor_analyze_scale_factor,
+ 0.0,
+ 0.0,
+ 1.0,
+ PGC_SUSET,
+ 0,
+ NULL,
+ NULL,
+ NULL);
+
+ MarkGUCPrefixReserved("pg_stat_advisor");
+
+ /* Install hooks. */
+ prev_ExecutorStart = ExecutorStart_hook;
+ ExecutorStart_hook = explain_ExecutorStart;
+ prev_ExecutorRun = ExecutorRun_hook;
+ ExecutorRun_hook = explain_ExecutorRun;
+ prev_ExecutorFinish = ExecutorFinish_hook;
+ ExecutorFinish_hook = explain_ExecutorFinish;
+ prev_ExecutorEnd = ExecutorEnd_hook;
+ ExecutorEnd_hook = explain_ExecutorEnd;
+}
+
+
+/*
+ * ExecutorStart hook: start up logging if needed
+ */
+static void
+explain_ExecutorStart(QueryDesc *queryDesc, int eflags)
+{
+ if (prev_ExecutorStart)
+ prev_ExecutorStart(queryDesc, eflags);
+ else
+ standard_ExecutorStart(queryDesc, eflags);
+
+ /*
+ * Set up to track total elapsed time in ExecutorRun. Make sure the
+ * space is allocated in the per-query context so it will go away at
+ * ExecutorEnd.
+ */
+ if (pg_stat_advisor_enabled() && queryDesc->totaltime == NULL)
+ {
+ MemoryContext oldcxt;
+
+ oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
+ queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL, false);
+ MemoryContextSwitchTo(oldcxt);
+ }
+}
+
+/*
+ * ExecutorRun hook: all we need do is track nesting depth
+ */
+static void
+explain_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction,
+ uint64 count, bool execute_once)
+{
+ nesting_level++;
+ PG_TRY();
+ {
+ if (prev_ExecutorRun)
+ prev_ExecutorRun(queryDesc, direction, count, execute_once);
+ else
+ standard_ExecutorRun(queryDesc, direction, count, execute_once);
+ }
+ PG_FINALLY();
+ {
+ nesting_level--;
+ }
+ PG_END_TRY();
+}
+
+/*
+ * ExecutorFinish hook: all we need do is track nesting depth
+ */
+static void
+explain_ExecutorFinish(QueryDesc *queryDesc)
+{
+ nesting_level++;
+ PG_TRY();
+ {
+ if (prev_ExecutorFinish)
+ prev_ExecutorFinish(queryDesc);
+ else
+ standard_ExecutorFinish(queryDesc);
+ }
+ PG_FINALLY();
+ {
+ nesting_level--;
+ }
+ PG_END_TRY();
+}
+
+/*
+ * ExecutorEnd hook: log results if needed
+ */
+static void
+explain_ExecutorEnd(QueryDesc *queryDesc)
+{
+ if (queryDesc->totaltime && pg_stat_advisor_enabled())
+ {
+ MemoryContext oldcxt;
+ ExplainState *es;
+
+ /*
+ * Make sure we operate in the per-query context, so any cruft will be
+ * discarded later during ExecutorEnd.
+ */
+ oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
+
+ /*
+ * Make sure stats accumulation is done. (Note: it's okay if several
+ * levels of hook all do this.)
+ */
+ InstrEndLoop(queryDesc->totaltime);
+
+ es = NewExplainState();
+
+ es->analyze = queryDesc->instrument_options;
+
+ ExplainBeginOutput(es);
+ ExplainQueryText(es, queryDesc);
+ ExplainPrintPlan(es, queryDesc);
+ if (es->analyze)
+ ExplainPrintTriggers(es, queryDesc);
+ if (es->costs)
+ ExplainPrintJITSummary(es, queryDesc);
+ ExplainEndOutput(es);
+
+ /* Entry to check */
+ if (!IsParallelWorker())
+ SuggestMultiColumnStatisticsForNode(queryDesc->planstate, es);
+
+ MemoryContextSwitchTo(oldcxt);
+ }
+
+ if (prev_ExecutorEnd)
+ prev_ExecutorEnd(queryDesc);
+ else
+ standard_ExecutorEnd(queryDesc);
+}
+
+/**
+ * Comparator used to sort Vars by name
+ */
+static int
+vars_list_comparator(const ListCell *a, const ListCell *b)
+{
+ char *va = strVal(linitial(((ColumnRef *) lfirst(a))->fields));
+ char *vb = strVal(linitial(((ColumnRef *) lfirst(b))->fields));
+
+ return strcmp(va, vb);
+}
+
+/**
+ * Suggest statistics for specified subplans.
+ */
+static void
+SuggestMultiColumnStatisticsForSubPlans(List *plans, ExplainState *es)
+{
+ ListCell *lst;
+
+ foreach(lst, plans)
+ {
+ SubPlanState *sps = (SubPlanState *) lfirst(lst);
+
+ SuggestMultiColumnStatisticsForNode(sps->planstate, es);
+ }
+}
+
+/**
+ * Suggest statistics for plan subnodes.
+ */
+static void
+SuggestMultiColumnStatisticsForMemberNodes(PlanState **planstates, int nsubnodes,
+ ExplainState *es)
+{
+ int j;
+
+ for (j = 0; j < nsubnodes; j++)
+ SuggestMultiColumnStatisticsForNode(planstates[j], es);
+}
+
+/*
+ * UpdateStatistics --- check entries in pg_statistic of rel_name
+ */
+bool
+UpdateStatistics(char *rel_name)
+{
+ int ret, ntuples;
+ bool result = false;
+ char sqlcmd[256];
+
+ if ((ret = SPI_connect()) < 0)
+ {
+ elog(ERROR, "SPI_connect failed: error code %d", ret);
+ }
+
+ snprintf(sqlcmd, sizeof(sqlcmd), "SELECT n_live_tup, n_mod_since_analyze FROM pg_stat_all_tables WHERE relname = '%s';", rel_name);
+
+ ret = SPI_execute(sqlcmd, true, 0);
+ if (ret != SPI_OK_SELECT)
+ elog(ERROR, "SPI_execute failed: error code %d", ret);
+
+ ntuples = SPI_processed;
+
+ if (ntuples > 0)
+ {
+ SPITupleTable *tuptable = SPI_tuptable;
+ TupleDesc tupdesc = SPI_tuptable->tupdesc;
+ HeapTuple tuple = tuptable->vals[0];
+ bool is_null;
+
+ double n_live_tup = (double)DatumGetInt64(SPI_getbinval(tuple, tupdesc, 1, &is_null));
+ double n_mod_since_analyze = (double)DatumGetInt64(SPI_getbinval(tuple, tupdesc, 2, &is_null));
+
+ /* Criteria of suggestion's message about ANALYZE */
+ if (n_mod_since_analyze / n_live_tup > pg_stat_advisor_analyze_scale_factor)
+ result = true;
+ }
+
+ SPI_finish();
+
+ return result;
+}
+
+/**
+ * Find extended statistics from 'relation_oid' relation on colmap
+ */
+static bool
+FindExtendedStatisticsOnVars(Oid *relation_oid, Bitmapset *colmap)
+{
+ bool isExistExtendedStatistics = false;
+ ListCell *oid;
+
+ /* Receive all extended statistics for current relation */
+ Relation pg_stextdata = table_open(*relation_oid, RowExclusiveLock);
+ List *statistics = RelationGetStatExtList(pg_stextdata);
+
+ foreach(oid, statistics)
+ {
+ Oid statOid = lfirst_oid(oid);
+ Form_pg_statistic_ext staForm;
+ HeapTuple htup;
+ Bitmapset *keys = NULL;
+ int i;
+
+ /* Read from pg_statistic_ext */
+ htup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statOid));
+ if (!HeapTupleIsValid(htup))
+ elog(ERROR, "cache lookup failed for statistics object %u", statOid);
+ staForm = (Form_pg_statistic_ext) GETSTRUCT(htup);
+
+ for (i = 0; i < staForm->stxkeys.dim1; i++)
+ keys = bms_add_member(keys, staForm->stxkeys.values[i]);
+
+ /* If we have statistics for current columns */
+ if (!bms_compare(keys, colmap))
+ {
+ isExistExtendedStatistics = true;
+
+ elog(DEBUG1, "pg_stat_advisor: found statistics for '%s' relation - '%s'",
+ get_rel_name(*relation_oid),
+ get_rel_name(statOid));
+
+ ReleaseSysCache(htup);
+ bms_free(keys);
+
+ break;
+ }
+
+ ReleaseSysCache(htup);
+ bms_free(keys);
+ }
+
+ list_free(statistics);
+ table_close(pg_stextdata, RowExclusiveLock);
+
+ return isExistExtendedStatistics;
+}
+
+/**
+ * Suggest statistics for qual
+ */
+static void
+SuggestMultiColumnStatisticsForQual(List *qual, ExplainState *es)
+{
+ List *vars = NULL;
+ ListCell *lc;
+
+ /* Extract vars from all quals */
+ foreach(lc, qual)
+ {
+ Node *node = (Node *) lfirst(lc);
+
+ if (IsA(node, RestrictInfo))
+ node = (Node *) ((RestrictInfo *) node)->clause;
+ vars = list_concat(vars, pull_vars_of_level(node, 0));
+ }
+
+ /* Loop until we considered all vars */
+ while (vars != NULL)
+ {
+ ListCell *cell;
+ List *cols = NULL;
+ Index relno = 0;
+ Bitmapset *colmap = NULL;
+ RangeTblEntry *rte;
+ char *rel_name;
+
+ /* Contruct list of unique vars */
+ foreach(cell, vars)
+ {
+ Node *node = (Node *) lfirst(cell);
+
+ if (IsA(node, Var))
+ {
+ Var *var = (Var *) node;
+ int varno = IS_SPECIAL_VARNO(var->varno) ? var->varnosyn : var->varno;
+
+ if (cols == NULL || varno == relno)
+ {
+ int varattno = IS_SPECIAL_VARNO(var->varno) ? var->varattnosyn : var->varattno;
+
+ relno = varno;
+ if (var->varattno > 0 &&
+ !bms_is_member(varattno, colmap) &&
+ varno >= 1 && /* not synthetic var */
+ varno <= list_length(es->rtable) &&
+ list_length(cols) < STATS_MAX_DIMENSIONS)
+ {
+ rte = rt_fetch(varno, es->rtable);
+
+ if (rte->rtekind == RTE_RELATION)
+ {
+ ColumnRef *col = makeNode(ColumnRef);
+ char *colname = get_rte_attribute_name(rte, varattno);
+
+ col->fields = list_make1(makeString(colname));
+ cols = lappend(cols, col);
+ colmap = bms_add_member(colmap, varattno);
+ }
+ }
+ }
+ else
+ {
+ continue;
+ }
+ }
+ vars = foreach_delete_current(vars, cell);
+ }
+
+ rte = rt_fetch(relno, es->rtable);
+ rel_name = get_rel_name(rte->relid);
+ if (UpdateStatistics(rel_name))
+ {
+ ereport(NOTICE, (errmsg("pg_stat_advisor suggestion: 'ANALYZE %s'", rel_name),
+ errhidestmt(true)));
+
+ bms_free(colmap);
+ return;
+ }
+
+ /* To suggest statitics we need to have at least 2 columns */
+ if (list_length(cols) >= 2)
+ {
+ char *stat_name = rel_name;
+ char *create_stat_stmt = (char *) "";
+ char const *sep = "ON";
+ size_t name_len;
+
+ /* Sort variables by name */
+ list_sort(cols, vars_list_comparator);
+
+ /*
+ * Construct name for statistic by concatenating relation name
+ * with all columns
+ */
+ foreach(cell, cols)
+ {
+ char *col_name = strVal(linitial(((ColumnRef *) lfirst(cell))->fields));
+
+ stat_name = psprintf("%s_%s", stat_name, col_name);
+ create_stat_stmt = psprintf("%s%s %s", create_stat_stmt, sep, col_name);
+ sep = ",";
+ }
+
+ name_len = strlen(stat_name);
+ /* Truncate name if it doesn't fit in NameData */
+ if (name_len >= NAMEDATALEN)
+ stat_name = psprintf("%.*s_%08x", NAMEDATALEN - 10, stat_name, (unsigned) hash_any((uint8 *) stat_name, name_len));
+
+ if (!FindExtendedStatisticsOnVars(&rte->relid, colmap))
+ {
+ ereport(NOTICE, (errmsg("pg_stat_advisor suggestion: CREATE STATISTICS %s %s FROM %s",
+ stat_name, create_stat_stmt, rel_name),
+ errhidestmt(true)));
+ }
+ }
+
+ bms_free(colmap);
+ }
+}
+
+/**
+ * Suggest statistics for node
+ */
+static void
+SuggestMultiColumnStatisticsForNode(PlanState *planstate, ExplainState *es)
+{
+ Plan *plan = planstate->plan;
+
+ /* NOTE: Criteria of suggestion's message about CREATE STATISTICS */
+ if (planstate->instrument && plan->plan_rows != 0 &&
+ plan->plan_rows / planstate->instrument->ntuples < pg_stat_advisor_suggest_statistics_threshold)
+ {
+ elog(DEBUG1, "pg_stat_advisor: suggest statistics due to following parameters \
+ estimated=%f, actual=%f, ratio=%f: plan=%s",
+ plan->plan_rows,
+ planstate->instrument->ntuples,
+ plan->plan_rows / planstate->instrument->ntuples,
+ nodeToString(plan));
+ /* quals, sort keys, etc */
+ switch (nodeTag(plan))
+ {
+ case T_IndexScan:
+ SuggestMultiColumnStatisticsForQual(((IndexScan *) plan)->indexqualorig, es);
+ break;
+ case T_IndexOnlyScan:
+ SuggestMultiColumnStatisticsForQual(((IndexOnlyScan *) plan)->indexqual, es);
+ break;
+ case T_BitmapIndexScan:
+ SuggestMultiColumnStatisticsForQual(((BitmapIndexScan *) plan)->indexqualorig, es);
+ break;
+ case T_NestLoop:
+ SuggestMultiColumnStatisticsForQual(((NestLoop *) plan)->join.joinqual, es);
+ break;
+ case T_MergeJoin:
+ SuggestMultiColumnStatisticsForQual(((MergeJoin *) plan)->mergeclauses, es);
+ SuggestMultiColumnStatisticsForQual(((MergeJoin *) plan)->join.joinqual, es);
+ break;
+ case T_HashJoin:
+ SuggestMultiColumnStatisticsForQual(((HashJoin *) plan)->hashclauses, es);
+ SuggestMultiColumnStatisticsForQual(((HashJoin *) plan)->join.joinqual, es);
+ break;
+ default:
+ break;
+ }
+ SuggestMultiColumnStatisticsForQual(plan->qual, es);
+ }
+
+ /* initPlan-s */
+ if (planstate->initPlan)
+ SuggestMultiColumnStatisticsForSubPlans(planstate->initPlan, es);
+
+ /* lefttree */
+ if (outerPlanState(planstate))
+ SuggestMultiColumnStatisticsForNode(outerPlanState(planstate), es);
+
+ /* righttree */
+ if (innerPlanState(planstate))
+ SuggestMultiColumnStatisticsForNode(innerPlanState(planstate), es);
+
+ /* special child plans */
+ switch (nodeTag(plan))
+ {
+ case T_Append:
+ SuggestMultiColumnStatisticsForMemberNodes(((AppendState *) planstate)->appendplans,
+ ((AppendState *) planstate)->as_nplans,
+ es);
+ break;
+ case T_MergeAppend:
+ SuggestMultiColumnStatisticsForMemberNodes(((MergeAppendState *) planstate)->mergeplans,
+ ((MergeAppendState *) planstate)->ms_nplans,
+ es);
+ break;
+ case T_BitmapAnd:
+ SuggestMultiColumnStatisticsForMemberNodes(((BitmapAndState *) planstate)->bitmapplans,
+ ((BitmapAndState *) planstate)->nplans,
+ es);
+ break;
+ case T_BitmapOr:
+ SuggestMultiColumnStatisticsForMemberNodes(((BitmapOrState *) planstate)->bitmapplans,
+ ((BitmapOrState *) planstate)->nplans,
+ es);
+ break;
+ case T_SubqueryScan:
+ SuggestMultiColumnStatisticsForNode(((SubqueryScanState *) planstate)->subplan, es);
+ break;
+ default:
+ break;
+ }
+}
diff --git a/contrib/pg_stat_advisor/sql/pg_stat_advisor.sql b/contrib/pg_stat_advisor/sql/pg_stat_advisor.sql
new file mode 100644
index 0000000000..a1ac29329d
--- /dev/null
+++ b/contrib/pg_stat_advisor/sql/pg_stat_advisor.sql
@@ -0,0 +1,24 @@
+LOAD 'pg_stat_advisor';
+SET pg_stat_advisor.analyze_scale_factor = 0.4;
+SET pg_stat_advisor.suggest_statistics_threshold = 0.11;
+
+CREATE TABLE my_tbl(fld_1 INTEGER, fld_2 BIGINT) WITH (autovacuum_enabled = false);
+INSERT INTO my_tbl (fld_1, fld_2)
+SELECT
+ i/100 as fld_1,
+ i/500 as fld_2
+FROM generate_series(1, 10000000) s(i);
+ANALYZE my_tbl;
+INSERT INTO my_tbl (fld_1, fld_2)
+SELECT
+ i/100 as fld_1,
+ i/500 as fld_2
+FROM generate_series(1, 10000000) s(i);
+
+EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF) SELECT * FROM my_tbl WHERE fld_1 = 1 AND fld_2 = 0;
+
+ANALYZE my_tbl;
+EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF) SELECT * FROM my_tbl WHERE fld_1 = 1 AND fld_2 = 0;
+
+CREATE STATISTICS my_tbl_fld_1_fld_2 ON fld_1, fld_2 FROM my_tbl;
+EXPLAIN (ANALYZE, COSTS OFF, SUMMARY OFF, TIMING OFF) SELECT * FROM my_tbl WHERE fld_1 = 1 AND fld_2 = 0;
\ No newline at end of file
--
2.34.1