From 223037eaff1a5b008be3230f713875a4b05f0453 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas.vondra@postgresql.org>
Date: Sat, 22 Oct 2022 00:06:28 +0200
Subject: [PATCH 5/6] wip: adjust watermark step

Look at available statistics - number of possible watermark values,
number of rows, work_mem, etc. and pick a good watermark_step value.

To calculate step using statistics, set the GUC to 0:

   SET brinsort_watermark_step = 0;
---
 src/backend/commands/explain.c          |  4 ++
 src/backend/executor/nodeBrinSort.c     | 20 +++----
 src/backend/optimizer/plan/createplan.c | 70 +++++++++++++++++++++++++
 src/backend/utils/misc/guc_tables.c     |  2 +-
 src/include/nodes/execnodes.h           |  1 +
 src/include/nodes/plannodes.h           |  3 ++
 6 files changed, 86 insertions(+), 14 deletions(-)

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index c5ace02a10d..114846ebe0b 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -2439,6 +2439,10 @@ static void
 show_brinsort_stats(BrinSortState *sortstate, List *ancestors, ExplainState *es)
 {
 	BrinSortStats  *stats = &sortstate->bs_stats;
+	BrinSort   *plan = (BrinSort *) sortstate->ss.ps.plan;
+
+	ExplainPropertyInteger("Step", NULL, (int64)
+						   plan->watermark_step, es);
 
 	if (stats->sort_count > 0)
 	{
diff --git a/src/backend/executor/nodeBrinSort.c b/src/backend/executor/nodeBrinSort.c
index 3563bf3c1ad..2f8e92753cd 100644
--- a/src/backend/executor/nodeBrinSort.c
+++ b/src/backend/executor/nodeBrinSort.c
@@ -257,14 +257,6 @@ static void ExecInitBrinSortRanges(BrinSort *node, BrinSortState *planstate);
 
 #define BRINSORT_DEBUG
 
-/*
- * How many distinct minval values to look forward for the next watermark?
- *
- * The smallest step we can do is 1, which means the immediately following
- * (while distinct) minval.
- */
-int brinsort_watermark_step = 1;
-
 /* do various consistency checks */
 static void
 AssertCheckRanges(BrinSortState *node)
@@ -365,9 +357,11 @@ brinsort_end_tidscan(BrinSortState *node)
  * heuristics (measure past sorts and extrapolate).
  */
 static void
-brinsort_update_watermark(BrinSortState *node, bool first, bool asc, int steps)
+brinsort_update_watermark(BrinSortState *node, bool first, bool asc)
 {
 	int		cmp;
+	BrinSort   *plan = (BrinSort *) node->ss.ps.plan;
+	int			steps = plan->watermark_step;
 
 	/* assume we haven't found a watermark */
 	bool	found = false;
@@ -918,7 +912,7 @@ IndexNext(BrinSortState *node)
 					node->bs_phase = BRINSORT_LOAD_RANGE;
 
 					/* set the first watermark */
-					brinsort_update_watermark(node, true, asc, brinsort_watermark_step);
+					brinsort_update_watermark(node, true, asc);
 				}
 
 				break;
@@ -978,7 +972,7 @@ IndexNext(BrinSortState *node)
 																			  stats.spaceUsed);
 							}
 
-							elog(DEBUG1, "method: %s  space: %ld kB (%s)",
+							elog(WARNING, "method: %s  space: %ld kB (%s)",
 								 tuplesort_method_name(stats.sortMethod),
 								 stats.spaceUsed,
 								 tuplesort_space_type_name(stats.spaceType));
@@ -1028,7 +1022,7 @@ IndexNext(BrinSortState *node)
 				{
 					/* updte the watermark and try reading more ranges */
 					node->bs_phase = BRINSORT_LOAD_RANGE;
-					brinsort_update_watermark(node, false, asc, brinsort_watermark_step);
+					brinsort_update_watermark(node, false, asc);
 				}
 
 				break;
@@ -1051,7 +1045,7 @@ IndexNext(BrinSortState *node)
 							{
 								brinsort_rescan(node);
 								node->bs_phase = BRINSORT_LOAD_RANGE;
-								brinsort_update_watermark(node, true, asc, brinsort_watermark_step);
+								brinsort_update_watermark(node, true, asc);
 							}
 							else
 								node->bs_phase = BRINSORT_FINISHED;
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 395c632f430..997c272dec0 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -18,6 +18,7 @@
 
 #include <math.h>
 
+#include "access/brin.h"
 #include "access/sysattr.h"
 #include "catalog/pg_class.h"
 #include "foreign/fdwapi.h"
@@ -321,6 +322,14 @@ static GatherMerge *create_gather_merge_plan(PlannerInfo *root,
 											 GatherMergePath *best_path);
 
 
+/*
+ * How many distinct minval values to look forward for the next watermark?
+ *
+ * The smallest step we can do is 1, which means the immediately following
+ * (while distinct) minval.
+ */
+int brinsort_watermark_step = 0;
+
 /*
  * create_plan
  *	  Creates the access plan for a query by recursively processing the
@@ -3340,6 +3349,67 @@ create_brinsort_plan(PlannerInfo *root,
 
 	copy_generic_path_info(&brinsort_plan->scan.plan, &best_path->ipath.path);
 
+	/*
+	 * determine watermark step (how fast to advance)
+	 *
+	 * If the brinsort_watermark_step is set to a non-zero value, we just use
+	 * that value directly. Otherwise we pick a value using some simple
+	 * heuristics heuristics - we don't want the rows to exceed work_mem, and
+	 * we leave a bit slack (because we're adding batches of rows, not row
+	 * by row).
+	 *
+	 * This has a weakness, because it assumes we incrementally add the same
+	 * number of rows into the "sort" set - but imagine very wide overlapping
+	 * ranges (e.g. random data on the same domain). Most of them will have
+	 * about the same minval, so the sort grows only very slowly. Until the
+	 * very last range, that removes the watermark and only then do most of
+	 * the rows get to the tuplesort.
+	 *
+	 * XXX But maybe we can look at the other statistics we have, like number
+	 * of overlaps and average range selectivity (% of tuples matching), and
+	 * deduce something from that?
+	 *
+	 * XXX Could we maybe adjust the watermark step adaptively at runtime?
+	 * That is, when we get to the "sort" step, maybe check how many rows
+	 * are there, and if there are only few then try increasing the step?
+	 */
+	brinsort_plan->watermark_step = brinsort_watermark_step;
+
+	if (brinsort_plan->watermark_step == 0)
+	{
+		BrinMinmaxStats *amstats;
+
+		/**/
+		Cardinality		rows = brinsort_plan->scan.plan.plan_rows;
+
+		/* estimate rowsize in the tuplesort */
+		int				width = brinsort_plan->scan.plan.plan_width;
+		int				tupwidth = (MAXALIGN(width) + MAXALIGN(SizeofHeapTupleHeader));
+
+		/* Don't overflow work_mem (use only half to absorb variations. */
+		int				maxrows = (work_mem * 1024L / tupwidth / 2);
+
+		/* If this is a LIMIT query, aim only for the required number of rows. */
+		if (root->limit_tuples > 0)
+			maxrows = Min(maxrows, root->limit_tuples);
+
+		/* FIXME hard-coded attnum */
+		amstats = (BrinMinmaxStats *) get_attindexam(brinsort_plan->indexid, 1);
+
+		if (amstats)
+		{
+			double	rows_per_step = rows / amstats->minval_ndistinct;
+			elog(WARNING, "rows_per_step = %f", rows_per_step);
+
+			brinsort_plan->watermark_step = (int) (maxrows / rows_per_step);
+
+			elog(WARNING, "calculated step = %d", brinsort_plan->watermark_step);
+		}
+
+		brinsort_plan->watermark_step = Max(brinsort_plan->watermark_step, 1);
+		brinsort_plan->watermark_step = Min(brinsort_plan->watermark_step, 1024);
+	}
+
 	return brinsort_plan;
 }
 
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index c7abdade496..9ab51a22db7 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -3432,7 +3432,7 @@ struct config_int ConfigureNamesInt[] =
 			NULL
 		},
 		&brinsort_watermark_step,
-		1, 1, INT_MAX,
+		0, 0, INT_MAX,
 		NULL, NULL, NULL
 	},
 
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index e8f7b25549f..86879bed2f4 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1671,6 +1671,7 @@ typedef struct BrinSortState
 	BrinRangeScanDesc *bs_scan;
 	BrinRange	   *bs_range;
 	ExprState	   *bs_qual;
+	int				bs_watermark_step;
 	Datum			bs_watermark;
 	bool			bs_watermark_set;
 	BrinSortPhase	bs_phase;
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index c4ef5362acc..9f9ad97ac2d 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -519,6 +519,9 @@ typedef struct BrinSort
 	/* NULLS FIRST/LAST directions */
 	bool	   *nullsFirst pg_node_attr(array_size(numCols));
 
+	/* number of watermark steps to make */
+	int			watermark_step;
+
 } BrinSort;
 
 /* ----------------
-- 
2.37.3

