From 3afa1148fe17a5c47267853e4cb3cac03bd595b0 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas.vondra@postgresql.org>
Date: Sat, 22 Oct 2022 01:39:39 +0200
Subject: [PATCH 6/6] wip: adaptive watermark step

Another option it to adjust the watermark step based on past tuplesort
executions, and either increase or decrease the step, based on whether
the sort was in-memory or on-disk, etc.

To do this, set the GUC to -1:

  SET brinsort_watermark_step = -1;
---
 src/backend/executor/nodeBrinSort.c     | 51 +++++++++++++++++++++++--
 src/backend/optimizer/plan/createplan.c |  7 +---
 src/backend/utils/misc/guc_tables.c     |  2 +-
 src/backend/utils/sort/tuplesort.c      | 12 ++++++
 src/include/nodes/execnodes.h           |  2 +-
 src/include/utils/tuplesort.h           |  1 +
 6 files changed, 64 insertions(+), 11 deletions(-)

diff --git a/src/backend/executor/nodeBrinSort.c b/src/backend/executor/nodeBrinSort.c
index 2f8e92753cd..2bf75fd603a 100644
--- a/src/backend/executor/nodeBrinSort.c
+++ b/src/backend/executor/nodeBrinSort.c
@@ -255,6 +255,8 @@ static TupleTableSlot *IndexNext(BrinSortState *node);
 static bool IndexRecheck(BrinSortState *node, TupleTableSlot *slot);
 static void ExecInitBrinSortRanges(BrinSort *node, BrinSortState *planstate);
 
+extern int brinsort_watermark_step;
+
 #define BRINSORT_DEBUG
 
 /* do various consistency checks */
@@ -814,6 +816,42 @@ brinsort_rescan(BrinSortState *node)
 	tuplesort_rescan(node->bs_scan->ranges);
 }
 
+/*
+ * Look at the tuplesort statistics, and maybe increase or decrease the
+ * watermark step. If the last sort was in-memory, we decrease the step.
+ * If the sort was in-memory, but we used less than work_mem/3, increment
+ * the step value.
+ *
+ * XXX This should probably behave differently for LIMIT queries, so that
+ * we don't load too many rows unnecessarily. We already consider that in
+ * create_brinsort_plan, but maybe we should limit increments to the ste
+ * value here too - say, by tracking how many rows are we supposed to
+ * produce, and limiting the watermark so that we don't process too many
+ * rows in future steps.
+ *
+ * XXX We might also track the number of rows in the sort and space used,
+ * to calculate more accurate estimate of row width. And then use that to
+ * calculate number of rows that fit into work_mem. But the number of rows
+ * that go into tuplesort (per range) added would still remain fairly
+ * inaccurate, so not sure how good this woud be.
+ */
+static void
+brinsort_adjust_watermark_step(BrinSortState *node, TuplesortInstrumentation *stats)
+{
+	BrinSort   *plan = (BrinSort *) node->ss.ps.plan;
+
+	if (brinsort_watermark_step != -1)
+		return;
+
+	if (stats->spaceType == SORT_SPACE_TYPE_DISK)
+		plan->watermark_step--;
+	else if (stats->spaceUsed < work_mem / 3)
+		plan->watermark_step++;
+
+	plan->watermark_step = Max(1, plan->watermark_step);
+	plan->watermark_step = Min(1024, plan->watermark_step);
+}
+
 /* ----------------------------------------------------------------
  *		IndexNext
  *
@@ -948,15 +986,20 @@ IndexNext(BrinSortState *node)
 					 */
 					if (node->bs_tuplesortstate)
 					{
+						TuplesortInstrumentation stats;
+
+						tuplesort_reset_stats(node->bs_tuplesortstate);
+
 						tuplesort_performsort(node->bs_tuplesortstate);
 						node->bs_stats.sort_count++;
 
-#ifdef BRINSORT_DEBUG
-						{
-							TuplesortInstrumentation stats;
+						memset(&stats, 0, sizeof(TuplesortInstrumentation));
+						tuplesort_get_stats(node->bs_tuplesortstate, &stats);
 
-							tuplesort_get_stats(node->bs_tuplesortstate, &stats);
+						brinsort_adjust_watermark_step(node, &stats);
 
+#ifdef BRINSORT_DEBUG
+						{
 							if (stats.spaceType == SORT_SPACE_TYPE_DISK)
 							{
 								node->bs_stats.sort_count_on_disk++;
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 997c272dec0..dc0a3669df2 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -3375,7 +3375,7 @@ create_brinsort_plan(PlannerInfo *root,
 	 */
 	brinsort_plan->watermark_step = brinsort_watermark_step;
 
-	if (brinsort_plan->watermark_step == 0)
+	if (brinsort_plan->watermark_step <= 0)
 	{
 		BrinMinmaxStats *amstats;
 
@@ -3398,12 +3398,9 @@ create_brinsort_plan(PlannerInfo *root,
 
 		if (amstats)
 		{
-			double	rows_per_step = rows / amstats->minval_ndistinct;
-			elog(WARNING, "rows_per_step = %f", rows_per_step);
+			double	rows_per_step = Max(1.0, (rows / amstats->minval_ndistinct));
 
 			brinsort_plan->watermark_step = (int) (maxrows / rows_per_step);
-
-			elog(WARNING, "calculated step = %d", brinsort_plan->watermark_step);
 		}
 
 		brinsort_plan->watermark_step = Max(brinsort_plan->watermark_step, 1);
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 9ab51a22db7..b6d4186241f 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -3432,7 +3432,7 @@ struct config_int ConfigureNamesInt[] =
 			NULL
 		},
 		&brinsort_watermark_step,
-		0, 0, INT_MAX,
+		0, -1, INT_MAX,
 		NULL, NULL, NULL
 	},
 
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index 416f02ba3cb..c61f27b6fa2 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -2574,6 +2574,18 @@ tuplesort_get_stats(Tuplesortstate *state,
 	}
 }
 
+/*
+ * tuplesort_reset_stats - reset summary statistics
+ *
+ * This can be called before tuplesort_performsort() starts.
+ */
+void
+tuplesort_reset_stats(Tuplesortstate *state)
+{
+	state->isMaxSpaceDisk = false;
+	state->maxSpace = 0;
+}
+
 /*
  * Convert TuplesortMethod to a string.
  */
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 86879bed2f4..485b7b2eeb3 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1682,7 +1682,7 @@ typedef struct BrinSortState
 	 * We need two tuplesort instances - one for current range, one for
 	 * spill-over tuples from the overlapping ranges
 	 */
-	void		   *bs_tuplesortstate;
+	Tuplesortstate  *bs_tuplesortstate;
 	Tuplestorestate *bs_tuplestore;
 } BrinSortState;
 
diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h
index 44412749906..897dfeb274f 100644
--- a/src/include/utils/tuplesort.h
+++ b/src/include/utils/tuplesort.h
@@ -367,6 +367,7 @@ extern void tuplesort_reset(Tuplesortstate *state);
 
 extern void tuplesort_get_stats(Tuplesortstate *state,
 								TuplesortInstrumentation *stats);
+extern void tuplesort_reset_stats(Tuplesortstate *state);
 extern const char *tuplesort_method_name(TuplesortMethod m);
 extern const char *tuplesort_space_type_name(TuplesortSpaceType t);
 
-- 
2.37.3

