From ad9f73005b425ff15f2ff2687a23f2b024db56b2 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas.vondra@postgresql.org>
Date: Thu, 20 Oct 2022 13:03:00 +0200
Subject: [PATCH 4/6] wip: multiple watermark steps

Allow incrementing the minval watermark faster, by skipping some minval
values. This allows sorting more data at once (instead of many tiny
sorts, which is inefficient). This also reduces the number of rows we
need to spill (and possibly transfer multiple times).

To use a different watermark step, use a new GUC:

  SET brinsort_watermark_step = 16
---
 src/backend/executor/nodeBrinSort.c | 59 ++++++++++++++++++++++++++---
 src/backend/utils/misc/guc_tables.c | 11 ++++++
 2 files changed, 64 insertions(+), 6 deletions(-)

diff --git a/src/backend/executor/nodeBrinSort.c b/src/backend/executor/nodeBrinSort.c
index c7d417d6e57..3563bf3c1ad 100644
--- a/src/backend/executor/nodeBrinSort.c
+++ b/src/backend/executor/nodeBrinSort.c
@@ -257,6 +257,14 @@ static void ExecInitBrinSortRanges(BrinSort *node, BrinSortState *planstate);
 
 #define BRINSORT_DEBUG
 
+/*
+ * How many distinct minval values to look forward for the next watermark?
+ *
+ * The smallest step we can do is 1, which means the immediately following
+ * (while distinct) minval.
+ */
+int brinsort_watermark_step = 1;
+
 /* do various consistency checks */
 static void
 AssertCheckRanges(BrinSortState *node)
@@ -357,11 +365,24 @@ brinsort_end_tidscan(BrinSortState *node)
  * heuristics (measure past sorts and extrapolate).
  */
 static void
-brinsort_update_watermark(BrinSortState *node, bool asc)
+brinsort_update_watermark(BrinSortState *node, bool first, bool asc, int steps)
 {
 	int		cmp;
+
+	/* assume we haven't found a watermark */
 	bool	found = false;
 
+	Assert(steps > 0);
+
+	/*
+	 * If the watermark is not set, either this is the first call (in
+	 * which case we just use the first (or rather second) value.
+	 * Otherwise it means we've reached the end, so no point in looking
+	 * for more watermarks.
+	 */
+	if (!node->bs_watermark_set && !first)
+		return;
+
 	tuplesort_markpos(node->bs_scan->ranges);
 
 	while (tuplesort_gettupleslot(node->bs_scan->ranges, true, false, node->bs_scan->slot, NULL))
@@ -387,22 +408,48 @@ brinsort_update_watermark(BrinSortState *node, bool asc)
 		else
 			value = slot_getattr(node->bs_scan->slot, 7, &isnull);
 
+		/*
+		 * Has to be the first call (otherwise we would not get here, because we
+		 * terminate after bs_watermark_set gets flipped back to false), so we
+		 * just set the value. But we don't count this as a step, because that
+		 * just picks the first minval value, as we certainly need to do at least
+		 * one more step.
+		 *
+		 * XXX Actually, do we need to make another step? Maybe there are enough
+		 * not-summarized ranges? Although, we don't know what values are in
+		 * those, ranges, and with increasing data we might easily end up just
+		 * writing all of it into the spill tuplestore. So making one more step
+		 * seems like a better idea - we'll at lest be able to produce something
+		 * which is good for LIMIT queries.
+		 */
 		if (!node->bs_watermark_set)
 		{
+			Assert(first);
 			node->bs_watermark_set = true;
 			node->bs_watermark = value;
+			found = true;
 			continue;
 		}
 
 		cmp = ApplySortComparator(node->bs_watermark, false, value, false,
 								  &node->bs_sortsupport);
 
-		if (cmp < 0)
+		/*
+		 * Values should not decrease (or whatever the operator says, might
+		 * be a DESC sort).
+		 */
+		Assert(cmp <= 0);
+
+		if (cmp < 0)	/* new watermark value */
 		{
 			node->bs_watermark_set = true;
 			node->bs_watermark = value;
 			found = true;
-			break;
+
+			steps--;
+
+			if (steps == 0)
+				break;
 		}
 	}
 
@@ -871,7 +918,7 @@ IndexNext(BrinSortState *node)
 					node->bs_phase = BRINSORT_LOAD_RANGE;
 
 					/* set the first watermark */
-					brinsort_update_watermark(node, asc);
+					brinsort_update_watermark(node, true, asc, brinsort_watermark_step);
 				}
 
 				break;
@@ -981,7 +1028,7 @@ IndexNext(BrinSortState *node)
 				{
 					/* updte the watermark and try reading more ranges */
 					node->bs_phase = BRINSORT_LOAD_RANGE;
-					brinsort_update_watermark(node, asc);
+					brinsort_update_watermark(node, false, asc, brinsort_watermark_step);
 				}
 
 				break;
@@ -1004,7 +1051,7 @@ IndexNext(BrinSortState *node)
 							{
 								brinsort_rescan(node);
 								node->bs_phase = BRINSORT_LOAD_RANGE;
-								brinsort_update_watermark(node, asc);
+								brinsort_update_watermark(node, true, asc, brinsort_watermark_step);
 							}
 							else
 								node->bs_phase = BRINSORT_FINISHED;
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index a5ca3bd0cc4..c7abdade496 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -95,6 +95,7 @@ extern char *temp_tablespaces;
 extern bool ignore_checksum_failure;
 extern bool ignore_invalid_pages;
 extern bool synchronize_seqscans;
+extern int	brinsort_watermark_step;
 
 #ifdef TRACE_SYNCSCAN
 extern bool trace_syncscan;
@@ -3425,6 +3426,16 @@ struct config_int ConfigureNamesInt[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"brinsort_watermark_step", PGC_USERSET, DEVELOPER_OPTIONS,
+			gettext_noop("sets the step for brinsort watermark increments"),
+			NULL
+		},
+		&brinsort_watermark_step,
+		1, 1, INT_MAX,
+		NULL, NULL, NULL
+	},
+
 	/* End-of-list marker */
 	{
 		{NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL, NULL
-- 
2.37.3

