From 44d407ff0e3485676d435ec5cd54898b44f439ff Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@postgresql.org>
Date: Mon, 25 Nov 2024 12:37:33 +0700
Subject: [PATCH v1] Branchless Lomuto partitioning

POC that only works on datatypes that use ssup_datum_signed_cmp.
Dev-only GUC needed because it can't yet handle NULLs or DESC.
---
 src/backend/utils/misc/guc_tables.c           |  12 ++
 .../utils/sort/part_right_branchless.h        |  21 +++
 src/backend/utils/sort/tuplesort.c            | 150 +++++++++++++++++-
 src/include/utils/guc.h                       |   1 +
 src/include/utils/tuplesort.h                 |   3 +
 5 files changed, 184 insertions(+), 3 deletions(-)
 create mode 100644 src/backend/utils/sort/part_right_branchless.h

diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 8a67f01200..9d69890d72 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -1709,6 +1709,18 @@ struct config_bool ConfigureNamesBool[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		/* XXX not for commit */
+		{"debug_branchless_sort", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("WIP testing of branchless sort techniques"),
+			NULL,
+			GUC_EXPLAIN
+		},
+		&debug_branchless_sort,
+		false,
+		NULL, NULL, NULL
+	},
+
 #ifdef TRACE_SYNCSCAN
 	/* this is undocumented because not exposed in a standard build */
 	{
diff --git a/src/backend/utils/sort/part_right_branchless.h b/src/backend/utils/sort/part_right_branchless.h
new file mode 100644
index 0000000000..37d93bf45f
--- /dev/null
+++ b/src/backend/utils/sort/part_right_branchless.h
@@ -0,0 +1,21 @@
+/*
+ * Based on psuedocode from https://orlp.net/blog/branchless-lomuto-partitioning/
+ *
+ * There is deliberately no include guard here.
+ */
+
+SortTuple pivot = v[0];
+size_t i = 0;
+size_t j = 0;
+
+while (j < n - 1)
+{
+	v[j] = v[i];
+	j += 1;
+	v[i] = v[j];
+	i += CMP_2WAY(v[i], pivot);
+}
+
+v[j] = v[i];
+v[i] = pivot;
+return &v[i];
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index c960cfa823..f71defe633 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -122,6 +122,7 @@
 
 /* GUC variables */
 bool		trace_sort = false;
+bool		debug_branchless_sort = false;	/* XXX not for commit */
 
 #ifdef DEBUG_BOUNDED_SORT
 bool		optimize_bounded_sort = true;
@@ -619,6 +620,134 @@ qsort_tuple_int32_compare(SortTuple *a, SortTuple *b, Tuplesortstate *state)
 #define ST_DEFINE
 #include "lib/sort_template.h"
 
+
+/* WIP: just a demo: sort tuples with nulls in the first key should be put in a separate array */
+#define SS_COMPARE(a, b) \
+	ApplySortComparator((a)->datum1, false, \
+						(b)->datum1, false, ssup)
+
+/* XXX: only works on first sort key */
+static pg_noinline SortTuple *
+med3(SortTuple *a,
+	 SortTuple *b,
+	 SortTuple *c, SortSupport ssup)
+{
+	return SS_COMPARE(a, b) < 0 ?
+		(SS_COMPARE(b, c) < 0 ? b : (SS_COMPARE(a, c) < 0 ? c : a))
+		: (SS_COMPARE(b, c) > 0 ? b : (SS_COMPARE(a, c) < 0 ? a : c));
+}
+
+static SortTuple *
+part_right_signed_asc(SortTuple *v, size_t n)
+{
+#define CMP_2WAY(v, pivot) DatumGetInt64((v).datum1) < DatumGetInt64((pivot).datum1)
+#include "part_right_branchless.h"
+#undef CMP_2WAY
+}
+
+static inline void
+swap(SortTuple *a, SortTuple *b)
+{
+	SortTuple	tmp = *a;
+
+	*a = *b;
+	*b = tmp;
+}
+
+/* WIP: proof of concept for one data type, with no tiebreaks or NULL handling */
+static void
+qsort_ssup_branchless(SortTuple *data, size_t n, Tuplesortstate *state)
+{
+	SortTuple  *a = data,
+			   *pl,
+			   *pm,
+			   *pn,
+			   *pivot_pos;
+	int			presorted;
+	SortSupportData *ssup = state->base.sortKeys;
+
+
+loop:
+	if (n < 2)
+		return;
+
+	CHECK_FOR_INTERRUPTS();
+
+	if (n < 7)
+	{
+		/* TODO: we should use moves rather than swaps */
+		/*
+		 * TODO: fastpath on front slice using ping-pong merge or sorting
+		 * network
+		 */
+		/* WIP: needs to be aware of tiebreak comparators */
+		for (pm = a + 1; pm < a + n;
+			 pm += 1)
+			for (pl = pm; pl > a && SS_COMPARE(pl - 1, pl) > 0;
+				 pl -= 1)
+				swap(pl, pl - 1);
+		return;
+	}
+
+	/*
+	 * TODO: Branchless Lomuto partitioning is said to not work well with
+	 * descending inputs, so add a similar check for descending input. We
+	 * could probably hoist these out of the loop and only check once at the
+	 * beginning.
+	 */
+	presorted = 1;
+	for (pm = a + 1; pm < a + n;
+		 pm += 1)
+	{
+		CHECK_FOR_INTERRUPTS();
+		if (SS_COMPARE(pm - 1, pm) > 0)
+		{
+			presorted = 0;
+			break;
+		}
+	}
+	if (presorted)
+		return;
+
+	pm = a + (n / 2);
+	if (n > 7)
+	{
+		pl = a;
+		pn = a + (n - 1);
+		if (n > 40)
+		{
+			size_t		d = (n / 8);
+
+			pl = med3(pl, pl + d, pl + 2 * d, ssup);
+			pm = med3(pm - d, pm, pm + d, ssup);
+			pn = med3(pn - 2 * d, pn - d, pn, ssup);
+		}
+		pm = med3(pl, pm, pn, ssup);
+	}
+
+	/* place pivot at the front */
+	swap(a, pm);
+
+	/*
+	 * WIP: check if pivot is same as previous pivot and do "partition left"
+	 * to handle duplicates. If the abbrev key is not authoritative, we need
+	 * to recurse with a sort using the tiebreak comparator
+	 */
+
+	pivot_pos = state->base.partition_right(a, n);
+
+	/* WIP: Keep recursion simple for now. */
+
+	/* Recurse on left partition... */
+	qsort_ssup_branchless(a, pivot_pos - a, state);
+
+	/* ..., then iterate on right partition to save stack space */
+	n -= pivot_pos + 1 - a;
+	a = pivot_pos + 1;
+	goto loop;
+}
+
+
 /*
  *		tuplesort_begin_xxx
  *
@@ -2695,9 +2824,24 @@ tuplesort_sort_memtuples(Tuplesortstate *state)
 #if SIZEOF_DATUM >= 8
 			else if (state->base.sortKeys[0].comparator == ssup_datum_signed_cmp)
 			{
-				qsort_tuple_signed(state->memtuples,
-								   state->memtupcount,
-								   state);
+				SortTuple  *pm;
+
+				if (debug_branchless_sort)
+				{
+					state->base.partition_right = part_right_signed_asc;
+					qsort_ssup_branchless(state->memtuples,
+										  state->memtupcount,
+										  state);
+				}
+				else
+					qsort_tuple_signed(state->memtuples,
+									   state->memtupcount,
+									   state);
+
+				for (pm = state->memtuples + 1;
+					 pm < state->memtuples + state->memtupcount;
+					 pm++)
+					Assert(COMPARETUP(state, pm - 1, pm) <= 0);
 				return;
 			}
 #endif
diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h
index 840b0fe57f..33fe01c573 100644
--- a/src/include/utils/guc.h
+++ b/src/include/utils/guc.h
@@ -296,6 +296,7 @@ extern PGDLLIMPORT int tcp_user_timeout;
 extern PGDLLIMPORT char *role_string;
 extern PGDLLIMPORT bool in_hot_standby_guc;
 extern PGDLLIMPORT bool trace_sort;
+extern PGDLLIMPORT bool debug_branchless_sort; /* XXX not for commit */
 
 #ifdef DEBUG_BOUNDED_SORT
 extern PGDLLIMPORT bool optimize_bounded_sort;
diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h
index cde83f6201..936f21b0db 100644
--- a/src/include/utils/tuplesort.h
+++ b/src/include/utils/tuplesort.h
@@ -180,6 +180,9 @@ typedef struct
 	 */
 	SortTupleComparator comparetup_tiebreak;
 
+	/* Optional partition functions for single-instruction comparators */
+	SortTuple*		(*partition_right) (SortTuple *begin, size_t n);
+
 	/*
 	 * Alter datum1 representation in the SortTuple's array back from the
 	 * abbreviated key to the first column value.
-- 
2.47.0

