From 976349a5abfe23d672650a611c3377638db06b46 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@2ndquadrant.com>
Date: Thu, 20 Jun 2024 22:26:26 +0200
Subject: [PATCH v20240620 3/7] Remove the explicit pg_qsort in workers

We don't need to do the explicit sort before building the GIN tuple,
because the mergesort in GinBufferStoreTuple is already maintaining the
correct order (this was added in an earlier commit).

The comment also adds a field with the first TID, and modifies the
comparator to sort by it (for each key value). This helps workers to
build non-overlapping TID lists and simply append values instead of
having to do the actual mergesort to combine them. This is best-effort,
i.e. it's not guaranteed to eliminate the mergesort - in particular,
parallel scans are synchronized, and thus may start somewhere in the
middle of the table, and wrap around. In which case there may be very
wide list (with low/high TID values).

Note: There's an XXX comment with a couple ideas on how to improve this,
at the cost of more complexity.
---
 src/backend/access/gin/gininsert.c | 119 +++++++++++++++++++----------
 src/include/access/gin_tuple.h     |   8 ++
 2 files changed, 86 insertions(+), 41 deletions(-)

diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c
index 8d46b53c43b..77e5efc58dd 100644
--- a/src/backend/access/gin/gininsert.c
+++ b/src/backend/access/gin/gininsert.c
@@ -1114,12 +1114,6 @@ _gin_parallel_heapscan(GinBuildState *state)
 	return state->bs_reltuples;
 }
 
-static int
-tid_cmp(const void *a, const void *b)
-{
-	return ItemPointerCompare((ItemPointer) a, (ItemPointer) b);
-}
-
 /*
  * Buffer used to accumulate TIDs from multiple GinTuples for the same key.
  *
@@ -1152,17 +1146,21 @@ AssertCheckGinBuffer(GinBuffer *buffer)
 }
 
 static void
-AssertCheckItemPointers(ItemPointerData *items, int nitems, bool sorted)
+AssertCheckItemPointers(GinBuffer *buffer, bool sorted)
 {
 #ifdef USE_ASSERT_CHECKING
-	for (int i = 0; i < nitems; i++)
+	/* we should not have a buffer with no TIDs to sort */
+	Assert(buffer->items != NULL);
+	Assert(buffer->nitems > 0);
+
+	for (int i = 0; i < buffer->nitems; i++)
 	{
-		Assert(ItemPointerIsValid(&items[i]));
+		Assert(ItemPointerIsValid(&buffer->items[i]));
 
 		if ((i == 0) || !sorted)
 			continue;
 
-		Assert(ItemPointerCompare(&items[i - 1], &items[i]) < 0);
+		Assert(ItemPointerCompare(&buffer->items[i - 1], &buffer->items[i]) < 0);
 	}
 #endif
 }
@@ -1225,6 +1223,33 @@ GinBufferKeyEquals(GinBuffer *buffer, GinTuple *tup)
 	return (memcmp(tup->data, DatumGetPointer(buffer->key), buffer->keylen) == 0);
 }
 
+/*
+ * GinBufferStoreTuple
+ *		Add data from a GinTuple into the GinBuffer.
+ *
+ * If the buffer is empty, we simply initialize it with data from the tuple.
+ * Otherwise data from the tuple (the TID list) is added to the TID data in
+ * the buffer, either by simply appending the TIDs or doing merge sort.
+ *
+ * The data (for the same key) is expected to be processed sorted by first
+ * TID. But this does not guarantee the lists do not overlap, especially in
+ * the leader, because the workers process interleaving data. But even in
+ * a single worker, lists can overlap - parallel scans require sync-scans,
+ * and if the scan starts somewhere in the table and then wraps around, it
+ * may contain very wide lists (in terms of TID range).
+ *
+ * But the ginMergeItemPointers() is already smart about detecting cases
+ * where it can simply concatenate the lists, and when full mergesort is
+ * needed. And does the right thing.
+ *
+ * By keeping the first TID in the GinTuple and sorting by that, we make
+ * it more likely the lists won't overlap very often.
+ *
+ * XXX How frequent can the overlaps be? If the scan does not wrap around,
+ * there should be no overlapping lists, and thus no mergesort. After a
+ * wraparound, there probably can be many - the one list will be very wide,
+ * with a very low and high TID, and all other lists will overlap with it.
+ */
 static void
 GinBufferStoreTuple(GinBuffer *buffer, GinTuple *tup)
 {
@@ -1251,7 +1276,12 @@ GinBufferStoreTuple(GinBuffer *buffer, GinTuple *tup)
 			buffer->key = (Datum) 0;
 	}
 
-	/* copy the new TIDs into the buffer, combine using merge-sort */
+	/*
+	 * Copy the new TIDs into the buffer, combine with existing data (if any)
+	 * using merge-sort. The mergesort is already smart about cases where it
+	 * can simply concatenate the two lists, and when it actually needs to
+	 * merge the data in an expensive way.
+	 */
 	{
 		int			nnew;
 		ItemPointer new;
@@ -1266,21 +1296,9 @@ GinBufferStoreTuple(GinBuffer *buffer, GinTuple *tup)
 
 		buffer->items = new;
 		buffer->nitems = nnew;
-	}
-
-	AssertCheckItemPointers(buffer->items, buffer->nitems, false);
-}
 
-static void
-GinBufferSortItems(GinBuffer *buffer)
-{
-	/* we should not have a buffer with no TIDs to sort */
-	Assert(buffer->items != NULL);
-	Assert(buffer->nitems > 0);
-
-	pg_qsort(buffer->items, buffer->nitems, sizeof(ItemPointerData), tid_cmp);
-
-	AssertCheckItemPointers(buffer->items, buffer->nitems, true);
+		AssertCheckItemPointers(buffer, true);
+	}
 }
 
 /* XXX Might be better to have a separate memory context for the buffer. */
@@ -1307,13 +1325,11 @@ GinBufferReset(GinBuffer *buffer)
 	buffer->typlen = 0;
 	buffer->typbyval = 0;
 
-	/*
-	 * XXX Should we do something if the array of TIDs gets too large? It may
-	 * grow too much, and we'll not free it until the worker finishes
-	 * building. But it's better to not let the array grow arbitrarily large,
-	 * and enforce work_mem as memory limit by flushing the buffer into the
-	 * tuplestore.
-	 */
+	if (buffer->items)
+	{
+		pfree(buffer->items);
+		buffer->items = NULL;
+	}
 }
 
 /*
@@ -1409,7 +1425,7 @@ _gin_parallel_merge(GinBuildState *state)
 			 * the data into the insert, and start a new entry for current
 			 * GinTuple.
 			 */
-			AssertCheckItemPointers(buffer->items, buffer->nitems, true);
+			AssertCheckItemPointers(buffer, true);
 
 			ginEntryInsert(&state->ginstate,
 						   buffer->attnum, buffer->key, buffer->category,
@@ -1419,14 +1435,17 @@ _gin_parallel_merge(GinBuildState *state)
 			GinBufferReset(buffer);
 		}
 
-		/* now remember the new key */
+		/*
+		 * Remember data for the current tuple (either remember the new key,
+		 * or append if to the existing data).
+		 */
 		GinBufferStoreTuple(buffer, tup);
 	}
 
 	/* flush data remaining in the buffer (for the last key) */
 	if (!GinBufferIsEmpty(buffer))
 	{
-		AssertCheckItemPointers(buffer->items, buffer->nitems, true);
+		AssertCheckItemPointers(buffer, true);
 
 		ginEntryInsert(&state->ginstate,
 					   buffer->attnum, buffer->key, buffer->category,
@@ -1529,7 +1548,7 @@ _gin_process_worker_data(GinBuildState *state, Tuplesortstate *worker_sort)
 			 * the data into the insert, and start a new entry for current
 			 * GinTuple.
 			 */
-			GinBufferSortItems(buffer);
+			AssertCheckItemPointers(buffer, true);
 
 			ntup = _gin_build_tuple(buffer->attnum, buffer->category,
 									buffer->key, buffer->typlen, buffer->typbyval,
@@ -1543,7 +1562,10 @@ _gin_process_worker_data(GinBuildState *state, Tuplesortstate *worker_sort)
 			GinBufferReset(buffer);
 		}
 
-		/* now remember the new key */
+		/*
+		 * Remember data for the current tuple (either remember the new key,
+		 * or append if to the existing data).
+		 */
 		GinBufferStoreTuple(buffer, tup);
 	}
 
@@ -1553,7 +1575,7 @@ _gin_process_worker_data(GinBuildState *state, Tuplesortstate *worker_sort)
 		GinTuple   *ntup;
 		Size		ntuplen;
 
-		GinBufferSortItems(buffer);
+		AssertCheckItemPointers(buffer, true);
 
 		ntup = _gin_build_tuple(buffer->attnum, buffer->category,
 								buffer->key, buffer->typlen, buffer->typbyval,
@@ -1854,6 +1876,7 @@ _gin_build_tuple(OffsetNumber attrnum, unsigned char category,
 	tuple->category = category;
 	tuple->keylen = keylen;
 	tuple->nitems = nitems;
+	tuple->first = items[0];
 
 	/* key type info */
 	tuple->typlen = typlen;
@@ -1938,6 +1961,12 @@ _gin_parse_tuple(GinTuple *a, ItemPointerData **items)
  * assumption that if we get two keys that are two different representations
  * of a logically equal value, it'll get merged by the index build.
  *
+ * If the key value matches, we compare the first TID value in the TID list,
+ * which means the tuples are merged in an order in which they are most
+ * likely to be simply concatenated. (This "first" TID will also allow us
+ * to determine a point up to which the list is fully determined and can be
+ * written into the index to enforce a memory limit etc.)
+ *
  * FIXME Is the assumption we can just memcmp() actually valid? Won't this
  * trigger the "could not split GIN page; all old items didn't fit" error
  * when trying to update the TID list?
@@ -1972,19 +2001,27 @@ _gin_compare_tuples(GinTuple *a, GinTuple *b)
 		 */
 		if (a->typbyval)
 		{
-			return memcmp(&keya, &keyb, a->keylen);
+			int			r = memcmp(&keya, &keyb, a->keylen);
+
+			/* if the key is the same, consider the first TID in the array */
+			return (r != 0) ? r : ItemPointerCompare(&a->first, &b->first);
 		}
 		else
 		{
+			int			r;
+
 			if (a->keylen < b->keylen)
 				return -1;
 
 			if (a->keylen > b->keylen)
 				return 1;
 
-			return memcmp(DatumGetPointer(keya), DatumGetPointer(keyb), a->keylen);
+			r = memcmp(DatumGetPointer(keya), DatumGetPointer(keyb), a->keylen);
+
+			/* if the key is the same, consider the first TID in the array */
+			return (r != 0) ? r : ItemPointerCompare(&a->first, &b->first);
 		}
 	}
 
-	return 0;
+	return ItemPointerCompare(&a->first, &b->first);
 }
diff --git a/src/include/access/gin_tuple.h b/src/include/access/gin_tuple.h
index 2cd5e716a9a..73280066f2c 100644
--- a/src/include/access/gin_tuple.h
+++ b/src/include/access/gin_tuple.h
@@ -12,6 +12,13 @@
 
 #include "storage/itemptr.h"
 
+/*
+ * Each worker sees tuples in CTID order, so if we track the first TID and
+ * compare that when combining results in the worker, we would not need to
+ * do an expensive sort in workers (the mergesort is already smart about
+ * detecting this and just concatenating the lists). We'd still need the
+ * full mergesort in the leader, but that's much cheaper.
+ */
 typedef struct GinTuple
 {
 	Size		tuplen;			/* length of the whole tuple */
@@ -20,6 +27,7 @@ typedef struct GinTuple
 	bool		typbyval;		/* typbyval for key */
 	OffsetNumber attrnum;		/* attnum of index key */
 	signed char category;		/* category: normal or NULL? */
+	ItemPointerData first;		/* first TID in the array */
 	int			nitems;			/* number of TIDs in the data */
 	char		data[FLEXIBLE_ARRAY_MEMBER];
 } GinTuple;
-- 
2.45.2