From ec1798ee481d4a3ccc466eb5e5c14a5a80fd87fd Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@2ndquadrant.com>
Date: Mon, 24 Jun 2024 01:14:52 +0200
Subject: [PATCH v20240624 3/7] Remove the explicit pg_qsort in workers

We don't need to do the explicit sort before building the GIN tuple,
because the mergesort in GinBufferStoreTuple is already maintaining the
correct order (this was added in an earlier commit).

The comment also adds a field with the first TID, and modifies the
comparator to sort by it (for each key value). This helps workers to
build non-overlapping TID lists and simply append values instead of
having to do the actual mergesort to combine them. This is best-effort,
i.e. it's not guaranteed to eliminate the mergesort - in particular,
parallel scans are synchronized, and thus may start somewhere in the
middle of the table, and wrap around. In which case there may be very
wide list (with low/high TID values).

Note: There's an XXX comment with a couple ideas on how to improve this,
at the cost of more complexity.
---
 src/backend/access/gin/gininsert.c | 107 +++++++++++++++++------------
 src/include/access/gin_tuple.h     |  11 ++-
 2 files changed, 74 insertions(+), 44 deletions(-)

diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c
index 1fa40e3ff72..df33e5947d8 100644
--- a/src/backend/access/gin/gininsert.c
+++ b/src/backend/access/gin/gininsert.c
@@ -1160,19 +1160,27 @@ typedef struct GinBuffer
 /*
  * Check that TID array contains valid values, and that it's sorted (if we
  * expect it to be).
+ *
+ * XXX At this point there are no places where "sorted=false" should be
+ * necessary, because we always use merge-sort to combine the old and new
+ * TID list. So maybe we should get rid of the argument entirely.
  */
 static void
-AssertCheckItemPointers(ItemPointerData *items, int nitems, bool sorted)
+AssertCheckItemPointers(GinBuffer *buffer, bool sorted)
 {
 #ifdef USE_ASSERT_CHECKING
-	for (int i = 0; i < nitems; i++)
+	/* we should not have a buffer with no TIDs to sort */
+	Assert(buffer->items != NULL);
+	Assert(buffer->nitems > 0);
+
+	for (int i = 0; i < buffer->nitems; i++)
 	{
-		Assert(ItemPointerIsValid(&items[i]));
+		Assert(ItemPointerIsValid(&buffer->items[i]));
 
 		if ((i == 0) || !sorted)
 			continue;
 
-		Assert(ItemPointerCompare(&items[i - 1], &items[i]) < 0);
+		Assert(ItemPointerCompare(&buffer->items[i - 1], &buffer->items[i]) < 0);
 	}
 #endif
 }
@@ -1189,8 +1197,10 @@ AssertCheckGinBuffer(GinBuffer *buffer)
 	 * we don't know if the TID array is expected to be sorted or not
 	 *
 	 * XXX maybe we can pass that to AssertCheckGinBuffer() call?
+	 * XXX actually with the mergesort in GinBufferStoreTuple, we
+	 * should not need 'false' here. See AssertCheckItemPointers.
 	 */
-	AssertCheckItemPointers(buffer->items, buffer->nitems, false);
+	AssertCheckItemPointers(buffer, false);
 #endif
 }
 
@@ -1295,8 +1305,26 @@ GinBufferKeyEquals(GinBuffer *buffer, GinTuple *tup)
  *		Add data (especially TID list) from a GIN tuple to the buffer.
  *
  * The buffer is expected to be empty (in which case it's initialized), or
- * having the same key. The TID values from the tuple are simply appended
- * to the array, without sorting.
+ * having the same key. The TID values from the tuple are combined with the
+ * stored values using a merge sort.
+ *
+ * The tuples (for the same key) is expected to be sorted by first TID. But
+ * this does not guarantee the lists do not overlap, especially in the leader,
+ * because the workers process interleaving data. But even in a single worker,
+ * lists can overlap - parallel scans require sync-scans, and if a scan wraps,
+ * obe of the lists may be very wide (in terms of TID range).
+ *
+ * But the ginMergeItemPointers() is already smart about detecting cases when
+ * it can simply concatenate the lists, and when full mergesort is needed. And
+ * does the right thing.
+ *
+ * By keeping the first TID in the GinTuple and sorting by that, we make it
+ * more likely the lists won't overlap very often.
+ *
+ * XXX How frequent can the overlaps be? If the scan does not wrap around,
+ * there should be no overlapping lists, and thus no mergesort. After a
+ * wraparound, there probably can be many - the one list will be very wide,
+ * with a very low and high TID, and all other lists will overlap with it.
  *
  * XXX We expect the tuples to contain sorted TID lists, so maybe we should
  * check that's true with an assert.
@@ -1342,33 +1370,9 @@ GinBufferStoreTuple(GinBuffer *buffer, GinTuple *tup)
 
 		buffer->items = new;
 		buffer->nitems = nnew;
-	}
-
-	/* we simply append the TID values, so don't check sorting */
-	AssertCheckItemPointers(buffer->items, buffer->nitems, false);
-}
-
-/* TID comparator for qsort */
-static int
-tid_cmp(const void *a, const void *b)
-{
-	return ItemPointerCompare((ItemPointer) a, (ItemPointer) b);
-}
-
-/*
- * GinBufferSortItems
- *		Sort the TID values stored in the TID buffer.
- */
-static void
-GinBufferSortItems(GinBuffer *buffer)
-{
-	/* we should not have a buffer with no TIDs to sort */
-	Assert(buffer->items != NULL);
-	Assert(buffer->nitems > 0);
-
-	pg_qsort(buffer->items, buffer->nitems, sizeof(ItemPointerData), tid_cmp);
 
-	AssertCheckItemPointers(buffer->items, buffer->nitems, true);
+		AssertCheckItemPointers(buffer, true);
+	}
 }
 
 /*
@@ -1505,7 +1509,7 @@ _gin_parallel_merge(GinBuildState *state)
 			 * the data into the insert, and start a new entry for current
 			 * GinTuple.
 			 */
-			AssertCheckItemPointers(buffer->items, buffer->nitems, true);
+			AssertCheckItemPointers(buffer, true);
 
 			ginEntryInsert(&state->ginstate,
 						   buffer->attnum, buffer->key, buffer->category,
@@ -1515,14 +1519,17 @@ _gin_parallel_merge(GinBuildState *state)
 			GinBufferReset(buffer);
 		}
 
-		/* now remember the new key */
+		/*
+		 * Remember data for the current tuple (either remember the new key,
+		 * or append if to the existing data).
+		 */
 		GinBufferStoreTuple(buffer, tup);
 	}
 
 	/* flush data remaining in the buffer (for the last key) */
 	if (!GinBufferIsEmpty(buffer))
 	{
-		AssertCheckItemPointers(buffer->items, buffer->nitems, true);
+		AssertCheckItemPointers(buffer, true);
 
 		ginEntryInsert(&state->ginstate,
 					   buffer->attnum, buffer->key, buffer->category,
@@ -1625,7 +1632,7 @@ _gin_process_worker_data(GinBuildState *state, Tuplesortstate *worker_sort)
 			 * the data into the insert, and start a new entry for current
 			 * GinTuple.
 			 */
-			GinBufferSortItems(buffer);
+			AssertCheckItemPointers(buffer, true);
 
 			ntup = _gin_build_tuple(buffer->attnum, buffer->category,
 									buffer->key, buffer->typlen, buffer->typbyval,
@@ -1639,7 +1646,10 @@ _gin_process_worker_data(GinBuildState *state, Tuplesortstate *worker_sort)
 			GinBufferReset(buffer);
 		}
 
-		/* now remember the new key */
+		/*
+		 * Remember data for the current tuple (either remember the new key,
+		 * or append if to the existing data).
+		 */
 		GinBufferStoreTuple(buffer, tup);
 	}
 
@@ -1649,7 +1659,7 @@ _gin_process_worker_data(GinBuildState *state, Tuplesortstate *worker_sort)
 		GinTuple   *ntup;
 		Size		ntuplen;
 
-		GinBufferSortItems(buffer);
+		AssertCheckItemPointers(buffer, true);
 
 		ntup = _gin_build_tuple(buffer->attnum, buffer->category,
 								buffer->key, buffer->typlen, buffer->typbyval,
@@ -1954,6 +1964,7 @@ _gin_build_tuple(OffsetNumber attrnum, unsigned char category,
 	tuple->category = category;
 	tuple->keylen = keylen;
 	tuple->nitems = nitems;
+	tuple->first = items[0];
 
 	/* key type info */
 	tuple->typlen = typlen;
@@ -2037,6 +2048,12 @@ _gin_parse_tuple(GinTuple *a, ItemPointerData **items)
  * compared last. The comparisons are done using type-specific sort support
  * functions.
  *
+ * If the key value matches, we compare the first TID value in the TID list,
+ * which means the tuples are merged in an order in which they are most
+ * likely to be simply concatenated. (This "first" TID will also allow us
+ * to determine a point up to which the list is fully determined and can be
+ * written into the index to enforce a memory limit etc.)
+ *
  * XXX We might try using memcmp(), based on the assumption that if we get
  * two keys that are two different representations of a logically equal
  * value, it'll get merged by the index build. But it's not clear that's
@@ -2049,6 +2066,7 @@ _gin_parse_tuple(GinTuple *a, ItemPointerData **items)
 int
 _gin_compare_tuples(GinTuple *a, GinTuple *b, SortSupport ssup)
 {
+	int			r;
 	Datum		keya,
 				keyb;
 
@@ -2070,10 +2088,13 @@ _gin_compare_tuples(GinTuple *a, GinTuple *b, SortSupport ssup)
 		keya = _gin_parse_tuple(a, NULL);
 		keyb = _gin_parse_tuple(b, NULL);
 
-		return ApplySortComparator(keya, false,
-								   keyb, false,
-								   &ssup[a->attrnum - 1]);
+		r = ApplySortComparator(keya, false,
+								keyb, false,
+								&ssup[a->attrnum - 1]);
+
+		/* if the key is the same, consider the first TID in the array */
+		return (r != 0) ? r : ItemPointerCompare(&a->first, &b->first);
 	}
 
-	return 0;
+	return ItemPointerCompare(&a->first, &b->first);
 }
diff --git a/src/include/access/gin_tuple.h b/src/include/access/gin_tuple.h
index 6f529a5aaf0..55dd8544b21 100644
--- a/src/include/access/gin_tuple.h
+++ b/src/include/access/gin_tuple.h
@@ -13,7 +13,15 @@
 #include "storage/itemptr.h"
 #include "utils/sortsupport.h"
 
-/* XXX do we still need all the fields now that we use SortSupport? */
+/*
+ * Each worker sees tuples in CTID order, so if we track the first TID and
+ * compare that when combining results in the worker, we would not need to
+ * do an expensive sort in workers (the mergesort is already smart about
+ * detecting this and just concatenating the lists). We'd still need the
+ * full mergesort in the leader, but that's much cheaper.
+ *
+ * XXX do we still need all the fields now that we use SortSupport?
+ */
 typedef struct GinTuple
 {
 	Size		tuplen;			/* length of the whole tuple */
@@ -22,6 +30,7 @@ typedef struct GinTuple
 	bool		typbyval;		/* typbyval for key */
 	OffsetNumber attrnum;		/* attnum of index key */
 	signed char category;		/* category: normal or NULL? */
+	ItemPointerData first;		/* first TID in the array */
 	int			nitems;			/* number of TIDs in the data */
 	char		data[FLEXIBLE_ARRAY_MEMBER];
 } GinTuple;
-- 
2.45.2

