From c1dafb0209ce225e9e825c07ddcb1416ce2435fe Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas.vondra@postgresql.org>
Date: Thu, 2 May 2024 15:21:36 +0200
Subject: [PATCH v20240502 3/8] Remove the explicit pg_qsort in workers

We don't need to do the explicit sort before building the GIN tuple,
because the mergesort in GinBufferStoreTuple is already maintaining the
correct order (this was added in an earlier commit).

The comment also adds a field with the first TID, and modifies the
comparator to sort by it (for each key value). This helps workers to
build non-overlapping TID lists and simply append values instead of
having to do the actual mergesort to combine them. This is best-effort,
i.e. it's not guaranteed to eliminate the mergesort - in particular,
parallel scans are synchronized, and thus may start somewhere in the
middle of the table, and wrap around. In which case there may be very
wide list (with low/high TID values).

Note: There's an XXX comment with a couple ideas on how to improve this,
at the cost of more complexity.
---
 src/backend/access/gin/gininsert.c | 130 +++++++++++++++++++++--------
 src/include/access/gin_tuple.h     |   9 +-
 2 files changed, 104 insertions(+), 35 deletions(-)

diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c
index 8011e0b5ad5..5762c9520d8 100644
--- a/src/backend/access/gin/gininsert.c
+++ b/src/backend/access/gin/gininsert.c
@@ -1110,12 +1110,6 @@ _gin_parallel_heapscan(GinBuildState *state)
 	return state->bs_reltuples;
 }
 
-static int
-tid_cmp(const void *a, const void *b)
-{
-	return ItemPointerCompare((ItemPointer) a, (ItemPointer) b);
-}
-
 /*
  * State used to combine accumulate TIDs from multiple GinTuples for the same
  * key value.
@@ -1147,17 +1141,21 @@ AssertCheckGinBuffer(GinBuffer *buffer)
 }
 
 static void
-AssertCheckItemPointers(ItemPointerData *items, int nitems, bool sorted)
+AssertCheckItemPointers(GinBuffer *buffer, bool sorted)
 {
 #ifdef USE_ASSERT_CHECKING
-	for (int i = 0; i < nitems; i++)
+	/* we should not have a buffer with no TIDs to sort */
+	Assert(buffer->items != NULL);
+	Assert(buffer->nitems > 0);
+
+	for (int i = 0; i < buffer->nitems; i++)
 	{
-		Assert(ItemPointerIsValid(&items[i]));
+		Assert(ItemPointerIsValid(&buffer->items[i]));
 
 		if ((i == 0) || !sorted)
 			continue;
 
-		Assert(ItemPointerCompare(&items[i - 1], &items[i]) < 0);
+		Assert(ItemPointerCompare(&buffer->items[i - 1], &buffer->items[i]) < 0);
 	}
 #endif
 }
@@ -1220,6 +1218,45 @@ GinBufferKeyEquals(GinBuffer *buffer, GinTuple *tup)
 	return (memcmp(tup->data, DatumGetPointer(buffer->key), buffer->keylen) == 0);
 }
 
+/*
+ * GinBufferStoreTuple
+ *		Add data from a GinTuple into the GinBuffer.
+ *
+ * If the buffer is empty, we simply initialize it with data from the tuple.
+ * Otherwise data from the tuple (the TID list) is added to the TID data in
+ * the buffer, either by simply appending the TIDs or doing merge sort.
+ *
+ * The data (for the same key) is expected to be processed sorted by first
+ * TID. But this does not guarantee the lists do not overlap, especially in
+ * the leader, because the workers process interleaving data. But even in
+ * a single worker, lists can overlap - parallel scans require sync-scans,
+ * and if the scan starts somewhere in the table and then wraps around, it
+ * may contain very wide lists (in terms of TID range).
+ *
+ * But the ginMergeItemPointers() is already smart about detecting cases
+ * where it can simply concatenate the lists, and when full mergesort is
+ * needed. And does the right thing.
+ *
+ * By keeping the first TID in the GinTuple and sorting by that, we make
+ * it more likely the lists won't overlap very often.
+ *
+ * XXX How frequent can the overlaps be? If the scan does not wrap around,
+ * there should be no overlapping lists, and thus no mergesort. After ab
+ * overlap, there probably can be many - the one list will be very wide,
+ * with a very low and high TID, and all other lists will overlap with it.
+ * I'm not sure how much we can do to prevent that, short of disabling sync
+ * scans (which for parallel scans is currently not possible). One option
+ * would be to keep two lists of TIDs, and see if the new list can be
+ * concatenated with one of them. The idea is that there's only one wide
+ * list (because the wraparound happens only once), and then do the
+ * mergesort only once at the very end.
+ *
+ * XXX Alternatively, we could simply detect the case when the lists can't
+ * be appended, and flush the current list out. The wrap around happens only
+ * once, so there's going to be only such wide list, and it'll be sorted
+ * first (because it has the lowest TID for the key). So we'd do this at
+ * most once per key.
+ */
 static void
 GinBufferStoreTuple(GinBuffer *buffer, GinTuple *tup)
 {
@@ -1246,7 +1283,12 @@ GinBufferStoreTuple(GinBuffer *buffer, GinTuple *tup)
 			buffer->key = (Datum) 0;
 	}
 
-	/* copy the new TIDs into the buffer, combine using merge-sort */
+	/*
+	 * Copy the new TIDs into the buffer, combine with existing data (if any)
+	 * using merge-sort. The mergesort is already smart about cases where it
+	 * can simply concatenate the two lists, and when it actually needs to
+	 * merge the data in an expensive way.
+	 */
 	{
 		int			nnew;
 		ItemPointer new;
@@ -1261,21 +1303,9 @@ GinBufferStoreTuple(GinBuffer *buffer, GinTuple *tup)
 
 		buffer->items = new;
 		buffer->nitems = nnew;
-	}
-
-	AssertCheckItemPointers(buffer->items, buffer->nitems, false);
-}
-
-static void
-GinBufferSortItems(GinBuffer *buffer)
-{
-	/* we should not have a buffer with no TIDs to sort */
-	Assert(buffer->items != NULL);
-	Assert(buffer->nitems > 0);
-
-	pg_qsort(buffer->items, buffer->nitems, sizeof(ItemPointerData), tid_cmp);
 
-	AssertCheckItemPointers(buffer->items, buffer->nitems, true);
+		AssertCheckItemPointers(buffer, true);
+	}
 }
 
 /* XXX probably would be better to have a memory context for the buffer */
@@ -1299,6 +1329,11 @@ GinBufferReset(GinBuffer *buffer)
 	buffer->typlen = 0;
 	buffer->typbyval = 0;
 
+	if (buffer->items)
+	{
+		pfree(buffer->items);
+		buffer->items = NULL;
+	}
 	/* XXX should do something with extremely large array of items? */
 }
 
@@ -1390,7 +1425,7 @@ _gin_parallel_merge(GinBuildState *state)
 			 * the data into the insert, and start a new entry for current
 			 * GinTuple.
 			 */
-			AssertCheckItemPointers(buffer->items, buffer->nitems, true);
+			AssertCheckItemPointers(buffer, true);
 
 			ginEntryInsert(&state->ginstate,
 						   buffer->attnum, buffer->key, buffer->category,
@@ -1400,14 +1435,17 @@ _gin_parallel_merge(GinBuildState *state)
 			GinBufferReset(buffer);
 		}
 
-		/* now remember the new key */
+		/*
+		 * Remember data for the current tuple (either remember the new key,
+		 * or append if to the existing data).
+		 */
 		GinBufferStoreTuple(buffer, tup);
 	}
 
 	/* flush data remaining in the buffer (for the last key) */
 	if (!GinBufferIsEmpty(buffer))
 	{
-		AssertCheckItemPointers(buffer->items, buffer->nitems, true);
+		AssertCheckItemPointers(buffer, true);
 
 		ginEntryInsert(&state->ginstate,
 					   buffer->attnum, buffer->key, buffer->category,
@@ -1510,7 +1548,7 @@ _gin_process_worker_data(GinBuildState *state, Tuplesortstate *worker_sort)
 			 * the data into the insert, and start a new entry for current
 			 * GinTuple.
 			 */
-			GinBufferSortItems(buffer);
+			AssertCheckItemPointers(buffer, true);
 
 			ntup = _gin_build_tuple(buffer->attnum, buffer->category,
 									buffer->key, buffer->typlen, buffer->typbyval,
@@ -1524,7 +1562,10 @@ _gin_process_worker_data(GinBuildState *state, Tuplesortstate *worker_sort)
 			GinBufferReset(buffer);
 		}
 
-		/* now remember the new key */
+		/*
+		 * Remember data for the current tuple (either remember the new key,
+		 * or append if to the existing data).
+		 */
 		GinBufferStoreTuple(buffer, tup);
 	}
 
@@ -1534,7 +1575,7 @@ _gin_process_worker_data(GinBuildState *state, Tuplesortstate *worker_sort)
 		GinTuple   *ntup;
 		Size		ntuplen;
 
-		GinBufferSortItems(buffer);
+		AssertCheckItemPointers(buffer, true);
 
 		ntup = _gin_build_tuple(buffer->attnum, buffer->category,
 								buffer->key, buffer->typlen, buffer->typbyval,
@@ -1835,6 +1876,7 @@ _gin_build_tuple(OffsetNumber attrnum, unsigned char category,
 	tuple->category = category;
 	tuple->keylen = keylen;
 	tuple->nitems = nitems;
+	tuple->first = items[0];
 
 	/* key type info */
 	tuple->typlen = typlen;
@@ -1919,6 +1961,12 @@ _gin_parse_tuple(GinTuple *a, ItemPointerData **items)
  * assumption that if we get two keys that are two different representations
  * of a logically equal value, it'll get merged by the index build.
  *
+ * If the key value matches, we compare the first TID value in the TID list,
+ * which means the tuples are merged in an order in which they are most
+ * likely to be simply concatenated. (This "first" TID will also allow us
+ * to determine a point up to which the list is fully determined and can be
+ * written into the index to enforce a memory limit etc.)
+ *
  * FIXME Is the assumption we can just memcmp() actually valid? Won't this
  * trigger the "could not split GIN page; all old items didn't fit" error
  * when trying to update the TID list?
@@ -1947,20 +1995,34 @@ _gin_compare_tuples(GinTuple *a, GinTuple *b)
 		keya = _gin_parse_tuple(a, NULL);
 		keyb = _gin_parse_tuple(b, NULL);
 
+		/*
+		 * works for both byval and byref types with fixed lenght, because for
+		 * byval we set keylen to sizeof(Datum)
+		 */
 		if (a->typlen > 0)
-			return memcmp(&keya, &keyb, a->keylen);
+		{
+			int			r = memcmp(&keya, &keyb, a->keylen);
+
+			/* if the key is the same, consider the first TID in the array */
+			return (r != 0) ? r : ItemPointerCompare(&a->first, &b->first);
+		}
 
 		if (a->typlen < 0)
 		{
+			int			r;
+
 			if (a->keylen < b->keylen)
 				return -1;
 
 			if (a->keylen > b->keylen)
 				return 1;
 
-			return memcmp(DatumGetPointer(keya), DatumGetPointer(keyb), a->keylen);
+			r = memcmp(DatumGetPointer(keya), DatumGetPointer(keyb), a->keylen);
+
+			/* if the key is the same, consider the first TID in the array */
+			return (r != 0) ? r : ItemPointerCompare(&a->first, &b->first);
 		}
 	}
 
-	return 0;
+	return ItemPointerCompare(&a->first, &b->first);
 }
diff --git a/src/include/access/gin_tuple.h b/src/include/access/gin_tuple.h
index b5304b73ff1..c3641edd5fc 100644
--- a/src/include/access/gin_tuple.h
+++ b/src/include/access/gin_tuple.h
@@ -10,7 +10,13 @@
 #ifndef GIN_TUPLE_
 #define GIN_TUPLE_
 
-
+/*
+ * Each worker sees tuples in CTID order, so if we track the first TID and
+ * compare that when combining results in the worker, we would not need to
+ * do an expensive sort in workers (the mergesort is already smart about
+ * detecting this and just concatenating the lists). We'd still need the
+ * full mergesort in the leader, but that's much cheaper.
+ */
 typedef struct GinTuple
 {
 	Size		tuplen;			/* length of the whole tuple */
@@ -19,6 +25,7 @@ typedef struct GinTuple
 	bool		typbyval;		/* typbyval for key */
 	OffsetNumber attrnum;
 	signed char category;		/* category: normal or NULL? */
+	ItemPointerData first;		/* first TID in the array */
 	int			nitems;			/* number of TIDs in the data */
 	char		data[FLEXIBLE_ARRAY_MEMBER];
 } GinTuple;
-- 
2.44.0