From d3635b15c1acadeec0d2d74ee18fc295b6c1a912 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas.vondra@postgresql.org>
Date: Thu, 20 Jun 2024 20:50:51 +0200
Subject: [PATCH v20240620 7/7] Detect wrap around in parallel callback

When sync scan during index build wraps around, that may result in some
keys having very long TID lists, requiring "full" merge sort runs when
combining data in workers. It also causes problems with enforcing memory
limit, because we can't just dump the data - the index build requires
append-only posting lists, and violating may result in errors like

  ERROR: could not split GIN page; all old items didn't fit

because after the scan wrap around some of the TIDs may belong to the
beginning of the list, affecting the compression.

But we can deal with this in the callback - if we see the TID to jump
back, that must mean a wraparound happened. In that case we simply dump
all the data accumulated in memory, and start from scratch.

This means there won't be any tuples with very wide TID ranges, instead
there'll be one tuple with a range at the end of the table, and another
tuple at the beginning. And all the lists in the worker will be
non-overlapping, and sort nicely based on first TID.

For the leader, we still need to do the full merge - the lists may
overlap and interleave in various ways. But there should be only very
few of those lists, about one per worker, making it not an issue.
---
 src/backend/access/gin/gininsert.c | 96 ++++++++++++++++++------------
 1 file changed, 57 insertions(+), 39 deletions(-)

diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c
index b6a40dd7ddc..cc0e63ff7f8 100644
--- a/src/backend/access/gin/gininsert.c
+++ b/src/backend/access/gin/gininsert.c
@@ -142,6 +142,7 @@ typedef struct
 	MemoryContext tmpCtx;
 	MemoryContext funcCtx;
 	BuildAccumulator accum;
+	ItemPointerData tid;
 
 	/* FIXME likely duplicate with indtuples */
 	double		bs_numtuples;
@@ -497,6 +498,49 @@ ginBuildCallback(Relation index, ItemPointer tid, Datum *values,
  * distinct hash values).
  */
 static void
+ginFlushBuildState(GinBuildState *buildstate, Relation index)
+{
+	ItemPointerData *list;
+	Datum		key;
+	GinNullCategory category;
+	uint32		nlist;
+	OffsetNumber attnum;
+	TupleDesc	tdesc = RelationGetDescr(index);
+
+	ginBeginBAScan(&buildstate->accum);
+	while ((list = ginGetBAEntry(&buildstate->accum,
+								 &attnum, &key, &category, &nlist)) != NULL)
+	{
+		/* information about the key */
+		Form_pg_attribute attr = TupleDescAttr(tdesc, (attnum - 1));
+
+		/* GIN tuple and tuple length */
+		GinTuple   *tup;
+		Size		tuplen;
+
+		/* there could be many entries, so be willing to abort here */
+		CHECK_FOR_INTERRUPTS();
+
+		tup = _gin_build_tuple(buildstate, attnum, category,
+							   key, attr->attlen, attr->attbyval,
+							   list, nlist, &tuplen);
+
+		tuplesort_putgintuple(buildstate->bs_worker_sort, tup, tuplen);
+
+		pfree(tup);
+	}
+
+	MemoryContextReset(buildstate->tmpCtx);
+	ginInitBA(&buildstate->accum);
+}
+
+/*
+ * To detect a wraparound (which can happen with sync scans), we remember the
+ * last TID seen by each worker - if the next TID seen by the worker is lower,
+ * the scan must have wrapped around. We handle that by flushing the current
+ * buildstate to the tuplesort, so that we don't end up with wide TID lists.
+ */
+static void
 ginBuildCallbackParallel(Relation index, ItemPointer tid, Datum *values,
 						 bool *isnull, bool tupleIsAlive, void *state)
 {
@@ -506,6 +550,16 @@ ginBuildCallbackParallel(Relation index, ItemPointer tid, Datum *values,
 
 	oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx);
 
+	/* scan wrapped around - flush accumulated entries */
+	if (ItemPointerCompare(tid, &buildstate->tid) < 0)
+	{
+		elog(LOG, "calling ginFlushBuildState");
+		ginFlushBuildState(buildstate, index);
+	}
+
+	/* remember the TID we're about to process */
+	memcpy(&buildstate->tid, tid, sizeof(ItemPointerData));
+
 	for (i = 0; i < buildstate->ginstate.origTupdesc->natts; i++)
 		ginHeapTupleBulkInsert(buildstate, (OffsetNumber) (i + 1),
 							   values[i], isnull[i], tid);
@@ -520,40 +574,7 @@ ginBuildCallbackParallel(Relation index, ItemPointer tid, Datum *values,
 	 * keep using work_mem here.
 	 */
 	if (buildstate->accum.allocatedMemory >= (Size) work_mem * 1024L)
-	{
-		ItemPointerData *list;
-		Datum		key;
-		GinNullCategory category;
-		uint32		nlist;
-		OffsetNumber attnum;
-		TupleDesc	tdesc = RelationGetDescr(index);
-
-		ginBeginBAScan(&buildstate->accum);
-		while ((list = ginGetBAEntry(&buildstate->accum,
-									 &attnum, &key, &category, &nlist)) != NULL)
-		{
-			/* information about the key */
-			Form_pg_attribute attr = TupleDescAttr(tdesc, (attnum - 1));
-
-			/* GIN tuple and tuple length */
-			GinTuple   *tup;
-			Size		tuplen;
-
-			/* there could be many entries, so be willing to abort here */
-			CHECK_FOR_INTERRUPTS();
-
-			tup = _gin_build_tuple(buildstate, attnum, category,
-								   key, attr->attlen, attr->attbyval,
-								   list, nlist, &tuplen);
-
-			tuplesort_putgintuple(buildstate->bs_worker_sort, tup, tuplen);
-
-			pfree(tup);
-		}
-
-		MemoryContextReset(buildstate->tmpCtx);
-		ginInitBA(&buildstate->accum);
-	}
+		ginFlushBuildState(buildstate, index);
 
 	MemoryContextSwitchTo(oldCtx);
 }
@@ -590,6 +611,7 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 	buildstate.bs_numtuples = 0;
 	buildstate.bs_reltuples = 0;
 	buildstate.bs_leader = NULL;
+	memset(&buildstate.tid, 0, sizeof(ItemPointerData));
 
 	/* initialize the meta page */
 	MetaBuffer = GinNewBuffer(index);
@@ -1314,11 +1336,6 @@ GinBufferShouldTrim(GinBuffer *buffer, GinTuple *tup)
  * By keeping the first TID in the GinTuple and sorting by that, we make
  * it more likely the lists won't overlap very often.
  *
- * XXX How frequent can the overlaps be? If the scan does not wrap around,
- * there should be no overlapping lists, and thus no mergesort. After a
- * wraparound, there probably can be many - the one list will be very wide,
- * with a very low and high TID, and all other lists will overlap with it.
- *
  * XXX Maybe we could/should allocate the buffer once and then keep it
  * without palloc/pfree. That won't help when just calling the mergesort,
  * as that does palloc internally, but if we detected the append case,
@@ -2005,6 +2022,7 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
 	buildstate.indtuples = 0;
 	/* XXX Shouldn't this initialize the other fields too, like ginbuild()? */
 	memset(&buildstate.buildStats, 0, sizeof(GinStatsData));
+	memset(&buildstate.tid, 0, sizeof(ItemPointerData));
 
 	/*
 	 * create a temporary memory context that is used to hold data not yet
-- 
2.45.2