From 123ef5a67b3548b7226c0c6bcc8d3f758d96ee82 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas.vondra@postgresql.org>
Date: Thu, 2 May 2024 15:21:55 +0200
Subject: [PATCH v20240619 11/11] Detect wrap around in parallel callback

When sync scan during index build wraps around, that may result in some
keys having very long TID lists, requiring "full" merge sort runs when
combining data in workers. It also causes problems with enforcing memory
limit, because we can't just dump the data - the index build requires
append-only posting lists, and violating may result in errors like

  ERROR: could not split GIN page; all old items didn't fit

because after the scan wrap around some of the TIDs may belong to the
beginning of the list, affecting the compression.

But we can deal with this in the callback - if we see the TID to jump
back, that must mean a wraparound happened. In that case we simply dump
all the data accumulated in memory, and start from scratch.

This means there won't be any tuples with very wide TID ranges, instead
there'll be one tuple with a range at the end of the table, and another
tuple at the beginning. And all the lists in the worker will be
non-overlapping, and sort nicely based on first TID.

For the leader, we still need to do the full merge - the lists may
overlap and interleave in various ways. But there should be only very
few of those lists, about one per worker, making it not an issue.
---
 src/backend/access/gin/gininsert.c | 89 ++++++++++++++++++------------
 1 file changed, 55 insertions(+), 34 deletions(-)

diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c
index 8d16e484093..10d65abbf47 100644
--- a/src/backend/access/gin/gininsert.c
+++ b/src/backend/access/gin/gininsert.c
@@ -142,6 +142,7 @@ typedef struct
 	MemoryContext tmpCtx;
 	MemoryContext funcCtx;
 	BuildAccumulator accum;
+	ItemPointerData tid;
 
 	/* FIXME likely duplicate with indtuples */
 	double		bs_numtuples;
@@ -473,6 +474,47 @@ ginBuildCallback(Relation index, ItemPointer tid, Datum *values,
 	MemoryContextSwitchTo(oldCtx);
 }
 
+static void
+ginFlushBuildState(GinBuildState *buildstate, Relation index)
+{
+	ItemPointerData *list;
+	Datum		key;
+	GinNullCategory category;
+	uint32		nlist;
+	OffsetNumber attnum;
+	TupleDesc	tdesc = RelationGetDescr(index);
+
+	ginBeginBAScan(&buildstate->accum);
+	while ((list = ginGetBAEntry(&buildstate->accum,
+								 &attnum, &key, &category, &nlist)) != NULL)
+	{
+		/* information about the key */
+		Form_pg_attribute attr = TupleDescAttr(tdesc, (attnum - 1));
+
+		/* GIN tuple and tuple length */
+		GinTuple   *tup;
+		Size		tuplen;
+
+		/* there could be many entries, so be willing to abort here */
+		CHECK_FOR_INTERRUPTS();
+
+		tup = _gin_build_tuple(buildstate, attnum, category,
+							   key, attr->attlen, attr->attbyval,
+							   list, nlist, &tuplen);
+
+		tuplesort_putgintuple(buildstate->bs_worker_sort, tup, tuplen);
+
+		pfree(tup);
+	}
+
+	MemoryContextReset(buildstate->tmpCtx);
+	ginInitBA(&buildstate->accum);
+}
+
+/*
+ * FIXME Another way to deal with the wrap around of sync scans would be to
+ * detect when tid wraps around and just flush the state.
+ */
 static void
 ginBuildCallbackParallel(Relation index, ItemPointer tid, Datum *values,
 						 bool *isnull, bool tupleIsAlive, void *state)
@@ -483,6 +525,16 @@ ginBuildCallbackParallel(Relation index, ItemPointer tid, Datum *values,
 
 	oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx);
 
+	/* flush contents before wrapping around */
+	if (ItemPointerCompare(tid, &buildstate->tid) < 0)
+	{
+		elog(LOG, "calling ginFlushBuildState");
+		ginFlushBuildState(buildstate, index);
+	}
+
+	/* remember the TID we're about to process */
+	memcpy(&buildstate->tid, tid, sizeof(ItemPointerData));
+
 	for (i = 0; i < buildstate->ginstate.origTupdesc->natts; i++)
 		ginHeapTupleBulkInsert(buildstate, (OffsetNumber) (i + 1),
 							   values[i], isnull[i], tid);
@@ -517,40 +569,7 @@ ginBuildCallbackParallel(Relation index, ItemPointer tid, Datum *values,
 	 * XXX probably should use 32MB, not work_mem, as used during planning?
 	 */
 	if (buildstate->accum.allocatedMemory >= (Size) work_mem * 1024L)
-	{
-		ItemPointerData *list;
-		Datum		key;
-		GinNullCategory category;
-		uint32		nlist;
-		OffsetNumber attnum;
-		TupleDesc	tdesc = RelationGetDescr(index);
-
-		ginBeginBAScan(&buildstate->accum);
-		while ((list = ginGetBAEntry(&buildstate->accum,
-									 &attnum, &key, &category, &nlist)) != NULL)
-		{
-			/* information about the key */
-			Form_pg_attribute attr = TupleDescAttr(tdesc, (attnum - 1));
-
-			/* GIN tuple and tuple length */
-			GinTuple   *tup;
-			Size		tuplen;
-
-			/* there could be many entries, so be willing to abort here */
-			CHECK_FOR_INTERRUPTS();
-
-			tup = _gin_build_tuple(buildstate, attnum, category,
-								   key, attr->attlen, attr->attbyval,
-								   list, nlist, &tuplen);
-
-			tuplesort_putgintuple(buildstate->bs_worker_sort, tup, tuplen);
-
-			pfree(tup);
-		}
-
-		MemoryContextReset(buildstate->tmpCtx);
-		ginInitBA(&buildstate->accum);
-	}
+		ginFlushBuildState(buildstate, index);
 
 	MemoryContextSwitchTo(oldCtx);
 }
@@ -586,6 +605,7 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 	buildstate.bs_numtuples = 0;
 	buildstate.bs_reltuples = 0;
 	buildstate.bs_leader = NULL;
+	memset(&buildstate.tid, 0, sizeof(ItemPointerData));
 
 	/* initialize the meta page */
 	MetaBuffer = GinNewBuffer(index);
@@ -2007,6 +2027,7 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
 	buildstate.indtuples = 0;
 	/* XXX shouldn't this initialize the other fiedls, like ginbuild()? */
 	memset(&buildstate.buildStats, 0, sizeof(GinStatsData));
+	memset(&buildstate.tid, 0, sizeof(ItemPointerData));
 
 	/*
 	 * create a temporary memory context that is used to hold data not yet
-- 
2.45.2