From dcb0058540f46fa5f9cbb21e1cff640874418401 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas.vondra@postgresql.org>
Date: Tue, 7 Nov 2023 18:15:20 +0100
Subject: [PATCH v3 2/4] fix review comments

---
 src/backend/access/brin/brin.c             | 44 +++++++++-------------
 src/backend/utils/sort/tuplesort.c         |  3 --
 src/backend/utils/sort/tuplesortvariants.c | 11 +++---
 3 files changed, 23 insertions(+), 35 deletions(-)

diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
index b7cd29c5968..d54d39a535e 100644
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -162,7 +162,7 @@ typedef struct BrinBuildState
 {
 	Relation	bs_irel;
 	int			bs_numtuples;
-	int			bs_reltuples;
+	double		bs_reltuples;
 	Buffer		bs_currentInsertBuf;
 	BlockNumber bs_pagesPerRange;
 	BlockNumber bs_currRangeStart;
@@ -172,13 +172,12 @@ typedef struct BrinBuildState
 
 	/*
 	 * bs_leader is only present when a parallel index build is performed, and
-	 * only in the leader process. (Actually, only the leader has a
+	 * only in the leader process. (Actually, only the leader process has a
 	 * BrinBuildState.)
 	 */
 	BrinLeader *bs_leader;
 	int			bs_worker_id;
 	BrinSpool  *bs_spool;
-	BrinSpool  *bs_spool_out;
 } BrinBuildState;
 
 /*
@@ -961,7 +960,7 @@ brinbuildCallback(Relation index,
 /*
  * A version of the callback, used by parallel index builds. The main difference
  * is that instead of writing the BRIN tuples into the index, we write them into
- * a shared tuplestore file, and leave the insertion up to the leader (which may
+ * a shared tuplestore, and leave the insertion up to the leader (which may
  * reorder them a bit etc.). The callback also does not generate empty ranges,
  * those may be added by the leader when merging results from workers.
  */
@@ -981,10 +980,10 @@ brinbuildCallbackParallel(Relation index,
 	/*
 	 * If we're in a block that belongs to a future range, summarize what
 	 * we've got and start afresh.  Note the scan might have skipped many
-	 * pages, if they were devoid of live tuples; make sure to insert index
-	 * tuples for those too.
+	 * pages, if they were devoid of live tuples; we do not create emptry
+	 * BRIN ranges here - the leader is responsible for filling them in.
 	 */
-	while (thisblock > state->bs_currRangeStart + state->bs_pagesPerRange - 1)
+	if (thisblock > state->bs_currRangeStart + state->bs_pagesPerRange - 1)
 	{
 
 		BRIN_elog((DEBUG2,
@@ -996,19 +995,15 @@ brinbuildCallbackParallel(Relation index,
 		form_and_spill_tuple(state);
 
 		/*
-		 * set state to correspond to the next range
+		 * Set state to correspond to the next range (for this block).
 		 *
-		 * XXX This has the issue that it skips ranges summarized by other
-		 * workers, but it also skips empty ranges that should have been
-		 * summarized. We'd need to either make the workers aware which
-		 * chunk they are actually processing (which is currently known
-		 * only in the ParallelBlockTableScan bit). Or we could ignore it
-		 * here, and then decide it while "merging" results from workers
-		 * (if there's no entry for the range, it had to be empty so we
-		 * just add an empty one).
+		 * This skips ranges that are either empty (and so we don't get any
+		 * tuples to summarize), or processes by other workers. We can't
+		 * differentiate those cases here easily, so we leave it up to the
+		 * leader to fill empty ranges where needed.
 		 */
-		while (thisblock > state->bs_currRangeStart + state->bs_pagesPerRange - 1)
-			state->bs_currRangeStart += state->bs_pagesPerRange;
+		state->bs_currRangeStart
+			= state->bs_pagesPerRange * (thisblock / state->bs_pagesPerRange);
 
 		/* re-initialize state for it */
 		brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);
@@ -1137,11 +1132,7 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 		 * on the amount of memory used by a CREATE INDEX operation, regardless of
 		 * the use of parallelism or any other factor.
 		 */
-		state->bs_spool_out = (BrinSpool *) palloc0(sizeof(BrinSpool));
-		state->bs_spool_out->heap = heap;
-		state->bs_spool_out->index = index;
-
-		state->bs_spool_out->sortstate =
+		state->bs_spool->sortstate =
 			tuplesort_begin_index_brin(heap, index,
 									   maintenance_work_mem, coordinate,
 									   TUPLESORT_NONE);
@@ -2427,6 +2418,7 @@ _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state)
 	BlockNumber	prevblkno = InvalidBlockNumber;
 	BrinTuple  *emptyTuple = NULL;
 	Size		emptySize;
+	BrinSpool  *spool = state->bs_spool;
 
 	/* Shutdown worker processes */
 	WaitForParallelWorkersToFinish(brinleader->pcxt);
@@ -2438,7 +2430,7 @@ _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state)
 	state->bs_reltuples = brinshared->reltuples;
 	state->bs_numtuples = brinshared->indtuples;
 
-	tuplesort_performsort(state->bs_spool_out->sortstate);
+	tuplesort_performsort(spool->sortstate);
 
 	/*
 	 * Read the BRIN tuples from the shared tuplesort, sorted by block number.
@@ -2452,7 +2444,7 @@ _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state)
 	 * XXX We can't quite free the BrinTuple, though, because that's a field
 	 * in BrinSortTuple.
 	 */
-	while ((btup = tuplesort_getbrintuple(state->bs_spool_out->sortstate, &tuplen, true)) != NULL)
+	while ((btup = tuplesort_getbrintuple(spool->sortstate, &tuplen, true)) != NULL)
 	{
 		/*
 		 * We should not get two summaries for the same range. The workers
@@ -2494,7 +2486,7 @@ _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state)
 		prevblkno = btup->bt_blkno;
 	}
 
-	tuplesort_end(state->bs_spool_out->sortstate);
+	tuplesort_end(spool->sortstate);
 
 	/*
 	 * Next, accumulate WAL usage.  (This must wait for the workers to finish,
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index 4c6b396a8a8..ab6353bdcd1 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -1328,7 +1328,6 @@ tuplesort_puttuple_common(Tuplesortstate *state, SortTuple *tuple, bool useAbbre
 			break;
 
 		default:
-			Assert(false);
 			elog(ERROR, "invalid tuplesort state");
 			break;
 	}
@@ -1463,7 +1462,6 @@ tuplesort_performsort(Tuplesortstate *state)
 			break;
 
 		default:
-			Assert(false);
 			elog(ERROR, "invalid tuplesort state");
 			break;
 	}
@@ -1720,7 +1718,6 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward,
 			return false;
 
 		default:
-			Assert(false);
 			elog(ERROR, "invalid tuplesort state");
 			return false;		/* keep compiler quiet */
 	}
diff --git a/src/backend/utils/sort/tuplesortvariants.c b/src/backend/utils/sort/tuplesortvariants.c
index 343ed4bbc54..525cc01b474 100644
--- a/src/backend/utils/sort/tuplesortvariants.c
+++ b/src/backend/utils/sort/tuplesortvariants.c
@@ -161,9 +161,8 @@ typedef struct
 
 /*
  * Computing BrinTuple size with only the tuple is difficult, so we want to track
- * the length for r referenced by SortTuple. That's what BrinSortTuple is meant
- * to do - it's essentially a BrinTuple prefixed by length. We only write the
- * BrinTuple to the logtapes, though.
+ * the length referenced by the SortTuple. That's what BrinSortTuple is meant
+ * to do - it's essentially a BrinTuple prefixed by its length.
  */
 typedef struct BrinSortTuple
 {
@@ -796,11 +795,11 @@ tuplesort_putbrintuple(Tuplesortstate *state, BrinTuple *tuple, Size size)
 
 	/* allocate space for the whole BRIN sort tuple */
 	bstup = palloc(BRINSORTTUPLE_SIZE(size));
-	stup.tuple = bstup;
 
 	bstup->tuplen = size;
 	memcpy(&bstup->tuple, tuple, size);
 
+	stup.tuple = bstup;
 	stup.datum1 = tuple->bt_blkno;
 	stup.isnull1 = false;
 
@@ -1723,8 +1722,8 @@ comparetup_index_brin(const SortTuple *a, const SortTuple *b,
 	BrinTuple  *tuple1;
 	BrinTuple  *tuple2;
 
-	tuple1 = &((BrinSortTuple *) a)->tuple;
-	tuple2 = &((BrinSortTuple *) b)->tuple;
+	tuple1 = &((BrinSortTuple *) (a->tuple))->tuple;
+	tuple2 = &((BrinSortTuple *) (b->tuple))->tuple;
 
 	if (tuple1->bt_blkno > tuple2->bt_blkno)
 		return 1;
-- 
2.41.0

