From 555743e4bc885609d20768f7f2990c6ba69b13a9 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Tue, 13 Feb 2024 10:57:07 -0500
Subject: [PATCH v1 09/11] Make table_scan_bitmap_next_block() async friendly

table_scan_bitmap_next_block() previously returned false if we did not
wish to call table_scan_bitmap_next_tuple() on the tuples on the page.
This could happen when there were no visible tuples on the page or, due
to concurrent activity on the table, the block returned by the iterator
is past the known end of the table.

This forced the caller to be responsible for determining if additional
blocks should be fetched and then for invoking
table_scan_bitmap_next_block() for these blocks.

It makes more sense for table_scan_bitmap_next_block() to be responsible
for finding a block that is not past the end of the table and for
table_scan_bitmap_next_tuple() to return false if there are no visible
tuples on the page.

This also allows us to move responsibility for the iterator to table AM
specific code. This means handling invalid blocks is entirely up to
the table AM.

These changes will enable bitmapheapscan to use the future streaming
read API. The table AMs will implement a streaming read API callback
that returns the next block that needs to be fetched. In heap AM's case,
the callback will use the iterator to find the next block to be fetched.
Since choosing the next block will no longer the responsibility of
BitmapHeapNext(), the streaming read control flow requires these changes
to table_scan_bitmap_next_block().
---
 src/backend/access/heap/heapam.c          |  22 ++++
 src/backend/access/heap/heapam_handler.c  |  56 ++++++---
 src/backend/executor/nodeBitmapHeapscan.c | 132 ++++++++--------------
 src/include/access/relscan.h              |   3 +
 src/include/access/tableam.h              |  14 +--
 src/include/nodes/execnodes.h             |  10 +-
 6 files changed, 121 insertions(+), 116 deletions(-)

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 88b4aad5820..d8569373987 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -959,6 +959,8 @@ heap_beginscan(Relation relation, Snapshot snapshot,
 	scan->empty_tuples = 0;
 	scan->rs_base.lossy_pages = 0;
 	scan->rs_base.exact_pages = 0;
+	scan->rs_base.shared_tbmiterator = NULL;
+	scan->rs_base.tbmiterator = NULL;
 
 	/*
 	 * Disable page-at-a-time mode if it's not a MVCC-safe snapshot.
@@ -1051,6 +1053,18 @@ heap_rescan(TableScanDesc sscan, ScanKey key, bool set_params,
 		ReleaseBuffer(scan->vmbuffer);
 	scan->vmbuffer = InvalidBuffer;
 
+	if (scan->rs_base.rs_flags & SO_TYPE_BITMAPSCAN)
+	{
+		if (scan->rs_base.shared_tbmiterator)
+			tbm_end_shared_iterate(scan->rs_base.shared_tbmiterator);
+
+		if (scan->rs_base.tbmiterator)
+			tbm_end_iterate(scan->rs_base.tbmiterator);
+	}
+
+	scan->rs_base.shared_tbmiterator = NULL;
+	scan->rs_base.tbmiterator = NULL;
+
 	/*
 	 * reinitialize scan descriptor
 	 */
@@ -1074,6 +1088,14 @@ heap_endscan(TableScanDesc sscan)
 		ReleaseBuffer(scan->vmbuffer);
 	scan->vmbuffer = InvalidBuffer;
 
+	if (sscan->shared_tbmiterator)
+		tbm_end_shared_iterate(sscan->shared_tbmiterator);
+	sscan->shared_tbmiterator = NULL;
+
+	if (sscan->tbmiterator)
+		tbm_end_iterate(sscan->tbmiterator);
+	sscan->tbmiterator = NULL;
+
 	/*
 	 * decrement relation reference count and free scan descriptor storage
 	 */
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index 6e85ef7a946..d55ece23a35 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -2114,17 +2114,49 @@ heapam_estimate_rel_size(Relation rel, int32 *attr_widths,
 
 static bool
 heapam_scan_bitmap_next_block(TableScanDesc scan,
-							  TBMIterateResult *tbmres)
+							  bool *recheck, BlockNumber *blockno)
 {
 	HeapScanDesc hscan = (HeapScanDesc) scan;
-	BlockNumber block = tbmres->blockno;
+	BlockNumber block;
 	Buffer		buffer;
 	Snapshot	snapshot;
 	int			ntup;
+	TBMIterateResult *tbmres;
 
 	hscan->rs_cindex = 0;
 	hscan->rs_ntuples = 0;
 
+	*blockno = InvalidBlockNumber;
+	*recheck = true;
+
+	do
+	{
+		if (scan->shared_tbmiterator)
+			tbmres = tbm_shared_iterate(scan->shared_tbmiterator);
+		else
+			tbmres = tbm_iterate(scan->tbmiterator);
+
+		if (tbmres == NULL)
+		{
+			/* no more entries in the bitmap */
+			Assert(hscan->empty_tuples == 0);
+			return false;
+		}
+
+		/*
+		 * Ignore any claimed entries past what we think is the end of the
+		 * relation. It may have been extended after the start of our scan (we
+		 * only hold an AccessShareLock, and it could be inserts from this
+		 * backend).  We don't take this optimization in SERIALIZABLE
+		 * isolation though, as we need to examine all invisible tuples
+		 * reachable by the index.
+		 */
+	} while (!IsolationIsSerializable() && tbmres->blockno >= hscan->rs_nblocks);
+
+	/* Got a valid block */
+	*blockno = tbmres->blockno;
+	*recheck = tbmres->recheck;
+
 	/*
 	 * We can skip fetching the heap page if we don't need any fields from the
 	 * heap, and the bitmap entries don't need rechecking, and all tuples on
@@ -2143,16 +2175,7 @@ heapam_scan_bitmap_next_block(TableScanDesc scan,
 		return true;
 	}
 
-	/*
-	 * Ignore any claimed entries past what we think is the end of the
-	 * relation. It may have been extended after the start of our scan (we
-	 * only hold an AccessShareLock, and it could be inserts from this
-	 * backend).  We don't take this optimization in SERIALIZABLE isolation
-	 * though, as we need to examine all invisible tuples reachable by the
-	 * index.
-	 */
-	if (!IsolationIsSerializable() && block >= hscan->rs_nblocks)
-		return false;
+	block = tbmres->blockno;
 
 	/*
 	 * Acquire pin on the target heap page, trading in any pin we held before.
@@ -2251,7 +2274,14 @@ heapam_scan_bitmap_next_block(TableScanDesc scan,
 			scan->lossy_pages++;
 	}
 
-	return ntup > 0;
+	/*
+	 * Return true to indicate that a valid block was found and the bitmap is
+	 * not exhausted. If there are no visible tuples on this page,
+	 * hscan->rs_ntuples will be 0 and heapam_scan_bitmap_next_tuple() will
+	 * return false returning control to this function to advance to the next
+	 * block in the bitmap.
+	 */
+	return true;
 }
 
 static bool
diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c
index 4d55390715c..efc6952e353 100644
--- a/src/backend/executor/nodeBitmapHeapscan.c
+++ b/src/backend/executor/nodeBitmapHeapscan.c
@@ -76,7 +76,6 @@ BitmapHeapNext(BitmapHeapScanState *node)
 	ExprContext *econtext;
 	TableScanDesc scan;
 	TIDBitmap  *tbm;
-	TBMIterateResult *tbmres;
 	TupleTableSlot *slot;
 	ParallelBitmapHeapState *pstate = node->pstate;
 	dsa_area   *dsa = node->ss.ps.state->es_query_dsa;
@@ -88,7 +87,6 @@ BitmapHeapNext(BitmapHeapScanState *node)
 	slot = node->ss.ss_ScanTupleSlot;
 	scan = node->ss.ss_currentScanDesc;
 	tbm = node->tbm;
-	tbmres = node->tbmres;
 
 	/*
 	 * If we haven't yet performed the underlying index scan, do it, and begin
@@ -126,7 +124,6 @@ BitmapHeapNext(BitmapHeapScanState *node)
 
 			node->tbm = tbm;
 			tbmiterator = tbm_begin_iterate(tbm);
-			node->tbmres = tbmres = NULL;
 
 #ifdef USE_PREFETCH
 			if (node->prefetch_maximum > 0)
@@ -179,7 +176,6 @@ BitmapHeapNext(BitmapHeapScanState *node)
 
 			/* Allocate a private iterator and attach the shared state to it */
 			shared_tbmiterator = tbm_attach_shared_iterate(dsa, pstate->tbmiterator);
-			node->tbmres = tbmres = NULL;
 
 #ifdef USE_PREFETCH
 			if (node->prefetch_maximum > 0)
@@ -201,46 +197,23 @@ BitmapHeapNext(BitmapHeapScanState *node)
 																	can_skip_fetch);
 		}
 
-		node->tbmiterator = tbmiterator;
-		node->shared_tbmiterator = shared_tbmiterator;
+		scan->tbmiterator = tbmiterator;
+		scan->shared_tbmiterator = shared_tbmiterator;
+
 		node->initialized = true;
+
+		/* Get the first block. if none, end of scan */
+		if (!table_scan_bitmap_next_block(scan, &node->recheck, &node->blockno))
+			goto exit;
+		BitmapAdjustPrefetchIterator(node, node->blockno);
+		BitmapAdjustPrefetchTarget(node);
 	}
 
 	for (;;)
 	{
-		CHECK_FOR_INTERRUPTS();
-
-		/*
-		 * Get next page of results if needed
-		 */
-		if (tbmres == NULL)
+		while (table_scan_bitmap_next_tuple(scan, slot))
 		{
-			if (!pstate)
-				node->tbmres = tbmres = tbm_iterate(node->tbmiterator);
-			else
-				node->tbmres = tbmres = tbm_shared_iterate(node->shared_tbmiterator);
-			if (tbmres == NULL)
-			{
-				/* no more entries in the bitmap */
-				break;
-			}
-
-			BitmapAdjustPrefetchIterator(node, tbmres->blockno);
-
-			if (!table_scan_bitmap_next_block(scan, tbmres))
-			{
-				/* AM doesn't think this block is valid, skip */
-				continue;
-			}
-
-			/* Adjust the prefetch target */
-			BitmapAdjustPrefetchTarget(node);
-		}
-		else
-		{
-			/*
-			 * Continuing in previously obtained page.
-			 */
+			CHECK_FOR_INTERRUPTS();
 
 #ifdef USE_PREFETCH
 
@@ -262,53 +235,46 @@ BitmapHeapNext(BitmapHeapScanState *node)
 				SpinLockRelease(&pstate->mutex);
 			}
 #endif							/* USE_PREFETCH */
-		}
 
-		/*
-		 * We issue prefetch requests *after* fetching the current page to try
-		 * to avoid having prefetching interfere with the main I/O. Also, this
-		 * should happen only when we have determined there is still something
-		 * to do on the current page, else we may uselessly prefetch the same
-		 * page we are just about to request for real.
-		 *
-		 * XXX: It's a layering violation that we do these checks above
-		 * tableam, they should probably moved below it at some point.
-		 */
-		BitmapPrefetch(node, scan);
-
-		/*
-		 * Attempt to fetch tuple from AM.
-		 */
-		if (!table_scan_bitmap_next_tuple(scan, slot))
-		{
-			/* nothing more to look at on this page */
-			node->tbmres = tbmres = NULL;
-			continue;
-		}
+			/*
+			 * We prefetch before fetching the current pages. We expect that a
+			 * future streaming read API will do this, so do it now for
+			 * consistency.
+			 */
+			BitmapPrefetch(node, scan);
 
-		/*
-		 * If we are using lossy info, we have to recheck the qual conditions
-		 * at every tuple.
-		 */
-		if (tbmres->recheck)
-		{
-			econtext->ecxt_scantuple = slot;
-			if (!ExecQualAndReset(node->bitmapqualorig, econtext))
+			/*
+			 * If we are using lossy info, we have to recheck the qual
+			 * conditions at every tuple.
+			 */
+			if (node->recheck)
 			{
-				/* Fails recheck, so drop it and loop back for another */
-				InstrCountFiltered2(node, 1);
-				ExecClearTuple(slot);
-				continue;
+				econtext->ecxt_scantuple = slot;
+				if (!ExecQualAndReset(node->bitmapqualorig, econtext))
+				{
+					/* Fails recheck, so drop it and loop back for another */
+					InstrCountFiltered2(node, 1);
+					ExecClearTuple(slot);
+					continue;
+				}
 			}
+
+			/* OK to return this tuple */
+			return slot;
 		}
 
-		/* OK to return this tuple */
-		return slot;
+		if (!table_scan_bitmap_next_block(scan, &node->recheck, &node->blockno))
+			break;
+
+		BitmapAdjustPrefetchIterator(node, node->blockno);
+		/* Adjust the prefetch target */
+		BitmapAdjustPrefetchTarget(node);
 	}
 
 	/*
 	 * if we get here it means we are at the end of the scan..
 	 */
+exit:
 	BitmapAccumCounters(node, scan);
 	return ExecClearTuple(slot);
 }
@@ -594,12 +560,8 @@ ExecReScanBitmapHeapScan(BitmapHeapScanState *node)
 		table_rescan(node->ss.ss_currentScanDesc, NULL);
 
 	/* release bitmaps and buffers if any */
-	if (node->tbmiterator)
-		tbm_end_iterate(node->tbmiterator);
 	if (node->prefetch_iterator)
 		tbm_end_iterate(node->prefetch_iterator);
-	if (node->shared_tbmiterator)
-		tbm_end_shared_iterate(node->shared_tbmiterator);
 	if (node->shared_prefetch_iterator)
 		tbm_end_shared_iterate(node->shared_prefetch_iterator);
 	if (node->tbm)
@@ -607,13 +569,12 @@ ExecReScanBitmapHeapScan(BitmapHeapScanState *node)
 	if (node->pvmbuffer != InvalidBuffer)
 		ReleaseBuffer(node->pvmbuffer);
 	node->tbm = NULL;
-	node->tbmiterator = NULL;
-	node->tbmres = NULL;
 	node->prefetch_iterator = NULL;
 	node->initialized = false;
-	node->shared_tbmiterator = NULL;
 	node->shared_prefetch_iterator = NULL;
 	node->pvmbuffer = InvalidBuffer;
+	node->recheck = true;
+	node->blockno = InvalidBlockNumber;
 
 	ExecScanReScan(&node->ss);
 
@@ -647,14 +608,10 @@ ExecEndBitmapHeapScan(BitmapHeapScanState *node)
 	/*
 	 * release bitmaps and buffers if any
 	 */
-	if (node->tbmiterator)
-		tbm_end_iterate(node->tbmiterator);
 	if (node->prefetch_iterator)
 		tbm_end_iterate(node->prefetch_iterator);
 	if (node->tbm)
 		tbm_free(node->tbm);
-	if (node->shared_tbmiterator)
-		tbm_end_shared_iterate(node->shared_tbmiterator);
 	if (node->shared_prefetch_iterator)
 		tbm_end_shared_iterate(node->shared_prefetch_iterator);
 	if (node->pvmbuffer != InvalidBuffer)
@@ -697,8 +654,6 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
 	scanstate->ss.ps.ExecProcNode = ExecBitmapHeapScan;
 
 	scanstate->tbm = NULL;
-	scanstate->tbmiterator = NULL;
-	scanstate->tbmres = NULL;
 	scanstate->pvmbuffer = InvalidBuffer;
 	scanstate->exact_pages = 0;
 	scanstate->lossy_pages = 0;
@@ -707,10 +662,11 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
 	scanstate->prefetch_target = 0;
 	scanstate->pscan_len = 0;
 	scanstate->initialized = false;
-	scanstate->shared_tbmiterator = NULL;
 	scanstate->shared_prefetch_iterator = NULL;
 	scanstate->pstate = NULL;
 	scanstate->worker_snapshot = NULL;
+	scanstate->recheck = true;
+	scanstate->blockno = InvalidBlockNumber;
 
 	/*
 	 * Miscellaneous initialization
diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h
index b74e08dd745..bf7ee044268 100644
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -16,6 +16,7 @@
 
 #include "access/htup_details.h"
 #include "access/itup.h"
+#include "nodes/tidbitmap.h"
 #include "port/atomics.h"
 #include "storage/buf.h"
 #include "storage/spin.h"
@@ -41,6 +42,8 @@ typedef struct TableScanDescData
 	ItemPointerData rs_maxtid;
 
 	/* Only used for Bitmap table scans */
+	TBMIterator *tbmiterator;
+	TBMSharedIterator *shared_tbmiterator;
 	long		exact_pages;
 	long		lossy_pages;
 
diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h
index 05e700c5055..b90d9b7f3fa 100644
--- a/src/include/access/tableam.h
+++ b/src/include/access/tableam.h
@@ -798,7 +798,7 @@ typedef struct TableAmRoutine
 	 * scan_bitmap_next_tuple need to exist, or neither.
 	 */
 	bool		(*scan_bitmap_next_block) (TableScanDesc scan,
-										   struct TBMIterateResult *tbmres);
+										   bool *recheck, BlockNumber *blockno);
 
 	/*
 	 * Fetch the next tuple of a bitmap table scan into `slot` and return true
@@ -1942,17 +1942,16 @@ table_relation_estimate_size(Relation rel, int32 *attr_widths,
  */
 
 /*
- * Prepare to fetch / check / return tuples from `tbmres->blockno` as part of
- * a bitmap table scan. `scan` needs to have been started via
- * table_beginscan_bm(). Returns false if there are no tuples to be found on
- * the page, true otherwise.
+ * Prepare to fetch / check / return tuples as part of a bitmap table scan.
+ * `scan` needs to have been started via table_beginscan_bm(). Returns false if
+ * there are no more blocks in the bitmap, true otherwise.
  *
  * Note, this is an optionally implemented function, therefore should only be
  * used after verifying the presence (at plan time or such).
  */
 static inline bool
 table_scan_bitmap_next_block(TableScanDesc scan,
-							 struct TBMIterateResult *tbmres)
+							 bool *recheck, BlockNumber *blockno)
 {
 	/*
 	 * We don't expect direct calls to table_scan_bitmap_next_block with valid
@@ -1962,8 +1961,7 @@ table_scan_bitmap_next_block(TableScanDesc scan,
 	if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan))
 		elog(ERROR, "unexpected table_scan_bitmap_next_block call during logical decoding");
 
-	return scan->rs_rd->rd_tableam->scan_bitmap_next_block(scan,
-														   tbmres);
+	return scan->rs_rd->rd_tableam->scan_bitmap_next_block(scan, recheck, blockno);
 }
 
 /*
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 9392923eb32..03973a3f262 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1709,9 +1709,7 @@ typedef struct ParallelBitmapHeapState
  *
  *		bitmapqualorig	   execution state for bitmapqualorig expressions
  *		tbm				   bitmap obtained from child index scan(s)
- *		tbmiterator		   iterator for scanning current pages
- *		tbmres			   current-page data
- *		pvmbuffer		   ditto, for prefetched pages
+ *		pvmbuffer		   buffer for visibility-map lookups of prefetched pages
  *		exact_pages		   total number of exact pages retrieved
  *		lossy_pages		   total number of lossy pages retrieved
  *		prefetch_iterator  iterator for prefetching ahead of current page
@@ -1720,7 +1718,6 @@ typedef struct ParallelBitmapHeapState
  *		prefetch_maximum   maximum value for prefetch_target
  *		pscan_len		   size of the shared memory for parallel bitmap
  *		initialized		   is node is ready to iterate
- *		shared_tbmiterator	   shared iterator
  *		shared_prefetch_iterator shared iterator for prefetching
  *		pstate			   shared state for parallel bitmap scan
  *		worker_snapshot	   snapshot for parallel worker
@@ -1731,8 +1728,6 @@ typedef struct BitmapHeapScanState
 	ScanState	ss;				/* its first field is NodeTag */
 	ExprState  *bitmapqualorig;
 	TIDBitmap  *tbm;
-	TBMIterator *tbmiterator;
-	TBMIterateResult *tbmres;
 	Buffer		pvmbuffer;
 	long		exact_pages;
 	long		lossy_pages;
@@ -1742,10 +1737,11 @@ typedef struct BitmapHeapScanState
 	int			prefetch_maximum;
 	Size		pscan_len;
 	bool		initialized;
-	TBMSharedIterator *shared_tbmiterator;
 	TBMSharedIterator *shared_prefetch_iterator;
 	ParallelBitmapHeapState *pstate;
 	Snapshot	worker_snapshot;
+	bool		recheck;
+	BlockNumber blockno;
 } BitmapHeapScanState;
 
 /* ----------------
-- 
2.37.2

