From aac60985d6bc70bfedf77a77ee3c512da87bfcb1 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Tue, 13 Feb 2024 14:27:57 -0500
Subject: [PATCH v1 11/11] BitmapHeapScan uses streaming read API

Remove all of the code to do prefetching from BitmapHeapScan code and
rely on the streaming read API prefetching. Heap table AM implements a
streaming read callback which uses the iterator to get the next valid
block that needs to be fetched for the streaming read API.
---
 src/backend/access/gin/ginget.c           |  15 +-
 src/backend/access/gin/ginscan.c          |   7 +
 src/backend/access/heap/heapam.c          |  71 +++++
 src/backend/access/heap/heapam_handler.c  |  78 +++--
 src/backend/executor/nodeBitmapHeapscan.c | 328 +---------------------
 src/backend/nodes/tidbitmap.c             |  80 +++---
 src/include/access/heapam.h               |   2 +
 src/include/access/tableam.h              |  14 +-
 src/include/nodes/execnodes.h             |  19 --
 src/include/nodes/tidbitmap.h             |   8 +-
 10 files changed, 178 insertions(+), 444 deletions(-)

diff --git a/src/backend/access/gin/ginget.c b/src/backend/access/gin/ginget.c
index 0b4f2ebadb6..3ce28078a6f 100644
--- a/src/backend/access/gin/ginget.c
+++ b/src/backend/access/gin/ginget.c
@@ -373,7 +373,10 @@ restartScanEntry:
 			if (entry->matchBitmap)
 			{
 				if (entry->matchIterator)
+				{
 					tbm_end_iterate(entry->matchIterator);
+					pfree(entry->matchResult);
+				}
 				entry->matchIterator = NULL;
 				tbm_free(entry->matchBitmap);
 				entry->matchBitmap = NULL;
@@ -386,6 +389,7 @@ restartScanEntry:
 		if (entry->matchBitmap && !tbm_is_empty(entry->matchBitmap))
 		{
 			entry->matchIterator = tbm_begin_iterate(entry->matchBitmap);
+			entry->matchResult = palloc0(TBM_ITERATE_RESULT_SIZE);
 			entry->isFinished = false;
 		}
 	}
@@ -823,21 +827,24 @@ entryGetItem(GinState *ginstate, GinScanEntry entry,
 		{
 			/*
 			 * If we've exhausted all items on this block, move to next block
-			 * in the bitmap.
+			 * in the bitmap. tbm_iterate() sets matchResult->blockno to
+			 * InvalidBlockNumber when the bitmap is exhausted.
 			 */
-			while (entry->matchResult == NULL ||
+			while ((!BlockNumberIsValid(entry->matchResult->blockno)) ||
 				   (entry->matchResult->ntuples >= 0 &&
 					entry->offset >= entry->matchResult->ntuples) ||
 				   entry->matchResult->blockno < advancePastBlk ||
 				   (ItemPointerIsLossyPage(&advancePast) &&
 					entry->matchResult->blockno == advancePastBlk))
 			{
-				entry->matchResult = tbm_iterate(entry->matchIterator);
 
-				if (entry->matchResult == NULL)
+				tbm_iterate(entry->matchIterator, entry->matchResult);
+				if (!BlockNumberIsValid(entry->matchResult->blockno))
 				{
 					ItemPointerSetInvalid(&entry->curItem);
 					tbm_end_iterate(entry->matchIterator);
+					pfree(entry->matchResult);
+					entry->matchResult = NULL;
 					entry->matchIterator = NULL;
 					entry->isFinished = true;
 					break;
diff --git a/src/backend/access/gin/ginscan.c b/src/backend/access/gin/ginscan.c
index af24d38544e..be27f9fe07e 100644
--- a/src/backend/access/gin/ginscan.c
+++ b/src/backend/access/gin/ginscan.c
@@ -246,7 +246,14 @@ ginFreeScanKeys(GinScanOpaque so)
 		if (entry->list)
 			pfree(entry->list);
 		if (entry->matchIterator)
+		{
 			tbm_end_iterate(entry->matchIterator);
+			if (entry->matchResult)
+			{
+				pfree(entry->matchResult);
+				entry->matchResult = NULL;
+			}
+		}
 		if (entry->matchBitmap)
 			tbm_free(entry->matchBitmap);
 	}
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index d8569373987..86484c6c72a 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -115,6 +115,8 @@ static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup);
 static HeapTuple ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool key_required,
 										bool *copy);
 
+static BlockNumber bitmapheap_pgsr_next_single(PgStreamingRead *pgsr, void *pgsr_private,
+							void *per_buffer_data);
 
 /*
  * Each tuple lock mode has a corresponding heavyweight lock, and one or two
@@ -335,6 +337,22 @@ initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock)
 	if (key != NULL && scan->rs_base.rs_nkeys > 0)
 		memcpy(scan->rs_base.rs_key, key, scan->rs_base.rs_nkeys * sizeof(ScanKeyData));
 
+	if (scan->rs_base.rs_flags & SO_TYPE_BITMAPSCAN)
+	{
+		if (scan->pgsr)
+			pg_streaming_read_free(scan->pgsr);
+
+		scan->pgsr = pg_streaming_read_buffer_alloc(PGSR_FLAG_DEFAULT,
+													scan,
+													TBM_ITERATE_RESULT_SIZE,
+													scan->rs_strategy,
+													BMR_REL(scan->rs_base.rs_rd),
+													MAIN_FORKNUM,
+													bitmapheap_pgsr_next_single);
+
+
+	}
+
 	/*
 	 * Currently, we only have a stats counter for sequential heap scans (but
 	 * e.g for bitmap scans the underlying bitmap index scans will be counted,
@@ -955,6 +973,7 @@ heap_beginscan(Relation relation, Snapshot snapshot,
 	scan->rs_base.rs_flags = flags;
 	scan->rs_base.rs_parallel = parallel_scan;
 	scan->rs_strategy = NULL;	/* set in initscan */
+	scan->pgsr = NULL;
 	scan->vmbuffer = InvalidBuffer;
 	scan->empty_tuples = 0;
 	scan->rs_base.lossy_pages = 0;
@@ -1113,6 +1132,13 @@ heap_endscan(TableScanDesc sscan)
 	if (scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT)
 		UnregisterSnapshot(scan->rs_base.rs_snapshot);
 
+	if (scan->rs_base.rs_flags & SO_TYPE_BITMAPSCAN)
+	{
+		if (scan->pgsr)
+			pg_streaming_read_free(scan->pgsr);
+		scan->pgsr = NULL;
+	}
+
 	pfree(scan);
 }
 
@@ -10270,3 +10296,48 @@ HeapCheckForSerializableConflictOut(bool visible, Relation relation,
 
 	CheckForSerializableConflictOut(relation, xid, snapshot);
 }
+
+static BlockNumber
+bitmapheap_pgsr_next_single(PgStreamingRead *pgsr, void *pgsr_private,
+							void *per_buffer_data)
+{
+	TBMIterateResult *tbmres = per_buffer_data;
+	HeapScanDesc hdesc = (HeapScanDesc) pgsr_private;
+
+	for (;;)
+	{
+		if (hdesc->rs_base.shared_tbmiterator)
+			tbm_shared_iterate(hdesc->rs_base.shared_tbmiterator, tbmres);
+		else
+			tbm_iterate(hdesc->rs_base.tbmiterator, tbmres);
+
+		/* no more entries in the bitmap */
+		if (!BlockNumberIsValid(tbmres->blockno))
+			return InvalidBlockNumber;
+
+		/*
+		 * Ignore any claimed entries past what we think is the end of the
+		 * relation. It may have been extended after the start of our scan (we
+		 * only hold an AccessShareLock, and it could be inserts from this
+		 * backend).  We don't take this optimization in SERIALIZABLE
+		 * isolation though, as we need to examine all invisible tuples
+		 * reachable by the index.
+		 */
+		if (!IsolationIsSerializable() && tbmres->blockno >= hdesc->rs_nblocks)
+			continue;
+
+
+		if (hdesc->rs_base.rs_flags & SO_CAN_SKIP_FETCH &&
+			!tbmres->recheck &&
+			VM_ALL_VISIBLE(hdesc->rs_base.rs_rd, tbmres->blockno, &hdesc->vmbuffer))
+		{
+			hdesc->empty_tuples += tbmres->ntuples;
+			continue;
+		}
+
+		return tbmres->blockno;
+	}
+
+	/* not reachable */
+	Assert(false);
+}
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index d55ece23a35..0cd586cd4b8 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -2113,77 +2113,65 @@ heapam_estimate_rel_size(Relation rel, int32 *attr_widths,
  */
 
 static bool
-heapam_scan_bitmap_next_block(TableScanDesc scan,
-							  bool *recheck, BlockNumber *blockno)
+heapam_scan_bitmap_next_block(TableScanDesc scan, bool *recheck)
 {
 	HeapScanDesc hscan = (HeapScanDesc) scan;
+	void	   *io_private;
 	BlockNumber block;
 	Buffer		buffer;
 	Snapshot	snapshot;
 	int			ntup;
 	TBMIterateResult *tbmres;
 
+	Assert(hscan->pgsr);
+
 	hscan->rs_cindex = 0;
 	hscan->rs_ntuples = 0;
 
-	*blockno = InvalidBlockNumber;
 	*recheck = true;
 
-	do
+	/* Release buffer containing previous block. */
+	if (BufferIsValid(hscan->rs_cbuf))
 	{
-		if (scan->shared_tbmiterator)
-			tbmres = tbm_shared_iterate(scan->shared_tbmiterator);
-		else
-			tbmres = tbm_iterate(scan->tbmiterator);
+		ReleaseBuffer(hscan->rs_cbuf);
+		hscan->rs_cbuf = InvalidBuffer;
+	}
+
+	hscan->rs_cbuf = pg_streaming_read_buffer_get_next(hscan->pgsr, &io_private);
 
-		if (tbmres == NULL)
+	if (BufferIsInvalid(hscan->rs_cbuf))
+	{
+		if (BufferIsValid(hscan->vmbuffer))
 		{
-			/* no more entries in the bitmap */
-			Assert(hscan->empty_tuples == 0);
-			return false;
+			ReleaseBuffer(hscan->vmbuffer);
+			hscan->vmbuffer = InvalidBuffer;
 		}
 
 		/*
-		 * Ignore any claimed entries past what we think is the end of the
-		 * relation. It may have been extended after the start of our scan (we
-		 * only hold an AccessShareLock, and it could be inserts from this
-		 * backend).  We don't take this optimization in SERIALIZABLE
-		 * isolation though, as we need to examine all invisible tuples
-		 * reachable by the index.
+		 * Bitmap is exhausted. Time to emit empty tuples if relevant. We emit
+		 * all empty tuples at the end instead of emitting them per block we
+		 * skip fetching. This is necessary because the streaming read API will
+		 * only return TBMIterateResults for blocks actually fetched. When we
+		 * skip fetching a block, we keep track of how many empty tuples to
+		 * emit at the end of the BitmapHeapScan. We do not recheck all NULL
+		 * tuples.
 		 */
-	} while (!IsolationIsSerializable() && tbmres->blockno >= hscan->rs_nblocks);
+		*recheck = false;
+		return hscan->empty_tuples > 0;
+	}
 
-	/* Got a valid block */
-	*blockno = tbmres->blockno;
-	*recheck = tbmres->recheck;
+	Assert(io_private);
 
-	/*
-	 * We can skip fetching the heap page if we don't need any fields from the
-	 * heap, and the bitmap entries don't need rechecking, and all tuples on
-	 * the page are visible to our transaction.
-	 */
-	if (scan->rs_flags & SO_CAN_SKIP_FETCH &&
-		!tbmres->recheck &&
-		VM_ALL_VISIBLE(scan->rs_rd, tbmres->blockno, &hscan->vmbuffer))
-	{
-		/* can't be lossy in the skip_fetch case */
-		Assert(tbmres->ntuples >= 0);
-		Assert(hscan->empty_tuples >= 0);
+	tbmres = (TBMIterateResult *) io_private;
 
-		hscan->empty_tuples += tbmres->ntuples;
+	Assert(BufferGetBlockNumber(hscan->rs_cbuf) == tbmres->blockno);
 
-		return true;
-	}
+	*recheck = tbmres->recheck;
 
-	block = tbmres->blockno;
+	hscan->rs_cblock = tbmres->blockno;
+	hscan->rs_ntuples = tbmres->ntuples;
 
-	/*
-	 * Acquire pin on the target heap page, trading in any pin we held before.
-	 */
-	hscan->rs_cbuf = ReleaseAndReadBuffer(hscan->rs_cbuf,
-										  scan->rs_rd,
-										  block);
-	hscan->rs_cblock = block;
+	block = tbmres->blockno;
 	buffer = hscan->rs_cbuf;
 	snapshot = scan->rs_snapshot;
 
diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c
index efc6952e353..8b7f87a4779 100644
--- a/src/backend/executor/nodeBitmapHeapscan.c
+++ b/src/backend/executor/nodeBitmapHeapscan.c
@@ -56,11 +56,6 @@ static TupleTableSlot *BitmapHeapNext(BitmapHeapScanState *node);
 static inline void BitmapAccumCounters(BitmapHeapScanState *node,
 									   TableScanDesc scan);
 static inline void BitmapDoneInitializingSharedState(ParallelBitmapHeapState *pstate);
-static inline void BitmapAdjustPrefetchIterator(BitmapHeapScanState *node,
-												BlockNumber blockno);
-static inline void BitmapAdjustPrefetchTarget(BitmapHeapScanState *node);
-static inline void BitmapPrefetch(BitmapHeapScanState *node,
-								  TableScanDesc scan);
 static bool BitmapShouldInitializeSharedState(ParallelBitmapHeapState *pstate);
 
 
@@ -124,15 +119,6 @@ BitmapHeapNext(BitmapHeapScanState *node)
 
 			node->tbm = tbm;
 			tbmiterator = tbm_begin_iterate(tbm);
-
-#ifdef USE_PREFETCH
-			if (node->prefetch_maximum > 0)
-			{
-				node->prefetch_iterator = tbm_begin_iterate(tbm);
-				node->prefetch_pages = 0;
-				node->prefetch_target = -1;
-			}
-#endif							/* USE_PREFETCH */
 		}
 		else
 		{
@@ -155,20 +141,6 @@ BitmapHeapNext(BitmapHeapScanState *node)
 				 * multiple processes to iterate jointly.
 				 */
 				pstate->tbmiterator = tbm_prepare_shared_iterate(tbm);
-#ifdef USE_PREFETCH
-				if (node->prefetch_maximum > 0)
-				{
-					pstate->prefetch_iterator =
-						tbm_prepare_shared_iterate(tbm);
-
-					/*
-					 * We don't need the mutex here as we haven't yet woke up
-					 * others.
-					 */
-					pstate->prefetch_pages = 0;
-					pstate->prefetch_target = -1;
-				}
-#endif
 
 				/* We have initialized the shared state so wake up others. */
 				BitmapDoneInitializingSharedState(pstate);
@@ -176,14 +148,6 @@ BitmapHeapNext(BitmapHeapScanState *node)
 
 			/* Allocate a private iterator and attach the shared state to it */
 			shared_tbmiterator = tbm_attach_shared_iterate(dsa, pstate->tbmiterator);
-
-#ifdef USE_PREFETCH
-			if (node->prefetch_maximum > 0)
-			{
-				node->shared_prefetch_iterator =
-					tbm_attach_shared_iterate(dsa, pstate->prefetch_iterator);
-			}
-#endif							/* USE_PREFETCH */
 		}
 
 		if (!scan)
@@ -203,46 +167,16 @@ BitmapHeapNext(BitmapHeapScanState *node)
 		node->initialized = true;
 
 		/* Get the first block. if none, end of scan */
-		if (!table_scan_bitmap_next_block(scan, &node->recheck, &node->blockno))
+		if (!table_scan_bitmap_next_block(scan, &node->recheck))
 			goto exit;
-		BitmapAdjustPrefetchIterator(node, node->blockno);
-		BitmapAdjustPrefetchTarget(node);
 	}
 
-	for (;;)
+	do
 	{
 		while (table_scan_bitmap_next_tuple(scan, slot))
 		{
 			CHECK_FOR_INTERRUPTS();
 
-#ifdef USE_PREFETCH
-
-			/*
-			 * Try to prefetch at least a few pages even before we get to the
-			 * second page if we don't stop reading after the first tuple.
-			 */
-			if (!pstate)
-			{
-				if (node->prefetch_target < node->prefetch_maximum)
-					node->prefetch_target++;
-			}
-			else if (pstate->prefetch_target < node->prefetch_maximum)
-			{
-				/* take spinlock while updating shared state */
-				SpinLockAcquire(&pstate->mutex);
-				if (pstate->prefetch_target < node->prefetch_maximum)
-					pstate->prefetch_target++;
-				SpinLockRelease(&pstate->mutex);
-			}
-#endif							/* USE_PREFETCH */
-
-			/*
-			 * We prefetch before fetching the current pages. We expect that a
-			 * future streaming read API will do this, so do it now for
-			 * consistency.
-			 */
-			BitmapPrefetch(node, scan);
-
 			/*
 			 * If we are using lossy info, we have to recheck the qual
 			 * conditions at every tuple.
@@ -263,13 +197,7 @@ BitmapHeapNext(BitmapHeapScanState *node)
 			return slot;
 		}
 
-		if (!table_scan_bitmap_next_block(scan, &node->recheck, &node->blockno))
-			break;
-
-		BitmapAdjustPrefetchIterator(node, node->blockno);
-		/* Adjust the prefetch target */
-		BitmapAdjustPrefetchTarget(node);
-	}
+	} while (table_scan_bitmap_next_block(scan, &node->recheck));
 
 	/*
 	 * if we get here it means we are at the end of the scan..
@@ -304,215 +232,6 @@ BitmapDoneInitializingSharedState(ParallelBitmapHeapState *pstate)
 	ConditionVariableBroadcast(&pstate->cv);
 }
 
-/*
- *	BitmapAdjustPrefetchIterator - Adjust the prefetch iterator
- */
-static inline void
-BitmapAdjustPrefetchIterator(BitmapHeapScanState *node,
-							 BlockNumber blockno)
-{
-#ifdef USE_PREFETCH
-	ParallelBitmapHeapState *pstate = node->pstate;
-
-	if (pstate == NULL)
-	{
-		TBMIterator *prefetch_iterator = node->prefetch_iterator;
-
-		if (node->prefetch_pages > 0)
-		{
-			/* The main iterator has closed the distance by one page */
-			node->prefetch_pages--;
-		}
-		else if (prefetch_iterator)
-		{
-			/* Do not let the prefetch iterator get behind the main one */
-			TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
-
-			if (tbmpre == NULL || tbmpre->blockno != blockno)
-				elog(ERROR, "prefetch and main iterators are out of sync");
-		}
-		return;
-	}
-
-	if (node->prefetch_maximum > 0)
-	{
-		TBMSharedIterator *prefetch_iterator = node->shared_prefetch_iterator;
-
-		SpinLockAcquire(&pstate->mutex);
-		if (pstate->prefetch_pages > 0)
-		{
-			pstate->prefetch_pages--;
-			SpinLockRelease(&pstate->mutex);
-		}
-		else
-		{
-			/* Release the mutex before iterating */
-			SpinLockRelease(&pstate->mutex);
-
-			/*
-			 * In case of shared mode, we can not ensure that the current
-			 * blockno of the main iterator and that of the prefetch iterator
-			 * are same.  It's possible that whatever blockno we are
-			 * prefetching will be processed by another process.  Therefore,
-			 * we don't validate the blockno here as we do in non-parallel
-			 * case.
-			 */
-			if (prefetch_iterator)
-				tbm_shared_iterate(prefetch_iterator);
-		}
-	}
-#endif							/* USE_PREFETCH */
-}
-
-/*
- * BitmapAdjustPrefetchTarget - Adjust the prefetch target
- *
- * Increase prefetch target if it's not yet at the max.  Note that
- * we will increase it to zero after fetching the very first
- * page/tuple, then to one after the second tuple is fetched, then
- * it doubles as later pages are fetched.
- */
-static inline void
-BitmapAdjustPrefetchTarget(BitmapHeapScanState *node)
-{
-#ifdef USE_PREFETCH
-	ParallelBitmapHeapState *pstate = node->pstate;
-
-	if (pstate == NULL)
-	{
-		if (node->prefetch_target >= node->prefetch_maximum)
-			 /* don't increase any further */ ;
-		else if (node->prefetch_target >= node->prefetch_maximum / 2)
-			node->prefetch_target = node->prefetch_maximum;
-		else if (node->prefetch_target > 0)
-			node->prefetch_target *= 2;
-		else
-			node->prefetch_target++;
-		return;
-	}
-
-	/* Do an unlocked check first to save spinlock acquisitions. */
-	if (pstate->prefetch_target < node->prefetch_maximum)
-	{
-		SpinLockAcquire(&pstate->mutex);
-		if (pstate->prefetch_target >= node->prefetch_maximum)
-			 /* don't increase any further */ ;
-		else if (pstate->prefetch_target >= node->prefetch_maximum / 2)
-			pstate->prefetch_target = node->prefetch_maximum;
-		else if (pstate->prefetch_target > 0)
-			pstate->prefetch_target *= 2;
-		else
-			pstate->prefetch_target++;
-		SpinLockRelease(&pstate->mutex);
-	}
-#endif							/* USE_PREFETCH */
-}
-
-/*
- * BitmapPrefetch - Prefetch, if prefetch_pages are behind prefetch_target
- */
-static inline void
-BitmapPrefetch(BitmapHeapScanState *node, TableScanDesc scan)
-{
-#ifdef USE_PREFETCH
-	ParallelBitmapHeapState *pstate = node->pstate;
-
-	if (pstate == NULL)
-	{
-		TBMIterator *prefetch_iterator = node->prefetch_iterator;
-
-		if (prefetch_iterator)
-		{
-			while (node->prefetch_pages < node->prefetch_target)
-			{
-				TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
-				bool		skip_fetch;
-
-				if (tbmpre == NULL)
-				{
-					/* No more pages to prefetch */
-					tbm_end_iterate(prefetch_iterator);
-					node->prefetch_iterator = NULL;
-					break;
-				}
-				node->prefetch_pages++;
-
-				/*
-				 * If we expect not to have to actually read this heap page,
-				 * skip this prefetch call, but continue to run the prefetch
-				 * logic normally.  (Would it be better not to increment
-				 * prefetch_pages?)
-				 *
-				 * This depends on the assumption that the index AM will
-				 * report the same recheck flag for this future heap page as
-				 * it did for the current heap page; which is not a certainty
-				 * but is true in many cases.
-				 */
-
-				skip_fetch = (scan->rs_flags & SO_CAN_SKIP_FETCH &&
-							  !tbmpre->recheck &&
-							  VM_ALL_VISIBLE(node->ss.ss_currentRelation,
-											 tbmpre->blockno,
-											 &node->pvmbuffer));
-
-				if (!skip_fetch)
-					PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno);
-			}
-		}
-
-		return;
-	}
-
-	if (pstate->prefetch_pages < pstate->prefetch_target)
-	{
-		TBMSharedIterator *prefetch_iterator = node->shared_prefetch_iterator;
-
-		if (prefetch_iterator)
-		{
-			while (1)
-			{
-				TBMIterateResult *tbmpre;
-				bool		do_prefetch = false;
-				bool		skip_fetch;
-
-				/*
-				 * Recheck under the mutex. If some other process has already
-				 * done enough prefetching then we need not to do anything.
-				 */
-				SpinLockAcquire(&pstate->mutex);
-				if (pstate->prefetch_pages < pstate->prefetch_target)
-				{
-					pstate->prefetch_pages++;
-					do_prefetch = true;
-				}
-				SpinLockRelease(&pstate->mutex);
-
-				if (!do_prefetch)
-					return;
-
-				tbmpre = tbm_shared_iterate(prefetch_iterator);
-				if (tbmpre == NULL)
-				{
-					/* No more pages to prefetch */
-					tbm_end_shared_iterate(prefetch_iterator);
-					node->shared_prefetch_iterator = NULL;
-					break;
-				}
-
-				/* As above, skip prefetch if we expect not to need page */
-				skip_fetch = (scan->rs_flags & SO_CAN_SKIP_FETCH &&
-							  !tbmpre->recheck &&
-							  VM_ALL_VISIBLE(node->ss.ss_currentRelation,
-											 tbmpre->blockno,
-											 &node->pvmbuffer));
-
-				if (!skip_fetch)
-					PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno);
-			}
-		}
-	}
-#endif							/* USE_PREFETCH */
-}
 
 /*
  * BitmapHeapRecheck -- access method routine to recheck a tuple in EvalPlanQual
@@ -559,22 +278,12 @@ ExecReScanBitmapHeapScan(BitmapHeapScanState *node)
 	if (node->ss.ss_currentScanDesc)
 		table_rescan(node->ss.ss_currentScanDesc, NULL);
 
-	/* release bitmaps and buffers if any */
-	if (node->prefetch_iterator)
-		tbm_end_iterate(node->prefetch_iterator);
-	if (node->shared_prefetch_iterator)
-		tbm_end_shared_iterate(node->shared_prefetch_iterator);
+	/* release bitmaps if any */
 	if (node->tbm)
 		tbm_free(node->tbm);
-	if (node->pvmbuffer != InvalidBuffer)
-		ReleaseBuffer(node->pvmbuffer);
 	node->tbm = NULL;
-	node->prefetch_iterator = NULL;
 	node->initialized = false;
-	node->shared_prefetch_iterator = NULL;
-	node->pvmbuffer = InvalidBuffer;
 	node->recheck = true;
-	node->blockno = InvalidBlockNumber;
 
 	ExecScanReScan(&node->ss);
 
@@ -606,16 +315,10 @@ ExecEndBitmapHeapScan(BitmapHeapScanState *node)
 	ExecEndNode(outerPlanState(node));
 
 	/*
-	 * release bitmaps and buffers if any
+	 * release bitmaps if any
 	 */
-	if (node->prefetch_iterator)
-		tbm_end_iterate(node->prefetch_iterator);
 	if (node->tbm)
 		tbm_free(node->tbm);
-	if (node->shared_prefetch_iterator)
-		tbm_end_shared_iterate(node->shared_prefetch_iterator);
-	if (node->pvmbuffer != InvalidBuffer)
-		ReleaseBuffer(node->pvmbuffer);
 
 	/*
 	 * close heap scan
@@ -654,19 +357,13 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
 	scanstate->ss.ps.ExecProcNode = ExecBitmapHeapScan;
 
 	scanstate->tbm = NULL;
-	scanstate->pvmbuffer = InvalidBuffer;
 	scanstate->exact_pages = 0;
 	scanstate->lossy_pages = 0;
-	scanstate->prefetch_iterator = NULL;
-	scanstate->prefetch_pages = 0;
-	scanstate->prefetch_target = 0;
 	scanstate->pscan_len = 0;
 	scanstate->initialized = false;
-	scanstate->shared_prefetch_iterator = NULL;
 	scanstate->pstate = NULL;
 	scanstate->worker_snapshot = NULL;
 	scanstate->recheck = true;
-	scanstate->blockno = InvalidBlockNumber;
 
 	/*
 	 * Miscellaneous initialization
@@ -706,13 +403,6 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
 	scanstate->bitmapqualorig =
 		ExecInitQual(node->bitmapqualorig, (PlanState *) scanstate);
 
-	/*
-	 * Maximum number of prefetches for the tablespace if configured,
-	 * otherwise the current value of the effective_io_concurrency GUC.
-	 */
-	scanstate->prefetch_maximum =
-		get_tablespace_io_concurrency(currentRelation->rd_rel->reltablespace);
-
 	scanstate->ss.ss_currentRelation = currentRelation;
 
 	/*
@@ -796,14 +486,10 @@ ExecBitmapHeapInitializeDSM(BitmapHeapScanState *node,
 		return;
 
 	pstate = shm_toc_allocate(pcxt->toc, node->pscan_len);
-
 	pstate->tbmiterator = 0;
-	pstate->prefetch_iterator = 0;
 
 	/* Initialize the mutex */
 	SpinLockInit(&pstate->mutex);
-	pstate->prefetch_pages = 0;
-	pstate->prefetch_target = 0;
 	pstate->state = BM_INITIAL;
 
 	ConditionVariableInit(&pstate->cv);
@@ -835,11 +521,7 @@ ExecBitmapHeapReInitializeDSM(BitmapHeapScanState *node,
 	if (DsaPointerIsValid(pstate->tbmiterator))
 		tbm_free_shared_area(dsa, pstate->tbmiterator);
 
-	if (DsaPointerIsValid(pstate->prefetch_iterator))
-		tbm_free_shared_area(dsa, pstate->prefetch_iterator);
-
 	pstate->tbmiterator = InvalidDsaPointer;
-	pstate->prefetch_iterator = InvalidDsaPointer;
 }
 
 /* ----------------------------------------------------------------
diff --git a/src/backend/nodes/tidbitmap.c b/src/backend/nodes/tidbitmap.c
index 0f4850065fb..ccb511fb608 100644
--- a/src/backend/nodes/tidbitmap.c
+++ b/src/backend/nodes/tidbitmap.c
@@ -180,7 +180,6 @@ struct TBMIterator
 	int			spageptr;		/* next spages index */
 	int			schunkptr;		/* next schunks index */
 	int			schunkbit;		/* next bit to check in current schunk */
-	TBMIterateResult output;	/* MUST BE LAST (because variable-size) */
 };
 
 /*
@@ -221,7 +220,6 @@ struct TBMSharedIterator
 	PTEntryArray *ptbase;		/* pagetable element array */
 	PTIterationArray *ptpages;	/* sorted exact page index list */
 	PTIterationArray *ptchunks; /* sorted lossy page index list */
-	TBMIterateResult output;	/* MUST BE LAST (because variable-size) */
 };
 
 /* Local function prototypes */
@@ -695,8 +693,7 @@ tbm_begin_iterate(TIDBitmap *tbm)
 	 * Create the TBMIterator struct, with enough trailing space to serve the
 	 * needs of the TBMIterateResult sub-struct.
 	 */
-	iterator = (TBMIterator *) palloc(sizeof(TBMIterator) +
-									  MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
+	iterator = (TBMIterator *) palloc(sizeof(TBMIterator));
 	iterator->tbm = tbm;
 
 	/*
@@ -957,20 +954,21 @@ tbm_advance_schunkbit(PagetableEntry *chunk, int *schunkbitp)
 /*
  * tbm_iterate - scan through next page of a TIDBitmap
  *
- * Returns a TBMIterateResult representing one page, or NULL if there are
- * no more pages to scan.  Pages are guaranteed to be delivered in numerical
- * order.  If result->ntuples < 0, then the bitmap is "lossy" and failed to
- * remember the exact tuples to look at on this page --- the caller must
- * examine all tuples on the page and check if they meet the intended
- * condition.  If result->recheck is true, only the indicated tuples need
- * be examined, but the condition must be rechecked anyway.  (For ease of
- * testing, recheck is always set true when ntuples < 0.)
+ * Caller must pass in a TBMIterateResult to be filled.
+ *
+ * Pages are guaranteed to be delivered in numerical order.  tbmres->blockno is
+ * set to InvalidBlockNumber when there are no more pages to scan. If
+ * tbmres->ntuples < 0, then the bitmap is "lossy" and failed to remember the
+ * exact tuples to look at on this page --- the caller must examine all tuples
+ * on the page and check if they meet the intended condition.  If
+ * tbmres->recheck is true, only the indicated tuples need be examined, but the
+ * condition must be rechecked anyway.  (For ease of testing, recheck is always
+ * set true when ntuples < 0.)
  */
-TBMIterateResult *
-tbm_iterate(TBMIterator *iterator)
+void
+tbm_iterate(TBMIterator *iterator, TBMIterateResult *tbmres)
 {
 	TIDBitmap  *tbm = iterator->tbm;
-	TBMIterateResult *output = &(iterator->output);
 
 	Assert(tbm->iterating == TBM_ITERATING_PRIVATE);
 
@@ -998,6 +996,7 @@ tbm_iterate(TBMIterator *iterator)
 	 * If both chunk and per-page data remain, must output the numerically
 	 * earlier page.
 	 */
+	Assert(tbmres);
 	if (iterator->schunkptr < tbm->nchunks)
 	{
 		PagetableEntry *chunk = tbm->schunks[iterator->schunkptr];
@@ -1008,11 +1007,11 @@ tbm_iterate(TBMIterator *iterator)
 			chunk_blockno < tbm->spages[iterator->spageptr]->blockno)
 		{
 			/* Return a lossy page indicator from the chunk */
-			output->blockno = chunk_blockno;
-			output->ntuples = -1;
-			output->recheck = true;
+			tbmres->blockno = chunk_blockno;
+			tbmres->ntuples = -1;
+			tbmres->recheck = true;
 			iterator->schunkbit++;
-			return output;
+			return;
 		}
 	}
 
@@ -1028,18 +1027,20 @@ tbm_iterate(TBMIterator *iterator)
 			page = tbm->spages[iterator->spageptr];
 
 		/* scan bitmap to extract individual offset numbers */
-		ntuples = tbm_extract_page_tuple(page, output);
-		output->blockno = page->blockno;
-		output->ntuples = ntuples;
-		output->recheck = page->recheck;
+		ntuples = tbm_extract_page_tuple(page, tbmres);
+		tbmres->blockno = page->blockno;
+		tbmres->ntuples = ntuples;
+		tbmres->recheck = page->recheck;
 		iterator->spageptr++;
-		return output;
+		return;
 	}
 
 	/* Nothing more in the bitmap */
-	return NULL;
+	tbmres->blockno = InvalidBlockNumber;
+	return;
 }
 
+
 /*
  *	tbm_shared_iterate - scan through next page of a TIDBitmap
  *
@@ -1047,10 +1048,9 @@ tbm_iterate(TBMIterator *iterator)
  *	across multiple processes.  We need to acquire the iterator LWLock,
  *	before accessing the shared members.
  */
-TBMIterateResult *
-tbm_shared_iterate(TBMSharedIterator *iterator)
+void
+tbm_shared_iterate(TBMSharedIterator *iterator, TBMIterateResult *tbmres)
 {
-	TBMIterateResult *output = &iterator->output;
 	TBMSharedIteratorState *istate = iterator->state;
 	PagetableEntry *ptbase = NULL;
 	int		   *idxpages = NULL;
@@ -1101,13 +1101,13 @@ tbm_shared_iterate(TBMSharedIterator *iterator)
 			chunk_blockno < ptbase[idxpages[istate->spageptr]].blockno)
 		{
 			/* Return a lossy page indicator from the chunk */
-			output->blockno = chunk_blockno;
-			output->ntuples = -1;
-			output->recheck = true;
+			tbmres->blockno = chunk_blockno;
+			tbmres->ntuples = -1;
+			tbmres->recheck = true;
 			istate->schunkbit++;
 
 			LWLockRelease(&istate->lock);
-			return output;
+			return;
 		}
 	}
 
@@ -1117,21 +1117,22 @@ tbm_shared_iterate(TBMSharedIterator *iterator)
 		int			ntuples;
 
 		/* scan bitmap to extract individual offset numbers */
-		ntuples = tbm_extract_page_tuple(page, output);
-		output->blockno = page->blockno;
-		output->ntuples = ntuples;
-		output->recheck = page->recheck;
+		ntuples = tbm_extract_page_tuple(page, tbmres);
+		tbmres->blockno = page->blockno;
+		tbmres->ntuples = ntuples;
+		tbmres->recheck = page->recheck;
 		istate->spageptr++;
 
 		LWLockRelease(&istate->lock);
 
-		return output;
+		return;
 	}
 
 	LWLockRelease(&istate->lock);
 
 	/* Nothing more in the bitmap */
-	return NULL;
+	tbmres->blockno = InvalidBlockNumber;
+	return;
 }
 
 /*
@@ -1470,8 +1471,7 @@ tbm_attach_shared_iterate(dsa_area *dsa, dsa_pointer dp)
 	 * Create the TBMSharedIterator struct, with enough trailing space to
 	 * serve the needs of the TBMIterateResult sub-struct.
 	 */
-	iterator = (TBMSharedIterator *) palloc0(sizeof(TBMSharedIterator) +
-											 MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
+	iterator = (TBMSharedIterator *) palloc0(sizeof(TBMSharedIterator));
 
 	istate = (TBMSharedIteratorState *) dsa_get_address(dsa, dp);
 
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index 2fc369a18ff..33e8a7e0bba 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -26,6 +26,7 @@
 #include "storage/dsm.h"
 #include "storage/lockdefs.h"
 #include "storage/shm_toc.h"
+#include "storage/streaming_read.h"
 #include "utils/relcache.h"
 #include "utils/snapshot.h"
 
@@ -73,6 +74,7 @@ typedef struct HeapScanDescData
 	ParallelBlockTableScanWorkerData *rs_parallelworkerdata;
 
 	/* these fields only used in page-at-a-time mode and for bitmap scans */
+	PgStreamingRead *pgsr;
 	Buffer		vmbuffer;		/* for checking if can skip fetch */
 	int			empty_tuples;	/* count of all NULL tuples to be returned */
 	int			rs_cindex;		/* current tuple's index in vistuples */
diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h
index b90d9b7f3fa..adde320d1eb 100644
--- a/src/include/access/tableam.h
+++ b/src/include/access/tableam.h
@@ -788,17 +788,10 @@ typedef struct TableAmRoutine
 	 * on the page have to be returned, otherwise the tuples at offsets in
 	 * `tbmres->offsets` need to be returned.
 	 *
-	 * XXX: Currently this may only be implemented if the AM uses md.c as its
-	 * storage manager, and uses ItemPointer->ip_blkid in a manner that maps
-	 * blockids directly to the underlying storage. nodeBitmapHeapscan.c
-	 * performs prefetching directly using that interface.  This probably
-	 * needs to be rectified at a later point.
-	 *
 	 * Optional callback, but either both scan_bitmap_next_block and
 	 * scan_bitmap_next_tuple need to exist, or neither.
 	 */
-	bool		(*scan_bitmap_next_block) (TableScanDesc scan,
-										   bool *recheck, BlockNumber *blockno);
+	bool		(*scan_bitmap_next_block) (TableScanDesc scan, bool *recheck);
 
 	/*
 	 * Fetch the next tuple of a bitmap table scan into `slot` and return true
@@ -1950,8 +1943,7 @@ table_relation_estimate_size(Relation rel, int32 *attr_widths,
  * used after verifying the presence (at plan time or such).
  */
 static inline bool
-table_scan_bitmap_next_block(TableScanDesc scan,
-							 bool *recheck, BlockNumber *blockno)
+table_scan_bitmap_next_block(TableScanDesc scan, bool *recheck)
 {
 	/*
 	 * We don't expect direct calls to table_scan_bitmap_next_block with valid
@@ -1961,7 +1953,7 @@ table_scan_bitmap_next_block(TableScanDesc scan,
 	if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan))
 		elog(ERROR, "unexpected table_scan_bitmap_next_block call during logical decoding");
 
-	return scan->rs_rd->rd_tableam->scan_bitmap_next_block(scan, recheck, blockno);
+	return scan->rs_rd->rd_tableam->scan_bitmap_next_block(scan, recheck);
 }
 
 /*
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 03973a3f262..96afabc67e6 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1682,11 +1682,8 @@ typedef enum
 /* ----------------
  *	 ParallelBitmapHeapState information
  *		tbmiterator				iterator for scanning current pages
- *		prefetch_iterator		iterator for prefetching ahead of current page
  *		mutex					mutual exclusion for the prefetching variable
  *								and state
- *		prefetch_pages			# pages prefetch iterator is ahead of current
- *		prefetch_target			current target prefetch distance
  *		state					current state of the TIDBitmap
  *		cv						conditional wait variable
  *		phs_snapshot_data		snapshot data shared to workers
@@ -1695,10 +1692,7 @@ typedef enum
 typedef struct ParallelBitmapHeapState
 {
 	dsa_pointer tbmiterator;
-	dsa_pointer prefetch_iterator;
 	slock_t		mutex;
-	int			prefetch_pages;
-	int			prefetch_target;
 	SharedBitmapState state;
 	ConditionVariable cv;
 	char		phs_snapshot_data[FLEXIBLE_ARRAY_MEMBER];
@@ -1709,16 +1703,10 @@ typedef struct ParallelBitmapHeapState
  *
  *		bitmapqualorig	   execution state for bitmapqualorig expressions
  *		tbm				   bitmap obtained from child index scan(s)
- *		pvmbuffer		   buffer for visibility-map lookups of prefetched pages
  *		exact_pages		   total number of exact pages retrieved
  *		lossy_pages		   total number of lossy pages retrieved
- *		prefetch_iterator  iterator for prefetching ahead of current page
- *		prefetch_pages	   # pages prefetch iterator is ahead of current
- *		prefetch_target    current target prefetch distance
- *		prefetch_maximum   maximum value for prefetch_target
  *		pscan_len		   size of the shared memory for parallel bitmap
  *		initialized		   is node is ready to iterate
- *		shared_prefetch_iterator shared iterator for prefetching
  *		pstate			   shared state for parallel bitmap scan
  *		worker_snapshot	   snapshot for parallel worker
  * ----------------
@@ -1728,20 +1716,13 @@ typedef struct BitmapHeapScanState
 	ScanState	ss;				/* its first field is NodeTag */
 	ExprState  *bitmapqualorig;
 	TIDBitmap  *tbm;
-	Buffer		pvmbuffer;
 	long		exact_pages;
 	long		lossy_pages;
-	TBMIterator *prefetch_iterator;
-	int			prefetch_pages;
-	int			prefetch_target;
-	int			prefetch_maximum;
 	Size		pscan_len;
 	bool		initialized;
-	TBMSharedIterator *shared_prefetch_iterator;
 	ParallelBitmapHeapState *pstate;
 	Snapshot	worker_snapshot;
 	bool		recheck;
-	BlockNumber blockno;
 } BitmapHeapScanState;
 
 /* ----------------
diff --git a/src/include/nodes/tidbitmap.h b/src/include/nodes/tidbitmap.h
index 1945f0639bf..672608200ba 100644
--- a/src/include/nodes/tidbitmap.h
+++ b/src/include/nodes/tidbitmap.h
@@ -64,12 +64,16 @@ extern bool tbm_is_empty(const TIDBitmap *tbm);
 
 extern TBMIterator *tbm_begin_iterate(TIDBitmap *tbm);
 extern dsa_pointer tbm_prepare_shared_iterate(TIDBitmap *tbm);
-extern TBMIterateResult *tbm_iterate(TBMIterator *iterator);
-extern TBMIterateResult *tbm_shared_iterate(TBMSharedIterator *iterator);
+extern void tbm_iterate(TBMIterator *iterator, TBMIterateResult *tbmres);
+extern void tbm_shared_iterate(TBMSharedIterator *iterator, TBMIterateResult *tbmres);
 extern void tbm_end_iterate(TBMIterator *iterator);
 extern void tbm_end_shared_iterate(TBMSharedIterator *iterator);
 extern TBMSharedIterator *tbm_attach_shared_iterate(dsa_area *dsa,
 													dsa_pointer dp);
 extern long tbm_calculate_entries(double maxbytes);
 
+#define TBM_ITERATE_RESULT_SIZE \
+	(offsetof(TBMIterateResult, offsets) + \
+	 MaxHeapTuplesPerPage * sizeof(OffsetNumber))
+
 #endif							/* TIDBITMAP_H */
-- 
2.37.2

