From 5152ee50a21ea8e39a0f33efc0c83519f418ed8d Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Fri, 22 Mar 2024 09:42:23 -0400
Subject: [PATCH v9 13/17] Push BitmapHeapScan prefetch code into heapam.c

In preparation for transitioning to using the streaming read API for
prefetching [1], move all of the BitmapHeapScanState members related to
prefetching and the functions for accessing them into the
HeapScanDescData and TableScanDescData. Members that still need to be
accessed in BitmapHeapNext() could not be moved into heap AM-specific
code. Specifically, parallel iterator setup requires several components
which seem odd to pass to the table AM API.

[1] https://www.postgresql.org/message-id/flat/CA%2BhUKGJkOiOCa%2Bmag4BF%2BzHo7qo%3Do9CFheB8%3Dg6uT5TUm2gkvA%40mail.gmail.com
---
 src/backend/access/heap/heapam.c          |  26 ++
 src/backend/access/heap/heapam_handler.c  | 268 +++++++++++++++
 src/backend/executor/nodeBitmapHeapscan.c | 397 +++-------------------
 src/include/access/heapam.h               |  12 +
 src/include/access/relscan.h              |  11 +
 src/include/access/tableam.h              |  38 ++-
 src/include/nodes/execnodes.h             |  16 -
 7 files changed, 388 insertions(+), 380 deletions(-)

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index e7bed84f75..c12563a188 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -951,8 +951,16 @@ heap_beginscan(Relation relation, Snapshot snapshot,
 	scan->rs_base.rs_flags = flags;
 	scan->rs_base.rs_parallel = parallel_scan;
 	scan->rs_strategy = NULL;	/* set in initscan */
+
+	scan->rs_base.blockno = InvalidBlockNumber;
+
 	scan->rs_vmbuffer = InvalidBuffer;
 	scan->rs_empty_tuples_pending = 0;
+	scan->pvmbuffer = InvalidBuffer;
+
+	scan->pfblockno = InvalidBlockNumber;
+	scan->prefetch_target = -1;
+	scan->prefetch_pages = 0;
 
 	/*
 	 * Disable page-at-a-time mode if it's not a MVCC-safe snapshot.
@@ -1035,6 +1043,12 @@ heap_rescan(TableScanDesc sscan, ScanKey key, bool set_params,
 			scan->rs_base.rs_flags &= ~SO_ALLOW_PAGEMODE;
 	}
 
+	scan->rs_base.blockno = InvalidBlockNumber;
+
+	scan->pfblockno = InvalidBlockNumber;
+	scan->prefetch_target = -1;
+	scan->prefetch_pages = 0;
+
 	/*
 	 * unpin scan buffers
 	 */
@@ -1047,6 +1061,12 @@ heap_rescan(TableScanDesc sscan, ScanKey key, bool set_params,
 		scan->rs_vmbuffer = InvalidBuffer;
 	}
 
+	if (BufferIsValid(scan->pvmbuffer))
+	{
+		ReleaseBuffer(scan->pvmbuffer);
+		scan->pvmbuffer = InvalidBuffer;
+	}
+
 	/*
 	 * reinitialize scan descriptor
 	 */
@@ -1072,6 +1092,12 @@ heap_endscan(TableScanDesc sscan)
 		scan->rs_vmbuffer = InvalidBuffer;
 	}
 
+	if (BufferIsValid(scan->pvmbuffer))
+	{
+		ReleaseBuffer(scan->pvmbuffer);
+		scan->pvmbuffer = InvalidBuffer;
+	}
+
 	/*
 	 * decrement relation reference count and free scan descriptor storage
 	 */
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index adfc77684a..efd2784e03 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -55,6 +55,9 @@ static bool SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer,
 								   OffsetNumber tupoffset);
 
 static BlockNumber heapam_scan_get_blocks_done(HeapScanDesc hscan);
+static inline void BitmapAdjustPrefetchIterator(HeapScanDesc scan);
+static inline void BitmapAdjustPrefetchTarget(HeapScanDesc scan);
+static inline void BitmapPrefetch(HeapScanDesc scan);
 
 static const TableAmRoutine heapam_methods;
 
@@ -2112,6 +2115,76 @@ heapam_estimate_rel_size(Relation rel, int32 *attr_widths,
  * ------------------------------------------------------------------------
  */
 
+/*
+ *	BitmapAdjustPrefetchIterator - Adjust the prefetch iterator
+ *
+ *	We keep track of how far the prefetch iterator is ahead of the main
+ *	iterator in prefetch_pages. For each block the main iterator returns, we
+ *	decrement prefetch_pages.
+ */
+static inline void
+BitmapAdjustPrefetchIterator(HeapScanDesc scan)
+{
+#ifdef USE_PREFETCH
+	ParallelBitmapHeapState *pstate = scan->rs_base.bm_parallel;
+	TBMIterateResult tbmpre;
+
+	if (pstate == NULL)
+	{
+		TBMIterator *prefetch_iterator = scan->rs_base.pf_tbmiterator;
+
+		if (scan->prefetch_pages > 0)
+		{
+			/* The main iterator has closed the distance by one page */
+			scan->prefetch_pages--;
+		}
+		else if (prefetch_iterator)
+		{
+			/* Do not let the prefetch iterator get behind the main one */
+			tbm_iterate(prefetch_iterator, &tbmpre);
+			scan->pfblockno = tbmpre.blockno;
+		}
+		return;
+	}
+
+	/*
+	 * Adjusting the prefetch iterator before invoking
+	 * table_scan_bitmap_next_block() keeps prefetch distance higher across
+	 * the parallel workers.
+	 */
+	if (scan->rs_base.prefetch_maximum > 0)
+	{
+		TBMSharedIterator *prefetch_iterator = scan->rs_base.pf_shared_tbmiterator;
+
+		SpinLockAcquire(&pstate->mutex);
+		if (pstate->prefetch_pages > 0)
+		{
+			pstate->prefetch_pages--;
+			SpinLockRelease(&pstate->mutex);
+		}
+		else
+		{
+			/* Release the mutex before iterating */
+			SpinLockRelease(&pstate->mutex);
+
+			/*
+			 * In case of shared mode, we can not ensure that the current
+			 * blockno of the main iterator and that of the prefetch iterator
+			 * are same.  It's possible that whatever blockno we are
+			 * prefetching will be processed by another process.  Therefore,
+			 * we don't validate the blockno here as we do in non-parallel
+			 * case.
+			 */
+			if (prefetch_iterator)
+			{
+				tbm_shared_iterate(prefetch_iterator, &tbmpre);
+				scan->pfblockno = tbmpre.blockno;
+			}
+		}
+	}
+#endif							/* USE_PREFETCH */
+}
+
 static bool
 heapam_scan_bitmap_next_block(TableScanDesc scan,
 							  bool *recheck, BlockNumber *blockno,
@@ -2130,6 +2203,8 @@ heapam_scan_bitmap_next_block(TableScanDesc scan,
 	*blockno = InvalidBlockNumber;
 	*recheck = true;
 
+	BitmapAdjustPrefetchIterator(hscan);
+
 	do
 	{
 		CHECK_FOR_INTERRUPTS();
@@ -2273,6 +2348,18 @@ heapam_scan_bitmap_next_block(TableScanDesc scan,
 	else
 		(*exact_pages)++;
 
+	/*
+	 * If serial, we can error out if the the prefetch block doesn't stay
+	 * ahead of the current block.
+	 */
+	if (scan->bm_parallel == NULL &&
+		scan->pf_tbmiterator &&
+		hscan->pfblockno > hscan->rs_base.blockno)
+		elog(ERROR, "prefetch and main iterators are out of sync");
+
+	/* Adjust the prefetch target */
+	BitmapAdjustPrefetchTarget(hscan);
+
 	/*
 	 * Return true to indicate that a valid block was found and the bitmap is
 	 * not exhausted. If there are no visible tuples on this page,
@@ -2283,6 +2370,157 @@ heapam_scan_bitmap_next_block(TableScanDesc scan,
 	return true;
 }
 
+/*
+ * BitmapAdjustPrefetchTarget - Adjust the prefetch target
+ *
+ * Increase prefetch target if it's not yet at the max.  Note that
+ * we will increase it to zero after fetching the very first
+ * page/tuple, then to one after the second tuple is fetched, then
+ * it doubles as later pages are fetched.
+ */
+static inline void
+BitmapAdjustPrefetchTarget(HeapScanDesc scan)
+{
+#ifdef USE_PREFETCH
+	ParallelBitmapHeapState *pstate = scan->rs_base.bm_parallel;
+	int			prefetch_maximum = scan->rs_base.prefetch_maximum;
+
+	if (pstate == NULL)
+	{
+		if (scan->prefetch_target >= prefetch_maximum)
+			 /* don't increase any further */ ;
+		else if (scan->prefetch_target >= prefetch_maximum / 2)
+			scan->prefetch_target = prefetch_maximum;
+		else if (scan->prefetch_target > 0)
+			scan->prefetch_target *= 2;
+		else
+			scan->prefetch_target++;
+		return;
+	}
+
+	/* Do an unlocked check first to save spinlock acquisitions. */
+	if (pstate->prefetch_target < prefetch_maximum)
+	{
+		SpinLockAcquire(&pstate->mutex);
+		if (pstate->prefetch_target >= prefetch_maximum)
+			 /* don't increase any further */ ;
+		else if (pstate->prefetch_target >= prefetch_maximum / 2)
+			pstate->prefetch_target = prefetch_maximum;
+		else if (pstate->prefetch_target > 0)
+			pstate->prefetch_target *= 2;
+		else
+			pstate->prefetch_target++;
+		SpinLockRelease(&pstate->mutex);
+	}
+#endif							/* USE_PREFETCH */
+}
+
+
+/*
+ * BitmapPrefetch - Prefetch, if prefetch_pages are behind prefetch_target
+ */
+static inline void
+BitmapPrefetch(HeapScanDesc scan)
+{
+#ifdef USE_PREFETCH
+	ParallelBitmapHeapState *pstate = scan->rs_base.bm_parallel;
+
+	if (pstate == NULL)
+	{
+		TBMIterator *prefetch_iterator = scan->rs_base.pf_tbmiterator;
+
+		if (prefetch_iterator)
+		{
+			while (scan->prefetch_pages < scan->prefetch_target)
+			{
+				TBMIterateResult tbmpre;
+				bool		skip_fetch;
+
+				tbm_iterate(prefetch_iterator, &tbmpre);
+
+				if (!BlockNumberIsValid(tbmpre.blockno))
+				{
+					/* No more pages to prefetch */
+					tbm_end_iterate(prefetch_iterator);
+					scan->rs_base.pf_tbmiterator = NULL;
+					break;
+				}
+				scan->prefetch_pages++;
+				scan->pfblockno = tbmpre.blockno;
+
+				/*
+				 * If we expect not to have to actually read this heap page,
+				 * skip this prefetch call, but continue to run the prefetch
+				 * logic normally.  (Would it be better not to increment
+				 * prefetch_pages?)
+				 */
+				skip_fetch = (!(scan->rs_base.rs_flags & SO_NEED_TUPLE) &&
+							  !tbmpre.recheck &&
+							  VM_ALL_VISIBLE(scan->rs_base.rs_rd,
+											 tbmpre.blockno,
+											 &scan->pvmbuffer));
+
+				if (!skip_fetch)
+					PrefetchBuffer(scan->rs_base.rs_rd, MAIN_FORKNUM, tbmpre.blockno);
+			}
+		}
+
+		return;
+	}
+
+	if (pstate->prefetch_pages < pstate->prefetch_target)
+	{
+		TBMSharedIterator *prefetch_iterator = scan->rs_base.pf_shared_tbmiterator;
+
+		if (prefetch_iterator)
+		{
+			while (1)
+			{
+				TBMIterateResult tbmpre;
+				bool		do_prefetch = false;
+				bool		skip_fetch;
+
+				/*
+				 * Recheck under the mutex. If some other process has already
+				 * done enough prefetching then we need not to do anything.
+				 */
+				SpinLockAcquire(&pstate->mutex);
+				if (pstate->prefetch_pages < pstate->prefetch_target)
+				{
+					pstate->prefetch_pages++;
+					do_prefetch = true;
+				}
+				SpinLockRelease(&pstate->mutex);
+
+				if (!do_prefetch)
+					return;
+
+				tbm_shared_iterate(prefetch_iterator, &tbmpre);
+				if (!BlockNumberIsValid(tbmpre.blockno))
+				{
+					/* No more pages to prefetch */
+					tbm_end_shared_iterate(prefetch_iterator);
+					scan->rs_base.pf_shared_tbmiterator = NULL;
+					break;
+				}
+
+				scan->pfblockno = tbmpre.blockno;
+
+				/* As above, skip prefetch if we expect not to need page */
+				skip_fetch = (!(scan->rs_base.rs_flags & SO_NEED_TUPLE) &&
+							  !tbmpre.recheck &&
+							  VM_ALL_VISIBLE(scan->rs_base.rs_rd,
+											 tbmpre.blockno,
+											 &scan->pvmbuffer));
+
+				if (!skip_fetch)
+					PrefetchBuffer(scan->rs_base.rs_rd, MAIN_FORKNUM, tbmpre.blockno);
+			}
+		}
+	}
+#endif							/* USE_PREFETCH */
+}
+
 static bool
 heapam_scan_bitmap_next_tuple(TableScanDesc scan,
 							  TupleTableSlot *slot)
@@ -2308,6 +2546,36 @@ heapam_scan_bitmap_next_tuple(TableScanDesc scan,
 	if (hscan->rs_cindex < 0 || hscan->rs_cindex >= hscan->rs_ntuples)
 		return false;
 
+#ifdef USE_PREFETCH
+
+	/*
+	 * Try to prefetch at least a few pages even before we get to the second
+	 * page if we don't stop reading after the first tuple.
+	 */
+	if (!scan->bm_parallel)
+	{
+		if (hscan->prefetch_target < scan->prefetch_maximum)
+			hscan->prefetch_target++;
+	}
+	else if (scan->bm_parallel->prefetch_target < scan->prefetch_maximum)
+	{
+		/* take spinlock while updating shared state */
+		SpinLockAcquire(&scan->bm_parallel->mutex);
+		if (scan->bm_parallel->prefetch_target < scan->prefetch_maximum)
+			scan->bm_parallel->prefetch_target++;
+		SpinLockRelease(&scan->bm_parallel->mutex);
+	}
+
+	/*
+	 * We issue prefetch requests *after* fetching the current page to try to
+	 * avoid having prefetching interfere with the main I/O. Also, this should
+	 * happen only when we have determined there is still something to do on
+	 * the current page, else we may uselessly prefetch the same page we are
+	 * just about to request for real.
+	 */
+	BitmapPrefetch(hscan);
+#endif							/* USE_PREFETCH */
+
 	targoffset = hscan->rs_vistuples[hscan->rs_cindex];
 	page = BufferGetPage(hscan->rs_cbuf);
 	lp = PageGetItemId(page, targoffset);
diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c
index 51c4360205..f241f4cb2c 100644
--- a/src/backend/executor/nodeBitmapHeapscan.c
+++ b/src/backend/executor/nodeBitmapHeapscan.c
@@ -51,10 +51,6 @@
 
 static TupleTableSlot *BitmapHeapNext(BitmapHeapScanState *node);
 static inline void BitmapDoneInitializingSharedState(ParallelBitmapHeapState *pstate);
-static inline void BitmapAdjustPrefetchIterator(BitmapHeapScanState *node);
-static inline void BitmapAdjustPrefetchTarget(BitmapHeapScanState *node);
-static inline void BitmapPrefetch(BitmapHeapScanState *node,
-								  TableScanDesc scan);
 static bool BitmapShouldInitializeSharedState(ParallelBitmapHeapState *pstate);
 
 
@@ -71,7 +67,6 @@ BitmapHeapNext(BitmapHeapScanState *node)
 	TableScanDesc scan;
 	TIDBitmap  *tbm;
 	TupleTableSlot *slot;
-	ParallelBitmapHeapState *pstate = node->pstate;
 	dsa_area   *dsa = node->ss.ps.state->es_query_dsa;
 
 	/*
@@ -91,83 +86,53 @@ BitmapHeapNext(BitmapHeapScanState *node)
 	 * prefetching.  node->prefetch_pages tracks exactly how many pages ahead
 	 * the prefetch iterator is.  Also, node->prefetch_target tracks the
 	 * desired prefetch distance, which starts small and increases up to the
-	 * node->prefetch_maximum.  This is to avoid doing a lot of prefetching in
+	 * scan->prefetch_maximum.  This is to avoid doing a lot of prefetching in
 	 * a scan that stops after a few tuples because of a LIMIT.
 	 */
 	if (!node->initialized)
 	{
-		TBMIterator *tbmiterator = NULL;
-		TBMSharedIterator *shared_tbmiterator = NULL;
+		/*
+		 * The leader will immediately come out of the function, but others
+		 * will be blocked until leader populates the TBM and wakes them up.
+		 */
+		bool		init_shared_state = node->pstate ?
+			BitmapShouldInitializeSharedState(node->pstate) : false;
+
+		/*
+		 * Maximum number of prefetches for the tablespace if configured,
+		 * otherwise the current value of the effective_io_concurrency GUC.
+		 */
+		int			pf_maximum = 0;
+#ifdef USE_PREFETCH
+		pf_maximum = get_tablespace_io_concurrency(node->ss.ss_currentRelation->rd_rel->reltablespace);
+#endif
 
-		if (!pstate)
+		if (!node->pstate || init_shared_state)
 		{
 			tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node));
 
 			if (!tbm || !IsA(tbm, TIDBitmap))
 				elog(ERROR, "unrecognized result from subplan");
-
 			node->tbm = tbm;
-			tbmiterator = tbm_begin_iterate(tbm);
 
-#ifdef USE_PREFETCH
-			if (node->prefetch_maximum > 0)
+			if (init_shared_state)
 			{
-				node->prefetch_iterator = tbm_begin_iterate(tbm);
-				node->prefetch_pages = 0;
-				node->prefetch_target = -1;
-			}
-#endif							/* USE_PREFETCH */
-		}
-		else
-		{
-			/*
-			 * The leader will immediately come out of the function, but
-			 * others will be blocked until leader populates the TBM and wakes
-			 * them up.
-			 */
-			if (BitmapShouldInitializeSharedState(pstate))
-			{
-				tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node));
-				if (!tbm || !IsA(tbm, TIDBitmap))
-					elog(ERROR, "unrecognized result from subplan");
-
-				node->tbm = tbm;
-
 				/*
 				 * Prepare to iterate over the TBM. This will return the
 				 * dsa_pointer of the iterator state which will be used by
 				 * multiple processes to iterate jointly.
 				 */
-				pstate->tbmiterator = tbm_prepare_shared_iterate(tbm);
+				node->pstate->tbmiterator = tbm_prepare_shared_iterate(tbm);
 #ifdef USE_PREFETCH
-				if (node->prefetch_maximum > 0)
+				if (pf_maximum > 0)
 				{
-					pstate->prefetch_iterator =
+					node->pstate->prefetch_iterator =
 						tbm_prepare_shared_iterate(tbm);
-
-					/*
-					 * We don't need the mutex here as we haven't yet woke up
-					 * others.
-					 */
-					pstate->prefetch_pages = 0;
-					pstate->prefetch_target = -1;
 				}
 #endif
-
 				/* We have initialized the shared state so wake up others. */
-				BitmapDoneInitializingSharedState(pstate);
-			}
-
-			/* Allocate a private iterator and attach the shared state to it */
-			shared_tbmiterator = tbm_attach_shared_iterate(dsa, pstate->tbmiterator);
-
-#ifdef USE_PREFETCH
-			if (node->prefetch_maximum > 0)
-			{
-				node->shared_prefetch_iterator =
-					tbm_attach_shared_iterate(dsa, pstate->prefetch_iterator);
+				BitmapDoneInitializingSharedState(node->pstate);
 			}
-#endif							/* USE_PREFETCH */
 		}
 
 		/*
@@ -197,8 +162,26 @@ BitmapHeapNext(BitmapHeapScanState *node)
 																	extra_flags);
 		}
 
-		scan->tbmiterator = tbmiterator;
-		scan->shared_tbmiterator = shared_tbmiterator;
+		scan->prefetch_maximum = pf_maximum;
+		scan->bm_parallel = node->pstate;
+
+		if (!scan->bm_parallel)
+			scan->tbmiterator = tbm_begin_iterate(tbm);
+		else
+			/* Allocate a private iterator and attach the shared state to it */
+			scan->shared_tbmiterator = tbm_attach_shared_iterate(dsa, scan->bm_parallel->tbmiterator);
+
+#ifdef USE_PREFETCH
+		if (scan->prefetch_maximum > 0)
+		{
+			if (!scan->bm_parallel)
+				scan->pf_tbmiterator = tbm_begin_iterate(tbm);
+			else
+				scan->pf_shared_tbmiterator =
+					tbm_attach_shared_iterate(dsa, scan->bm_parallel->prefetch_iterator);
+		}
+#endif							/* USE_PREFETCH */
+
 
 		node->initialized = true;
 
@@ -211,36 +194,6 @@ BitmapHeapNext(BitmapHeapScanState *node)
 		{
 			CHECK_FOR_INTERRUPTS();
 
-#ifdef USE_PREFETCH
-
-			/*
-			 * Try to prefetch at least a few pages even before we get to the
-			 * second page if we don't stop reading after the first tuple.
-			 */
-			if (!pstate)
-			{
-				if (node->prefetch_target < node->prefetch_maximum)
-					node->prefetch_target++;
-			}
-			else if (pstate->prefetch_target < node->prefetch_maximum)
-			{
-				/* take spinlock while updating shared state */
-				SpinLockAcquire(&pstate->mutex);
-				if (pstate->prefetch_target < node->prefetch_maximum)
-					pstate->prefetch_target++;
-				SpinLockRelease(&pstate->mutex);
-			}
-#endif							/* USE_PREFETCH */
-
-			/*
-			 * We issue prefetch requests *after* fetching the current page to
-			 * try to avoid having prefetching interfere with the main I/O.
-			 * Also, this should happen only when we have determined there is
-			 * still something to do on the current page, else we may
-			 * uselessly prefetch the same page we are just about to request
-			 * for real.
-			 */
-			BitmapPrefetch(node, scan);
 
 			/*
 			 * If we are using lossy info, we have to recheck the qual
@@ -264,23 +217,9 @@ BitmapHeapNext(BitmapHeapScanState *node)
 
 new_page:
 
-		BitmapAdjustPrefetchIterator(node);
-
-		if (!table_scan_bitmap_next_block(scan, &node->recheck, &node->blockno,
+		if (!table_scan_bitmap_next_block(scan, &node->recheck, &scan->blockno,
 										  &node->lossy_pages, &node->exact_pages))
 			break;
-
-		/*
-		 * If serial, we can error out if the the prefetch block doesn't stay
-		 * ahead of the current block.
-		 */
-		if (node->pstate == NULL &&
-			node->prefetch_iterator &&
-			node->pfblockno > node->blockno)
-			elog(ERROR, "prefetch and main iterators are out of sync");
-
-		/* Adjust the prefetch target */
-		BitmapAdjustPrefetchTarget(node);
 	}
 
 	/*
@@ -304,224 +243,7 @@ BitmapDoneInitializingSharedState(ParallelBitmapHeapState *pstate)
 	ConditionVariableBroadcast(&pstate->cv);
 }
 
-/*
- *	BitmapAdjustPrefetchIterator - Adjust the prefetch iterator
- *
- *	We keep track of how far the prefetch iterator is ahead of the main
- *	iterator in prefetch_pages. For each block the main iterator returns, we
- *	decrement prefetch_pages.
- */
-static inline void
-BitmapAdjustPrefetchIterator(BitmapHeapScanState *node)
-{
-#ifdef USE_PREFETCH
-	ParallelBitmapHeapState *pstate = node->pstate;
-	TBMIterateResult tbmpre;
-
-	if (pstate == NULL)
-	{
-		TBMIterator *prefetch_iterator = node->prefetch_iterator;
-
-		if (node->prefetch_pages > 0)
-		{
-			/* The main iterator has closed the distance by one page */
-			node->prefetch_pages--;
-		}
-		else if (prefetch_iterator)
-		{
-			/* Do not let the prefetch iterator get behind the main one */
-			tbm_iterate(prefetch_iterator, &tbmpre);
-			node->pfblockno = tbmpre.blockno;
-		}
-		return;
-	}
-
-	/*
-	 * Adjusting the prefetch iterator before invoking
-	 * table_scan_bitmap_next_block() keeps prefetch distance higher across
-	 * the parallel workers.
-	 */
-	if (node->prefetch_maximum > 0)
-	{
-		TBMSharedIterator *prefetch_iterator = node->shared_prefetch_iterator;
 
-		SpinLockAcquire(&pstate->mutex);
-		if (pstate->prefetch_pages > 0)
-		{
-			pstate->prefetch_pages--;
-			SpinLockRelease(&pstate->mutex);
-		}
-		else
-		{
-			/* Release the mutex before iterating */
-			SpinLockRelease(&pstate->mutex);
-
-			/*
-			 * In case of shared mode, we can not ensure that the current
-			 * blockno of the main iterator and that of the prefetch iterator
-			 * are same.  It's possible that whatever blockno we are
-			 * prefetching will be processed by another process.  Therefore,
-			 * we don't validate the blockno here as we do in non-parallel
-			 * case.
-			 */
-			if (prefetch_iterator)
-			{
-				tbm_shared_iterate(prefetch_iterator, &tbmpre);
-				node->pfblockno = tbmpre.blockno;
-			}
-		}
-	}
-#endif							/* USE_PREFETCH */
-}
-
-/*
- * BitmapAdjustPrefetchTarget - Adjust the prefetch target
- *
- * Increase prefetch target if it's not yet at the max.  Note that
- * we will increase it to zero after fetching the very first
- * page/tuple, then to one after the second tuple is fetched, then
- * it doubles as later pages are fetched.
- */
-static inline void
-BitmapAdjustPrefetchTarget(BitmapHeapScanState *node)
-{
-#ifdef USE_PREFETCH
-	ParallelBitmapHeapState *pstate = node->pstate;
-
-	if (pstate == NULL)
-	{
-		if (node->prefetch_target >= node->prefetch_maximum)
-			 /* don't increase any further */ ;
-		else if (node->prefetch_target >= node->prefetch_maximum / 2)
-			node->prefetch_target = node->prefetch_maximum;
-		else if (node->prefetch_target > 0)
-			node->prefetch_target *= 2;
-		else
-			node->prefetch_target++;
-		return;
-	}
-
-	/* Do an unlocked check first to save spinlock acquisitions. */
-	if (pstate->prefetch_target < node->prefetch_maximum)
-	{
-		SpinLockAcquire(&pstate->mutex);
-		if (pstate->prefetch_target >= node->prefetch_maximum)
-			 /* don't increase any further */ ;
-		else if (pstate->prefetch_target >= node->prefetch_maximum / 2)
-			pstate->prefetch_target = node->prefetch_maximum;
-		else if (pstate->prefetch_target > 0)
-			pstate->prefetch_target *= 2;
-		else
-			pstate->prefetch_target++;
-		SpinLockRelease(&pstate->mutex);
-	}
-#endif							/* USE_PREFETCH */
-}
-
-/*
- * BitmapPrefetch - Prefetch, if prefetch_pages are behind prefetch_target
- */
-static inline void
-BitmapPrefetch(BitmapHeapScanState *node, TableScanDesc scan)
-{
-#ifdef USE_PREFETCH
-	ParallelBitmapHeapState *pstate = node->pstate;
-
-	if (pstate == NULL)
-	{
-		TBMIterator *prefetch_iterator = node->prefetch_iterator;
-
-		if (prefetch_iterator)
-		{
-			while (node->prefetch_pages < node->prefetch_target)
-			{
-				TBMIterateResult tbmpre;
-				bool		skip_fetch;
-
-				tbm_iterate(prefetch_iterator, &tbmpre);
-
-				if (!BlockNumberIsValid(tbmpre.blockno))
-				{
-					/* No more pages to prefetch */
-					tbm_end_iterate(prefetch_iterator);
-					node->prefetch_iterator = NULL;
-					break;
-				}
-				node->prefetch_pages++;
-				node->pfblockno = tbmpre.blockno;
-
-				/*
-				 * If we expect not to have to actually read this heap page,
-				 * skip this prefetch call, but continue to run the prefetch
-				 * logic normally.  (Would it be better not to increment
-				 * prefetch_pages?)
-				 */
-				skip_fetch = (!(scan->rs_flags & SO_NEED_TUPLE) &&
-							  !tbmpre.recheck &&
-							  VM_ALL_VISIBLE(node->ss.ss_currentRelation,
-											 tbmpre.blockno,
-											 &node->pvmbuffer));
-
-				if (!skip_fetch)
-					PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre.blockno);
-			}
-		}
-
-		return;
-	}
-
-	if (pstate->prefetch_pages < pstate->prefetch_target)
-	{
-		TBMSharedIterator *prefetch_iterator = node->shared_prefetch_iterator;
-
-		if (prefetch_iterator)
-		{
-			while (1)
-			{
-				TBMIterateResult tbmpre;
-				bool		do_prefetch = false;
-				bool		skip_fetch;
-
-				/*
-				 * Recheck under the mutex. If some other process has already
-				 * done enough prefetching then we need not to do anything.
-				 */
-				SpinLockAcquire(&pstate->mutex);
-				if (pstate->prefetch_pages < pstate->prefetch_target)
-				{
-					pstate->prefetch_pages++;
-					do_prefetch = true;
-				}
-				SpinLockRelease(&pstate->mutex);
-
-				if (!do_prefetch)
-					return;
-
-				tbm_shared_iterate(prefetch_iterator, &tbmpre);
-				if (!BlockNumberIsValid(tbmpre.blockno))
-				{
-					/* No more pages to prefetch */
-					tbm_end_shared_iterate(prefetch_iterator);
-					node->shared_prefetch_iterator = NULL;
-					break;
-				}
-
-				node->pfblockno = tbmpre.blockno;
-
-				/* As above, skip prefetch if we expect not to need page */
-				skip_fetch = (!(scan->rs_flags & SO_NEED_TUPLE) &&
-							  !tbmpre.recheck &&
-							  VM_ALL_VISIBLE(node->ss.ss_currentRelation,
-											 tbmpre.blockno,
-											 &node->pvmbuffer));
-
-				if (!skip_fetch)
-					PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre.blockno);
-			}
-		}
-	}
-#endif							/* USE_PREFETCH */
-}
 
 /*
  * BitmapHeapRecheck -- access method routine to recheck a tuple in EvalPlanQual
@@ -569,22 +291,11 @@ ExecReScanBitmapHeapScan(BitmapHeapScanState *node)
 		table_rescan(node->ss.ss_currentScanDesc, NULL);
 
 	/* release bitmaps and buffers if any */
-	if (node->prefetch_iterator)
-		tbm_end_iterate(node->prefetch_iterator);
-	if (node->shared_prefetch_iterator)
-		tbm_end_shared_iterate(node->shared_prefetch_iterator);
 	if (node->tbm)
 		tbm_free(node->tbm);
-	if (node->pvmbuffer != InvalidBuffer)
-		ReleaseBuffer(node->pvmbuffer);
 	node->tbm = NULL;
-	node->prefetch_iterator = NULL;
 	node->initialized = false;
-	node->shared_prefetch_iterator = NULL;
-	node->pvmbuffer = InvalidBuffer;
 	node->recheck = true;
-	node->blockno = InvalidBlockNumber;
-	node->pfblockno = InvalidBlockNumber;
 
 	ExecScanReScan(&node->ss);
 
@@ -625,14 +336,8 @@ ExecEndBitmapHeapScan(BitmapHeapScanState *node)
 	/*
 	 * release bitmaps and buffers if any
 	 */
-	if (node->prefetch_iterator)
-		tbm_end_iterate(node->prefetch_iterator);
 	if (node->tbm)
 		tbm_free(node->tbm);
-	if (node->shared_prefetch_iterator)
-		tbm_end_shared_iterate(node->shared_prefetch_iterator);
-	if (node->pvmbuffer != InvalidBuffer)
-		ReleaseBuffer(node->pvmbuffer);
 }
 
 /* ----------------------------------------------------------------
@@ -665,18 +370,11 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
 	scanstate->ss.ps.ExecProcNode = ExecBitmapHeapScan;
 
 	scanstate->tbm = NULL;
-	scanstate->pvmbuffer = InvalidBuffer;
 	scanstate->exact_pages = 0;
 	scanstate->lossy_pages = 0;
-	scanstate->prefetch_iterator = NULL;
-	scanstate->prefetch_pages = 0;
-	scanstate->prefetch_target = 0;
 	scanstate->initialized = false;
-	scanstate->shared_prefetch_iterator = NULL;
 	scanstate->pstate = NULL;
 	scanstate->recheck = true;
-	scanstate->blockno = InvalidBlockNumber;
-	scanstate->pfblockno = InvalidBlockNumber;
 
 	/*
 	 * Miscellaneous initialization
@@ -716,13 +414,6 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
 	scanstate->bitmapqualorig =
 		ExecInitQual(node->bitmapqualorig, (PlanState *) scanstate);
 
-	/*
-	 * Maximum number of prefetches for the tablespace if configured,
-	 * otherwise the current value of the effective_io_concurrency GUC.
-	 */
-	scanstate->prefetch_maximum =
-		get_tablespace_io_concurrency(currentRelation->rd_rel->reltablespace);
-
 	scanstate->ss.ss_currentRelation = currentRelation;
 
 	/*
@@ -806,7 +497,7 @@ ExecBitmapHeapInitializeDSM(BitmapHeapScanState *node,
 	/* Initialize the mutex */
 	SpinLockInit(&pstate->mutex);
 	pstate->prefetch_pages = 0;
-	pstate->prefetch_target = 0;
+	pstate->prefetch_target = -1;
 	pstate->state = BM_INITIAL;
 
 	ConditionVariableInit(&pstate->cv);
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index 3dfb19ec7d..22bdccc2a9 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -81,6 +81,18 @@ typedef struct HeapScanDescData
 	 */
 	Buffer		rs_vmbuffer;
 	int			rs_empty_tuples_pending;
+	/* buffer for visibility-map lookups of prefetched pages */
+	Buffer		pvmbuffer;
+
+	/*
+	 * These fields only used for prefetching in bitmap table scans
+	 */
+	/* Current target for prefetch distance */
+	int			prefetch_target;
+	/* # pages prefetch iterator is ahead of current */
+	int			prefetch_pages;
+	/* used to validate prefetch block stays ahead of current block */
+	BlockNumber pfblockno;
 
 	/* these fields only used in page-at-a-time mode and for bitmap scans */
 	int			rs_cindex;		/* current tuple's index in vistuples */
diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h
index 92b829cebc..93168bd350 100644
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -26,6 +26,7 @@ struct ParallelTableScanDescData;
 
 struct TBMIterator;
 struct TBMSharedIterator;
+struct ParallelBitmapHeapState;
 
 /*
  * Generic descriptor for table scans. This is the base-class for table scans,
@@ -46,6 +47,16 @@ typedef struct TableScanDescData
 	/* Only used for Bitmap table scans */
 	struct TBMIterator *tbmiterator;
 	struct TBMSharedIterator *shared_tbmiterator;
+	/* Prefetch iterators */
+	struct TBMIterator *pf_tbmiterator;
+	struct TBMSharedIterator *pf_shared_tbmiterator;
+
+	/* maximum value for prefetch_target */
+	int			prefetch_maximum;
+	struct ParallelBitmapHeapState *bm_parallel;
+
+	/* used to validate prefetch and current block stay in sync */
+	BlockNumber blockno;
 
 	/*
 	 * Information about type and behaviour of the scan, a bitmask of members
diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h
index 1d4b79a73f..9cab4462d6 100644
--- a/src/include/access/tableam.h
+++ b/src/include/access/tableam.h
@@ -800,17 +800,6 @@ typedef struct TableAmRoutine
 	 * lossy_pages is incremented if the block's representation in the bitmap
 	 * is lossy, otherwise, exact_pages is incremented.
 	 *
-	 * XXX: Currently this may only be implemented if the AM uses md.c as its
-	 * storage manager, and uses ItemPointer->ip_blkid in a manner that maps
-	 * blockids directly to the underlying storage. nodeBitmapHeapscan.c
-	 * performs prefetching directly using that interface.  This probably
-	 * needs to be rectified at a later point.
-	 *
-	 * XXX: Currently this may only be implemented if the AM uses the
-	 * visibilitymap, as nodeBitmapHeapscan.c unconditionally accesses it to
-	 * perform prefetching.  This probably needs to be rectified at a later
-	 * point.
-	 *
 	 * Optional callback, but either both scan_bitmap_next_block and
 	 * scan_bitmap_next_tuple need to exist, or neither.
 	 */
@@ -961,6 +950,9 @@ table_beginscan_bm(Relation rel, Snapshot snapshot,
 	result = rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags);
 	result->shared_tbmiterator = NULL;
 	result->tbmiterator = NULL;
+	result->pf_shared_tbmiterator = NULL;
+	result->pf_tbmiterator = NULL;
+	result->bm_parallel = NULL;
 	return result;
 }
 
@@ -1029,11 +1021,23 @@ table_endscan(TableScanDesc scan)
 			scan->shared_tbmiterator = NULL;
 		}
 
+		if (scan->pf_shared_tbmiterator)
+		{
+			tbm_end_shared_iterate(scan->pf_shared_tbmiterator);
+			scan->pf_shared_tbmiterator = NULL;
+		}
+
 		if (scan->tbmiterator)
 		{
 			tbm_end_iterate(scan->tbmiterator);
 			scan->tbmiterator = NULL;
 		}
+
+		if (scan->pf_tbmiterator)
+		{
+			tbm_end_iterate(scan->pf_tbmiterator);
+			scan->pf_tbmiterator = NULL;
+		}
 	}
 
 	scan->rs_rd->rd_tableam->scan_end(scan);
@@ -1054,11 +1058,23 @@ table_rescan(TableScanDesc scan,
 			scan->shared_tbmiterator = NULL;
 		}
 
+		if (scan->pf_shared_tbmiterator)
+		{
+			tbm_end_shared_iterate(scan->pf_shared_tbmiterator);
+			scan->pf_shared_tbmiterator = NULL;
+		}
+
 		if (scan->tbmiterator)
 		{
 			tbm_end_iterate(scan->tbmiterator);
 			scan->tbmiterator = NULL;
 		}
+
+		if (scan->pf_tbmiterator)
+		{
+			tbm_end_iterate(scan->pf_tbmiterator);
+			scan->pf_tbmiterator = NULL;
+		}
 	}
 
 	scan->rs_rd->rd_tableam->scan_rescan(scan, key, false, false, false, false);
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 8688bc5ab0..7a3fdf9cd4 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1783,19 +1783,11 @@ typedef struct ParallelBitmapHeapState
  *
  *		bitmapqualorig	   execution state for bitmapqualorig expressions
  *		tbm				   bitmap obtained from child index scan(s)
- *		pvmbuffer		   buffer for visibility-map lookups of prefetched pages
  *		exact_pages		   total number of exact pages retrieved
  *		lossy_pages		   total number of lossy pages retrieved
- *		prefetch_iterator  iterator for prefetching ahead of current page
- *		prefetch_pages	   # pages prefetch iterator is ahead of current
- *		prefetch_target    current target prefetch distance
- *		prefetch_maximum   maximum value for prefetch_target
  *		initialized		   is node is ready to iterate
- *		shared_prefetch_iterator shared iterator for prefetching
  *		pstate			   shared state for parallel bitmap scan
  *		recheck			   do current page's tuples need recheck
- *		blockno			   used to validate pf and current block in sync
- *		pfblockno		   used to validate pf stays ahead of current block
  * ----------------
  */
 typedef struct BitmapHeapScanState
@@ -1803,19 +1795,11 @@ typedef struct BitmapHeapScanState
 	ScanState	ss;				/* its first field is NodeTag */
 	ExprState  *bitmapqualorig;
 	TIDBitmap  *tbm;
-	Buffer		pvmbuffer;
 	long		exact_pages;
 	long		lossy_pages;
-	TBMIterator *prefetch_iterator;
-	int			prefetch_pages;
-	int			prefetch_target;
-	int			prefetch_maximum;
 	bool		initialized;
-	TBMSharedIterator *shared_prefetch_iterator;
 	ParallelBitmapHeapState *pstate;
 	bool		recheck;
-	BlockNumber blockno;
-	BlockNumber pfblockno;
 } BitmapHeapScanState;
 
 /* ----------------
-- 
2.40.1

