From b0c3e346bf41c6c7d14502814bb0ee327ae68169 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@2ndquadrant.com>
Date: Sat, 9 Dec 2023 00:21:30 +0100
Subject: [PATCH v20231209 2/2] reworks

---
 src/backend/access/heap/heapam_handler.c |  14 +-
 src/backend/access/index/genam.c         |  35 +---
 src/backend/access/index/indexam.c       | 152 ++++------------
 src/backend/executor/execIndexing.c      |  12 +-
 src/backend/executor/execReplication.c   |  18 +-
 src/backend/executor/nodeIndexonlyscan.c | 166 ++++++++---------
 src/backend/executor/nodeIndexscan.c     | 149 ++++++++--------
 src/backend/utils/adt/selfuncs.c         |   5 +-
 src/include/access/genam.h               | 217 ++++++++++++-----------
 src/include/access/relscan.h             |  10 --
 src/include/nodes/execnodes.h            |   4 +
 11 files changed, 332 insertions(+), 450 deletions(-)

diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index 89474078951..26d3ec20b63 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -44,7 +44,6 @@
 #include "storage/smgr.h"
 #include "utils/builtins.h"
 #include "utils/rel.h"
-#include "utils/spccache.h"
 
 static void reform_and_rewrite_tuple(HeapTuple tuple,
 									 Relation OldHeap, Relation NewHeap,
@@ -748,14 +747,6 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
 			PROGRESS_CLUSTER_INDEX_RELID
 		};
 		int64		ci_val[2];
-		int			prefetch_max;
-
-		/*
-		 * Get the prefetch target for the old tablespace (which is what we'll
-		 * read using the index). We'll use it as a reset value too, although
-		 * there should be no rescans for CLUSTER etc.
-		 */
-		prefetch_max = get_tablespace_io_concurrency(OldHeap->rd_rel->reltablespace);
 
 		/* Set phase and OIDOldIndex to columns */
 		ci_val[0] = PROGRESS_CLUSTER_PHASE_INDEX_SCAN_HEAP;
@@ -764,8 +755,7 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
 
 		tableScan = NULL;
 		heapScan = NULL;
-		indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, 0, 0,
-									prefetch_max);
+		indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, 0, 0);
 		index_rescan(indexScan, NULL, 0, NULL, 0);
 	}
 	else
@@ -802,7 +792,7 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
 
 		if (indexScan != NULL)
 		{
-			if (!index_getnext_slot(indexScan, ForwardScanDirection, slot))
+			if (!index_getnext_slot(indexScan, ForwardScanDirection, slot, NULL))
 				break;
 
 			/* Since we used no scan keys, should never need to recheck */
diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c
index d45a209ee3a..72e7c9f206c 100644
--- a/src/backend/access/index/genam.c
+++ b/src/backend/access/index/genam.c
@@ -126,9 +126,6 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
 	scan->xs_hitup = NULL;
 	scan->xs_hitupdesc = NULL;
 
-	/* Information used for asynchronous prefetching during index scans. */
-	scan->xs_prefetch = NULL;
-
 	return scan;
 }
 
@@ -443,20 +440,8 @@ systable_beginscan(Relation heapRelation,
 				elog(ERROR, "column is not in index");
 		}
 
-		/*
-		 * We don't do any prefetching on system catalogs, for two main reasons.
-		 *
-		 * Firstly, we usually do PK lookups, which makes prefetching pointles,
-		 * or we often don't know how many rows to expect (and the numbers tend
-		 * to be fairly low). So it's not clear it'd help. Furthermore, places
-		 * that are sensitive tend to use syscache anyway.
-		 *
-		 * Secondly, we can't call get_tablespace_io_concurrency() because that
-		 * does a sysscan internally, so it might lead to a cycle. We could use
-		 * use effective_io_concurrency, but it doesn't seem worth it.
-		 */
 		sysscan->iscan = index_beginscan(heapRelation, irel,
-										 snapshot, nkeys, 0, 0);
+										 snapshot, nkeys, 0);
 		index_rescan(sysscan->iscan, key, nkeys, NULL, 0);
 		sysscan->scan = NULL;
 	}
@@ -524,7 +509,7 @@ systable_getnext(SysScanDesc sysscan)
 
 	if (sysscan->irel)
 	{
-		if (index_getnext_slot(sysscan->iscan, ForwardScanDirection, sysscan->slot))
+		if (index_getnext_slot(sysscan->iscan, ForwardScanDirection, sysscan->slot, NULL))
 		{
 			bool		shouldFree;
 
@@ -711,20 +696,8 @@ systable_beginscan_ordered(Relation heapRelation,
 			elog(ERROR, "column is not in index");
 	}
 
-	/*
-	 * We don't do any prefetching on system catalogs, for two main reasons.
-	 *
-	 * Firstly, we usually do PK lookups, which makes prefetching pointles,
-	 * or we often don't know how many rows to expect (and the numbers tend
-	 * to be fairly low). So it's not clear it'd help. Furthermore, places
-	 * that are sensitive tend to use syscache anyway.
-	 *
-	 * Secondly, we can't call get_tablespace_io_concurrency() because that
-	 * does a sysscan internally, so it might lead to a cycle. We could use
-	 * use effective_io_concurrency, but it doesn't seem worth it.
-	 */
 	sysscan->iscan = index_beginscan(heapRelation, indexRelation,
-									 snapshot, nkeys, 0, 0);
+									 snapshot, nkeys, 0);
 	index_rescan(sysscan->iscan, key, nkeys, NULL, 0);
 	sysscan->scan = NULL;
 
@@ -740,7 +713,7 @@ systable_getnext_ordered(SysScanDesc sysscan, ScanDirection direction)
 	HeapTuple	htup = NULL;
 
 	Assert(sysscan->irel);
-	if (index_getnext_slot(sysscan->iscan, direction, sysscan->slot))
+	if (index_getnext_slot(sysscan->iscan, direction, sysscan->slot, NULL))
 		htup = ExecFetchSlotHeapTuple(sysscan->slot, false, NULL);
 
 	/* See notes in systable_getnext */
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index e493548b68a..f96aeba1b39 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -109,12 +109,14 @@ do { \
 
 static IndexScanDesc index_beginscan_internal(Relation indexRelation,
 											  int nkeys, int norderbys, Snapshot snapshot,
-											  ParallelIndexScanDesc pscan, bool temp_snap,
-											  int prefetch_max);
+											  ParallelIndexScanDesc pscan, bool temp_snap);
 
-static void index_prefetch_tids(IndexScanDesc scan, ScanDirection direction);
-static ItemPointer index_prefetch_get_tid(IndexScanDesc scan, ScanDirection direction, bool *all_visible);
-static void index_prefetch(IndexScanDesc scan, ItemPointer tid, bool skip_all_visible, bool *all_visible);
+static void index_prefetch_tids(IndexScanDesc scan, ScanDirection direction,
+								IndexPrefetch *prefetch);
+static ItemPointer index_prefetch_get_tid(IndexScanDesc scan, ScanDirection direction,
+										  IndexPrefetch *prefetch, bool *all_visible);
+static void index_prefetch(IndexScanDesc scan, IndexPrefetch *prefetch,
+						   ItemPointer tid, bool skip_all_visible, bool *all_visible);
 
 
 /* ----------------------------------------------------------------
@@ -223,42 +225,18 @@ index_insert_cleanup(Relation indexRelation,
  * index_beginscan - start a scan of an index with amgettuple
  *
  * Caller must be holding suitable locks on the heap and the index.
- *
- * prefetch_max determines if prefetching is requested for this index scan,
- * and how far ahead we want to prefetch
- *
- * Setting prefetch_max to 0 disables prefetching for the index scan. We do
- * this for two reasons - for scans on system catalogs, and/or for cases where
- * prefetching is expected to be pointless (like IOS).
- *
- * For system catalogs, we usually either scan by a PK value, or we we expect
- * only few rows (or rather we don't know how many rows to expect). Also, we
- * need to prevent infinite in the get_tablespace_io_concurrency() call - it
- * does an index scan internally. So we simply disable prefetching for system
- * catalogs. We could deal with this by picking a conservative static target
- * (e.g. effective_io_concurrency, capped to something), but places that are
- * performance sensitive likely use syscache anyway, and catalogs tend to be
- * very small and hot. So we don't bother.
- *
- * For IOS, we expect to not need most heap pages (that's the whole point of
- * IOS, actually), and prefetching them might lead to a lot of wasted I/O.
- *
- * XXX Not sure the infinite loop can still happen, now that the target lookup
- * moved to callers of index_beginscan.
  */
 IndexScanDesc
 index_beginscan(Relation heapRelation,
 				Relation indexRelation,
 				Snapshot snapshot,
-				int nkeys, int norderbys,
-				int prefetch_max)
+				int nkeys, int norderbys)
 {
 	IndexScanDesc scan;
 
 	Assert(snapshot != InvalidSnapshot);
 
-	scan = index_beginscan_internal(indexRelation, nkeys, norderbys, snapshot,
-									NULL, false, prefetch_max);
+	scan = index_beginscan_internal(indexRelation, nkeys, norderbys, snapshot, NULL, false);
 
 	/*
 	 * Save additional parameters into the scandesc.  Everything else was set
@@ -288,8 +266,7 @@ index_beginscan_bitmap(Relation indexRelation,
 
 	Assert(snapshot != InvalidSnapshot);
 
-	/* No prefetch in bitmap scans, prefetch is done by the heap scan. */
-	scan = index_beginscan_internal(indexRelation, nkeys, 0, snapshot, NULL, false, 0);
+	scan = index_beginscan_internal(indexRelation, nkeys, 0, snapshot, NULL, false);
 
 	/*
 	 * Save additional parameters into the scandesc.  Everything else was set
@@ -306,8 +283,7 @@ index_beginscan_bitmap(Relation indexRelation,
 static IndexScanDesc
 index_beginscan_internal(Relation indexRelation,
 						 int nkeys, int norderbys, Snapshot snapshot,
-						 ParallelIndexScanDesc pscan, bool temp_snap,
-						 int prefetch_max)
+						 ParallelIndexScanDesc pscan, bool temp_snap)
 {
 	IndexScanDesc scan;
 
@@ -330,31 +306,6 @@ index_beginscan_internal(Relation indexRelation,
 	/* Initialize information for parallel scan. */
 	scan->parallel_scan = pscan;
 	scan->xs_temp_snap = temp_snap;
-	scan->indexonly = false;
-
-	/*
-	 * With prefetching requested, initialize the prefetcher state.
-	 *
-	 * FIXME This should really be in the IndexScanState, not IndexScanDesc
-	 * (certainly the queues etc). But index_getnext_tid only gets the scan
-	 * descriptor, so how else would we pass it? Seems like a sign of wrong
-	 * layer doing the prefetching.
-	 */
-	if ((prefetch_max > 0) &&
-		(io_direct_flags & IO_DIRECT_DATA) == 0)	/* no prefetching for direct I/O */
-	{
-		IndexPrefetch prefetcher = palloc0(sizeof(IndexPrefetchData));
-
-		prefetcher->queueIndex = 0;
-		prefetcher->queueStart = 0;
-		prefetcher->queueEnd = 0;
-
-		prefetcher->prefetchTarget = 0;
-		prefetcher->prefetchMaxTarget = prefetch_max;
-		prefetcher->vmBuffer = InvalidBuffer;
-
-		scan->xs_prefetch = prefetcher;
-	}
 
 	return scan;
 }
@@ -391,20 +342,6 @@ index_rescan(IndexScanDesc scan,
 
 	scan->indexRelation->rd_indam->amrescan(scan, keys, nkeys,
 											orderbys, norderbys);
-
-	/* If we're prefetching for this index, maybe reset some of the state. */
-	if (scan->xs_prefetch != NULL)
-	{
-		IndexPrefetch prefetcher = scan->xs_prefetch;
-
-		prefetcher->queueStart = 0;
-		prefetcher->queueEnd = 0;
-		prefetcher->queueIndex = 0;
-		prefetcher->prefetchDone = false;
-
-		/* restart the incremental ramp-up */
-		prefetcher->prefetchTarget = 0;
-	}
 }
 
 /* ----------------
@@ -433,23 +370,6 @@ index_endscan(IndexScanDesc scan)
 	if (scan->xs_temp_snap)
 		UnregisterSnapshot(scan->xs_snapshot);
 
-	/*
-	 * If prefetching was enabled for this scan, log prefetch stats.
-	 *
-	 * FIXME This should really go to EXPLAIN ANALYZE instead.
-	 */
-	if (scan->xs_prefetch)
-	{
-		IndexPrefetch prefetch = scan->xs_prefetch;
-
-		elog(LOG, "index prefetch stats: requests " UINT64_FORMAT " prefetches " UINT64_FORMAT " (%f) skip cached " UINT64_FORMAT " sequential " UINT64_FORMAT,
-			 prefetch->countAll,
-			 prefetch->countPrefetch,
-			 prefetch->countPrefetch * 100.0 / prefetch->countAll,
-			 prefetch->countSkipCached,
-			 prefetch->countSkipSequential);
-	}
-
 	/* Release the scan data structure itself */
 	IndexScanEnd(scan);
 }
@@ -595,8 +515,7 @@ index_parallelrescan(IndexScanDesc scan)
  */
 IndexScanDesc
 index_beginscan_parallel(Relation heaprel, Relation indexrel, int nkeys,
-						 int norderbys, ParallelIndexScanDesc pscan,
-						 int prefetch_max)
+						 int norderbys, ParallelIndexScanDesc pscan)
 {
 	Snapshot	snapshot;
 	IndexScanDesc scan;
@@ -605,7 +524,7 @@ index_beginscan_parallel(Relation heaprel, Relation indexrel, int nkeys,
 	snapshot = RestoreSnapshot(pscan->ps_snapshot_data);
 	RegisterSnapshot(snapshot);
 	scan = index_beginscan_internal(indexrel, nkeys, norderbys, snapshot,
-									pscan, true, prefetch_max);
+									pscan, true);
 
 	/*
 	 * Save additional parameters into the scandesc.  Everything else was set
@@ -727,12 +646,13 @@ index_fetch_heap(IndexScanDesc scan, TupleTableSlot *slot)
  * ----------------
  */
 bool
-index_getnext_slot(IndexScanDesc scan, ScanDirection direction, TupleTableSlot *slot)
+index_getnext_slot(IndexScanDesc scan, ScanDirection direction, TupleTableSlot *slot,
+				   IndexPrefetch *prefetch)
 {
 	for (;;)
 	{
 		/* Do prefetching (if requested/enabled). */
-		index_prefetch_tids(scan, direction);
+		index_prefetch_tids(scan, direction, prefetch);
 
 		if (!scan->xs_heap_continue)
 		{
@@ -740,7 +660,7 @@ index_getnext_slot(IndexScanDesc scan, ScanDirection direction, TupleTableSlot *
 			bool		all_visible;
 
 			/* Time to fetch the next TID from the index */
-			tid = index_prefetch_get_tid(scan, direction, &all_visible);
+			tid = index_prefetch_get_tid(scan, direction, prefetch, &all_visible);
 
 			/* If we're out of index entries, we're done */
 			if (tid == NULL)
@@ -1135,7 +1055,7 @@ index_opclass_options(Relation indrel, AttrNumber attnum, Datum attoptions,
  * does not require a sequential pattern).
  */
 static bool
-index_prefetch_is_sequential(IndexPrefetch prefetch, BlockNumber block)
+index_prefetch_is_sequential(IndexPrefetch *prefetch, BlockNumber block)
 {
 	int			idx;
 
@@ -1270,9 +1190,9 @@ index_prefetch_is_sequential(IndexPrefetch prefetch, BlockNumber block)
  * what index_prefetch_is_sequential does.
  */
 static bool
-index_prefetch_add_cache(IndexPrefetch prefetch, BlockNumber block)
+index_prefetch_add_cache(IndexPrefetch *prefetch, BlockNumber block)
 {
-	PrefetchCacheEntry *entry;
+	IndexPrefetchCacheEntry *entry;
 
 	/* map the block number the the LRU */
 	int			lru = hash_uint32(block) % PREFETCH_LRU_COUNT;
@@ -1419,9 +1339,9 @@ index_prefetch_add_cache(IndexPrefetch prefetch, BlockNumber block)
  * value once in a while, and see what happens.
  */
 static void
-index_prefetch(IndexScanDesc scan, ItemPointer tid, bool skip_all_visible, bool *all_visible)
+index_prefetch(IndexScanDesc scan, IndexPrefetch *prefetch,
+			   ItemPointer tid, bool skip_all_visible, bool *all_visible)
 {
-	IndexPrefetch prefetch = scan->xs_prefetch;
 	BlockNumber block;
 
 	/* by default not all visible (or we didn't check) */
@@ -1502,33 +1422,33 @@ index_prefetch(IndexScanDesc scan, ItemPointer tid, bool skip_all_visible, bool
  * ----------------
  */
 ItemPointer
-index_getnext_tid(IndexScanDesc scan, ScanDirection direction)
+index_getnext_tid(IndexScanDesc scan, ScanDirection direction,
+				  IndexPrefetch *prefetch)
 {
 	bool		all_visible;	/* ignored */
 
 	/* Do prefetching (if requested/enabled). */
-	index_prefetch_tids(scan, direction);
+	index_prefetch_tids(scan, direction, prefetch);
 
 	/* Read the TID from the queue (or directly from the index). */
-	return index_prefetch_get_tid(scan, direction, &all_visible);
+	return index_prefetch_get_tid(scan, direction, prefetch, &all_visible);
 }
 
 ItemPointer
-index_getnext_tid_vm(IndexScanDesc scan, ScanDirection direction, bool *all_visible)
+index_getnext_tid_vm(IndexScanDesc scan, ScanDirection direction,
+					 IndexPrefetch *prefetch, bool *all_visible)
 {
 	/* Do prefetching (if requested/enabled). */
-	index_prefetch_tids(scan, direction);
+	index_prefetch_tids(scan, direction, prefetch);
 
 	/* Read the TID from the queue (or directly from the index). */
-	return index_prefetch_get_tid(scan, direction, all_visible);
+	return index_prefetch_get_tid(scan, direction, prefetch, all_visible);
 }
 
 static void
-index_prefetch_tids(IndexScanDesc scan, ScanDirection direction)
+index_prefetch_tids(IndexScanDesc scan, ScanDirection direction,
+					IndexPrefetch *prefetch)
 {
-	/* for convenience */
-	IndexPrefetch prefetch = scan->xs_prefetch;
-
 	/*
 	 * If the prefetching is still active (i.e. enabled and we still
 	 * haven't finished reading TIDs from the scan), read enough TIDs into
@@ -1577,7 +1497,7 @@ index_prefetch_tids(IndexScanDesc scan, ScanDirection direction)
 			 * XXX index_getnext_tid_prefetch is only called for IOS (for now),
 			 * so skip prefetching of all-visible pages.
 			 */
-			index_prefetch(scan, tid, scan->indexonly, &all_visible);
+			index_prefetch(scan, prefetch, tid, prefetch->indexonly, &all_visible);
 
 			prefetch->queueItems[PREFETCH_QUEUE_INDEX(prefetch->queueEnd)].tid = *tid;
 			prefetch->queueItems[PREFETCH_QUEUE_INDEX(prefetch->queueEnd)].all_visible = all_visible;
@@ -1587,11 +1507,9 @@ index_prefetch_tids(IndexScanDesc scan, ScanDirection direction)
 }
 
 static ItemPointer
-index_prefetch_get_tid(IndexScanDesc scan, ScanDirection direction, bool *all_visible)
+index_prefetch_get_tid(IndexScanDesc scan, ScanDirection direction,
+					   IndexPrefetch *prefetch, bool *all_visible)
 {
-	/* for convenience */
-	IndexPrefetch prefetch = scan->xs_prefetch;
-
 	/*
 	 * With prefetching enabled (even if we already finished reading
 	 * all TIDs from the index scan), we need to return a TID from the
diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c
index 15fa3211667..0a136db6712 100644
--- a/src/backend/executor/execIndexing.c
+++ b/src/backend/executor/execIndexing.c
@@ -770,18 +770,18 @@ check_exclusion_or_unique_constraint(Relation heap, Relation index,
 	/*
 	 * May have to restart scan from this point if a potential conflict is
 	 * found.
-	 *
-	 * XXX Should this do index prefetch? Probably not worth it for unique
-	 * constraints, I guess? Otherwise we should calculate prefetch_target
-	 * just like in nodeIndexscan etc.
 	 */
 retry:
 	conflict = false;
 	found_self = false;
-	index_scan = index_beginscan(heap, index, &DirtySnapshot, indnkeyatts, 0, 0);
+	index_scan = index_beginscan(heap, index, &DirtySnapshot, indnkeyatts, 0);
 	index_rescan(index_scan, scankeys, indnkeyatts, NULL, 0);
 
-	while (index_getnext_slot(index_scan, ForwardScanDirection, existing_slot))
+	/*
+	 * XXX Would be nice to also benefit from prefetching here. All we need to
+	 * do is instantiate the prefetcher, I guess.
+	 */
+	while (index_getnext_slot(index_scan, ForwardScanDirection, existing_slot, NULL))
 	{
 		TransactionId xwait;
 		XLTW_Oper	reason_wait;
diff --git a/src/backend/executor/execReplication.c b/src/backend/executor/execReplication.c
index 91676ccff95..9498b00fa64 100644
--- a/src/backend/executor/execReplication.c
+++ b/src/backend/executor/execReplication.c
@@ -204,21 +204,21 @@ RelationFindReplTupleByIndex(Relation rel, Oid idxoid,
 	/* Build scan key. */
 	skey_attoff = build_replindex_scan_key(skey, rel, idxrel, searchslot);
 
-	/* Start an index scan.
-	 *
-	 * XXX Should this do index prefetching? We're looking for a single tuple,
-	 * probably using a PK / UNIQUE index, so does not seem worth it. If we
-	 * reconsider this, calclate prefetch_target like in nodeIndexscan.
-	 */
-	scan = index_beginscan(rel, idxrel, &snap, skey_attoff, 0, 0);
+	/* Start an index scan. */
+	scan = index_beginscan(rel, idxrel, &snap, skey_attoff, 0);
 
 retry:
 	found = false;
 
 	index_rescan(scan, skey, skey_attoff, NULL, 0);
 
-	/* Try to find the tuple */
-	while (index_getnext_slot(scan, ForwardScanDirection, outslot))
+	/*
+	 * Try to find the tuple
+	 *
+	 * XXX Would be nice to also benefit from prefetching here. All we need to
+	 * do is instantiate the prefetcher, I guess.
+	 */
+	while (index_getnext_slot(scan, ForwardScanDirection, outslot, NULL))
 	{
 		/*
 		 * Avoid expensive equality check if the index is primary key or
diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c
index b6660c10a63..a7eadaf3db2 100644
--- a/src/backend/executor/nodeIndexonlyscan.c
+++ b/src/backend/executor/nodeIndexonlyscan.c
@@ -65,8 +65,8 @@ IndexOnlyNext(IndexOnlyScanState *node)
 	IndexScanDesc scandesc;
 	TupleTableSlot *slot;
 	ItemPointer tid;
-	Relation	heapRel = node->ss.ss_currentRelation;
-	bool		all_visible;
+	IndexPrefetch  *prefetch;
+	bool			all_visible;
 
 	/*
 	 * extract necessary information from index scan node
@@ -80,52 +80,22 @@ IndexOnlyNext(IndexOnlyScanState *node)
 	direction = ScanDirectionCombine(estate->es_direction,
 									 ((IndexOnlyScan *) node->ss.ps.plan)->indexorderdir);
 	scandesc = node->ioss_ScanDesc;
+	prefetch = node->ioss_prefetch;
 	econtext = node->ss.ps.ps_ExprContext;
 	slot = node->ss.ss_ScanTupleSlot;
 
 	if (scandesc == NULL)
 	{
-		int			prefetch_max;
-
-		/*
-		 * Determine number of heap pages to prefetch for this index. This is
-		 * essentially just effective_io_concurrency for the table (or the
-		 * tablespace it's in).
-		 *
-		 * XXX Should this also look at plan.plan_rows and maybe cap the target
-		 * to that? Pointless to prefetch more than we expect to use. Or maybe
-		 * just reset to that value during prefetching, after reading the next
-		 * index page (or rather after rescan)?
-		 *
-		 * XXX Maybe reduce the value with parallel workers?
-		 */
-		prefetch_max = Min(get_tablespace_io_concurrency(heapRel->rd_rel->reltablespace),
-						   node->ss.ps.plan->plan_rows);
-
 		/*
 		 * We reach here if the index only scan is not parallel, or if we're
 		 * serially executing an index only scan that was planned to be
 		 * parallel.
-		 *
-		 * XXX Maybe we should enable prefetching, but prefetch only pages that
-		 * are not all-visible (but checking that from the index code seems like
-		 * a violation of layering etc).
-		 *
-		 * XXX This might lead to IOS being slower than plain index scan, if the
-		 * table has a lot of pages that need recheck.
 		 */
 		scandesc = index_beginscan(node->ss.ss_currentRelation,
 								   node->ioss_RelationDesc,
 								   estate->es_snapshot,
 								   node->ioss_NumScanKeys,
-								   node->ioss_NumOrderByKeys,
-								   prefetch_max);
-
-		/*
-		 * Remember this is index-only scan, because of prefetching. Not the most
-		 * elegant way to pass this info.
-		 */
-		scandesc->indexonly = true;
+								   node->ioss_NumOrderByKeys);
 
 		node->ioss_ScanDesc = scandesc;
 
@@ -149,7 +119,7 @@ IndexOnlyNext(IndexOnlyScanState *node)
 	/*
 	 * OK, now that we have what we need, fetch the next tuple.
 	 */
-	while ((tid = index_getnext_tid_vm(scandesc, direction, &all_visible)) != NULL)
+	while ((tid = index_getnext_tid_vm(scandesc, direction, prefetch, &all_visible)) != NULL)
 	{
 		bool		tuple_from_heap = false;
 
@@ -389,6 +359,16 @@ ExecReScanIndexOnlyScan(IndexOnlyScanState *node)
 					 node->ioss_ScanKeys, node->ioss_NumScanKeys,
 					 node->ioss_OrderByKeys, node->ioss_NumOrderByKeys);
 
+	/* also reset the prefetcher, so that we start from scratch */
+	if (node->ioss_prefetch)
+	{
+		IndexPrefetch *prefetch = node->ioss_prefetch;
+
+		prefetch->queueIndex = 0;
+		prefetch->queueStart = 0;
+		prefetch->queueEnd = 0;
+	}
+
 	ExecScanReScan(&node->ss);
 }
 
@@ -417,14 +397,22 @@ ExecEndIndexOnlyScan(IndexOnlyScanState *node)
 	}
 
 	/* Release VM buffer pin from prefetcher, if any. */
-	if (indexScanDesc && indexScanDesc->xs_prefetch)
+	if (node->ioss_prefetch)
 	{
-		IndexPrefetch indexPrefetch = indexScanDesc->xs_prefetch;
+		IndexPrefetch *prefetch = node->ioss_prefetch;
+
+		/* XXX some debug info */
+		elog(LOG, "index prefetch stats: requests " UINT64_FORMAT " prefetches " UINT64_FORMAT " (%f) skip cached " UINT64_FORMAT " sequential " UINT64_FORMAT,
+			 prefetch->countAll,
+			 prefetch->countPrefetch,
+			 prefetch->countPrefetch * 100.0 / prefetch->countAll,
+			 prefetch->countSkipCached,
+			 prefetch->countSkipSequential);
 
-		if (indexPrefetch->vmBuffer != InvalidBuffer)
+		if (prefetch->vmBuffer != InvalidBuffer)
 		{
-			ReleaseBuffer(indexPrefetch->vmBuffer);
-			indexPrefetch->vmBuffer = InvalidBuffer;
+			ReleaseBuffer(prefetch->vmBuffer);
+			prefetch->vmBuffer = InvalidBuffer;
 		}
 	}
 
@@ -652,6 +640,63 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags)
 		indexstate->ioss_RuntimeContext = NULL;
 	}
 
+	/*
+	 * Also initialize index prefetcher.
+	 *
+	 * XXX No prefetching for direct I/O.
+	 */
+	if ((io_direct_flags & IO_DIRECT_DATA) == 0)
+	{
+		int			prefetch_max;
+		Relation    heapRel = indexstate->ss.ss_currentRelation;
+
+		/*
+		 * Determine number of heap pages to prefetch for this index. This is
+		 * essentially just effective_io_concurrency for the table (or the
+		 * tablespace it's in).
+		 *
+		 * XXX Should this also look at plan.plan_rows and maybe cap the target
+		 * to that? Pointless to prefetch more than we expect to use. Or maybe
+		 * just reset to that value during prefetching, after reading the next
+		 * index page (or rather after rescan)?
+		 *
+		 * XXX Maybe reduce the value with parallel workers?
+		 */
+		prefetch_max = Min(get_tablespace_io_concurrency(heapRel->rd_rel->reltablespace),
+						   indexstate->ss.ps.plan->plan_rows);
+
+		/*
+		 * We reach here if the index only scan is not parallel, or if we're
+		 * serially executing an index only scan that was planned to be
+		 * parallel.
+		 *
+		 * XXX Maybe we should enable prefetching, but prefetch only pages that
+		 * are not all-visible (but checking that from the index code seems like
+		 * a violation of layering etc).
+		 *
+		 * XXX This might lead to IOS being slower than plain index scan, if the
+		 * table has a lot of pages that need recheck.
+		 *
+		 * Remember this is index-only scan, because of prefetching. Not the most
+		 * elegant way to pass this info.
+		 */
+		if (prefetch_max > 0)
+		{
+			IndexPrefetch *prefetch = palloc0(sizeof(IndexPrefetch));
+
+			prefetch->queueIndex = 0;
+			prefetch->queueStart = 0;
+			prefetch->queueEnd = 0;
+
+			prefetch->prefetchTarget = 0;
+			prefetch->prefetchMaxTarget = prefetch_max;
+			prefetch->vmBuffer = InvalidBuffer;
+			prefetch->indexonly = true;
+
+			indexstate->ioss_prefetch = prefetch;
+		}
+	}
+
 	/*
 	 * all done.
 	 */
@@ -694,24 +739,6 @@ ExecIndexOnlyScanInitializeDSM(IndexOnlyScanState *node,
 {
 	EState	   *estate = node->ss.ps.state;
 	ParallelIndexScanDesc piscan;
-	Relation	heapRel = node->ss.ss_currentRelation;
-	int			prefetch_max;
-
-	/*
-	 * Determine number of heap pages to prefetch for this index. This is
-	 * essentially just effective_io_concurrency for the table (or the
-	 * tablespace it's in).
-	 *
-	 * XXX Should this also look at plan.plan_rows and maybe cap the target
-	 * to that? Pointless to prefetch more than we expect to use. Or maybe
-	 * just reset to that value during prefetching, after reading the next
-	 * index page (or rather after rescan)?
-	 *
-	 * XXX Maybe reduce the value with parallel workers?
-	 */
-
-	prefetch_max = Min(get_tablespace_io_concurrency(heapRel->rd_rel->reltablespace),
-					   node->ss.ps.plan->plan_rows);
 
 	piscan = shm_toc_allocate(pcxt->toc, node->ioss_PscanLen);
 	index_parallelscan_initialize(node->ss.ss_currentRelation,
@@ -724,8 +751,7 @@ ExecIndexOnlyScanInitializeDSM(IndexOnlyScanState *node,
 								 node->ioss_RelationDesc,
 								 node->ioss_NumScanKeys,
 								 node->ioss_NumOrderByKeys,
-								 piscan,
-								 prefetch_max);
+								 piscan);
 	node->ioss_ScanDesc->xs_want_itup = true;
 	node->ioss_VMBuffer = InvalidBuffer;
 
@@ -763,23 +789,6 @@ ExecIndexOnlyScanInitializeWorker(IndexOnlyScanState *node,
 								  ParallelWorkerContext *pwcxt)
 {
 	ParallelIndexScanDesc piscan;
-	Relation	heapRel = node->ss.ss_currentRelation;
-	int			prefetch_max;
-
-	/*
-	 * Determine number of heap pages to prefetch for this index. This is
-	 * essentially just effective_io_concurrency for the table (or the
-	 * tablespace it's in).
-	 *
-	 * XXX Should this also look at plan.plan_rows and maybe cap the target
-	 * to that? Pointless to prefetch more than we expect to use. Or maybe
-	 * just reset to that value during prefetching, after reading the next
-	 * index page (or rather after rescan)?
-	 *
-	 * XXX Maybe reduce the value with parallel workers?
-	 */
-	prefetch_max = Min(get_tablespace_io_concurrency(heapRel->rd_rel->reltablespace),
-					   node->ss.ps.plan->plan_rows);
 
 	piscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false);
 	node->ioss_ScanDesc =
@@ -787,8 +796,7 @@ ExecIndexOnlyScanInitializeWorker(IndexOnlyScanState *node,
 								 node->ioss_RelationDesc,
 								 node->ioss_NumScanKeys,
 								 node->ioss_NumOrderByKeys,
-								 piscan,
-								 prefetch_max);
+								 piscan);
 	node->ioss_ScanDesc->xs_want_itup = true;
 
 	/*
diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c
index a5f5394ef49..b3282ec5a75 100644
--- a/src/backend/executor/nodeIndexscan.c
+++ b/src/backend/executor/nodeIndexscan.c
@@ -86,7 +86,7 @@ IndexNext(IndexScanState *node)
 	ScanDirection direction;
 	IndexScanDesc scandesc;
 	TupleTableSlot *slot;
-	Relation heapRel = node->ss.ss_currentRelation;
+	IndexPrefetch  *prefetch;
 
 	/*
 	 * extract necessary information from index scan node
@@ -100,26 +100,12 @@ IndexNext(IndexScanState *node)
 	direction = ScanDirectionCombine(estate->es_direction,
 									 ((IndexScan *) node->ss.ps.plan)->indexorderdir);
 	scandesc = node->iss_ScanDesc;
+	prefetch = node->iss_prefetch;
 	econtext = node->ss.ps.ps_ExprContext;
 	slot = node->ss.ss_ScanTupleSlot;
 
 	if (scandesc == NULL)
 	{
-		int	prefetch_max;
-
-		/*
-		 * Determine number of heap pages to prefetch for this index scan. This
-		 * is essentially just effective_io_concurrency for the table (or the
-		 * tablespace it's in).
-		 *
-		 * XXX Should this also look at plan.plan_rows and maybe cap the target
-		 * to that? Pointless to prefetch more than we expect to use. Or maybe
-		 * just reset to that value during prefetching, after reading the next
-		 * index page (or rather after rescan)?
-		 */
-		prefetch_max = Min(get_tablespace_io_concurrency(heapRel->rd_rel->reltablespace),
-						   node->ss.ps.plan->plan_rows);
-
 		/*
 		 * We reach here if the index scan is not parallel, or if we're
 		 * serially executing an index scan that was planned to be parallel.
@@ -128,8 +114,7 @@ IndexNext(IndexScanState *node)
 								   node->iss_RelationDesc,
 								   estate->es_snapshot,
 								   node->iss_NumScanKeys,
-								   node->iss_NumOrderByKeys,
-								   prefetch_max);
+								   node->iss_NumOrderByKeys);
 
 		node->iss_ScanDesc = scandesc;
 
@@ -146,7 +131,7 @@ IndexNext(IndexScanState *node)
 	/*
 	 * ok, now that we have what we need, fetch the next tuple.
 	 */
-	while (index_getnext_slot(scandesc, direction, slot))
+	while (index_getnext_slot(scandesc, direction, slot, prefetch))
 	{
 		CHECK_FOR_INTERRUPTS();
 
@@ -195,7 +180,7 @@ IndexNextWithReorder(IndexScanState *node)
 	Datum	   *lastfetched_vals;
 	bool	   *lastfetched_nulls;
 	int			cmp;
-	Relation	heapRel = node->ss.ss_currentRelation;
+	IndexPrefetch *prefetch;
 
 	estate = node->ss.ps.state;
 
@@ -212,26 +197,12 @@ IndexNextWithReorder(IndexScanState *node)
 	Assert(ScanDirectionIsForward(estate->es_direction));
 
 	scandesc = node->iss_ScanDesc;
+	prefetch = node->iss_prefetch;
 	econtext = node->ss.ps.ps_ExprContext;
 	slot = node->ss.ss_ScanTupleSlot;
 
 	if (scandesc == NULL)
 	{
-		int	prefetch_max;
-
-		/*
-		 * Determine number of heap pages to prefetch for this index. This is
-		 * essentially just effective_io_concurrency for the table (or the
-		 * tablespace it's in).
-		 *
-		 * XXX Should this also look at plan.plan_rows and maybe cap the target
-		 * to that? Pointless to prefetch more than we expect to use. Or maybe
-		 * just reset to that value during prefetching, after reading the next
-		 * index page (or rather after rescan)?
-		 */
-		prefetch_max = Min(get_tablespace_io_concurrency(heapRel->rd_rel->reltablespace),
-						   node->ss.ps.plan->plan_rows);
-
 		/*
 		 * We reach here if the index scan is not parallel, or if we're
 		 * serially executing an index scan that was planned to be parallel.
@@ -240,8 +211,7 @@ IndexNextWithReorder(IndexScanState *node)
 								   node->iss_RelationDesc,
 								   estate->es_snapshot,
 								   node->iss_NumScanKeys,
-								   node->iss_NumOrderByKeys,
-								   prefetch_max);
+								   node->iss_NumOrderByKeys);
 
 		node->iss_ScanDesc = scandesc;
 
@@ -294,7 +264,7 @@ IndexNextWithReorder(IndexScanState *node)
 		 * Fetch next tuple from the index.
 		 */
 next_indextuple:
-		if (!index_getnext_slot(scandesc, ForwardScanDirection, slot))
+		if (!index_getnext_slot(scandesc, ForwardScanDirection, slot, prefetch))
 		{
 			/*
 			 * No more tuples from the index.  But we still need to drain any
@@ -623,6 +593,16 @@ ExecReScanIndexScan(IndexScanState *node)
 					 node->iss_OrderByKeys, node->iss_NumOrderByKeys);
 	node->iss_ReachedEnd = false;
 
+	/* also reset the prefetcher, so that we start from scratch */
+	if (node->iss_prefetch)
+	{
+		IndexPrefetch *prefetch = node->iss_prefetch;
+
+		prefetch->queueIndex = 0;
+		prefetch->queueStart = 0;
+		prefetch->queueEnd = 0;
+	}
+
 	ExecScanReScan(&node->ss);
 }
 
@@ -829,6 +809,19 @@ ExecEndIndexScan(IndexScanState *node)
 	indexRelationDesc = node->iss_RelationDesc;
 	indexScanDesc = node->iss_ScanDesc;
 
+	/* XXX nothing to free, but print some debug info */
+	if (node->iss_prefetch)
+	{
+		IndexPrefetch *prefetch = node->iss_prefetch;
+
+		elog(LOG, "index prefetch stats: requests " UINT64_FORMAT " prefetches " UINT64_FORMAT " (%f) skip cached " UINT64_FORMAT " sequential " UINT64_FORMAT,
+			 prefetch->countAll,
+			 prefetch->countPrefetch,
+			 prefetch->countPrefetch * 100.0 / prefetch->countAll,
+			 prefetch->countSkipCached,
+			 prefetch->countSkipSequential);
+	}
+
 	/*
 	 * close the index relation (no-op if we didn't open it)
 	 */
@@ -1101,6 +1094,45 @@ ExecInitIndexScan(IndexScan *node, EState *estate, int eflags)
 		indexstate->iss_RuntimeContext = NULL;
 	}
 
+	/*
+	 * Also initialize index prefetcher.
+	 *
+	 * XXX No prefetching for direct I/O.
+	 */
+	if ((io_direct_flags & IO_DIRECT_DATA) == 0)
+	{
+		int	prefetch_max;
+		Relation    heapRel = indexstate->ss.ss_currentRelation;
+
+		/*
+		 * Determine number of heap pages to prefetch for this index scan. This
+		 * is essentially just effective_io_concurrency for the table (or the
+		 * tablespace it's in).
+		 *
+		 * XXX Should this also look at plan.plan_rows and maybe cap the target
+		 * to that? Pointless to prefetch more than we expect to use. Or maybe
+		 * just reset to that value during prefetching, after reading the next
+		 * index page (or rather after rescan)?
+		 */
+		prefetch_max = Min(get_tablespace_io_concurrency(heapRel->rd_rel->reltablespace),
+						   indexstate->ss.ps.plan->plan_rows);
+
+		if (prefetch_max > 0)
+		{
+			IndexPrefetch *prefetch = palloc0(sizeof(IndexPrefetch));
+
+			prefetch->queueIndex = 0;
+			prefetch->queueStart = 0;
+			prefetch->queueEnd = 0;
+
+			prefetch->prefetchTarget = 0;
+			prefetch->prefetchMaxTarget = prefetch_max;
+			prefetch->vmBuffer = InvalidBuffer;
+
+			indexstate->iss_prefetch = prefetch;
+		}
+	}
+
 	/*
 	 * all done.
 	 */
@@ -1697,24 +1729,6 @@ ExecIndexScanInitializeDSM(IndexScanState *node,
 {
 	EState	   *estate = node->ss.ps.state;
 	ParallelIndexScanDesc piscan;
-	Relation	heapRel = node->ss.ss_currentRelation;
-	int			prefetch_max;
-
-	/*
-	 * Determine number of heap pages to prefetch for this index. This is
-	 * essentially just effective_io_concurrency for the table (or the
-	 * tablespace it's in).
-	 *
-	 * XXX Should this also look at plan.plan_rows and maybe cap the target
-	 * to that? Pointless to prefetch more than we expect to use. Or maybe
-	 * just reset to that value during prefetching, after reading the next
-	 * index page (or rather after rescan)?
-	 *
-	 * XXX Maybe reduce the value with parallel workers?
-	 */
-
-	prefetch_max = Min(get_tablespace_io_concurrency(heapRel->rd_rel->reltablespace),
-					   node->ss.ps.plan->plan_rows);
 
 	piscan = shm_toc_allocate(pcxt->toc, node->iss_PscanLen);
 	index_parallelscan_initialize(node->ss.ss_currentRelation,
@@ -1727,8 +1741,7 @@ ExecIndexScanInitializeDSM(IndexScanState *node,
 								 node->iss_RelationDesc,
 								 node->iss_NumScanKeys,
 								 node->iss_NumOrderByKeys,
-								 piscan,
-								 prefetch_max);
+								 piscan);
 
 	/*
 	 * If no run-time keys to calculate or they are ready, go ahead and pass
@@ -1764,23 +1777,6 @@ ExecIndexScanInitializeWorker(IndexScanState *node,
 							  ParallelWorkerContext *pwcxt)
 {
 	ParallelIndexScanDesc piscan;
-	Relation	heapRel = node->ss.ss_currentRelation;
-	int			prefetch_max;
-
-	/*
-	 * Determine number of heap pages to prefetch for this index. This is
-	 * essentially just effective_io_concurrency for the table (or the
-	 * tablespace it's in).
-	 *
-	 * XXX Should this also look at plan.plan_rows and maybe cap the target
-	 * to that? Pointless to prefetch more than we expect to use. Or maybe
-	 * just reset to that value during prefetching, after reading the next
-	 * index page (or rather after rescan)?
-	 *
-	 * XXX Maybe reduce the value with parallel workers?
-	 */
-	prefetch_max = Min(get_tablespace_io_concurrency(heapRel->rd_rel->reltablespace),
-					   node->ss.ps.plan->plan_rows);
 
 	piscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false);
 	node->iss_ScanDesc =
@@ -1788,8 +1784,7 @@ ExecIndexScanInitializeWorker(IndexScanState *node,
 								 node->iss_RelationDesc,
 								 node->iss_NumScanKeys,
 								 node->iss_NumOrderByKeys,
-								 piscan,
-								 prefetch_max);
+								 piscan);
 
 	/*
 	 * If no run-time keys to calculate or they are ready, go ahead and pass
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 8b662d371dd..b5c79359425 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -6289,16 +6289,15 @@ get_actual_variable_endpoint(Relation heapRel,
 	InitNonVacuumableSnapshot(SnapshotNonVacuumable,
 							  GlobalVisTestFor(heapRel));
 
-	/* XXX Maybe should do prefetching using the default prefetch parameters? */
 	index_scan = index_beginscan(heapRel, indexRel,
 								 &SnapshotNonVacuumable,
-								 1, 0, 0);
+								 1, 0);
 	/* Set it up for index-only scan */
 	index_scan->xs_want_itup = true;
 	index_rescan(index_scan, scankeys, 1, NULL, 0);
 
 	/* Fetch first/next tuple in specified direction */
-	while ((tid = index_getnext_tid(index_scan, indexscandir)) != NULL)
+	while ((tid = index_getnext_tid(index_scan, indexscandir, NULL)) != NULL)
 	{
 		BlockNumber block = ItemPointerGetBlockNumber(tid);
 
diff --git a/src/include/access/genam.h b/src/include/access/genam.h
index f6882f644d2..c0c46d7a05f 100644
--- a/src/include/access/genam.h
+++ b/src/include/access/genam.h
@@ -129,6 +129,110 @@ typedef struct IndexOrderByDistance
 	bool		isnull;
 } IndexOrderByDistance;
 
+
+
+/*
+ * Cache of recently prefetched blocks, organized as a hash table of
+ * small LRU caches. Doesn't need to be perfectly accurate, but we
+ * aim to make false positives/negatives reasonably low.
+ */
+typedef struct IndexPrefetchCacheEntry {
+	BlockNumber		block;
+	uint64			request;
+} IndexPrefetchCacheEntry;
+
+/*
+ * Size of the cache of recently prefetched blocks - shouldn't be too
+ * small or too large. 1024 seems about right, it covers ~8MB of data.
+ * It's somewhat arbitrary, there's no particular formula saying it
+ * should not be higher/lower.
+ *
+ * The cache is structured as an array of small LRU caches, so the total
+ * size needs to be a multiple of LRU size. The LRU should be tiny to
+ * keep linear search cheap enough.
+ *
+ * XXX Maybe we could consider effective_cache_size or something?
+ */
+#define		PREFETCH_LRU_SIZE		8
+#define		PREFETCH_LRU_COUNT		128
+#define		PREFETCH_CACHE_SIZE		(PREFETCH_LRU_SIZE * PREFETCH_LRU_COUNT)
+
+/*
+ * Used to detect sequential patterns (and disable prefetching).
+ */
+#define		PREFETCH_QUEUE_HISTORY			8
+#define		PREFETCH_SEQ_PATTERN_BLOCKS		4
+
+typedef struct IndexPrefetchEntry
+{
+	ItemPointerData		tid;
+	bool				all_visible;
+} IndexPrefetchEntry;
+
+typedef struct IndexPrefetch
+{
+	/*
+	 * XXX We need to disable this in some cases (e.g. when using index-only
+	 * scans, we don't want to prefetch pages). Or maybe we should prefetch
+	 * only pages that are not all-visible, that'd be even better.
+	 */
+	int			prefetchTarget;	/* how far we should be prefetching */
+	int			prefetchMaxTarget;	/* maximum prefetching distance */
+	int			prefetchReset;	/* reset to this distance on rescan */
+	bool		prefetchDone;	/* did we get all TIDs from the index? */
+
+	/* runtime statistics */
+	uint64		countAll;		/* all prefetch requests */
+	uint64		countPrefetch;	/* actual prefetches */
+	uint64		countSkipSequential;
+	uint64		countSkipCached;
+
+	/* used when prefetching index-only scans */
+	bool		indexonly;
+	Buffer		vmBuffer;
+
+	/*
+	 * Queue of TIDs to prefetch.
+	 *
+	 * XXX Sizing for MAX_IO_CONCURRENCY may be overkill, but it seems simpler
+	 * than dynamically adjusting for custom values.
+	 */
+	IndexPrefetchEntry	queueItems[MAX_IO_CONCURRENCY];
+	uint64			queueIndex;	/* next TID to prefetch */
+	uint64			queueStart;	/* first valid TID in queue */
+	uint64			queueEnd;	/* first invalid (empty) TID in queue */
+
+	/*
+	 * A couple of last prefetched blocks, used to check for certain access
+	 * pattern and skip prefetching - e.g. for sequential access).
+	 *
+	 * XXX Separate from the main queue, because we only want to compare the
+	 * block numbers, not the whole TID. In sequential access it's likely we
+	 * read many items from each page, and we don't want to check many items
+	 * (as that is much more expensive).
+	 */
+	BlockNumber		blockItems[PREFETCH_QUEUE_HISTORY];
+	uint64			blockIndex;	/* index in the block (points to the first
+								 * empty entry)*/
+
+	/*
+	 * Cache of recently prefetched blocks, organized as a hash table of
+	 * small LRU caches.
+	 */
+	uint64				prefetchReqNumber;
+	IndexPrefetchCacheEntry	prefetchCache[PREFETCH_CACHE_SIZE];
+
+} IndexPrefetch;
+
+#define PREFETCH_QUEUE_INDEX(a)	((a) % (MAX_IO_CONCURRENCY))
+#define PREFETCH_QUEUE_EMPTY(p)	((p)->queueEnd == (p)->queueIndex)
+#define PREFETCH_ENABLED(p)		((p) && ((p)->prefetchMaxTarget > 0))
+#define PREFETCH_FULL(p)		((p)->queueEnd - (p)->queueIndex == (p)->prefetchTarget)
+#define PREFETCH_DONE(p)		((p) && ((p)->prefetchDone && PREFETCH_QUEUE_EMPTY(p)))
+#define PREFETCH_ACTIVE(p)		(PREFETCH_ENABLED(p) && !(p)->prefetchDone)
+#define PREFETCH_BLOCK_INDEX(v)	((v) % PREFETCH_QUEUE_HISTORY)
+
+
 /*
  * generalized index_ interface routines (in indexam.c)
  */
@@ -155,8 +259,7 @@ extern void index_insert_cleanup(Relation indexRelation,
 extern IndexScanDesc index_beginscan(Relation heapRelation,
 									 Relation indexRelation,
 									 Snapshot snapshot,
-									 int nkeys, int norderbys,
-									 int prefetch_max);
+									 int nkeys, int norderbys);
 extern IndexScanDesc index_beginscan_bitmap(Relation indexRelation,
 											Snapshot snapshot,
 											int nkeys);
@@ -173,17 +276,19 @@ extern void index_parallelscan_initialize(Relation heapRelation,
 extern void index_parallelrescan(IndexScanDesc scan);
 extern IndexScanDesc index_beginscan_parallel(Relation heaprel,
 											  Relation indexrel, int nkeys, int norderbys,
-											  ParallelIndexScanDesc pscan,
-											  int prefetch_max);
+											  ParallelIndexScanDesc pscan);
 extern ItemPointer index_getnext_tid(IndexScanDesc scan,
-									 ScanDirection direction);
+									 ScanDirection direction,
+									 IndexPrefetch *prefetch);
 extern ItemPointer index_getnext_tid_vm(IndexScanDesc scan,
 										ScanDirection direction,
+										IndexPrefetch *prefetch,
 										bool *all_visible);
 struct TupleTableSlot;
 extern bool index_fetch_heap(IndexScanDesc scan, struct TupleTableSlot *slot);
 extern bool index_getnext_slot(IndexScanDesc scan, ScanDirection direction,
-							   struct TupleTableSlot *slot);
+							   struct TupleTableSlot *slot,
+							   IndexPrefetch *prefetch);
 extern int64 index_getbitmap(IndexScanDesc scan, TIDBitmap *bitmap);
 
 extern IndexBulkDeleteResult *index_bulk_delete(IndexVacuumInfo *info,
@@ -238,104 +343,4 @@ extern HeapTuple systable_getnext_ordered(SysScanDesc sysscan,
 										  ScanDirection direction);
 extern void systable_endscan_ordered(SysScanDesc sysscan);
 
-/*
- * Cache of recently prefetched blocks, organized as a hash table of
- * small LRU caches. Doesn't need to be perfectly accurate, but we
- * aim to make false positives/negatives reasonably low.
- */
-typedef struct PrefetchCacheEntry {
-	BlockNumber		block;
-	uint64			request;
-} PrefetchCacheEntry;
-
-/*
- * Size of the cache of recently prefetched blocks - shouldn't be too
- * small or too large. 1024 seems about right, it covers ~8MB of data.
- * It's somewhat arbitrary, there's no particular formula saying it
- * should not be higher/lower.
- *
- * The cache is structured as an array of small LRU caches, so the total
- * size needs to be a multiple of LRU size. The LRU should be tiny to
- * keep linear search cheap enough.
- *
- * XXX Maybe we could consider effective_cache_size or something?
- */
-#define		PREFETCH_LRU_SIZE		8
-#define		PREFETCH_LRU_COUNT		128
-#define		PREFETCH_CACHE_SIZE		(PREFETCH_LRU_SIZE * PREFETCH_LRU_COUNT)
-
-/*
- * Used to detect sequential patterns (and disable prefetching).
- */
-#define		PREFETCH_QUEUE_HISTORY			8
-#define		PREFETCH_SEQ_PATTERN_BLOCKS		4
-
-typedef struct PrefetchEntry
-{
-	ItemPointerData		tid;
-	bool				all_visible;
-} PrefetchEntry;
-
-typedef struct IndexPrefetchData
-{
-	/*
-	 * XXX We need to disable this in some cases (e.g. when using index-only
-	 * scans, we don't want to prefetch pages). Or maybe we should prefetch
-	 * only pages that are not all-visible, that'd be even better.
-	 */
-	int			prefetchTarget;	/* how far we should be prefetching */
-	int			prefetchMaxTarget;	/* maximum prefetching distance */
-	int			prefetchReset;	/* reset to this distance on rescan */
-	bool		prefetchDone;	/* did we get all TIDs from the index? */
-
-	/* runtime statistics */
-	uint64		countAll;		/* all prefetch requests */
-	uint64		countPrefetch;	/* actual prefetches */
-	uint64		countSkipSequential;
-	uint64		countSkipCached;
-
-	/* used when prefetching index-only scans */
-	Buffer		vmBuffer;
-
-	/*
-	 * Queue of TIDs to prefetch.
-	 *
-	 * XXX Sizing for MAX_IO_CONCURRENCY may be overkill, but it seems simpler
-	 * than dynamically adjusting for custom values.
-	 */
-	PrefetchEntry	queueItems[MAX_IO_CONCURRENCY];
-	uint64			queueIndex;	/* next TID to prefetch */
-	uint64			queueStart;	/* first valid TID in queue */
-	uint64			queueEnd;	/* first invalid (empty) TID in queue */
-
-	/*
-	 * A couple of last prefetched blocks, used to check for certain access
-	 * pattern and skip prefetching - e.g. for sequential access).
-	 *
-	 * XXX Separate from the main queue, because we only want to compare the
-	 * block numbers, not the whole TID. In sequential access it's likely we
-	 * read many items from each page, and we don't want to check many items
-	 * (as that is much more expensive).
-	 */
-	BlockNumber		blockItems[PREFETCH_QUEUE_HISTORY];
-	uint64			blockIndex;	/* index in the block (points to the first
-								 * empty entry)*/
-
-	/*
-	 * Cache of recently prefetched blocks, organized as a hash table of
-	 * small LRU caches.
-	 */
-	uint64				prefetchReqNumber;
-	PrefetchCacheEntry	prefetchCache[PREFETCH_CACHE_SIZE];
-
-} IndexPrefetchData;
-
-#define PREFETCH_QUEUE_INDEX(a)	((a) % (MAX_IO_CONCURRENCY))
-#define PREFETCH_QUEUE_EMPTY(p)	((p)->queueEnd == (p)->queueIndex)
-#define PREFETCH_ENABLED(p)		((p) && ((p)->prefetchMaxTarget > 0))
-#define PREFETCH_FULL(p)		((p)->queueEnd - (p)->queueIndex == (p)->prefetchTarget)
-#define PREFETCH_DONE(p)		((p) && ((p)->prefetchDone && PREFETCH_QUEUE_EMPTY(p)))
-#define PREFETCH_ACTIVE(p)		(PREFETCH_ENABLED(p) && !(p)->prefetchDone)
-#define PREFETCH_BLOCK_INDEX(v)	((v) % PREFETCH_QUEUE_HISTORY)
-
 #endif							/* GENAM_H */
diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h
index d5903492c6e..d03360eac04 100644
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -106,12 +106,6 @@ typedef struct IndexFetchTableData
 	Relation	rel;
 } IndexFetchTableData;
 
-/*
- * Forward declarations, defined in genam.h.
- */
-typedef struct IndexPrefetchData IndexPrefetchData;
-typedef struct IndexPrefetchData *IndexPrefetch;
-
 /*
  * We use the same IndexScanDescData structure for both amgettuple-based
  * and amgetbitmap-based index scans.  Some fields are only relevant in
@@ -135,7 +129,6 @@ typedef struct IndexScanDescData
 	bool		ignore_killed_tuples;	/* do not return killed entries */
 	bool		xactStartedInRecovery;	/* prevents killing/seeing killed
 										 * tuples */
-	bool		indexonly;			/* is this index-only scan? */
 
 	/* index access method's private state */
 	void	   *opaque;			/* access-method-specific info */
@@ -169,9 +162,6 @@ typedef struct IndexScanDescData
 	bool	   *xs_orderbynulls;
 	bool		xs_recheckorderby;
 
-	/* prefetching state (or NULL if disabled for this scan) */
-	IndexPrefetchData *xs_prefetch;
-
 	/* parallel index scan information, in shared memory */
 	struct ParallelIndexScanDescData *parallel_scan;
 }			IndexScanDescData;
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 5d7f17dee07..8745453a5b4 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1529,6 +1529,7 @@ typedef struct
 	bool	   *elem_nulls;		/* array of num_elems is-null flags */
 } IndexArrayKeyInfo;
 
+
 /* ----------------
  *	 IndexScanState information
  *
@@ -1580,6 +1581,8 @@ typedef struct IndexScanState
 	bool	   *iss_OrderByTypByVals;
 	int16	   *iss_OrderByTypLens;
 	Size		iss_PscanLen;
+
+	IndexPrefetch *iss_prefetch;
 } IndexScanState;
 
 /* ----------------
@@ -1618,6 +1621,7 @@ typedef struct IndexOnlyScanState
 	TupleTableSlot *ioss_TableSlot;
 	Buffer		ioss_VMBuffer;
 	Size		ioss_PscanLen;
+	IndexPrefetch *ioss_prefetch;
 } IndexOnlyScanState;
 
 /* ----------------
-- 
2.41.0

