From 047eda4fa39e5b37b5beaaa3f24195a8d7aa6a5e Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@2ndquadrant.com>
Date: Sat, 14 Oct 2023 00:13:26 +0200
Subject: [PATCH 4/4] PoC prefetch for IOS

---
 src/backend/access/index/indexam.c       | 140 ++++++++++++++++++++++-
 src/backend/executor/nodeIndexonlyscan.c |  63 +++++++++-
 src/include/access/genam.h               |   2 +
 3 files changed, 195 insertions(+), 10 deletions(-)

diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index 8c56acfd638..4ae4e867770 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -49,6 +49,7 @@
 #include "access/relscan.h"
 #include "access/tableam.h"
 #include "access/transam.h"
+#include "access/visibilitymap.h"
 #include "access/xlog.h"
 #include "catalog/index.h"
 #include "catalog/pg_amproc.h"
@@ -111,7 +112,7 @@ static IndexScanDesc index_beginscan_internal(Relation indexRelation,
 											  ParallelIndexScanDesc pscan, bool temp_snap,
 											  int prefetch_max);
 
-static void index_prefetch(IndexScanDesc scan, ItemPointer tid);
+static void index_prefetch(IndexScanDesc scan, ItemPointer tid, bool skip_all_visible);
 
 
 /* ----------------------------------------------------------------
@@ -755,7 +756,7 @@ index_getnext_slot(IndexScanDesc scan, ScanDirection direction, TupleTableSlot *
 				 * FIXME For IOS, this should prefetch only pages that are not
 				 * fully visible.
 				 */
-				index_prefetch(scan, tid);
+				index_prefetch(scan, tid, false);
 			}
 		}
 
@@ -1439,7 +1440,7 @@ index_prefetch_add_cache(IndexPrefetch prefetch, BlockNumber block)
  * in BTScanPosData.nextPage.
  */
 static void
-index_prefetch(IndexScanDesc scan, ItemPointer tid)
+index_prefetch(IndexScanDesc scan, ItemPointer tid, bool skip_all_visible)
 {
 	IndexPrefetch prefetch = scan->xs_prefetch;
 	BlockNumber block;
@@ -1462,10 +1463,36 @@ index_prefetch(IndexScanDesc scan, ItemPointer tid)
 	 */
 	Assert(scan->heapRelation);
 
-	prefetch->countAll++;
-
 	block = ItemPointerGetBlockNumber(tid);
 
+	/*
+	 * When prefetching for IOS, we want to only prefetch pages that are not
+	 * marked as all-visible (because not fetching all-visible pages is the
+	 * point of IOS).
+	 *
+	 * XXX This is not great, because it releases the VM buffer for each TID
+	 * we consider to prefetch. We should reuse that somehow, similar to the
+	 * actual IOS code. Ideally, we should use the same ioss_VMBuffer (if
+	 * we can propagate it here). Or at least do it for a bulk of prefetches,
+	 * although that's not very useful - after the ramp-up we will prefetch
+	 * the pages one by one anyway.
+	 */
+	if (skip_all_visible)
+	{
+		bool	all_visible;
+		Buffer	vmbuffer = InvalidBuffer;
+
+		all_visible = VM_ALL_VISIBLE(scan->heapRelation,
+									 block,
+									 &vmbuffer);
+
+		if (vmbuffer != InvalidBuffer)
+			ReleaseBuffer(vmbuffer);
+
+		if (all_visible)
+			return;
+	}
+
 	/*
 	 * Do not prefetch the same block over and over again,
 	 *
@@ -1480,4 +1507,107 @@ index_prefetch(IndexScanDesc scan, ItemPointer tid)
 		PrefetchBuffer(scan->heapRelation, MAIN_FORKNUM, block);
 		pgBufferUsage.blks_prefetches++;
 	}
+
+	prefetch->countAll++;
+}
+
+/* ----------------
+ * index_getnext_tid_prefetch - get the next TID from a scan
+ *
+ * The result is the next TID satisfying the scan keys,
+ * or NULL if no more matching tuples exist.
+ *
+ * FIXME not sure this handles xs_heapfetch correctly.
+ * ----------------
+ */
+ItemPointer
+index_getnext_tid_prefetch(IndexScanDesc scan, ScanDirection direction)
+{
+	IndexPrefetch prefetch = scan->xs_prefetch; /* for convenience */
+
+	/*
+	 * If the prefetching is still active (i.e. enabled and we still
+	 * haven't finished reading TIDs from the scan), read enough TIDs into
+	 * the queue until we hit the current target.
+	 */
+	if (PREFETCH_ACTIVE(prefetch))
+	{
+		/*
+		 * Ramp up the prefetch distance incrementally.
+		 *
+		 * Intentionally done as first, before reading the TIDs into the
+		 * queue, so that there's always at least one item. Otherwise we
+		 * might get into a situation where we start with target=0 and no
+		 * TIDs loaded.
+		 */
+		prefetch->prefetchTarget = Min(prefetch->prefetchTarget + 1,
+									   prefetch->prefetchMaxTarget);
+
+		/*
+		 * Now read TIDs from the index until the queue is full (with
+		 * respect to the current prefetch target).
+		 */
+		while (!PREFETCH_FULL(prefetch))
+		{
+			ItemPointer tid;
+
+			/* Time to fetch the next TID from the index */
+			tid = index_getnext_tid(scan, direction);
+
+			/*
+			 * If we're out of index entries, we're done (and we mark the
+			 * the prefetcher as inactive).
+			 */
+			if (tid == NULL)
+			{
+				prefetch->prefetchDone = true;
+				break;
+			}
+
+			Assert(ItemPointerEquals(tid, &scan->xs_heaptid));
+
+			prefetch->queueItems[PREFETCH_QUEUE_INDEX(prefetch->queueEnd)] = *tid;
+			prefetch->queueEnd++;
+
+			/*
+			 * Issue the actuall prefetch requests for the new TID.
+			 *
+			 * XXX index_getnext_tid_prefetch is only called for IOS (for now),
+			 * so skip prefetching of all-visible pages.
+			 */
+			index_prefetch(scan, tid, true);
+		}
+	}
+
+	/*
+	 * With prefetching enabled (even if we already finished reading
+	 * all TIDs from the index scan), we need to return a TID from the
+	 * queue. Otherwise, we just get the next TID from the scan
+	 * directly.
+	 */
+	if (PREFETCH_ENABLED(prefetch))
+	{
+		/* Did we reach the end of the scan and the queue is empty? */
+		if (PREFETCH_DONE(prefetch))
+			return NULL;
+
+		scan->xs_heaptid = prefetch->queueItems[PREFETCH_QUEUE_INDEX(prefetch->queueIndex)];
+		prefetch->queueIndex++;
+	}
+	else				/* not prefetching, just do the regular work  */
+	{
+		ItemPointer tid;
+
+		/* Time to fetch the next TID from the index */
+		tid = index_getnext_tid(scan, direction);
+
+		/* If we're out of index entries, we're done */
+		if (tid == NULL)
+			return NULL;
+
+		Assert(ItemPointerEquals(tid, &scan->xs_heaptid));
+	}
+
+	/* Return the TID of the tuple we found. */
+	return &scan->xs_heaptid;
 }
diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c
index 7c27913502c..855afd5ba76 100644
--- a/src/backend/executor/nodeIndexonlyscan.c
+++ b/src/backend/executor/nodeIndexonlyscan.c
@@ -43,7 +43,7 @@
 #include "storage/predicate.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"
-
+#include "utils/spccache.h"
 
 static TupleTableSlot *IndexOnlyNext(IndexOnlyScanState *node);
 static void StoreIndexTuple(TupleTableSlot *slot, IndexTuple itup,
@@ -65,6 +65,7 @@ IndexOnlyNext(IndexOnlyScanState *node)
 	IndexScanDesc scandesc;
 	TupleTableSlot *slot;
 	ItemPointer tid;
+	Relation	heapRel = node->ss.ss_currentRelation;
 
 	/*
 	 * extract necessary information from index scan node
@@ -83,6 +84,23 @@ IndexOnlyNext(IndexOnlyScanState *node)
 
 	if (scandesc == NULL)
 	{
+		int			prefetch_max;
+
+		/*
+		 * Determine number of heap pages to prefetch for this index. This is
+		 * essentially just effective_io_concurrency for the table (or the
+		 * tablespace it's in).
+		 *
+		 * XXX Should this also look at plan.plan_rows and maybe cap the target
+		 * to that? Pointless to prefetch more than we expect to use. Or maybe
+		 * just reset to that value during prefetching, after reading the next
+		 * index page (or rather after rescan)?
+		 *
+		 * XXX Maybe reduce the value with parallel workers?
+		 */
+		prefetch_max = Min(get_tablespace_io_concurrency(heapRel->rd_rel->reltablespace),
+						   node->ss.ps.plan->plan_rows);
+
 		/*
 		 * We reach here if the index only scan is not parallel, or if we're
 		 * serially executing an index only scan that was planned to be
@@ -100,7 +118,7 @@ IndexOnlyNext(IndexOnlyScanState *node)
 								   estate->es_snapshot,
 								   node->ioss_NumScanKeys,
 								   node->ioss_NumOrderByKeys,
-								   0);	/* no prefetching for IOS */
+								   prefetch_max);
 
 		node->ioss_ScanDesc = scandesc;
 
@@ -124,7 +142,7 @@ IndexOnlyNext(IndexOnlyScanState *node)
 	/*
 	 * OK, now that we have what we need, fetch the next tuple.
 	 */
-	while ((tid = index_getnext_tid(scandesc, direction)) != NULL)
+	while ((tid = index_getnext_tid_prefetch(scandesc, direction)) != NULL)
 	{
 		bool		tuple_from_heap = false;
 
@@ -654,6 +672,24 @@ ExecIndexOnlyScanInitializeDSM(IndexOnlyScanState *node,
 {
 	EState	   *estate = node->ss.ps.state;
 	ParallelIndexScanDesc piscan;
+	Relation	heapRel = node->ss.ss_currentRelation;
+	int			prefetch_max;
+
+	/*
+	 * Determine number of heap pages to prefetch for this index. This is
+	 * essentially just effective_io_concurrency for the table (or the
+	 * tablespace it's in).
+	 *
+	 * XXX Should this also look at plan.plan_rows and maybe cap the target
+	 * to that? Pointless to prefetch more than we expect to use. Or maybe
+	 * just reset to that value during prefetching, after reading the next
+	 * index page (or rather after rescan)?
+	 *
+	 * XXX Maybe reduce the value with parallel workers?
+	 */
+
+	prefetch_max = Min(get_tablespace_io_concurrency(heapRel->rd_rel->reltablespace),
+					   node->ss.ps.plan->plan_rows);
 
 	piscan = shm_toc_allocate(pcxt->toc, node->ioss_PscanLen);
 	index_parallelscan_initialize(node->ss.ss_currentRelation,
@@ -667,7 +703,7 @@ ExecIndexOnlyScanInitializeDSM(IndexOnlyScanState *node,
 								 node->ioss_NumScanKeys,
 								 node->ioss_NumOrderByKeys,
 								 piscan,
-								 0);	/* no prefetching for IOS */
+								 prefetch_max);
 	node->ioss_ScanDesc->xs_want_itup = true;
 	node->ioss_VMBuffer = InvalidBuffer;
 
@@ -705,6 +741,23 @@ ExecIndexOnlyScanInitializeWorker(IndexOnlyScanState *node,
 								  ParallelWorkerContext *pwcxt)
 {
 	ParallelIndexScanDesc piscan;
+	Relation	heapRel = node->ss.ss_currentRelation;
+	int			prefetch_max;
+
+	/*
+	 * Determine number of heap pages to prefetch for this index. This is
+	 * essentially just effective_io_concurrency for the table (or the
+	 * tablespace it's in).
+	 *
+	 * XXX Should this also look at plan.plan_rows and maybe cap the target
+	 * to that? Pointless to prefetch more than we expect to use. Or maybe
+	 * just reset to that value during prefetching, after reading the next
+	 * index page (or rather after rescan)?
+	 *
+	 * XXX Maybe reduce the value with parallel workers?
+	 */
+	prefetch_max = Min(get_tablespace_io_concurrency(heapRel->rd_rel->reltablespace),
+					   node->ss.ps.plan->plan_rows);
 
 	piscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false);
 	node->ioss_ScanDesc =
@@ -713,7 +766,7 @@ ExecIndexOnlyScanInitializeWorker(IndexOnlyScanState *node,
 								 node->ioss_NumScanKeys,
 								 node->ioss_NumOrderByKeys,
 								 piscan,
-								 0);	/* no prefetching for IOS */
+								 prefetch_max);
 	node->ioss_ScanDesc->xs_want_itup = true;
 
 	/*
diff --git a/src/include/access/genam.h b/src/include/access/genam.h
index ceb6279b0b0..6e92758ced5 100644
--- a/src/include/access/genam.h
+++ b/src/include/access/genam.h
@@ -175,6 +175,8 @@ extern IndexScanDesc index_beginscan_parallel(Relation heaprel,
 											  int prefetch_max);
 extern ItemPointer index_getnext_tid(IndexScanDesc scan,
 									 ScanDirection direction);
+extern ItemPointer index_getnext_tid_prefetch(IndexScanDesc scan,
+											  ScanDirection direction);
 struct TupleTableSlot;
 extern bool index_fetch_heap(IndexScanDesc scan, struct TupleTableSlot *slot);
 extern bool index_getnext_slot(IndexScanDesc scan, ScanDirection direction,
-- 
2.41.0