From 8571e2b0ee705ea7d1b8d2f285c361ea010e4d2d Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@2ndquadrant.com>
Date: Sat, 18 Nov 2023 12:05:53 +0100
Subject: [PATCH v20231124 4/7] reintroduce the LRU cache of recent blocks

Useful for detecting sequential patterns, for which read-ahead works
better than our prefetching, and checking shared buffers and page cache
is not sufficient.
---
 src/backend/access/index/indexam.c | 137 +++++++++++++++++++++++++++++
 src/include/access/genam.h         |  30 +++++++
 2 files changed, 167 insertions(+)

diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index 54a704338f1..7456a69ab34 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -1082,6 +1082,125 @@ index_opclass_options(Relation indrel, AttrNumber attnum, Datum attoptions,
 	return build_local_reloptions(&relopts, attoptions, validate);
 }
 
+/*
+ * index_prefetch_is_sequential
+ *		Track the block number and check if the I/O pattern is sequential,
+ *		or if the same block was just prefetched.
+ *
+ * Prefetching is cheap, but for some access patterns the benefits are small
+ * compared to the extra overhead. In particular, for sequential access the
+ * read-ahead performed by the OS is very effective/efficient. Doing more
+ * prefetching is just increasing the costs.
+ *
+ * This tries to identify simple sequential patterns, so that we can skip
+ * the prefetching request. This is implemented by having a small queue
+ * of block numbers, and checking it before prefetching another block.
+ *
+ * We look at the preceding PREFETCH_SEQ_PATTERN_BLOCKS blocks, and see if
+ * they are sequential. We also check if the block is the same as the last
+ * request (which is not sequential).
+ *
+ * Note that the main prefetch queue is not really useful for this, as it
+ * stores TIDs while we care about block numbers. Consider a sorted table,
+ * with a perfectly sequential pattern when accessed through an index. Each
+ * heap page may have dozens of TIDs, but we need to check block numbers.
+ * We could keep enough TIDs to cover enough blocks, but then we also need
+ * to walk those when checking the pattern (in hot path).
+ *
+ * So instead, we maintain a small separate queue of block numbers, and we use
+ * this instead.
+ *
+ * Returns true if the block is in a sequential pattern (and so should not be
+ * prefetched), or false (not sequential, should be prefetched).
+ *
+ * XXX The name is a bit misleading, as it also adds the block number to the
+ * block queue and checks if the block is the same as the last one (which
+ * does not require a sequential pattern).
+ */
+static bool
+index_prefetch_is_sequential(IndexPrefetch prefetch, BlockNumber block)
+{
+	int			idx;
+
+	/*
+	 * If the block queue is empty, just store the block and we're done (it's
+	 * neither a sequential pattern, neither recently prefetched block).
+	 */
+	if (prefetch->blockIndex == 0)
+	{
+		prefetch->blockItems[PREFETCH_BLOCK_INDEX(prefetch->blockIndex)] = block;
+		prefetch->blockIndex++;
+		return false;
+	}
+
+	/*
+	 * Check if it's the same as the immediately preceding block. We don't
+	 * want to prefetch the same block over and over (which would happen for
+	 * well correlated indexes).
+	 *
+	 * In principle we could rely on index_prefetch_add_cache doing this using
+	 * the full cache, but this check is much cheaper and we need to look at
+	 * the preceding block anyway, so we just do it.
+	 *
+	 * XXX Notice we haven't added the block to the block queue yet, and there
+	 * is a preceding block (i.e. blockIndex-1 is valid).
+	 */
+	if (prefetch->blockItems[PREFETCH_BLOCK_INDEX(prefetch->blockIndex - 1)] == block)
+		return true;
+
+	/*
+	 * Add the block number to the queue.
+	 *
+	 * We do this before checking if the pattern, because we want to know
+	 * about the block even if we end up skipping the prefetch. Otherwise we'd
+	 * not be able to detect longer sequential pattens - we'd skip one block
+	 * but then fail to skip the next couple blocks even in a perfect
+	 * sequential pattern. This ocillation might even prevent the OS
+	 * read-ahead from kicking in.
+	 */
+	prefetch->blockItems[PREFETCH_BLOCK_INDEX(prefetch->blockIndex)] = block;
+	prefetch->blockIndex++;
+
+	/*
+	 * Check if the last couple blocks are in a sequential pattern. We look
+	 * for a sequential pattern of PREFETCH_SEQ_PATTERN_BLOCKS (4 by default),
+	 * so we look for patterns of 5 pages (40kB) including the new block.
+	 *
+	 * XXX Perhaps this should be tied to effective_io_concurrency somehow?
+	 *
+	 * XXX Could it be harmful that we read the queue backwards? Maybe memory
+	 * prefetching works better for the forward direction?
+	 */
+	for (int i = 1; i < PREFETCH_SEQ_PATTERN_BLOCKS; i++)
+	{
+		/*
+		 * Are there enough requests to confirm a sequential pattern? We only
+		 * consider something to be sequential after finding a sequence of
+		 * PREFETCH_SEQ_PATTERN_BLOCKS blocks.
+		 *
+		 * FIXME Better to move this outside the loop.
+		 */
+		if (prefetch->blockIndex < i)
+			return false;
+
+		/*
+		 * Calculate index of the earlier block (we need to do -1 as we
+		 * already incremented the index when adding the new block to the
+		 * queue).
+		 */
+		idx = PREFETCH_BLOCK_INDEX(prefetch->blockIndex - i - 1);
+
+		/*
+		 * For a sequential pattern, blocks "k" step ago needs to have block
+		 * number by "k" smaller compared to the current block.
+		 */
+		if (prefetch->blockItems[idx] != (block - i))
+			return false;
+	}
+
+	return true;
+}
+
 /*
  * index_prefetch
  *		Prefetch the TID, unless it's sequential or recently prefetched.
@@ -1172,8 +1291,26 @@ index_prefetch(IndexScanDesc scan, ItemPointer tid, bool skip_all_visible)
 			return;
 	}
 
+	/*
+	 * Now also check if the blocks to prefetch are in a sequential pattern.
+	 * We do it here because we need to do this check before PrefetchBuffer
+	 * initiates the prefetch, and we it can't do this easily (as it doesn't
+	 * know in what context it's called in). So we do it here.
+	 *
+	 * We use a tiny LRU cache and see if the blocks follow a sequential
+	 * pattern - if it's the same as the previous block, or if the last
+	 * couple blocks are a continguous sequence, we don't prefetch it.
+	 */
+	if (index_prefetch_is_sequential(prefetch, block))
+	{
+		prefetch->countSkipSequential++;
+		return;
+	}
+
+	/* XXX shouldn't this be before the VM / sequenqial check? */
 	prefetch->countAll++;
 
+	/* OK, try prefetching the block. */
 	result = PrefetchBuffer(scan->heapRelation, MAIN_FORKNUM, block);
 
 	/*
diff --git a/src/include/access/genam.h b/src/include/access/genam.h
index 9f33796fd29..9e2d77ef23b 100644
--- a/src/include/access/genam.h
+++ b/src/include/access/genam.h
@@ -235,6 +235,22 @@ extern HeapTuple systable_getnext_ordered(SysScanDesc sysscan,
 										  ScanDirection direction);
 extern void systable_endscan_ordered(SysScanDesc sysscan);
 
+/*
+ * Cache of recently prefetched blocks, organized as a hash table of
+ * small LRU caches. Doesn't need to be perfectly accurate, but we
+ * aim to make false positives/negatives reasonably low.
+ */
+typedef struct PrefetchCacheEntry {
+	BlockNumber		block;
+	uint64			request;
+} PrefetchCacheEntry;
+
+/*
+ * Used to detect sequential patterns (to not prefetch in this case).
+ */
+#define		PREFETCH_QUEUE_HISTORY			8
+#define		PREFETCH_SEQ_PATTERN_BLOCKS		4
+
 
 typedef struct IndexPrefetchData
 {
@@ -264,6 +280,20 @@ typedef struct IndexPrefetchData
 	uint64			queueIndex;	/* next TID to prefetch */
 	uint64			queueStart;	/* first valid TID in queue */
 	uint64			queueEnd;	/* first invalid (empty) TID in queue */
+
+	/*
+	 * A couple of last prefetched blocks, used to check for certain access
+	 * pattern and skip prefetching - e.g. for sequential access).
+	 *
+	 * XXX Separate from the main queue, because we only want to compare the
+	 * block numbers, not the whole TID. In sequential access it's likely we
+	 * read many items from each page, and we don't want to check many items
+	 * (as that is much more expensive).
+	 */
+	BlockNumber		blockItems[PREFETCH_QUEUE_HISTORY];
+	uint64			blockIndex;	/* index in the block (points to the first
+								 * empty entry)*/
+
 } IndexPrefetchData;
 
 #define PREFETCH_QUEUE_INDEX(a)	((a) % (MAX_IO_CONCURRENCY))
-- 
2.42.0

