From c0e57e66e59f7b7e98414c280d9e9d1c60c1f768 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Wed, 19 Feb 2025 01:25:40 +1300
Subject: [PATCH v2.8 12/38] Simplify distance heuristics in read_stream.c.

Previously, sequential reads would cause the look-ahead distance to
move towards io_combine_limit, on the basis that we wouldn't be issuing
advice so we wouldn't benefit from looking ahead further.  That's not
really true, as we could suffer avoidable stalls when encountering
random jumps after sequential regions, for example in Bitmap Heap Scans
(with a proposed patch), and it is also incompatible with AIO plans,
where you always have to look ahead to start I/O.

Simplify the algorithm: now cache hits alone make the look-ahead
distance drop off, and cache misses make it grow rapidly as before.
Random vs sequential heuristics are no longer taken into consideration.

Reviewed-by: Andres Freund <andres@anarazel.de> (earlier version)
Tested-by: Melanie Plageman <melanieplageman@gmail.com>
Discussion: https://postgr.es/m/CA%2BhUKGK_%3D4CVmMHvsHjOVrK6t4F%3DLBpFzsrr3R%2BaJYN8kcTfWg%40mail.gmail.com
---
 src/backend/storage/aio/read_stream.c | 94 ++++++++++-----------------
 1 file changed, 34 insertions(+), 60 deletions(-)

diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c
index f991373359a..ef30930b9b2 100644
--- a/src/backend/storage/aio/read_stream.c
+++ b/src/backend/storage/aio/read_stream.c
@@ -17,30 +17,12 @@
  * pending read.  When that isn't possible, the existing pending read is sent
  * to StartReadBuffers() so that a new one can begin to form.
  *
- * The algorithm for controlling the look-ahead distance tries to classify the
- * stream into three ideal behaviors:
- *
- * A) No I/O is necessary, because the requested blocks are fully cached
- * already.  There is no benefit to looking ahead more than one block, so
- * distance is 1.  This is the default initial assumption.
- *
- * B) I/O is necessary, but read-ahead advice is undesirable because the
- * access is sequential and we can rely on the kernel's read-ahead heuristics,
- * or impossible because direct I/O is enabled, or the system doesn't support
- * read-ahead advice.  There is no benefit in looking ahead more than
- * io_combine_limit, because in this case the only goal is larger read system
- * calls.  Looking further ahead would pin many buffers and perform
- * speculative work for no benefit.
- *
- * C) I/O is necessary, it appears to be random, and this system supports
- * read-ahead advice.  We'll look further ahead in order to reach the
- * configured level of I/O concurrency.
- *
- * The distance increases rapidly and decays slowly, so that it moves towards
- * those levels as different I/O patterns are discovered.  For example, a
- * sequential scan of fully cached data doesn't bother looking ahead, but a
- * sequential scan that hits a region of uncached blocks will start issuing
- * increasingly wide read calls until it plateaus at io_combine_limit.
+ * The algorithm for controlling the look-ahead distance is based on recent
+ * cache hit and miss history.  When no I/O is necessary, there is no benefit
+ * in looking ahead more than one block.  This is the default initial
+ * assumption, but when blocks needing I/O are streamed, the distance is
+ * increased rapidly to try to benefit from I/O combining and concurrency.  It
+ * is reduced gradually when cached blocks are streamed.
  *
  * The main data structure is a circular queue of buffers of size
  * max_pinned_buffers plus some extra space for technical reasons, ready to be
@@ -337,7 +319,7 @@ read_stream_start_pending_read(ReadStream *stream)
 	/* Remember whether we need to wait before returning this buffer. */
 	if (!need_wait)
 	{
-		/* Look-ahead distance decays, no I/O necessary (behavior A). */
+		/* Look-ahead distance decays, no I/O necessary. */
 		if (stream->distance > 1)
 			stream->distance--;
 	}
@@ -518,6 +500,15 @@ read_stream_begin_impl(int flags,
 	else
 		max_ios = get_tablespace_io_concurrency(tablespace_id);
 
+	/*
+	 * XXX Since we don't have asynchronous I/O yet, if direct I/O is enabled
+	 * then just behave as though I/O concurrency is set to 0.  Otherwise we
+	 * would look ahead pinning many buffers for no benefit, for lack of
+	 * advice and AIO.
+	 */
+	if (io_direct_flags & IO_DIRECT_DATA)
+		max_ios = 0;
+
 	/* Cap to INT16_MAX to avoid overflowing below */
 	max_ios = Min(max_ios, PG_INT16_MAX);
 
@@ -638,7 +629,7 @@ read_stream_begin_impl(int flags,
 	/*
 	 * Skip the initial ramp-up phase if the caller says we're going to be
 	 * reading the whole relation.  This way we start out assuming we'll be
-	 * doing full io_combine_limit sized reads (behavior B).
+	 * doing full io_combine_limit sized reads.
 	 */
 	if (flags & READ_STREAM_FULL)
 		stream->distance = Min(max_pinned_buffers, stream->io_combine_limit);
@@ -729,10 +720,10 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 #ifndef READ_STREAM_DISABLE_FAST_PATH
 
 	/*
-	 * A fast path for all-cached scans (behavior A).  This is the same as the
-	 * usual algorithm, but it is specialized for no I/O and no per-buffer
-	 * data, so we can skip the queue management code, stay in the same buffer
-	 * slot and use singular StartReadBuffer().
+	 * A fast path for all-cached scans.  This is the same as the usual
+	 * algorithm, but it is specialized for no I/O and no per-buffer data, so
+	 * we can skip the queue management code, stay in the same buffer slot and
+	 * use singular StartReadBuffer().
 	 */
 	if (likely(stream->fast_path))
 	{
@@ -852,37 +843,20 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 		if (++stream->oldest_io_index == stream->max_ios)
 			stream->oldest_io_index = 0;
 
-		if (stream->ios[io_index].op.flags & READ_BUFFERS_ISSUE_ADVICE)
-		{
-			/* Distance ramps up fast (behavior C). */
-			distance = stream->distance * 2;
-			distance = Min(distance, stream->max_pinned_buffers);
-			stream->distance = distance;
+		/* Look-ahead distance ramps up quickly after we do I/O. */
+		distance = stream->distance * 2;
+		distance = Min(distance, stream->max_pinned_buffers);
+		stream->distance = distance;
 
-			/*
-			 * If we've caught up with the first advice issued for the current
-			 * sequential region, cancel further advice until the next random
-			 * jump.  The kernel should be able to see the pattern now that
-			 * we're actually making sequential preadv() calls.
-			 */
-			if (stream->ios[io_index].op.blocknum == stream->seq_until_processed)
-				stream->seq_until_processed = InvalidBlockNumber;
-		}
-		else
-		{
-			/* No advice; move towards io_combine_limit (behavior B). */
-			if (stream->distance > stream->io_combine_limit)
-			{
-				stream->distance--;
-			}
-			else
-			{
-				distance = stream->distance * 2;
-				distance = Min(distance, stream->io_combine_limit);
-				distance = Min(distance, stream->max_pinned_buffers);
-				stream->distance = distance;
-			}
-		}
+		/*
+		 * If we've caught up with the first advice issued for the current
+		 * sequential region, cancel further advice until the next random
+		 * jump.  The kernel should be able to see the pattern now that we're
+		 * actually making sequential preadv() calls.
+		 */
+		if (stream->advice_enabled &&
+			stream->ios[io_index].op.blocknum == stream->seq_until_processed)
+			stream->seq_until_processed = InvalidBlockNumber;
 	}
 
 #ifdef CLOBBER_FREED_MEMORY
-- 
2.48.1.76.g4e746b1a31.dirty