From 5457ef3b17fd28be63c1ba31fcfc1d845a3010ca Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Wed, 19 Feb 2025 01:25:40 +1300
Subject: [PATCH v3 4/6] Look ahead more when sequential in read_stream.c.

Previously, sequential reads would cause the look-ahead distance to
fall back to io_combine_limit, on the basis that kernel read-ahead
should start helping.  It also meant that we'd have to ramp the distance
back up when a sequential region was followed by a burst of random
jumps, with little hope of avoiding a stall, which is not a good
trade-off and is incompatible with AIO plans (you have to look ahead if
you have to start real I/O).

Simplify the algorithm: now only cache hits make the look-ahead distance
drop off, and cache misses still make it grow rapidly.  Random vs
sequential heuristics are no longer taken into consideration while
making that decision.

Reviewed-by: Andres Freund <andres@anarazel.de>
Discussion: https://postgr.es/m/CA%2BhUKGK_%3D4CVmMHvsHjOVrK6t4F%3DLBpFzsrr3R%2BaJYN8kcTfWg%40mail.gmail.com
---
 src/backend/storage/aio/read_stream.c | 92 ++++++++++-----------------
 1 file changed, 33 insertions(+), 59 deletions(-)

diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c
index a8a96baf8c1..57cde89cfdc 100644
--- a/src/backend/storage/aio/read_stream.c
+++ b/src/backend/storage/aio/read_stream.c
@@ -17,30 +17,12 @@
  * pending read.  When that isn't possible, the existing pending read is sent
  * to StartReadBuffers() so that a new one can begin to form.
  *
- * The algorithm for controlling the look-ahead distance tries to classify the
- * stream into three ideal behaviors:
+ * The algorithm for controlling the look-ahead distance is based on recent
+ * cache hits and misses:
  *
- * A) No I/O is necessary, because the requested blocks are fully cached
- * already.  There is no benefit to looking ahead more than one block, so
- * distance is 1.  This is the default initial assumption.
- *
- * B) I/O is necessary, but read-ahead advice is undesirable because the
- * access is sequential and we can rely on the kernel's read-ahead heuristics,
- * or impossible because direct I/O is enabled, or the system doesn't support
- * read-ahead advice.  There is no benefit in looking ahead more than
- * io_combine_limit, because in this case the only goal is larger read system
- * calls.  Looking further ahead would pin many buffers and perform
- * speculative work for no benefit.
- *
- * C) I/O is necessary, it appears to be random, and this system supports
- * read-ahead advice.  We'll look further ahead in order to reach the
- * configured level of I/O concurrency.
- *
- * The distance increases rapidly and decays slowly, so that it moves towards
- * those levels as different I/O patterns are discovered.  For example, a
- * sequential scan of fully cached data doesn't bother looking ahead, but a
- * sequential scan that hits a region of uncached blocks will start issuing
- * increasingly wide read calls until it plateaus at io_combine_limit.
+ * When no I/O is necessary, there is no point in looking ahead more than one
+ * block.  This is the default initial assumption.  Otherwise rapidly increase
+ * the distance to try to benefit from I/O combining and I/O concurrency.
  *
  * The main data structure is a circular queue of buffers of size
  * max_pinned_buffers plus some extra space for technical reasons, ready to be
@@ -336,7 +318,7 @@ read_stream_start_pending_read(ReadStream *stream)
 	/* Remember whether we need to wait before returning this buffer. */
 	if (!need_wait)
 	{
-		/* Look-ahead distance decays, no I/O necessary (behavior A). */
+		/* Look-ahead distance decays, no I/O necessary. */
 		if (stream->distance > 1)
 			stream->distance--;
 	}
@@ -517,6 +499,15 @@ read_stream_begin_impl(int flags,
 	else
 		max_ios = get_tablespace_io_concurrency(tablespace_id);
 
+	/*
+	 * XXX Since we don't have asynchronous I/O yet, if direct I/O is enabled
+	 * then just behave as though I/O concurrency is set to 0.  Otherwise we
+	 * would look ahead pinning many buffers for no benefit, for lack of
+	 * advice and AIO.
+	 */
+	if (io_direct_flags & IO_DIRECT_DATA)
+		max_ios = 0;
+
 	/* Cap to INT16_MAX to avoid overflowing below */
 	max_ios = Min(max_ios, PG_INT16_MAX);
 
@@ -637,7 +628,7 @@ read_stream_begin_impl(int flags,
 	/*
 	 * Skip the initial ramp-up phase if the caller says we're going to be
 	 * reading the whole relation.  This way we start out assuming we'll be
-	 * doing full io_combine_limit sized reads (behavior B).
+	 * doing full io_combine_limit sized reads.
 	 */
 	if (flags & READ_STREAM_FULL)
 		stream->distance = Min(max_pinned_buffers, stream->io_combine_limit);
@@ -728,10 +719,10 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 #ifndef READ_STREAM_DISABLE_FAST_PATH
 
 	/*
-	 * A fast path for all-cached scans (behavior A).  This is the same as the
-	 * usual algorithm, but it is specialized for no I/O and no per-buffer
-	 * data, so we can skip the queue management code, stay in the same buffer
-	 * slot and use singular StartReadBuffer().
+	 * A fast path for all-cached scans.  This is the same as the usual
+	 * algorithm, but it is specialized for no I/O and no per-buffer data, so
+	 * we can skip the queue management code, stay in the same buffer slot and
+	 * use singular StartReadBuffer().
 	 */
 	if (likely(stream->fast_path))
 	{
@@ -851,37 +842,20 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 		if (++stream->oldest_io_index == stream->max_ios)
 			stream->oldest_io_index = 0;
 
-		if (stream->ios[io_index].op.flags & READ_BUFFERS_ISSUE_ADVICE)
-		{
-			/* Distance ramps up fast (behavior C). */
-			distance = stream->distance * 2;
-			distance = Min(distance, stream->max_pinned_buffers);
-			stream->distance = distance;
+		/* Look-ahead distance ramps up quickly after we do I/O. */
+		distance = stream->distance * 2;
+		distance = Min(distance, stream->max_pinned_buffers);
+		stream->distance = distance;
 
-			/*
-			 * If we've caught up with the first advice issued for the current
-			 * sequential region, cancel further advice until the next random
-			 * jump.  The kernel should be able to see the pattern now that
-			 * we're actually making sequential preadv() calls.
-			 */
-			if (stream->ios[io_index].op.blocknum == stream->seq_until_processed)
-				stream->seq_until_processed = InvalidBlockNumber;
-		}
-		else
-		{
-			/* No advice; move towards io_combine_limit (behavior B). */
-			if (stream->distance > stream->io_combine_limit)
-			{
-				stream->distance--;
-			}
-			else
-			{
-				distance = stream->distance * 2;
-				distance = Min(distance, stream->io_combine_limit);
-				distance = Min(distance, stream->max_pinned_buffers);
-				stream->distance = distance;
-			}
-		}
+		/*
+		 * If we've caught up with the first advice issued for the current
+		 * sequential region, cancel further advice until the next random
+		 * jump.  The kernel should be able to see the pattern now that we're
+		 * actually making sequential preadv() calls.
+		 */
+		if (stream->advice_enabled &&
+			stream->ios[io_index].op.blocknum == stream->seq_until_processed)
+			stream->seq_until_processed = InvalidBlockNumber;
 	}
 
 #ifdef CLOBBER_FREED_MEMORY
-- 
2.39.5

