From 5d23e2b31b60f970884681bd75e1553079d5570b Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Sat, 28 Mar 2026 10:51:44 -0400
Subject: [PATCH v4 02/14] read_stream: Prevent distance from decaying too
 quickly

Until now we reduced the look-ahead distance by 1 on every hit, and doubled it
on every miss. That is problematic because there are very common IO patterns
where this prevents us from ever reaching a sufficiently high distance (e.g. a
miss followed by a hit will never have the distance grow beyond 2). In many
such cases, if we had ever reached a sufficient look-ahead distance, things
would have been fine, because we grow the distance faster than we decrease it.

One might think that the most obvious answer to this problem would be to never
reduce the distance. However, that would not work well, as (particularly with
upcoming users of read streams), it is reasonably common to at first have a
lot of misses and then to transition to a fully cached workload, e.g. because
the same blocks are needed repeatedly within one stream. Doing unnecessarily
deep readahead can be costly, due to having to pin a lot more buffers, which
increases CPU overhead.

Because the cost of a synchronously handled miss can be very high (multiple
milliseconds for every IO with commonly used storage) compared to the CPU
overhead of keeping the distance too high, we want to err on the side of not
reducing the distance too early.

The insight that a decrease of the distance by 1 at ever hit may be ok at
large distances, but not at low distances, shows a way out: If we only allow
decreasing the distance once there were no misses for our maximum look-ahead
distance, we will keep the distance high as long as readahead has a chance to
do IO asynchronously, but not commonly when not.

Several folks have written variants of this patch, at least Thomas Munro,
Melanie Plageman and I all have written variants of this.

Author:
Reviewed-by:
Discussion: https://postgr.es/m/
Backpatch:
---
 src/backend/storage/aio/read_stream.c | 36 ++++++++++++++++++++++++---
 1 file changed, 33 insertions(+), 3 deletions(-)

diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c
index c9595ea10c7..31f9e35dee3 100644
--- a/src/backend/storage/aio/read_stream.c
+++ b/src/backend/storage/aio/read_stream.c
@@ -99,6 +99,7 @@ struct ReadStream
 	int16		forwarded_buffers;
 	int16		pinned_buffers;
 	int16		distance;
+	uint16		distance_decay_holdoff;
 	int16		initialized_buffers;
 	int16		resume_distance;
 	int			read_buffers_flags;
@@ -364,9 +365,22 @@ read_stream_start_pending_read(ReadStream *stream)
 	/* Remember whether we need to wait before returning this buffer. */
 	if (!need_wait)
 	{
-		/* Look-ahead distance decays, no I/O necessary. */
-		if (stream->distance > 1)
-			stream->distance--;
+		/*
+		 * If there currently is no IO in progress, and we have not needed to
+		 * issue IO recently, decay the look-ahead distance.  We detect if we
+		 * had to issue IO recently by having a decay holdoff that's set to
+		 * the max look-ahead distance whenever we need to do IO.  This is
+		 * important to ensure we eventually reach a high enough distance to
+		 * perform IO asynchronously when starting out with a small look-ahead
+		 * distance.
+		 */
+		if (stream->distance > 1 && stream->ios_in_progress == 0)
+		{
+			if (stream->distance_decay_holdoff == 0)
+				stream->distance--;
+			else
+				stream->distance_decay_holdoff--;
+		}
 	}
 	else
 	{
@@ -702,6 +716,7 @@ read_stream_begin_impl(int flags,
 	stream->seq_blocknum = InvalidBlockNumber;
 	stream->seq_until_processed = InvalidBlockNumber;
 	stream->temporary = SmgrIsTemp(smgr);
+	stream->distance_decay_holdoff = 0;
 
 	/*
 	 * Skip the initial ramp-up phase if the caller says we're going to be
@@ -954,6 +969,20 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 		distance = Min(distance, stream->max_pinned_buffers);
 		stream->distance = distance;
 
+		/*
+		 * As we needed IO, prevent distance from being reduced within our
+		 * maximum look-ahead window. This avoids having distance collapse too
+		 * quickly in workloads where most of the required blocks are cached,
+		 * but where the remaining IOs are a sufficient enough factor to cause
+		 * a substantial slowdown if executed synchronously.
+		 *
+		 * There are valid arguments for preventing decay for max_ios or for
+		 * max_pinned_buffers.  But the argument for max_pinned_buffers seems
+		 * clearer - if we can't see any misses within the maximum look-ahead
+		 * distance, we can't do any useful read-ahead.
+		 */
+		stream->distance_decay_holdoff = stream->max_pinned_buffers;
+
 		/*
 		 * If we've reached the first block of a sequential region we're
 		 * issuing advice for, cancel that until the next jump.  The kernel
@@ -1128,6 +1157,7 @@ read_stream_reset(ReadStream *stream)
 	/* Start off assuming data is cached. */
 	stream->distance = 1;
 	stream->resume_distance = stream->distance;
+	stream->distance_decay_holdoff = 0;
 }
 
 /*
-- 
2.53.0.1.gb2826b52eb

