From f4d0c7c81c1b960e71d7f76f1262279b329eac07 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Tue, 18 Feb 2025 15:59:13 +1300
Subject: [PATCH v2 3/4] Improve read stream advice for larger random reads.

read_stream.c tries not to issue advice when it thinks the kernel's
readahead should be active, ie when using buffered I/O and reading
sequential blocks.  It previously gave up a little too easily: it
should issue advice until it has started running sequential pread()
calls, not just when it's planning to.  The simpler strategy worked for
random regions of size <= io_combine_limit and large sequential regions,
but not so well when reading random regions of size
> io_combine limit.  For a 256kB chunk of data far away
from recent access, it would issue advice for the first half (assuming
io_combine_limit=128kB) and then suffer an I/O stall for the second
half.

Discovered by Tomas Vondra's regression testing of many data clustering
patterns using Melanie Plageman's streaming Bitmap Heap Scan patch, with
analysis of the I/O stall-producing pattern from Andres Freund.

Discussion: https://postgr.es/m/CA%2BhUKGK_%3D4CVmMHvsHjOVrK6t4F%3DLBpFzsrr3R%2BaJYN8kcTfWg%40mail.gmail.com
Discussion: https://postgr.es/m/CA%2BhUKGJ3HSWciQCz8ekP1Zn7N213RfA4nbuotQawfpq23%2Bw-5Q%40mail.gmail.com
---
 src/backend/storage/aio/read_stream.c | 44 +++++++++++++++++++++------
 1 file changed, 35 insertions(+), 9 deletions(-)

diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c
index 6c2b4ec011d..a028217a08e 100644
--- a/src/backend/storage/aio/read_stream.c
+++ b/src/backend/storage/aio/read_stream.c
@@ -132,6 +132,7 @@ struct ReadStream
 
 	/* Next expected block, for detecting sequential access. */
 	BlockNumber seq_blocknum;
+	BlockNumber seq_start;
 
 	/* The read operation we are currently preparing. */
 	BlockNumber pending_read_blocknum;
@@ -260,16 +261,30 @@ read_stream_start_pending_read(ReadStream *stream, bool suppress_advice)
 	else
 		Assert(stream->next_buffer_index == stream->oldest_buffer_index);
 
-	/*
-	 * If advice hasn't been suppressed, this system supports it, and this
-	 * isn't a strictly sequential pattern, then we'll issue advice.
-	 */
-	if (!suppress_advice &&
-		stream->advice_enabled &&
-		stream->pending_read_blocknum != stream->seq_blocknum)
+	/* Do we need to issue read-ahead advice? */
+	flags = 0;
+	if (stream->advice_enabled)
+	{
 		flags = READ_BUFFERS_ISSUE_ADVICE;
-	else
-		flags = 0;
+
+		if (stream->pending_read_blocknum == stream->seq_blocknum)
+		{
+			/*
+			 * Suppress advice if our WaitReadBuffers() calls have caught up
+			 * with the first advice we issued for this sequential run.
+			 */
+			if (stream->seq_start == InvalidBlockNumber)
+				suppress_advice = true;
+		}
+		else
+		{
+			/* Random jump, so start a new sequential run. */
+			stream->seq_start = stream->pending_read_blocknum;
+		}
+
+		if (suppress_advice)
+			flags = 0;
+	}
 
 	/* Compute the remaining portion of the per-backend buffer limit. */
 	if (stream->temporary)
@@ -601,6 +616,8 @@ read_stream_begin_impl(int flags,
 	stream->callback_private_data = callback_private_data;
 	stream->buffered_blocknum = InvalidBlockNumber;
 	stream->temporary = SmgrIsTemp(smgr);
+	stream->seq_blocknum = InvalidBlockNumber;
+	stream->seq_start = InvalidBlockNumber;
 
 	/*
 	 * Skip the initial ramp-up phase if the caller says we're going to be
@@ -825,6 +842,15 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 			distance = stream->distance * 2;
 			distance = Min(distance, stream->max_pinned_buffers);
 			stream->distance = distance;
+
+			/*
+			 * If we've caught up with the first advice issued for the current
+			 * sequential run, cancel further advice until the next random
+			 * jump.  The kernel should be able to see the pattern now that
+			 * we're issuing sequential preadv() calls.
+			 */
+			if (stream->ios[io_index].op.blocknum == stream->seq_start)
+				stream->seq_start = InvalidBlockNumber;
 		}
 		else
 		{
-- 
2.39.5

