From 3dcfa20437c7febf65ef7ad7dbc1a3c4f0bf6980 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Tue, 18 Feb 2025 15:59:13 +1300
Subject: [PATCH v2.8 11/38] Improve read_stream.c advice for big random
 chunks.

read_stream.c tries not to issue readahead advice when it thinks the
kernel's own readahead should be active, ie when using buffered I/O and
reading sequential blocks.  It previously gave up too easily, and issued
advice only for the first read of size up to io_combine_limit in a
larger sequential range of blocks after a random jump.  The following
read could suffer an avoidable I/O stall.

Fix, by issuing advice until the corresponding pread() calls catch up
with the start of the region we're currently issuing advice for, if
ever.  That's when the kernel actually sees the sequential pattern and
has any chance of helping.  That won't happen until the sequential
region fills the whole look-ahead window, so advice is now much more
aggressive, while still disabled for purely sequential streams.

While refactoring the advice logic, also get rid of the suppress_advice
argument that was passed around between functions, since
read_stream_start_pending_read() can make that determination itself, and
it's better to keep all that logic in one place.

Revealed by Tomas Vondra's disk access pattern tests with Melanie
Plageman's Bitmap Heap Scan patch.

Reviewed-by: Andres Freund <andres@anarazel.de> (earlier version)
Reported-by: Melanie Plageman <melanieplageman@gmail.com>
Reported-by: Tomas Vondra <tomas@vondra.me>
Reported-by: Andres Freund <andres@anarazel.de>
Tested-by: Melanie Plageman <melanieplageman@gmail.com>
Discussion: https://postgr.es/m/CA%2BhUKGK_%3D4CVmMHvsHjOVrK6t4F%3DLBpFzsrr3R%2BaJYN8kcTfWg%40mail.gmail.com
Discussion: https://postgr.es/m/CA%2BhUKGJ3HSWciQCz8ekP1Zn7N213RfA4nbuotQawfpq23%2Bw-5Q%40mail.gmail.com
---
 src/backend/storage/aio/read_stream.c | 71 +++++++++++++++++++--------
 1 file changed, 50 insertions(+), 21 deletions(-)

diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c
index 175f8410baf..f991373359a 100644
--- a/src/backend/storage/aio/read_stream.c
+++ b/src/backend/storage/aio/read_stream.c
@@ -133,6 +133,7 @@ struct ReadStream
 
 	/* Next expected block, for detecting sequential access. */
 	BlockNumber seq_blocknum;
+	BlockNumber seq_until_processed;
 
 	/* The read operation we are currently preparing. */
 	BlockNumber pending_read_blocknum;
@@ -238,11 +239,11 @@ read_stream_unget_block(ReadStream *stream, BlockNumber blocknum)
  * distance to a level that prevents look-ahead until buffers are released.
  */
 static bool
-read_stream_start_pending_read(ReadStream *stream, bool suppress_advice)
+read_stream_start_pending_read(ReadStream *stream)
 {
 	bool		need_wait;
 	int			nblocks;
-	int			flags;
+	int			flags = 0;
 	int16		io_index;
 	int16		overflow;
 	int16		buffer_index;
@@ -262,16 +263,36 @@ read_stream_start_pending_read(ReadStream *stream, bool suppress_advice)
 	else
 		Assert(stream->next_buffer_index == stream->oldest_buffer_index);
 
-	/*
-	 * If advice hasn't been suppressed, this system supports it, and this
-	 * isn't a strictly sequential pattern, then we'll issue advice.
-	 */
-	if (!suppress_advice &&
-		stream->advice_enabled &&
-		stream->pending_read_blocknum != stream->seq_blocknum)
-		flags = READ_BUFFERS_ISSUE_ADVICE;
-	else
-		flags = 0;
+	/* Do we need to issue read-ahead advice? */
+	if (stream->advice_enabled)
+	{
+		bool		ahead;
+
+		/*
+		 * We only issue advice if we're actually far enough ahead that we
+		 * won't immediately have to call WaitReadBuffers().
+		 */
+		ahead = stream->pinned_buffers > 0 ||
+			stream->pending_read_nblocks < stream->distance;
+
+		if (stream->pending_read_blocknum == stream->seq_blocknum)
+		{
+			/*
+			 * Sequential: issue advice only until the WaitReadBuffers() calls
+			 * catch up with the first advice issued for this sequential
+			 * region, so the kernel can see sequential access.
+			 */
+			if (stream->seq_until_processed != InvalidBlockNumber && ahead)
+				flags = READ_BUFFERS_ISSUE_ADVICE;
+		}
+		else
+		{
+			/* Random jump: start tracking new region. */
+			stream->seq_until_processed = stream->pending_read_blocknum;
+			if (ahead)
+				flags = READ_BUFFERS_ISSUE_ADVICE;
+		}
+	}
 
 	/* How many more buffers is this backend allowed? */
 	if (stream->temporary)
@@ -360,7 +381,7 @@ read_stream_start_pending_read(ReadStream *stream, bool suppress_advice)
 }
 
 static void
-read_stream_look_ahead(ReadStream *stream, bool suppress_advice)
+read_stream_look_ahead(ReadStream *stream)
 {
 	while (stream->ios_in_progress < stream->max_ios &&
 		   stream->pinned_buffers + stream->pending_read_nblocks < stream->distance)
@@ -371,8 +392,7 @@ read_stream_look_ahead(ReadStream *stream, bool suppress_advice)
 
 		if (stream->pending_read_nblocks == stream->io_combine_limit)
 		{
-			read_stream_start_pending_read(stream, suppress_advice);
-			suppress_advice = false;
+			read_stream_start_pending_read(stream);
 			continue;
 		}
 
@@ -405,15 +425,13 @@ read_stream_look_ahead(ReadStream *stream, bool suppress_advice)
 		/* We have to start the pending read before we can build another. */
 		while (stream->pending_read_nblocks > 0)
 		{
-			if (!read_stream_start_pending_read(stream, suppress_advice) ||
+			if (!read_stream_start_pending_read(stream) ||
 				stream->ios_in_progress == stream->max_ios)
 			{
 				/* We've hit the buffer or I/O limit.  Rewind and stop here. */
 				read_stream_unget_block(stream, blocknum);
 				return;
 			}
-
-			suppress_advice = false;
 		}
 
 		/* This is the start of a new pending read. */
@@ -437,7 +455,7 @@ read_stream_look_ahead(ReadStream *stream, bool suppress_advice)
 		  stream->pinned_buffers == 0) ||
 		 stream->distance == 0) &&
 		stream->ios_in_progress < stream->max_ios)
-		read_stream_start_pending_read(stream, suppress_advice);
+		read_stream_start_pending_read(stream);
 
 	/*
 	 * There should always be something pinned when we leave this function,
@@ -613,6 +631,8 @@ read_stream_begin_impl(int flags,
 	stream->callback = callback;
 	stream->callback_private_data = callback_private_data;
 	stream->buffered_blocknum = InvalidBlockNumber;
+	stream->seq_blocknum = InvalidBlockNumber;
+	stream->seq_until_processed = InvalidBlockNumber;
 	stream->temporary = SmgrIsTemp(smgr);
 
 	/*
@@ -793,7 +813,7 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 		 * space for more, but if we're just starting up we'll need to crank
 		 * the handle to get started.
 		 */
-		read_stream_look_ahead(stream, true);
+		read_stream_look_ahead(stream);
 
 		/* End of stream reached? */
 		if (stream->pinned_buffers == 0)
@@ -838,6 +858,15 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 			distance = stream->distance * 2;
 			distance = Min(distance, stream->max_pinned_buffers);
 			stream->distance = distance;
+
+			/*
+			 * If we've caught up with the first advice issued for the current
+			 * sequential region, cancel further advice until the next random
+			 * jump.  The kernel should be able to see the pattern now that
+			 * we're actually making sequential preadv() calls.
+			 */
+			if (stream->ios[io_index].op.blocknum == stream->seq_until_processed)
+				stream->seq_until_processed = InvalidBlockNumber;
 		}
 		else
 		{
@@ -899,7 +928,7 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 		stream->oldest_buffer_index = 0;
 
 	/* Prepare for the next call. */
-	read_stream_look_ahead(stream, false);
+	read_stream_look_ahead(stream);
 
 #ifndef READ_STREAM_DISABLE_FAST_PATH
 	/* See if we can take the fast path for all-cached scans next time. */
-- 
2.48.1.76.g4e746b1a31.dirty

