From ab494c6e5ccf93563dcf7059f7eec7d4252294f6 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Sat, 15 Jun 2024 14:37:26 +1200
Subject: [PATCH 1/2] Fix overloaded remit of read_stream_reset().

Some extensions need to examine page contents to find more block numbers
to stream.  That currently means that they have to signal end-of-stream
when data runs out, but later "reset" the stream to resume with new
information.

As an unfortunate by-product of that API economy, the look-ahead
distance would start out at one again, and then need to ramp back up to
find a useful concurrency level.  This created a significant loss of
performance for self-referential block streams with a high degree of
fan-out, as discovered by the pgvector project.

Split out a separate function read_stream_resume() that is explicitly
intended for that usage pattern.  Since distance == 0 is the internal
representation of end-of-stream, all it has to do is restore the
distance recorded at the time end-of-stream was signaled to resume
looking ahead with the same I/O concurrency level after a page jump.

We don't have any self-referential streaming in PostgreSQL itself yet,
but we decided to call this a bug all the same and back-patch a fix.
The intention was to support that exact usage pattern, the API just
missed the mark by trying to handle too many use cases.  Such extensions
should be able to benefit from streaming, and the behavior change is
isolated to code that calls the new variant, so a separation of duties
seems warranted and safe to back-patch.

The problem was reported as extremely poor look-ahead performance in v18
using AIO for prefetching, but it also affects the fadvise-based
prefetching in v17, where read_stream.c arrived.

Reviewed-by:
Tested-by:
Backpatch-through: 17
Discussion: https://postgr.es/m/CA%2BhUKGL-3mBtkA9RTbLFHuSS5cviuv0ko7nBhCg9KM7Q-GSEkw%40mail.gmail.com
---
 src/backend/storage/aio/read_stream.c | 15 +++++++++++++++
 src/include/storage/read_stream.h     |  1 +
 2 files changed, 16 insertions(+)

diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c
index 031fde9f4cb..beae8ef325a 100644
--- a/src/backend/storage/aio/read_stream.c
+++ b/src/backend/storage/aio/read_stream.c
@@ -100,6 +100,7 @@ struct ReadStream
 	int16		pinned_buffers;
 	int16		distance;
 	int16		initialized_buffers;
+	int16		resume_distance;
 	int			read_buffers_flags;
 	bool		sync_mode;		/* using io_method=sync */
 	bool		batch_mode;		/* READ_STREAM_USE_BATCHING */
@@ -464,6 +465,7 @@ read_stream_look_ahead(ReadStream *stream)
 		if (blocknum == InvalidBlockNumber)
 		{
 			/* End of stream. */
+			stream->resume_distance = stream->distance;
 			stream->distance = 0;
 			break;
 		}
@@ -711,6 +713,7 @@ read_stream_begin_impl(int flags,
 		stream->distance = Min(max_pinned_buffers, stream->io_combine_limit);
 	else
 		stream->distance = 1;
+	stream->resume_distance = stream->distance;
 
 	/*
 	 * Since we always access the same relation, we can initialize parts of
@@ -862,6 +865,7 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 		else
 		{
 			/* No more blocks, end of stream. */
+			stream->resume_distance = stream->distance;
 			stream->distance = 0;
 			stream->oldest_buffer_index = stream->next_buffer_index;
 			stream->pinned_buffers = 0;
@@ -1034,6 +1038,17 @@ read_stream_next_block(ReadStream *stream, BufferAccessStrategy *strategy)
 	return read_stream_get_block(stream, NULL);
 }
 
+/*
+ * Resume looking ahead after the block number callback reported end-of-stream.
+ * This is useful for streams of self-referential blocks, after a buffer needed
+ * to be consumed and examined to find more block numbers.
+ */
+void
+read_stream_resume(ReadStream *stream)
+{
+	stream->distance = stream->resume_distance;
+}
+
 /*
  * Reset a read stream by releasing any queued up buffers, allowing the stream
  * to be used again for different blocks.  This can be used to clear an
diff --git a/src/include/storage/read_stream.h b/src/include/storage/read_stream.h
index 9b0d65161d0..e29ac50fc9e 100644
--- a/src/include/storage/read_stream.h
+++ b/src/include/storage/read_stream.h
@@ -99,6 +99,7 @@ extern ReadStream *read_stream_begin_smgr_relation(int flags,
 												   ReadStreamBlockNumberCB callback,
 												   void *callback_private_data,
 												   size_t per_buffer_data_size);
+extern void read_stream_resume(ReadStream *stream);
 extern void read_stream_reset(ReadStream *stream);
 extern void read_stream_end(ReadStream *stream);
 
-- 
2.51.1

