From 63cb731176a62320d296f968b12a5d4d36e703d0 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Wed, 18 Mar 2026 11:17:57 -0400
Subject: [PATCH v6 8/8] AIO: Don't wait for already in-progress IO

When a backend attempts to start a read IO and finds the first buffer
already has I/O in progress, previously it waited for that I/O to
complete before initiating reads for any of the subsequent buffers.

Although the backend must wait for the I/O to finish when acquiring the
buffer, there's no reason for it to wait when setting up the read
operation. Waiting at this point prevents the backend from starting I/O
on subsequent buffers and can significantly reduce concurrency.

This matters in two workloads: when multiple backends scan the same
relation concurrently, and when a single backend requests the same block
multiple times within the readahead distance.

If backends wait each time they encounter an in-progress read,
the access pattern effectively degenerates into synchronous I/O.

To fix this, when encountering an already in-progress IO for the head
buffer, a backend now records the buffer's wait reference and defers
waiting until WaitReadBuffers(), when it actually needs to acquire the
buffer.

In rare cases, a backend may still need to wait synchronously at IO
start time: if another backend has set BM_IO_IN_PROGRESS on the buffer
but has not yet set the wait reference. Such windows should be brief and
uncommon.

Author: Melanie Plageman <melanieplageman@gmail.com>
Reviewed-by: Andres Freund <andres@anarazel.de>
Reviewed-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Discussion: https://postgr.es/m/flat/zljergweqti7x67lg5ije2rzjusie37nslsnkjkkby4laqqbfw%403p3zu522yykv
---
 src/backend/storage/buffer/bufmgr.c | 201 +++++++++++++++++++---------
 src/include/storage/bufmgr.h        |   4 +-
 src/tools/pgindent/typedefs.list    |   1 +
 3 files changed, 145 insertions(+), 61 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 2179ade07cc..31d1563a69f 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -185,6 +185,20 @@ typedef struct SMgrSortArray
 	SMgrRelation srel;
 } SMgrSortArray;
 
+/*
+ * In AsyncReadBuffers(), when preparing a buffer for reading and setting
+ * BM_IO_IN_PROGRESS, the buffer may already have I/O in progress or may
+ * already contain the desired block. AsyncReadBuffers() must distinguish
+ * between these cases (and the case where it should initiate I/O) so it can
+ * mark an in-progress buffer as foreign I/O rather than waiting on it.
+ */
+typedef enum PrepareReadBufferStatus
+{
+	READ_BUFFER_ALREADY_DONE,
+	READ_BUFFER_IN_PROGRESS,
+	READ_BUFFER_READY_FOR_IO,
+} PrepareReadBufferStatus;
+
 /* GUC variables */
 bool		zero_damaged_pages = false;
 int			bgwriter_lru_maxpages = 100;
@@ -1663,8 +1677,9 @@ CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete)
  * Local version of PrepareHeadBufferReadIO(). Here instead of localbuf.c to
  * avoid an external function call.
  */
-static bool
-PrepareHeadLocalBufferReadIO(Buffer buffer)
+static PrepareReadBufferStatus
+PrepareHeadLocalBufferReadIO(ReadBuffersOperation *operation,
+							 Buffer buffer)
 {
 	BufferDesc *desc = GetLocalBufferDescriptor(-buffer - 1);
 	uint64		buf_state = pg_atomic_read_u64(&desc->state);
@@ -1675,49 +1690,60 @@ PrepareHeadLocalBufferReadIO(Buffer buffer)
 	 * handle). Only the owning backend can set BM_VALID on a local buffer.
 	 */
 	if (buf_state & BM_VALID)
-		return false;
+		return READ_BUFFER_ALREADY_DONE;
 
 	/*
 	 * Submit any staged IO before checking for in-progress IO. Without this,
 	 * the wref check below could find IO that this backend staged but hasn't
-	 * submitted yet. Waiting on that would PANIC because the owner can't wait
-	 * on its own staged IO.
+	 * submitted yet. If we returned READ_BUFFER_IN_PROGRESS and
+	 * WaitReadBuffers() then tried to wait on it, we'd PANIC because the
+	 * owner can't wait on its own staged IO.
 	 */
 	pgaio_submit_staged();
 
-	/* Wait for in-progress IO */
+	/* We've already asynchronously started this IO, so join it */
 	if (pgaio_wref_valid(&desc->io_wref))
 	{
-		PgAioWaitRef iow = desc->io_wref;
-
-		pgaio_wref_wait(&iow);
-
-		buf_state = pg_atomic_read_u64(&desc->state);
+		operation->io_wref = desc->io_wref;
+		operation->foreign_io = true;
+		return READ_BUFFER_IN_PROGRESS;
 	}
 
-	/*
-	 * If BM_VALID is set, we waited on IO and it completed successfully.
-	 * Otherwise, we'll initiate IO on the buffer.
-	 */
-	return !(buf_state & BM_VALID);
+	/* Prepare to start IO on this buffer */
+	return READ_BUFFER_READY_FOR_IO;
 }
 
 /*
  * Try to start IO on the first buffer in a new run of blocks. If AIO is in
- * progress, be it in this backend or another backend, we wait for it to
- * finish and then check the result.
+ * progress, be it in this backend or another backend, we just associate the
+ * wait reference with the operation and wait in WaitReadBuffers(). This turns
+ * out to be important for performance in two workloads:
+ *
+ * 1) A read stream that has to read the same block multiple times within the
+ *    readahead distance. This can happen e.g. for the table accesses of an
+ *    index scan.
+ *
+ * 2) Concurrent scans by multiple backends on the same relation.
+ *
+ * If we were to synchronously wait for the in-progress IO, we'd not be able
+ * to keep enough I/O in flight.
+ *
+ * If we do find there is ongoing I/O for the buffer, we set up a 1-block
+ * ReadBuffersOperation that WaitReadBuffers then can wait on.
  *
- * Returns true if the buffer is ready for IO, false if the buffer is already
- * valid.
+ * It's possible that another backend has started IO on the buffer but not yet
+ * set its wait reference. In this case, we have no choice but to wait for
+ * either the wait reference to be valid or the IO to be done.
  */
-static bool
-PrepareHeadBufferReadIO(Buffer buffer)
+static PrepareReadBufferStatus
+PrepareHeadBufferReadIO(ReadBuffersOperation *operation,
+						Buffer buffer)
 {
 	uint64		buf_state;
 	BufferDesc *desc;
 
 	if (BufferIsLocal(buffer))
-		return PrepareHeadLocalBufferReadIO(buffer);
+		return PrepareHeadLocalBufferReadIO(operation, buffer);
 
 	ResourceOwnerEnlarge(CurrentResourceOwner);
 	desc = GetBufferDescriptor(buffer - 1);
@@ -1732,11 +1758,25 @@ PrepareHeadBufferReadIO(Buffer buffer)
 		if (buf_state & BM_VALID)
 		{
 			UnlockBufHdr(desc);
-			return false;
+			return READ_BUFFER_ALREADY_DONE;
 		}
 
 		if (buf_state & BM_IO_IN_PROGRESS)
 		{
+			/* Join existing read */
+			if (pgaio_wref_valid(&desc->io_wref))
+			{
+				operation->io_wref = desc->io_wref;
+				operation->foreign_io = true;
+				UnlockBufHdr(desc);
+				return READ_BUFFER_IN_PROGRESS;
+			}
+
+			/*
+			 * If the wait ref is not valid but the IO is in progress, someone
+			 * else started IO but hasn't set the wait ref yet. We have no
+			 * choice but to wait until the IO completes.
+			 */
 			UnlockBufHdr(desc);
 
 			/*
@@ -1758,7 +1798,7 @@ PrepareHeadBufferReadIO(Buffer buffer)
 		ResourceOwnerRememberBufferIO(CurrentResourceOwner,
 									  BufferDescriptorGetBuffer(desc));
 
-		return true;
+		return READ_BUFFER_READY_FOR_IO;
 	}
 }
 
@@ -1939,8 +1979,11 @@ WaitReadBuffers(ReadBuffersOperation *operation)
 			 * b) reports some time as waiting, even if we never waited
 			 *
 			 * we first check if we already know the IO is complete.
+			 *
+			 * Note that operation->io_return is uninitialized for foreign IO,
+			 * so we cannot use the cheaper PGAIO_RS_UNKNOWN pre-check.
 			 */
-			if (aio_ret->result.status == PGAIO_RS_UNKNOWN &&
+			if ((operation->foreign_io || aio_ret->result.status == PGAIO_RS_UNKNOWN) &&
 				!pgaio_wref_check_done(&operation->io_wref))
 			{
 				instr_time	io_start = pgstat_prepare_io_time(track_io_timing);
@@ -1959,11 +2002,45 @@ WaitReadBuffers(ReadBuffersOperation *operation)
 				Assert(pgaio_wref_check_done(&operation->io_wref));
 			}
 
-			/*
-			 * We now are sure the IO completed. Check the results. This
-			 * includes reporting on errors if there were any.
-			 */
-			ProcessReadBuffersResult(operation);
+			if (unlikely(operation->foreign_io))
+			{
+				Buffer		buffer = operation->buffers[operation->nblocks_done];
+				BufferDesc *desc = BufferIsLocal(buffer) ?
+					GetLocalBufferDescriptor(-buffer - 1) :
+					GetBufferDescriptor(buffer - 1);
+				uint64		buf_state = pg_atomic_read_u64(&desc->state);
+
+				if (buf_state & BM_VALID)
+				{
+					BlockNumber blocknum = operation->blocknum + operation->nblocks_done;
+
+					operation->nblocks_done += 1;
+					Assert(operation->nblocks_done <= operation->nblocks);
+
+					/*
+					 * Track this as a 'hit' for this backend. The backend
+					 * performing the IO will track it as a 'read'.
+					 */
+					TrackBufferHit(io_object, io_context,
+								   operation->rel, operation->persistence,
+								   operation->smgr, operation->forknum,
+								   blocknum);
+				}
+
+				/*
+				 * If the foreign IO failed and left the buffer invalid,
+				 * nblocks_done is not incremented. The retry loop below will
+				 * call AsyncReadBuffers() which will attempt the IO itself.
+				 */
+			}
+			else
+			{
+				/*
+				 * We now are sure the IO completed. Check the results. This
+				 * includes reporting on errors if there were any.
+				 */
+				ProcessReadBuffersResult(operation);
+			}
 		}
 
 		/*
@@ -2009,7 +2086,8 @@ WaitReadBuffers(ReadBuffersOperation *operation)
  * affected by the call. If the first buffer is valid, *nblocks_progress is
  * set to 1 and operation->nblocks_done is incremented.
  *
- * Returns true if IO was initiated, false if no IO was necessary.
+ * Returns true if IO was initiated or is already in progress (foreign IO),
+ * false if the buffer was already valid.
  */
 static bool
 AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
@@ -2028,6 +2106,7 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
 	IOContext	io_context;
 	IOObject	io_object;
 	instr_time	io_start;
+	PrepareReadBufferStatus status;
 
 	if (persistence == RELPERSISTENCE_TEMP)
 	{
@@ -2066,40 +2145,42 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
 		ioh = pgaio_io_acquire(CurrentResourceOwner, &operation->io_return);
 	}
 
+	operation->foreign_io = false;
 	pgaio_wref_clear(&operation->io_wref);
 
-	/*
-	 * Check if we can start IO on the first to-be-read buffer.
-	 *
-	 * If an I/O is already in progress in another backend, we want to wait
-	 * for the outcome: either done, or something went wrong and we will
-	 * retry.
-	 */
-	if (!PrepareHeadBufferReadIO(buffers[nblocks_done]))
+	/* Check if we can start IO on the first to-be-read buffer */
+	status = PrepareHeadBufferReadIO(operation, buffers[nblocks_done]);
+	if (status != READ_BUFFER_READY_FOR_IO)
 	{
-		/*
-		 * Someone has already completed this block, we're done.
-		 *
-		 * When IO is necessary, ->nblocks_done is updated in
-		 * ProcessReadBuffersResult(), but that is not called if no IO is
-		 * necessary. Thus update here.
-		 */
-		operation->nblocks_done += 1;
+		pgaio_io_release(ioh);
 		*nblocks_progress = 1;
+		if (status == READ_BUFFER_ALREADY_DONE)
+		{
+			/*
+			 * Someone has already completed this block, we're done.
+			 *
+			 * When IO is necessary, ->nblocks_done is updated in
+			 * ProcessReadBuffersResult(), but that is not called if no IO is
+			 * necessary. Thus update here.
+			 */
+			operation->nblocks_done += 1;
+			Assert(operation->nblocks_done <= operation->nblocks);
 
-		pgaio_io_release(ioh);
-		pgaio_wref_clear(&operation->io_wref);
+			/*
+			 * Report and track this as a 'hit' for this backend, even though
+			 * it must have started out as a miss in PinBufferForBlock(). The
+			 * other backend will track this as a 'read'.
+			 */
+			TrackBufferHit(io_object, io_context,
+						   operation->rel, operation->persistence,
+						   operation->smgr, operation->forknum,
+						   blocknum);
+			return false;
+		}
 
-		/*
-		 * Report and track this as a 'hit' for this backend, even though it
-		 * must have started out as a miss in PinBufferForBlock(). The other
-		 * backend will track this as a 'read'.
-		 */
-		TrackBufferHit(io_object, io_context,
-					   operation->rel, operation->persistence,
-					   operation->smgr, operation->forknum,
-					   blocknum);
-		return false;
+		/* The IO is already in-progress */
+		Assert(status == READ_BUFFER_IN_PROGRESS);
+		return true;
 	}
 
 	/*
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 4017896f951..dd41b92f944 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -144,9 +144,11 @@ struct ReadBuffersOperation
 	 */
 	Buffer	   *buffers;
 	BlockNumber blocknum;
-	int			flags;
+	uint16		flags;
 	int16		nblocks;
 	int16		nblocks_done;
+	/* true if waiting on another backend's IO */
+	bool		foreign_io;
 	PgAioWaitRef io_wref;
 	PgAioReturn io_return;
 };
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 340662cf72c..ffaea427952 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -2365,6 +2365,7 @@ PredicateLockData
 PredicateLockTargetType
 PrefetchBufferResult
 PrepParallelRestorePtrType
+PrepareReadBufferStatus
 PrepareStmt
 PreparedStatement
 PresortedKeyData
-- 
2.43.0

