From 6574ac9267fe9938f59ed67c8f0282716d8c28f3 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Sun, 3 Aug 2025 00:15:01 +1200
Subject: [PATCH v1 3/4] aio: Support I/O methods without true vectored I/O.

Currently, we rely on loops in our pg_preadv()/pg_pwritev() functions to
simulate vectored file I/O on Windows.  This change adds PgAioMethodOps
properties that declare whether true vectored I/O is available as an
efficient underlying operation, and teaches StartReadBuffers() to avoid
forming reads that cross into non-adjacent buffers if not.

The information is exposed as pgaio_have_vectored_file_io(direct_io),
which is then exposed via smgrvectorcombine(), and consumed by the I/O
combining logic in StartReadBuffers() to split the operation at a buffer
jump if necessary.

For example, this means that a direct I/O read that requires scattering
data to non-adjacent buffers will now be split into smaller reads
potentially executed concurrently, on Windows, instead of being
"secretly" handled serially inside pg_preadv().  (Independent patches to
add the missing support for actual vectored direct I/O on Windows would
then change the properties to match; as of this commit they describe the
current reality.)

In-development native I/O methods need this as they can't so easy fake
it, they need a 1:1 mapping between PgAioHandle and underlying OS
operation.

XXX find a way to make this determination at compile-time to avoid extra
code in builds where all I/O methods all say yes to everything, eg
Linux, FreeBSD?
---
 src/backend/storage/aio/aio.c             | 17 +++++++++++++++++
 src/backend/storage/aio/method_io_uring.c |  3 +++
 src/backend/storage/aio/method_sync.c     |  5 +++++
 src/backend/storage/aio/method_worker.c   | 17 +++++++++++++++++
 src/backend/storage/buffer/bufmgr.c       | 16 ++++++++++++++++
 src/backend/storage/smgr/md.c             |  9 +++++++++
 src/backend/storage/smgr/smgr.c           | 21 +++++++++++++++++++++
 src/include/storage/aio.h                 |  2 ++
 src/include/storage/aio_internal.h        |  7 +++++++
 src/include/storage/md.h                  |  1 +
 src/include/storage/smgr.h                |  1 +
 11 files changed, 99 insertions(+)

diff --git a/src/backend/storage/aio/aio.c b/src/backend/storage/aio/aio.c
index 88a1991870b..5886eb4a267 100644
--- a/src/backend/storage/aio/aio.c
+++ b/src/backend/storage/aio/aio.c
@@ -93,6 +93,23 @@ static const IoMethodOps *const pgaio_method_ops_table[] = {
 const IoMethodOps *pgaio_method_ops;
 
 
+/* --------------------------------------------------------------------------------
+ * Public Functions related to feature availability of features
+ * --------------------------------------------------------------------------------
+ */
+
+/*
+ * Can the current io_method perform file operations with iovcnt > 1?
+ */
+bool
+pgaio_have_vectored_file_io(bool direct)
+{
+	if (direct)
+		return pgaio_method_ops->have_vectored_file_io_direct;
+	else
+		return pgaio_method_ops->have_vectored_file_io_buffered;
+}
+
 /* --------------------------------------------------------------------------------
  * Public Functions related to PgAioHandle
  * --------------------------------------------------------------------------------
diff --git a/src/backend/storage/aio/method_io_uring.c b/src/backend/storage/aio/method_io_uring.c
index 0a8c054162f..935d4bab87d 100644
--- a/src/backend/storage/aio/method_io_uring.c
+++ b/src/backend/storage/aio/method_io_uring.c
@@ -60,6 +60,9 @@ static void pgaio_uring_sq_from_io(PgAioHandle *ioh, struct io_uring_sqe *sqe);
 
 
 const IoMethodOps pgaio_uring_ops = {
+	.have_vectored_file_io_buffered = true,
+	.have_vectored_file_io_direct = true,
+
 	/*
 	 * While io_uring mostly is OK with FDs getting closed while the IO is in
 	 * flight, that is not true for IOs submitted with IOSQE_ASYNC.
diff --git a/src/backend/storage/aio/method_sync.c b/src/backend/storage/aio/method_sync.c
index 902c2428d41..7936090c026 100644
--- a/src/backend/storage/aio/method_sync.c
+++ b/src/backend/storage/aio/method_sync.c
@@ -26,6 +26,11 @@ static int	pgaio_sync_submit(uint16 num_staged_ios, PgAioHandle **staged_ios);
 
 
 const IoMethodOps pgaio_sync_ops = {
+#if HAVE_DECL_PREADV && HAVE_DECL_PWRITEV
+	.have_vectored_file_io_buffered = true,
+	.have_vectored_file_io_direct = true,
+#endif
+
 	.needs_synchronous_execution = pgaio_sync_needs_synchronous_execution,
 	.submit = pgaio_sync_submit,
 };
diff --git a/src/backend/storage/aio/method_worker.c b/src/backend/storage/aio/method_worker.c
index ba2d54f2839..782016c2cb9 100644
--- a/src/backend/storage/aio/method_worker.c
+++ b/src/backend/storage/aio/method_worker.c
@@ -82,6 +82,23 @@ static int	pgaio_worker_submit(uint16 num_staged_ios, PgAioHandle **staged_ios);
 
 
 const IoMethodOps pgaio_worker_ops = {
+	/*
+	 * Though Windows doesn't really have vectored I/O for buffered files, the
+	 * loop-based emulation in pg_iovec.h is probably OK in a worker due to
+	 * kernel read-ahead.
+	 */
+	.have_vectored_file_io_buffered = true,
+#if HAVE_DECL_PREADV && HAVE_DECL_PWRITEV
+
+	/*
+	 * We don't yet implement true scatter/gather operations on Windows, and
+	 * until then it seems better to split operations up into chunks that can
+	 * be executed concurrently rather than serially by pg_iovec.h when
+	 * preadv() and pwrite() are missing.
+	 */
+	.have_vectored_file_io_direct = true,
+#endif
+
 	.shmem_size = pgaio_worker_shmem_size,
 	.shmem_init = pgaio_worker_shmem_init,
 
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 67431208e7f..386aa658f8d 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -1268,6 +1268,7 @@ StartReadBuffersImpl(ReadBuffersOperation *operation,
 {
 	int			actual_nblocks = *nblocks;
 	int			maxcombine = 0;
+	bool		vectorcombine = false;
 	bool		did_start_io;
 
 	Assert(*nblocks == 1 || allow_forwarding);
@@ -1373,6 +1374,7 @@ StartReadBuffersImpl(ReadBuffersOperation *operation,
 			 */
 			if (i == 0 && actual_nblocks > 1)
 			{
+				vectorcombine = smgrvectorcombine(operation->smgr);
 				maxcombine = smgrmaxcombine(operation->smgr,
 											operation->forknum,
 											blockNum);
@@ -1383,6 +1385,20 @@ StartReadBuffersImpl(ReadBuffersOperation *operation,
 					actual_nblocks = maxcombine;
 				}
 			}
+
+			/*
+			 * If true vectored I/O is not available, check if we've cross
+			 * into a non-consecutive buffer and need to rewind, to avoid
+			 * forming a read that can't be executed efficiently.  If so, the
+			 * discontiguous pinned buffer is forwarded to the next call.
+			 */
+			if (i > 0 && !vectorcombine)
+			{
+				if (BufferIsLocal(buffers[i]) ?
+					-buffers[i] != -buffers[i - 1] + 1 :
+					buffers[i] != buffers[i - 1] + 1)
+					actual_nblocks = i;
+			}
 		}
 	}
 	*nblocks = actual_nblocks;
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 2ccb0faceb5..e057ebb2b11 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -815,6 +815,15 @@ buffers_to_iovec(struct iovec *iov, void **buffers, int nblocks)
 	return iovcnt;
 }
 
+/*
+ * mdvectorcombine() -- Report whether vectored operations are optimal.
+ */
+bool
+mdvectorcombine(SMgrRelation reln)
+{
+	return pgaio_have_vectored_file_io(io_direct_flags & IO_DIRECT_DATA);
+}
+
 /*
  * mdmaxcombine() -- Return the maximum number of total blocks that can be
  *				 combined with an IO starting at blocknum.
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index bce37a36d51..ae3be3616a5 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -102,6 +102,7 @@ typedef struct f_smgr
 									BlockNumber blocknum, int nblocks, bool skipFsync);
 	bool		(*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
 								  BlockNumber blocknum, int nblocks);
+	bool		(*smgr_vectorcombine) (SMgrRelation reln);
 	uint32		(*smgr_maxcombine) (SMgrRelation reln, ForkNumber forknum,
 									BlockNumber blocknum);
 	void		(*smgr_readv) (SMgrRelation reln, ForkNumber forknum,
@@ -138,6 +139,7 @@ static const f_smgr smgrsw[] = {
 		.smgr_extend = mdextend,
 		.smgr_zeroextend = mdzeroextend,
 		.smgr_prefetch = mdprefetch,
+		.smgr_vectorcombine = mdvectorcombine,
 		.smgr_maxcombine = mdmaxcombine,
 		.smgr_readv = mdreadv,
 		.smgr_startreadv = mdstartreadv,
@@ -687,6 +689,25 @@ smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	return ret;
 }
 
+/*
+ * smgrvectorcombine() - Report whether vectored reads/writes are supported.
+ *
+ * If false, then the vectored I/O functions can still be called with
+ * non-adjacent buffers, but may generate internal retries or looping, and the
+ * caller might prefer to avoid that and generate concurrent operations.
+ */
+bool
+smgrvectorcombine(SMgrRelation reln)
+{
+	bool		ret;
+
+	HOLD_INTERRUPTS();
+	ret = smgrsw[reln->smgr_which].smgr_vectorcombine(reln);
+	RESUME_INTERRUPTS();
+
+	return ret;
+}
+
 /*
  * smgrmaxcombine() - Return the maximum number of total blocks that can be
  *				 combined with an IO starting at blocknum.
diff --git a/src/include/storage/aio.h b/src/include/storage/aio.h
index 2933eea0649..f6d93719f0c 100644
--- a/src/include/storage/aio.h
+++ b/src/include/storage/aio.h
@@ -274,6 +274,8 @@ struct PgAioHandleCallbacks
  */
 
 /* functions in aio.c */
+extern bool pgaio_have_vectored_file_io(bool direct);
+
 struct ResourceOwnerData;
 extern PgAioHandle *pgaio_io_acquire(struct ResourceOwnerData *resowner, PgAioReturn *ret);
 extern PgAioHandle *pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret);
diff --git a/src/include/storage/aio_internal.h b/src/include/storage/aio_internal.h
index a8e1e442bb3..7662d665082 100644
--- a/src/include/storage/aio_internal.h
+++ b/src/include/storage/aio_internal.h
@@ -255,6 +255,13 @@ typedef struct IoMethodOps
 {
 	/* properties */
 
+	/*
+	 * Whether vectored I/O operations with iovcnt > 1 can be executed for
+	 * buffered I/O and direct I/O file descriptors.
+	 */
+	bool		have_vectored_file_io_buffered;
+	bool		have_vectored_file_io_direct;
+
 	/*
 	 * If an FD is about to be closed, do we need to wait for all in-flight
 	 * IOs referencing that FD?
diff --git a/src/include/storage/md.h b/src/include/storage/md.h
index b563c27abf0..e69ff228312 100644
--- a/src/include/storage/md.h
+++ b/src/include/storage/md.h
@@ -35,6 +35,7 @@ extern void mdzeroextend(SMgrRelation reln, ForkNumber forknum,
 						 BlockNumber blocknum, int nblocks, bool skipFsync);
 extern bool mdprefetch(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber blocknum, int nblocks);
+extern bool mdvectorcombine(SMgrRelation reln);
 extern uint32 mdmaxcombine(SMgrRelation reln, ForkNumber forknum,
 						   BlockNumber blocknum);
 extern void mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index 3964d9334b3..bf836d1f1b5 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -95,6 +95,7 @@ extern void smgrzeroextend(SMgrRelation reln, ForkNumber forknum,
 						   BlockNumber blocknum, int nblocks, bool skipFsync);
 extern bool smgrprefetch(SMgrRelation reln, ForkNumber forknum,
 						 BlockNumber blocknum, int nblocks);
+extern bool smgrvectorcombine(SMgrRelation reln);
 extern uint32 smgrmaxcombine(SMgrRelation reln, ForkNumber forknum,
 							 BlockNumber blocknum);
 extern void smgrreadv(SMgrRelation reln, ForkNumber forknum,
-- 
2.39.5 (Apple Git-154)