From 6574ac9267fe9938f59ed67c8f0282716d8c28f3 Mon Sep 17 00:00:00 2001 From: Thomas Munro Date: Sun, 3 Aug 2025 00:15:01 +1200 Subject: [PATCH v1 3/4] aio: Support I/O methods without true vectored I/O. Currently, we rely on loops in our pg_preadv()/pg_pwritev() functions to simulate vectored file I/O on Windows. This change adds PgAioMethodOps properties that declare whether true vectored I/O is available as an efficient underlying operation, and teaches StartReadBuffers() to avoid forming reads that cross into non-adjacent buffers if not. The information is exposed as pgaio_have_vectored_file_io(direct_io), which is then exposed via smgrvectorcombine(), and consumed by the I/O combining logic in StartReadBuffers() to split the operation at a buffer jump if necessary. For example, this means that a direct I/O read that requires scattering data to non-adjacent buffers will now be split into smaller reads potentially executed concurrently, on Windows, instead of being "secretly" handled serially inside pg_preadv(). (Independent patches to add the missing support for actual vectored direct I/O on Windows would then change the properties to match; as of this commit they describe the current reality.) In-development native I/O methods need this as they can't so easy fake it, they need a 1:1 mapping between PgAioHandle and underlying OS operation. XXX find a way to make this determination at compile-time to avoid extra code in builds where all I/O methods all say yes to everything, eg Linux, FreeBSD? --- src/backend/storage/aio/aio.c | 17 +++++++++++++++++ src/backend/storage/aio/method_io_uring.c | 3 +++ src/backend/storage/aio/method_sync.c | 5 +++++ src/backend/storage/aio/method_worker.c | 17 +++++++++++++++++ src/backend/storage/buffer/bufmgr.c | 16 ++++++++++++++++ src/backend/storage/smgr/md.c | 9 +++++++++ src/backend/storage/smgr/smgr.c | 21 +++++++++++++++++++++ src/include/storage/aio.h | 2 ++ src/include/storage/aio_internal.h | 7 +++++++ src/include/storage/md.h | 1 + src/include/storage/smgr.h | 1 + 11 files changed, 99 insertions(+) diff --git a/src/backend/storage/aio/aio.c b/src/backend/storage/aio/aio.c index 88a1991870b..5886eb4a267 100644 --- a/src/backend/storage/aio/aio.c +++ b/src/backend/storage/aio/aio.c @@ -93,6 +93,23 @@ static const IoMethodOps *const pgaio_method_ops_table[] = { const IoMethodOps *pgaio_method_ops; +/* -------------------------------------------------------------------------------- + * Public Functions related to feature availability of features + * -------------------------------------------------------------------------------- + */ + +/* + * Can the current io_method perform file operations with iovcnt > 1? + */ +bool +pgaio_have_vectored_file_io(bool direct) +{ + if (direct) + return pgaio_method_ops->have_vectored_file_io_direct; + else + return pgaio_method_ops->have_vectored_file_io_buffered; +} + /* -------------------------------------------------------------------------------- * Public Functions related to PgAioHandle * -------------------------------------------------------------------------------- diff --git a/src/backend/storage/aio/method_io_uring.c b/src/backend/storage/aio/method_io_uring.c index 0a8c054162f..935d4bab87d 100644 --- a/src/backend/storage/aio/method_io_uring.c +++ b/src/backend/storage/aio/method_io_uring.c @@ -60,6 +60,9 @@ static void pgaio_uring_sq_from_io(PgAioHandle *ioh, struct io_uring_sqe *sqe); const IoMethodOps pgaio_uring_ops = { + .have_vectored_file_io_buffered = true, + .have_vectored_file_io_direct = true, + /* * While io_uring mostly is OK with FDs getting closed while the IO is in * flight, that is not true for IOs submitted with IOSQE_ASYNC. diff --git a/src/backend/storage/aio/method_sync.c b/src/backend/storage/aio/method_sync.c index 902c2428d41..7936090c026 100644 --- a/src/backend/storage/aio/method_sync.c +++ b/src/backend/storage/aio/method_sync.c @@ -26,6 +26,11 @@ static int pgaio_sync_submit(uint16 num_staged_ios, PgAioHandle **staged_ios); const IoMethodOps pgaio_sync_ops = { +#if HAVE_DECL_PREADV && HAVE_DECL_PWRITEV + .have_vectored_file_io_buffered = true, + .have_vectored_file_io_direct = true, +#endif + .needs_synchronous_execution = pgaio_sync_needs_synchronous_execution, .submit = pgaio_sync_submit, }; diff --git a/src/backend/storage/aio/method_worker.c b/src/backend/storage/aio/method_worker.c index ba2d54f2839..782016c2cb9 100644 --- a/src/backend/storage/aio/method_worker.c +++ b/src/backend/storage/aio/method_worker.c @@ -82,6 +82,23 @@ static int pgaio_worker_submit(uint16 num_staged_ios, PgAioHandle **staged_ios); const IoMethodOps pgaio_worker_ops = { + /* + * Though Windows doesn't really have vectored I/O for buffered files, the + * loop-based emulation in pg_iovec.h is probably OK in a worker due to + * kernel read-ahead. + */ + .have_vectored_file_io_buffered = true, +#if HAVE_DECL_PREADV && HAVE_DECL_PWRITEV + + /* + * We don't yet implement true scatter/gather operations on Windows, and + * until then it seems better to split operations up into chunks that can + * be executed concurrently rather than serially by pg_iovec.h when + * preadv() and pwrite() are missing. + */ + .have_vectored_file_io_direct = true, +#endif + .shmem_size = pgaio_worker_shmem_size, .shmem_init = pgaio_worker_shmem_init, diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 67431208e7f..386aa658f8d 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -1268,6 +1268,7 @@ StartReadBuffersImpl(ReadBuffersOperation *operation, { int actual_nblocks = *nblocks; int maxcombine = 0; + bool vectorcombine = false; bool did_start_io; Assert(*nblocks == 1 || allow_forwarding); @@ -1373,6 +1374,7 @@ StartReadBuffersImpl(ReadBuffersOperation *operation, */ if (i == 0 && actual_nblocks > 1) { + vectorcombine = smgrvectorcombine(operation->smgr); maxcombine = smgrmaxcombine(operation->smgr, operation->forknum, blockNum); @@ -1383,6 +1385,20 @@ StartReadBuffersImpl(ReadBuffersOperation *operation, actual_nblocks = maxcombine; } } + + /* + * If true vectored I/O is not available, check if we've cross + * into a non-consecutive buffer and need to rewind, to avoid + * forming a read that can't be executed efficiently. If so, the + * discontiguous pinned buffer is forwarded to the next call. + */ + if (i > 0 && !vectorcombine) + { + if (BufferIsLocal(buffers[i]) ? + -buffers[i] != -buffers[i - 1] + 1 : + buffers[i] != buffers[i - 1] + 1) + actual_nblocks = i; + } } } *nblocks = actual_nblocks; diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 2ccb0faceb5..e057ebb2b11 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -815,6 +815,15 @@ buffers_to_iovec(struct iovec *iov, void **buffers, int nblocks) return iovcnt; } +/* + * mdvectorcombine() -- Report whether vectored operations are optimal. + */ +bool +mdvectorcombine(SMgrRelation reln) +{ + return pgaio_have_vectored_file_io(io_direct_flags & IO_DIRECT_DATA); +} + /* * mdmaxcombine() -- Return the maximum number of total blocks that can be * combined with an IO starting at blocknum. diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index bce37a36d51..ae3be3616a5 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -102,6 +102,7 @@ typedef struct f_smgr BlockNumber blocknum, int nblocks, bool skipFsync); bool (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks); + bool (*smgr_vectorcombine) (SMgrRelation reln); uint32 (*smgr_maxcombine) (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); void (*smgr_readv) (SMgrRelation reln, ForkNumber forknum, @@ -138,6 +139,7 @@ static const f_smgr smgrsw[] = { .smgr_extend = mdextend, .smgr_zeroextend = mdzeroextend, .smgr_prefetch = mdprefetch, + .smgr_vectorcombine = mdvectorcombine, .smgr_maxcombine = mdmaxcombine, .smgr_readv = mdreadv, .smgr_startreadv = mdstartreadv, @@ -687,6 +689,25 @@ smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, return ret; } +/* + * smgrvectorcombine() - Report whether vectored reads/writes are supported. + * + * If false, then the vectored I/O functions can still be called with + * non-adjacent buffers, but may generate internal retries or looping, and the + * caller might prefer to avoid that and generate concurrent operations. + */ +bool +smgrvectorcombine(SMgrRelation reln) +{ + bool ret; + + HOLD_INTERRUPTS(); + ret = smgrsw[reln->smgr_which].smgr_vectorcombine(reln); + RESUME_INTERRUPTS(); + + return ret; +} + /* * smgrmaxcombine() - Return the maximum number of total blocks that can be * combined with an IO starting at blocknum. diff --git a/src/include/storage/aio.h b/src/include/storage/aio.h index 2933eea0649..f6d93719f0c 100644 --- a/src/include/storage/aio.h +++ b/src/include/storage/aio.h @@ -274,6 +274,8 @@ struct PgAioHandleCallbacks */ /* functions in aio.c */ +extern bool pgaio_have_vectored_file_io(bool direct); + struct ResourceOwnerData; extern PgAioHandle *pgaio_io_acquire(struct ResourceOwnerData *resowner, PgAioReturn *ret); extern PgAioHandle *pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret); diff --git a/src/include/storage/aio_internal.h b/src/include/storage/aio_internal.h index a8e1e442bb3..7662d665082 100644 --- a/src/include/storage/aio_internal.h +++ b/src/include/storage/aio_internal.h @@ -255,6 +255,13 @@ typedef struct IoMethodOps { /* properties */ + /* + * Whether vectored I/O operations with iovcnt > 1 can be executed for + * buffered I/O and direct I/O file descriptors. + */ + bool have_vectored_file_io_buffered; + bool have_vectored_file_io_direct; + /* * If an FD is about to be closed, do we need to wait for all in-flight * IOs referencing that FD? diff --git a/src/include/storage/md.h b/src/include/storage/md.h index b563c27abf0..e69ff228312 100644 --- a/src/include/storage/md.h +++ b/src/include/storage/md.h @@ -35,6 +35,7 @@ extern void mdzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync); extern bool mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks); +extern bool mdvectorcombine(SMgrRelation reln); extern uint32 mdmaxcombine(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); extern void mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index 3964d9334b3..bf836d1f1b5 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -95,6 +95,7 @@ extern void smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync); extern bool smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks); +extern bool smgrvectorcombine(SMgrRelation reln); extern uint32 smgrmaxcombine(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); extern void smgrreadv(SMgrRelation reln, ForkNumber forknum, -- 2.39.5 (Apple Git-154)