From ec2e1e21054f00918d3b28ce01129bc06de37790 Mon Sep 17 00:00:00 2001 From: Thomas Munro Date: Sun, 3 Aug 2025 23:07:56 +1200 Subject: [PATCH v2 1/2] aio: Fix pgaio_io_wait() for staged IOs. Previously, pgaio_io_wait()'s cases for PGAIO_HS_DEFINED and PGAIO_HS_STAGED fell through to waiting for completion. The owner only promises to advance it to PGAIO_HS_SUBMITTED. The waiter needs to be prepared to call ->wait_one() itself once the IO is submitted in order to guarantee progress and avoid deadlocks on IO methods that provide ->wait_one(). Introduce a new per-backend condition variable submit_cv, woken by by pgaio_submit_stage(), and use it to wait for the state to advance. The new broadcast doesn't seem to cause any measurable slowdown, so ideas for optimizing the common no-waiters case were abandoned for now. It may not be possible to reach any real deadlock with existing AIO users, but that situation could change. There's also no reason the waiter shouldn't begin to wait via the IO method as soon as possible even without a deadlock. Picked up by testing a proposed IO method that has ->wait_one(), like io_method=io_uring, and code review. Backpatch-through: 18 Reviewed-by: Andres Freund Discussion: https://postgr.es/m/CA%2BhUKG%2BmZYrSdnhk-XrBYO18H829K77S9gMKUsykOiTJtqB43g%40mail.gmail.com --- src/backend/storage/aio/aio.c | 26 ++++++++++++++++--- src/backend/storage/aio/aio_init.c | 1 + .../utils/activity/wait_event_names.txt | 1 + src/include/storage/aio_internal.h | 7 +++++ 4 files changed, 32 insertions(+), 3 deletions(-) diff --git a/src/backend/storage/aio/aio.c b/src/backend/storage/aio/aio.c index 3643f27ad6e..87eb1f71705 100644 --- a/src/backend/storage/aio/aio.c +++ b/src/backend/storage/aio/aio.c @@ -607,6 +607,23 @@ pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation) elog(ERROR, "IO in wrong state: %d", state); break; + case PGAIO_HS_DEFINED: + case PGAIO_HS_STAGED: + + /* + * The owner hasn't submitted the IO yet. Wait for it to do + * so. Only then can this backend wait via the IO method if + * required, or otherwise ioh->cv. + */ + ConditionVariablePrepareToSleep(&pgaio_ctl->backend_state[ioh->owner_procno].submit_cv); + while (!pgaio_io_was_recycled(ioh, ref_generation, &state) && + (state == PGAIO_HS_DEFINED || + state == PGAIO_HS_STAGED)) + ConditionVariableSleep(&pgaio_ctl->backend_state[ioh->owner_procno].submit_cv, + WAIT_EVENT_AIO_IO_SUBMIT); + ConditionVariableCancelSleep(); + continue; + case PGAIO_HS_SUBMITTED: /* @@ -621,9 +638,6 @@ pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation) } /* fallthrough */ - /* waiting for owner to submit */ - case PGAIO_HS_DEFINED: - case PGAIO_HS_STAGED: /* waiting for reaper to complete */ /* fallthrough */ case PGAIO_HS_COMPLETED_IO: @@ -1139,6 +1153,12 @@ pgaio_submit_staged(void) pgaio_my_backend->num_staged_ios = 0; + /* + * Wake any backend that started waiting for any of these IOs before + * submission, so that it can call ->wait_one() if necessary. + */ + ConditionVariableBroadcast(&pgaio_my_backend->submit_cv); + pgaio_debug(DEBUG4, "aio: submitted %d IOs", total_submitted); diff --git a/src/backend/storage/aio/aio_init.c b/src/backend/storage/aio/aio_init.c index 885c3940c66..a065ad1463f 100644 --- a/src/backend/storage/aio/aio_init.c +++ b/src/backend/storage/aio/aio_init.c @@ -188,6 +188,7 @@ AioShmemInit(void) dclist_init(&bs->idle_ios); memset(bs->staged_ios, 0, sizeof(PgAioHandle *) * PGAIO_SUBMIT_BATCH_SIZE); + ConditionVariableInit(&bs->submit_cv); dclist_init(&bs->in_flight_ios); /* initialize per-backend IOs */ diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt index 0be307d2ca0..7d1dff2d6da 100644 --- a/src/backend/utils/activity/wait_event_names.txt +++ b/src/backend/utils/activity/wait_event_names.txt @@ -194,6 +194,7 @@ ABI_compatibility: Section: ClassName - WaitEventIO AIO_IO_COMPLETION "Waiting for another process to complete IO." +AIO_IO_SUBMIT "Waiting for another process to submit IO." AIO_IO_URING_SUBMIT "Waiting for IO submission via io_uring." AIO_IO_URING_EXECUTION "Waiting for IO execution via io_uring." BASEBACKUP_READ "Waiting for base backup to read from a file." diff --git a/src/include/storage/aio_internal.h b/src/include/storage/aio_internal.h index 2d37a243abe..3e438b587d9 100644 --- a/src/include/storage/aio_internal.h +++ b/src/include/storage/aio_internal.h @@ -208,6 +208,13 @@ typedef struct PgAioBackend uint16 num_staged_ios; PgAioHandle *staged_ios[PGAIO_SUBMIT_BATCH_SIZE]; + /* + * Other backends sometimes need to wait for the owning backend to submit. + * The per-IO CV would work for that purpose, but a per-backend CV allows + * for just one broadcast per submitted batch. + */ + ConditionVariable submit_cv; + /* * List of in-flight IOs. Also contains IOs that aren't strictly speaking * in-flight anymore, but have been waited-for and completed by another -- 2.39.5 (Apple Git-154)