From fcd741083fdd02b541770ddb728db496102548ca Mon Sep 17 00:00:00 2001 From: Thomas Munro Date: Sun, 3 Aug 2025 23:07:56 +1200 Subject: [PATCH] aio: Fix pgaio_io_wait() for DEFINED and STAGED. Previously, pgaio_io_wait()'s cases for PGAIO_HS_DEFINED or PGAIO_HS_STAGED fell through to waiting for completion. The owner only promises to advance it to PGAIO_HS_SUBMITTED, so the waiter needs to be prepared to call wait_one() itself. Introduce a per-backend condition variable submit_cv, woken by by pgaio_submit_stage(), and use it to wait for progress out of these states. XXX just a sketch XXX expensive and pessimistic... is there a way to avoid the broadcast or at least spinlock when no one is listening? --- src/backend/storage/aio/aio.c | 17 ++++++++++++++--- src/backend/storage/aio/aio_init.c | 1 + src/backend/utils/activity/wait_event_names.txt | 1 + src/include/storage/aio_internal.h | 3 +++ 4 files changed, 19 insertions(+), 3 deletions(-) diff --git a/src/backend/storage/aio/aio.c b/src/backend/storage/aio/aio.c index 3643f27ad6e..2a78f684f84 100644 --- a/src/backend/storage/aio/aio.c +++ b/src/backend/storage/aio/aio.c @@ -607,6 +607,18 @@ pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation) elog(ERROR, "IO in wrong state: %d", state); break; + /* waiting for owner to submit */ + case PGAIO_HS_DEFINED: + case PGAIO_HS_STAGED: + ConditionVariablePrepareToSleep(&ioh->cv); + while (!pgaio_io_was_recycled(ioh, ref_generation, &state) && + (state == PGAIO_HS_DEFINED || + state == PGAIO_HS_STAGED)) + ConditionVariableSleep(&pgaio_ctl->backend_state[ioh->owner_procno].submit_cv, + WAIT_EVENT_AIO_IO_SUBMIT); + ConditionVariableCancelSleep(); + continue; + case PGAIO_HS_SUBMITTED: /* @@ -621,9 +633,6 @@ pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation) } /* fallthrough */ - /* waiting for owner to submit */ - case PGAIO_HS_DEFINED: - case PGAIO_HS_STAGED: /* waiting for reaper to complete */ /* fallthrough */ case PGAIO_HS_COMPLETED_IO: @@ -1139,6 +1148,8 @@ pgaio_submit_staged(void) pgaio_my_backend->num_staged_ios = 0; + ConditionVariableBroadcast(&pgaio_my_backend->submit_cv); + pgaio_debug(DEBUG4, "aio: submitted %d IOs", total_submitted); diff --git a/src/backend/storage/aio/aio_init.c b/src/backend/storage/aio/aio_init.c index 885c3940c66..a065ad1463f 100644 --- a/src/backend/storage/aio/aio_init.c +++ b/src/backend/storage/aio/aio_init.c @@ -188,6 +188,7 @@ AioShmemInit(void) dclist_init(&bs->idle_ios); memset(bs->staged_ios, 0, sizeof(PgAioHandle *) * PGAIO_SUBMIT_BATCH_SIZE); + ConditionVariableInit(&bs->submit_cv); dclist_init(&bs->in_flight_ios); /* initialize per-backend IOs */ diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt index 0be307d2ca0..7d1dff2d6da 100644 --- a/src/backend/utils/activity/wait_event_names.txt +++ b/src/backend/utils/activity/wait_event_names.txt @@ -194,6 +194,7 @@ ABI_compatibility: Section: ClassName - WaitEventIO AIO_IO_COMPLETION "Waiting for another process to complete IO." +AIO_IO_SUBMIT "Waiting for another process to submit IO." AIO_IO_URING_SUBMIT "Waiting for IO submission via io_uring." AIO_IO_URING_EXECUTION "Waiting for IO execution via io_uring." BASEBACKUP_READ "Waiting for base backup to read from a file." diff --git a/src/include/storage/aio_internal.h b/src/include/storage/aio_internal.h index 2d37a243abe..5447d7c2bcf 100644 --- a/src/include/storage/aio_internal.h +++ b/src/include/storage/aio_internal.h @@ -208,6 +208,9 @@ typedef struct PgAioBackend uint16 num_staged_ios; PgAioHandle *staged_ios[PGAIO_SUBMIT_BATCH_SIZE]; + /* Other backends can wait for this backend's IOs to be submitted. */ + ConditionVariable submit_cv; + /* * List of in-flight IOs. Also contains IOs that aren't strictly speaking * in-flight anymore, but have been waited-for and completed by another -- 2.39.5 (Apple Git-154)