From 88283d2ceb1e17e3acd5f73a3a5f5ac3ffc4bfc4 Mon Sep 17 00:00:00 2001 From: Zhijie Hou Date: Fri, 21 Nov 2025 13:18:13 +0800 Subject: [PATCH v1] Fix the race condition of updating slot minimum LSN Previously, there is a race condition: if a backend creates a new slot and attempts to initialize the slot.restart_lsn during WAL reservation, but meanwhile, another backend invokes ReplicationSlotsComputeRequiredLSN(), the slot minimum LSN may be initially updated by the newly created slot, only to be subsequently overwritten by the backend running ReplicationSlotsComputeRequiredLSN() with an more recent LSN. This scenario could lead to the premature removal of WALs reserved by the new slot during a checkpoint, resulting in the newly created slot being invalidated. The commit closes this race condition by acquiring an exclusive ReplicationSlotControlLock when updating slot.restart_lsn during WAL reservation. Additionally, XLogSetReplicationSlotMinimumLSN() is placed under the protection of the ReplicationSlotControlLock. This serializes the update of slot.restart_lsn and the computation of the minimum LSN in other backends, ensuring that a more recent minimum LSN isn't computed while an older one is still being reserved. --- src/backend/replication/logical/slotsync.c | 10 ++++++++++ src/backend/replication/slot.c | 23 +++++++++++++++++++++- 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/src/backend/replication/logical/slotsync.c b/src/backend/replication/logical/slotsync.c index 1c343d03d21..f4d46a63ccb 100644 --- a/src/backend/replication/logical/slotsync.c +++ b/src/backend/replication/logical/slotsync.c @@ -568,6 +568,14 @@ reserve_wal_for_local_slot(XLogRecPtr restart_lsn) */ LWLockAcquire(ReplicationSlotAllocationLock, LW_EXCLUSIVE); + /* + * Acquire an exclusive lock to prevent other backends from concurrently + * updating the minimum slot LSN. In addition to the reason mentioned for + * this lock in ReplicationSlotReserveWal(), it also ensures the fetched + * minimum slot LSN remains safe when updating the slot.restart_lsn. + */ + LWLockAcquire(ReplicationSlotControlLock, LW_EXCLUSIVE); + /* * Determine the minimum non-removable LSN by comparing the redo pointer * with the minimum slot LSN. @@ -593,6 +601,8 @@ reserve_wal_for_local_slot(XLogRecPtr restart_lsn) slot->data.restart_lsn = Max(restart_lsn, min_safe_lsn); SpinLockRelease(&slot->mutex); + LWLockRelease(ReplicationSlotControlLock); + ReplicationSlotsComputeRequiredLSN(); XLByteToSeg(slot->data.restart_lsn, segno, wal_segment_size); diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c index 4c47261c7f9..48d819ee333 100644 --- a/src/backend/replication/slot.c +++ b/src/backend/replication/slot.c @@ -1301,6 +1301,13 @@ ReplicationSlotsComputeRequiredLSN(void) Assert(ReplicationSlotCtl != NULL); + /* + * Hold the ReplicationSlotControlLock until after updating the slot's + * minimum LSN value. A detailed reason and analysis for the safety + * concerning xmin is provided in the comments of + * ReplicationSlotsComputeRequiredXmin(), which similarly applies here when + * considering the restart_lsn. + */ LWLockAcquire(ReplicationSlotControlLock, LW_SHARED); for (i = 0; i < max_replication_slots; i++) { @@ -1346,9 +1353,10 @@ ReplicationSlotsComputeRequiredLSN(void) restart_lsn < min_required)) min_required = restart_lsn; } - LWLockRelease(ReplicationSlotControlLock); XLogSetReplicationSlotMinimumLSN(min_required); + + LWLockRelease(ReplicationSlotControlLock); } /* @@ -1738,10 +1746,23 @@ ReplicationSlotReserveWal(void) else restart_lsn = GetXLogInsertRecPtr(); + /* + * Hold the ReplicationSlotControlLock exclusive when updating the slot + * restart_lsn. Doing so ensures that other backends either wait for the + * restart_lsn update before computing the minimum LSN or include the + * updated restart_lsn in their minimum LSN computations. This prevents + * other backends from overwriting the minimum LSN with a position more + * recent than the WAL position being reserved, ensuring the WALs required + * by this slot are not prematurely removed during checkpoint. + */ + LWLockAcquire(ReplicationSlotControlLock, LW_EXCLUSIVE); + SpinLockAcquire(&slot->mutex); slot->data.restart_lsn = restart_lsn; SpinLockRelease(&slot->mutex); + LWLockRelease(ReplicationSlotControlLock); + /* prevent WAL removal as fast as possible */ ReplicationSlotsComputeRequiredLSN(); -- 2.51.1.windows.1