From ea5df7ebf90900e31e002317a3320cb66a8ef31a Mon Sep 17 00:00:00 2001 From: Zhijie Hou Date: Thu, 17 Apr 2025 17:14:24 +0800 Subject: [PATCH] Fix assertion failure when decoding synced two-phase enabled slots. Current, during slot synchronization, it skips updating the confirmed_lsn if it detects that the catalog_xmin or restart_lsn of the synced slot has already surpassed those of the remote slot on the primary. This behavior poses a problem when two-phase commit is enabled on the remote slot. The lack of synchronization between the latest confirmed_lsn and two_phase_at may result in transactions prepared between the old confirmed_lsn and two_phase_at being unexpectedly decoded and sent to subscribers following a promotion. To fix this, we keep confirmed_flush updated even if the local catalog_xmin or restart_lsn is more recent. --- src/backend/replication/logical/slotsync.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/backend/replication/logical/slotsync.c b/src/backend/replication/logical/slotsync.c index e22d41891e6..2a12167cdaf 100644 --- a/src/backend/replication/logical/slotsync.c +++ b/src/backend/replication/logical/slotsync.c @@ -219,6 +219,21 @@ update_local_synced_slot(RemoteSlot *remote_slot, Oid remote_dbid, LSN_FORMAT_ARGS(slot->data.restart_lsn), slot->data.catalog_xmin)); + /* + * Keep confirmed_flush updated even if catalog_xmin or restart_lsn + * advances beyond the values of the remote slot. This is necessary + * when twophase is enabled on the remote slot, as failing to sync the + * latest confirmed_lsn along with two_phase_at can lead to the + * transactions between the old confirmed_lsn and two_phase_at being + * unexpectedly decoded and sent to the subscriber. + */ + if (remote_slot->confirmed_lsn > remote_slot->restart_lsn) + { + SpinLockAcquire(&slot->mutex); + slot->data.confirmed_flush = remote_slot->confirmed_lsn; + SpinLockRelease(&slot->mutex); + } + if (remote_slot_precedes) *remote_slot_precedes = true; } -- 2.30.0.windows.2