From f60a004116bc21440b8769c75f11126ac4ef667f Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Thu, 10 Apr 2025 11:12:41 +0900
Subject: [PATCH v5] Fix sync_standbys_defined race on startup.

WalSndCtl->sync_standbys_defined, can be updated by checkpointer process
only and in backends it can be read. So, on postgres startup, we have
race between checkpointer process and backends.

Notes: blah..
---
 src/include/replication/walsender_private.h | 23 +++++-
 src/backend/replication/syncrep.c           | 85 +++++++++++++++++----
 src/backend/replication/walsender.c         |  6 +-
 3 files changed, 92 insertions(+), 22 deletions(-)

diff --git a/src/include/replication/walsender_private.h b/src/include/replication/walsender_private.h
index 0fc77f1b4af7..92243cce1d04 100644
--- a/src/include/replication/walsender_private.h
+++ b/src/include/replication/walsender_private.h
@@ -96,11 +96,11 @@ typedef struct
 	XLogRecPtr	lsn[NUM_SYNC_REP_WAIT_MODE];
 
 	/*
-	 * Are any sync standbys defined?  Waiting backends can't reload the
-	 * config file safely, so checkpointer updates this value as needed.
-	 * Protected by SyncRepLock.
+	 * Status of data related to the synchronous standbys.  Waiting backends
+	 * can't reload the config file safely, so checkpointer updates this value
+	 * as needed. Protected by SyncRepLock.
 	 */
-	bool		sync_standbys_defined;
+	bits8		sync_standbys_status;
 
 	/* used as a registry of physical / logical walsenders to wake */
 	ConditionVariable wal_flush_cv;
@@ -116,6 +116,21 @@ typedef struct
 	WalSnd		walsnds[FLEXIBLE_ARRAY_MEMBER];
 } WalSndCtlData;
 
+/* Flags for WalSndCtlData->sync_standbys_status */
+
+/*
+ * Is the synchronous standby data initialized from the GUC?  This is flipped
+ * the first time synchronous_standby_names is processed by the checkpointer.
+ */
+#define SYNC_STANDBY_INIT			(1 << 0)
+
+/*
+ * Is the synchronous standby data defined?  This is set when
+ * synchronous_standby_names has some data, after being processed by the
+ * checkpointer.
+ */
+#define SYNC_STANDBY_DEFINED		(1 << 1)
+
 extern PGDLLIMPORT WalSndCtlData *WalSndCtl;
 
 
diff --git a/src/backend/replication/syncrep.c b/src/backend/replication/syncrep.c
index d75e39680355..4c9f550f7dc3 100644
--- a/src/backend/replication/syncrep.c
+++ b/src/backend/replication/syncrep.c
@@ -161,16 +161,23 @@ SyncRepWaitForLSN(XLogRecPtr lsn, bool commit)
 	 * sync replication standby names defined.
 	 *
 	 * Since this routine gets called every commit time, it's important to
-	 * exit quickly if sync replication is not requested. So we check
-	 * WalSndCtl->sync_standbys_defined flag without the lock and exit
-	 * immediately if it's false. If it's true, we need to check it again
-	 * later while holding the lock, to check the flag and operate the sync
-	 * rep queue atomically. This is necessary to avoid the race condition
-	 * described in SyncRepUpdateSyncStandbysDefined(). On the other hand, if
-	 * it's false, the lock is not necessary because we don't touch the queue.
+	 * exit quickly if sync replication is not requested.
+	 *
+	 * We check WalSndCtl->sync_standbys_status flag without the lock and exit
+	 * immediately if SYNC_STANDBY_INIT is set (the checkpointer has
+	 * initialized this data) but SYNC_STANDBY_DEFINED is missing (no sync
+	 * replication requested).
+	 *
+	 * If SYNC_STANDBY_DEFINED is set, we need to check the status again later
+	 * while holding the lock, to check the flag and operate the sync rep
+	 * queue atomically.  This is necessary to avoid the race condition
+	 * described in SyncRepUpdateSyncStandbysDefined().  On the other hand, if
+	 * SYNC_STANDBY_DEFINED is not set, the lock is not necessary because we
+	 * don't touch the queue.
 	 */
 	if (!SyncRepRequested() ||
-		!((volatile WalSndCtlData *) WalSndCtl)->sync_standbys_defined)
+		((((volatile WalSndCtlData *) WalSndCtl)->sync_standbys_status) &
+		 (SYNC_STANDBY_INIT | SYNC_STANDBY_DEFINED)) == SYNC_STANDBY_INIT)
 		return;
 
 	/* Cap the level for anything other than commit to remote flush only. */
@@ -186,16 +193,38 @@ SyncRepWaitForLSN(XLogRecPtr lsn, bool commit)
 	Assert(MyProc->syncRepState == SYNC_REP_NOT_WAITING);
 
 	/*
-	 * We don't wait for sync rep if WalSndCtl->sync_standbys_defined is not
-	 * set.  See SyncRepUpdateSyncStandbysDefined.
+	 * We don't wait for sync rep if SYNC_STANDBY_DEFINED is not set.  See
+	 * SyncRepUpdateSyncStandbysDefined().
 	 *
 	 * Also check that the standby hasn't already replied. Unlikely race
 	 * condition but we'll be fetching that cache line anyway so it's likely
 	 * to be a low cost check.
+	 *
+	 * If the sync standby data has not been initialized yet
+	 * (SYNC_STANDBY_INIT is not set), then fall back to a direct GUC check.
 	 */
-	if (!WalSndCtl->sync_standbys_defined ||
-		lsn <= WalSndCtl->lsn[mode])
+	if (WalSndCtl->sync_standbys_status & SYNC_STANDBY_INIT)
 	{
+		if ((WalSndCtl->sync_standbys_status & SYNC_STANDBY_DEFINED) == 0 ||
+			lsn <= WalSndCtl->lsn[mode])
+		{
+			LWLockRelease(SyncRepLock);
+			return;
+		}
+	}
+	else if (!SyncStandbysDefined())
+	{
+		/*
+		 * If we are here, the sync standby data has not been initialized yet,
+		 * so fall back to the best thing we can do: a check on
+		 * SyncStandbysDefined() to see if the GUC is set or not.
+		 *
+		 * If the GUC has a value, we wait until the checkpointer updates the
+		 * status data because we cannot be sure yet if we should wait or not.
+		 * If the GUC has *no* value, we are sure that there is no point to
+		 * wait; this matters for example when initializing a cluster, where
+		 * we should never wait, and no sync standbys is the default behavior.
+		 */
 		LWLockRelease(SyncRepLock);
 		return;
 	}
@@ -912,7 +941,7 @@ SyncRepWakeQueue(bool all, int mode)
 
 /*
  * The checkpointer calls this as needed to update the shared
- * sync_standbys_defined flag, so that backends don't remain permanently wedged
+ * sync_standbys_status flag, so that backends don't remain permanently wedged
  * if synchronous_standby_names is unset.  It's safe to check the current value
  * without the lock, because it's only ever updated by one process.  But we
  * must take the lock to change it.
@@ -922,8 +951,11 @@ SyncRepUpdateSyncStandbysDefined(void)
 {
 	bool		sync_standbys_defined = SyncStandbysDefined();
 
-	if (sync_standbys_defined != WalSndCtl->sync_standbys_defined)
+	if (sync_standbys_defined !=
+		((WalSndCtl->sync_standbys_status & SYNC_STANDBY_DEFINED) != 0))
 	{
+		pg_usleep(2 * 1000 * 1000L);	/* 2s */
+
 		LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);
 
 		/*
@@ -946,7 +978,30 @@ SyncRepUpdateSyncStandbysDefined(void)
 		 * backend that hasn't yet reloaded its config might go to sleep on
 		 * the queue (and never wake up).  This prevents that.
 		 */
-		WalSndCtl->sync_standbys_defined = sync_standbys_defined;
+		WalSndCtl->sync_standbys_status = SYNC_STANDBY_INIT |
+			(sync_standbys_defined ? SYNC_STANDBY_DEFINED : 0);
+
+		LWLockRelease(SyncRepLock);
+	}
+	else if ((WalSndCtl->sync_standbys_status & SYNC_STANDBY_INIT) == 0)
+	{
+		LWLockAcquire(SyncRepLock, LW_EXCLUSIVE);
+
+		/*
+		 * Note that there is no need to wake up the queues here.  We would
+		 * reach this path only if SyncStandbysDefined() returns false, or it
+		 * would mean that some backends are waiting with the GUC set.  See
+		 * SyncRepWaitForLSN().
+		 */
+		Assert(!SyncStandbysDefined());
+
+		/*
+		 * Even if there is no sync standby defined, let the readers of this
+		 * information know that the sync standby data has been initialized.
+		 * This can just be done once, hence the previous check on
+		 * SYNC_STANDBY_INIT to avoid useless work.
+		 */
+		WalSndCtl->sync_standbys_status |= SYNC_STANDBY_INIT;
 
 		LWLockRelease(SyncRepLock);
 	}
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index 216baeda5cd2..987a7336ec84 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -1674,13 +1674,13 @@ WalSndUpdateProgress(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId
 	 * When skipping empty transactions in synchronous replication, we send a
 	 * keepalive message to avoid delaying such transactions.
 	 *
-	 * It is okay to check sync_standbys_defined flag without lock here as in
-	 * the worst case we will just send an extra keepalive message when it is
+	 * It is okay to check sync_standbys_status without lock here as in the
+	 * worst case we will just send an extra keepalive message when it is
 	 * really not required.
 	 */
 	if (skipped_xact &&
 		SyncRepRequested() &&
-		((volatile WalSndCtlData *) WalSndCtl)->sync_standbys_defined)
+		(((volatile WalSndCtlData *) WalSndCtl)->sync_standbys_status & SYNC_STANDBY_DEFINED))
 	{
 		WalSndKeepalive(false, lsn);
 
-- 
2.49.0