From 6f32238f119236322dfb16262a88c3a9c5141413 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathandbossart@gmail.com>
Date: Thu, 15 Dec 2022 14:11:43 -0800
Subject: [PATCH v1 1/1] handle race condition when restarting wal receivers

---
 src/backend/postmaster/postmaster.c | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index eac3450774..9d6f58e0b3 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -1601,9 +1601,9 @@ checkControlFile(void)
  *
  * In normal conditions we wait at most one minute, to ensure that the other
  * background tasks handled by ServerLoop get done even when no requests are
- * arriving.  However, if there are background workers waiting to be started,
- * we don't actually sleep so that they are quickly serviced.  Other exception
- * cases are as shown in the code.
+ * arriving.  However, if there are background workers or a WAL receiver
+ * waiting to be started, we make sure they are quickly serviced.  Other
+ * exception cases are as shown in the code.
  */
 static void
 DetermineSleepTime(struct timeval *timeout)
@@ -1611,11 +1611,12 @@ DetermineSleepTime(struct timeval *timeout)
 	TimestampTz next_wakeup = 0;
 
 	/*
-	 * Normal case: either there are no background workers at all, or we're in
-	 * a shutdown sequence (during which we ignore bgworkers altogether).
+	 * Normal case: either there are no background workers and no WAL receiver
+	 * at all, or we're in a shutdown sequence (during which we ignore
+	 * bgworkers and the WAL receiver altogether).
 	 */
 	if (Shutdown > NoShutdown ||
-		(!StartWorkerNeeded && !HaveCrashedWorker))
+		(!StartWorkerNeeded && !HaveCrashedWorker && !WalReceiverRequested))
 	{
 		if (AbortStartTime != 0)
 		{
@@ -1674,6 +1675,21 @@ DetermineSleepTime(struct timeval *timeout)
 		}
 	}
 
+	/*
+	 * If WalReceiverRequested is set, we're probably waiting on a SIGCHLD to
+	 * arrive to clear WalReceiverPID before starting the new WAL receiver.  We
+	 * don't expect that to take long, so limit the sleep to 100ms so that we
+	 * start the new WAL receiver promptly.
+	 */
+	if (WalReceiverRequested)
+	{
+		TimestampTz walrcv_wakeup;
+
+		walrcv_wakeup = TimestampTzPlusMilliseconds(GetCurrentTimestamp(), 100);
+		if (next_wakeup == 0 || walrcv_wakeup < next_wakeup)
+			next_wakeup = walrcv_wakeup;
+	}
+
 	if (next_wakeup != 0)
 	{
 		long		secs;
-- 
2.25.1

