From 97b4983b1443d03525b0565eb104b359a43044af Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Tue, 14 Jan 2025 00:25:01 -0500
Subject: [PATCH v2.3 05/30] WIP: postmaster: Commonalize FatalError paths

This includes some behavioural changes:

- Previously PM_WAIT_XLOG_ARCHIVAL wasn't handled in HandleFatalError(), that
  doesn't seem quite right.
- Failure to fork checkpointer now transitions through PM_WAIT_BACKENDS, like
  child crashes. That's not necessarily great, but...

Author:
Reviewed-by:
Discussion: https://postgr.es/m/
Backpatch:
---
 src/backend/postmaster/postmaster.c | 61 +++++++++++++++++++++++------
 1 file changed, 49 insertions(+), 12 deletions(-)

diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 13d49eecd22..41f2bbc214c 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -2693,12 +2693,47 @@ HandleFatalError(QuitSignalReason reason, bool consider_sigabrt)
 		FatalError = true;
 
 	/* We now transit into a state of waiting for children to die */
-	if (pmState == PM_RECOVERY ||
-		pmState == PM_HOT_STANDBY ||
-		pmState == PM_RUN ||
-		pmState == PM_STOP_BACKENDS ||
-		pmState == PM_WAIT_XLOG_SHUTDOWN)
-		UpdatePMState(PM_WAIT_BACKENDS);
+	switch (pmState)
+	{
+		case PM_INIT:
+			/* shouldn't have any children */
+			Assert(false);
+			break;
+		case PM_STARTUP:
+			/* should have been handled in process_pm_child_exit */
+			Assert(false);
+			break;
+
+			/* wait for children to die */
+		case PM_RECOVERY:
+		case PM_HOT_STANDBY:
+		case PM_RUN:
+		case PM_STOP_BACKENDS:
+			UpdatePMState(PM_WAIT_BACKENDS);
+			break;
+
+		case PM_WAIT_BACKENDS:
+			/* there might be more backends to wait for */
+			break;
+
+		case PM_WAIT_XLOG_SHUTDOWN:
+		case PM_WAIT_XLOG_ARCHIVAL:
+
+			/*
+			 * Note that we switch *back* to PM_WAIT_BACKENDS here. This way
+			 * the PM_WAIT_BACKENDS && FatalError code in
+			 * PostmasterStateMachine does not have to be duplicated.
+			 *
+			 * XXX: This seems rather ugly, but it's not obvious if the
+			 * alternative is better.
+			 */
+			UpdatePMState(PM_WAIT_BACKENDS);
+			break;
+
+		case PM_WAIT_DEAD_END:
+		case PM_NO_CHILDREN:
+			break;
+	}
 
 	/*
 	 * .. and if this doesn't happen quickly enough, now the clock is ticking
@@ -2836,6 +2871,9 @@ PostmasterStateMachine(void)
 	 * PM_WAIT_BACKENDS, but we signal the processes first, before waiting for
 	 * them.  Treating it as a distinct pmState allows us to share this code
 	 * across multiple shutdown code paths.
+	 *
+	 * Note that HandleFatalError() switches to PM_WAIT_BACKENDS even if we
+	 * were, before the fatal error, in a "more advanced" state.
 	 */
 	if (pmState == PM_STOP_BACKENDS || pmState == PM_WAIT_BACKENDS)
 	{
@@ -2967,13 +3005,12 @@ PostmasterStateMachine(void)
 					 * We don't consult send_abort_for_crash here, as it's
 					 * unlikely that dumping cores would illuminate the reason
 					 * for checkpointer fork failure.
+					 *
+					 * XXX: Is it worth inventing a different PMQUIT value
+					 * that signals that the cluster is in a bad state,
+					 * without a process having crashed?
 					 */
-					FatalError = true;
-					UpdatePMState(PM_WAIT_DEAD_END);
-					ConfigurePostmasterWaitSet(false);
-
-					/* Kill the walsenders and archiver too */
-					SignalChildren(SIGQUIT, btmask_all_except(B_LOGGER));
+					HandleFatalError(PMQUIT_FOR_CRASH, false);
 				}
 			}
 		}
-- 
2.48.1.76.g4e746b1a31.dirty