From a425f6f89afcf2bc9cf95b291bb17084bf79d21e Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Thu, 1 Aug 2024 18:40:20 +0300
Subject: [PATCH v3 09/12] Kill dead-end children when there's nothing else
 left

Previously, the postmaster would never try to kill dead-end child
processes, even if there was no other processes left. A dead-end
backend will eventually exit, when authentication_timeout expires, but
if a dead-end backend is the only thing that's preventing the server
from shutting down, it seems better to kill it immediately. It's
particularly important, if there was a bug in the early startup code
that prevented a dead-end child from timing out and exiting normally.

Includes a test for that case where a dead-end backend previously kept
the server from shutting down.
---
 src/backend/postmaster/postmaster.c     | 35 +++++++-------
 src/test/postmaster/meson.build         |  1 +
 src/test/postmaster/t/002_start_stop.pl | 64 +++++++++++++++++++++++++
 3 files changed, 81 insertions(+), 19 deletions(-)
 create mode 100644 src/test/postmaster/t/002_start_stop.pl

diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 2e917c5ea9..bfadb995cb 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -409,9 +409,6 @@ static void sigquit_child(pid_t pid);
 static bool SignalSomeChildren(int signal, uint32 targetMask);
 static void TerminateChildren(int signal);
 
-#define SignalChildren(sig)		\
-	SignalSomeChildren(sig, BACKEND_TYPE_ALL & ~(1 << B_DEAD_END_BACKEND))
-
 static int	CountChildren(uint32 targetMask);
 static Backend *assign_backendlist_entry(void);
 static void LaunchMissingBackgroundProcesses(void);
@@ -1963,7 +1960,7 @@ process_pm_reload_request(void)
 		ereport(LOG,
 				(errmsg("received SIGHUP, reloading configuration files")));
 		ProcessConfigFile(PGC_SIGHUP);
-		SignalChildren(SIGHUP);
+		SignalSomeChildren(SIGHUP, BACKEND_TYPE_ALL & ~(1 << B_DEAD_END_BACKEND));
 		if (StartupPID != 0)
 			signal_child(StartupPID, SIGHUP);
 		if (BgWriterPID != 0)
@@ -2382,7 +2379,7 @@ process_pm_child_exit(void)
 				 * Waken walsenders for the last time. No regular backends
 				 * should be around anymore.
 				 */
-				SignalChildren(SIGUSR2);
+				SignalSomeChildren(SIGUSR2, BACKEND_TYPE_ALL & (1 << B_DEAD_END_BACKEND));
 
 				pmState = PM_SHUTDOWN_2;
 			}
@@ -2867,7 +2864,7 @@ PostmasterStateMachine(void)
 		 */
 		ForgetUnstartedBackgroundWorkers();
 
-		/* Signal all backend children except walsenders */
+		/* Signal all backend children except walsenders and dead-end backends */
 		SignalSomeChildren(SIGTERM,
 						   BACKEND_TYPE_ALL & ~(1 << B_WAL_SENDER | 1 << B_DEAD_END_BACKEND));
 		/* and the autovac launcher too */
@@ -2925,10 +2922,11 @@ PostmasterStateMachine(void)
 			if (Shutdown >= ImmediateShutdown || FatalError)
 			{
 				/*
-				 * Start waiting for dead_end children to die.  This state
-				 * change causes ServerLoop to stop creating new ones.
+				 * Stop any dead_end children and stop creating new ones.
 				 */
 				pmState = PM_WAIT_DEAD_END;
+				ConfigurePostmasterWaitSet(false);
+				SignalSomeChildren(SIGQUIT, 1 << B_DEAD_END_BACKEND);
 
 				/*
 				 * We already SIGQUIT'd the archiver and stats processes, if
@@ -2967,9 +2965,10 @@ PostmasterStateMachine(void)
 					 */
 					FatalError = true;
 					pmState = PM_WAIT_DEAD_END;
+					ConfigurePostmasterWaitSet(false);
 
-					/* Kill the walsenders and archiver too */
-					SignalChildren(SIGQUIT);
+					/* Kill the walsenders and archiver, too */
+					SignalSomeChildren(SIGQUIT, BACKEND_TYPE_ALL);
 					if (PgArchPID != 0)
 						signal_child(PgArchPID, SIGQUIT);
 				}
@@ -2987,15 +2986,14 @@ PostmasterStateMachine(void)
 		 */
 		if (PgArchPID == 0 && CountChildren(BACKEND_TYPE_ALL & ~(1 << B_DEAD_END_BACKEND)) == 0)
 		{
+			ConfigurePostmasterWaitSet(false);
+			SignalSomeChildren(SIGTERM, 1 << B_DEAD_END_BACKEND);
 			pmState = PM_WAIT_DEAD_END;
 		}
 	}
 
 	if (pmState == PM_WAIT_DEAD_END)
 	{
-		/* Don't allow any new socket connection events. */
-		ConfigurePostmasterWaitSet(false);
-
 		/*
 		 * PM_WAIT_DEAD_END state ends when the BackendList is entirely empty
 		 * (ie, no dead_end children remain), and the archiver is gone too.
@@ -3281,8 +3279,7 @@ sigquit_child(pid_t pid)
 }
 
 /*
- * Send a signal to the targeted children (but NOT special children;
- * dead_end children are never signaled, either XXX).
+ * Send a signal to the targeted children (but NOT special children).
  */
 static bool
 SignalSomeChildren(int signal, uint32 targetMask)
@@ -3313,8 +3310,8 @@ SignalSomeChildren(int signal, uint32 targetMask)
 		}
 
 		ereport(DEBUG4,
-				(errmsg_internal("sending signal %d to process %d",
-								 signal, (int) bp->pid)));
+				(errmsg_internal("sending signal %d to %s process %d",
+								 signal, GetBackendTypeDesc(bp->bkend_type), (int) bp->pid)));
 		signal_child(bp->pid, signal);
 		signaled = true;
 	}
@@ -3323,12 +3320,12 @@ SignalSomeChildren(int signal, uint32 targetMask)
 
 /*
  * Send a termination signal to children.  This considers all of our children
- * processes, except syslogger and dead_end backends.
+ * processes, except syslogger.
  */
 static void
 TerminateChildren(int signal)
 {
-	SignalChildren(signal);
+	SignalSomeChildren(signal, BACKEND_TYPE_ALL);
 	if (StartupPID != 0)
 	{
 		signal_child(StartupPID, signal);
diff --git a/src/test/postmaster/meson.build b/src/test/postmaster/meson.build
index c2de2e0eb5..2d89adf520 100644
--- a/src/test/postmaster/meson.build
+++ b/src/test/postmaster/meson.build
@@ -7,6 +7,7 @@ tests += {
   'tap': {
     'tests': [
       't/001_connection_limits.pl',
+      't/002_start_stop.pl',
     ],
   },
 }
diff --git a/src/test/postmaster/t/002_start_stop.pl b/src/test/postmaster/t/002_start_stop.pl
new file mode 100644
index 0000000000..6f114659fa
--- /dev/null
+++ b/src/test/postmaster/t/002_start_stop.pl
@@ -0,0 +1,64 @@
+
+# Copyright (c) 2021-2024, PostgreSQL Global Development Group
+
+# XXX
+# XXX
+
+use IO::Socket::INET;
+use IO::Socket::UNIX;
+use strict;
+use warnings FATAL => 'all';
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+use Time::HiRes qw(time);
+
+# Initialize the server with low connection limits, to test dead-end backends
+my $node = PostgreSQL::Test::Cluster->new('primary');
+$node->init;
+$node->append_conf('postgresql.conf', "max_connections = 5");
+$node->append_conf('postgresql.conf', "log_connections = on");
+$node->append_conf('postgresql.conf', "log_min_messages = debug2");
+
+# XX
+$node->append_conf('postgresql.conf', "authentication_timeout = '120 s'");
+
+$node->start;
+
+my @sessions = ();
+my @raw_connections = ();
+
+#for (my $i=0; $i <= 5; $i++) {
+#	push(@sessions, $node->background_psql('postgres', on_error_die => 1));
+#}
+#$node->connect_fails("dbname=postgres", "max_connections reached",
+#					 expected_stderr => qr/FATAL:  sorry, too many clients already/);
+
+# We can still open TCP (or Unix domain socket) connections, but beyond a
+# certain number (roughly 2x max_connections), they will be "dead-end backends"
+for (my $i = 0; $i <= 20; $i++)
+{
+	push(@raw_connections, $node->raw_connect());
+}
+
+# Test that the dead-end backends don't prevent the server from stopping.
+my $before = time();
+$node->stop();
+my $elapsed = time() - $before;
+ok($elapsed < 60);
+
+$node->start();
+
+$node->connect_ok("dbname=postgres", "works after restart");
+
+# Clean up
+foreach my $session (@sessions)
+{
+	$session->quit;
+}
+foreach my $socket (@raw_connections)
+{
+	$socket->close();
+}
+
+done_testing();
-- 
2.39.2

