From ffc1469819b28ff76e3290cd695e60511d94305a Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Fri, 6 May 2022 19:57:54 +0900
Subject: [PATCH v6] Correctly update control file at promotion

If the cluster ends recovery on a promotion (aka the control file shows
a state different than DB_IN_ARCHIVE_RECOVERY) while
CreateRestartPoint() is still processing, this function could miss the
update the control file but still do the recycling/removal of the past
WAL segments, causing a follow-up restart doing recovery to fail on a
missing checkpoint record (aka PANIC) because the redo LSN referred in
the control file was located in a segment already gone.

This commit fixes the control file update so the redo LSN is updated to
reflect what gets recycled, with the minimum recovery point changed only
if the cluster is still doing archive recovery.
---
 src/backend/access/transam/xlog.c             |  48 ++++----
 .../t/027_invalid_checkpoint_after_promote.pl | 113 ++++++++++++++++++
 2 files changed, 141 insertions(+), 20 deletions(-)
 create mode 100644 src/test/recovery/t/027_invalid_checkpoint_after_promote.pl

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 61cda56c6f..c9c0b3f90a 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -6921,6 +6921,9 @@ CreateRestartPoint(int flags)
 	XLogSegNo	_logSegNo;
 	TimestampTz xtime;
 
+	/* Concurrent checkpoint/restartpoint cannot happen */
+	Assert (!IsUnderPostmaster || MyBackendType == B_CHECKPOINTER);
+
 	/* Get a local copy of the last safe checkpoint record. */
 	SpinLockAcquire(&XLogCtl->info_lck);
 	lastCheckPointRecPtr = XLogCtl->lastCheckPointRecPtr;
@@ -7013,30 +7016,33 @@ CreateRestartPoint(int flags)
 	 */
 	PriorRedoPtr = ControlFile->checkPointCopy.redo;
 
-	/*
-	 * Update pg_control, using current time.  Check that it still shows
-	 * DB_IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
-	 * this is a quick hack to make sure nothing really bad happens if somehow
-	 * we get here after the end-of-recovery checkpoint.
-	 */
+	/* Update pg_control, using current time */
 	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
-	if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
-		ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
+
+	/*
+	 * Check if the control file still shows an older checkpoint, and update
+	 * its information.
+	 */
+	if (ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
 	{
 		ControlFile->checkPoint = lastCheckPointRecPtr;
 		ControlFile->checkPointCopy = lastCheckPoint;
+	}
 
-		/*
-		 * Ensure minRecoveryPoint is past the checkpoint record.  Normally,
-		 * this will have happened already while writing out dirty buffers,
-		 * but not necessarily - e.g. because no buffers were dirtied.  We do
-		 * this because a backup performed in recovery uses minRecoveryPoint to
-		 * determine which WAL files must be included in the backup, and the
-		 * file (or files) containing the checkpoint record must be included,
-		 * at a minimum. Note that for an ordinary restart of recovery there's
-		 * no value in having the minimum recovery point any earlier than this
-		 * anyway, because redo will begin just after the checkpoint record.
-		 */
+	/*
+	 * Ensure minRecoveryPoint is past the checkpoint record while archive
+	 * recovery is still ongoing.  Normally, this will have happened already
+	 * while writing out dirty buffers, but not necessarily - e.g. because no
+	 * buffers were dirtied.  We do this because a base backup uses
+	 * minRecoveryPoint to determine which WAL files must be included in the
+	 * backup, and the file (or files) containing the checkpoint record must
+	 * be included, at a minimum.  Note that for an ordinary restart of
+	 * recovery there's no value in having the minimum recovery point any
+	 * earlier than this anyway, because redo will begin just after the
+	 * checkpoint record.
+	 */
+	if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
+	{
 		if (ControlFile->minRecoveryPoint < lastCheckPointEndPtr)
 		{
 			ControlFile->minRecoveryPoint = lastCheckPointEndPtr;
@@ -7046,10 +7052,12 @@ CreateRestartPoint(int flags)
 			LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
 			LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
 		}
+
 		if (flags & CHECKPOINT_IS_SHUTDOWN)
 			ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
-		UpdateControlFile();
 	}
+
+	UpdateControlFile();
 	LWLockRelease(ControlFileLock);
 
 	/*
diff --git a/src/test/recovery/t/027_invalid_checkpoint_after_promote.pl b/src/test/recovery/t/027_invalid_checkpoint_after_promote.pl
new file mode 100644
index 0000000000..15c8d0a01a
--- /dev/null
+++ b/src/test/recovery/t/027_invalid_checkpoint_after_promote.pl
@@ -0,0 +1,113 @@
+use strict;
+use warnings;
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Time::HiRes qw(usleep nanosleep);
+use Test::More tests => 5;
+
+# initialize primary node
+my $node_primary = PostgreSQL::Test::Cluster->new('master');
+$node_primary->init(allows_streaming => 1);
+$node_primary->append_conf(
+	'postgresql.conf', q[
+checkpoint_timeout = 30s
+max_wal_size = 16GB
+log_checkpoints = on
+restart_after_crash = on
+]);
+$node_primary->start;
+my $backup_name = 'my_backup';
+$node_primary->backup($backup_name);
+
+# setup a standby
+my $node_standby = PostgreSQL::Test::Cluster->new('standby1');
+$node_standby->init_from_backup($node_primary, $backup_name, has_streaming => 1);
+$node_standby->start;
+
+# dummy table for the upcoming tests.
+$node_primary->safe_psql('postgres', 'checkpoint');
+$node_primary->safe_psql('postgres', 'create table test (a int, b varchar(255))');
+
+# use background psql to insert batch, just to make checkpoint a little slow.
+my $psql_timeout = IPC::Run::timer(3600);
+my ($stdin, $stdout, $stderr) = ('', '', '');
+my $psql_primary = IPC::Run::start(
+	[ 'psql', '-XAtq', '-v', 'ON_ERROR_STOP=1', '-f', '-', '-d', $node_primary->connstr('postgres') ],
+	'<',
+	\$stdin,
+	'>',
+	\$stdout,
+	'2>',
+	\$stderr,
+	$psql_timeout);
+$stdin .= q[
+INSERT INTO test SELECT i, 'aaaaaaaaaaaaaaaaaaaaaaa' from generate_series(1, 100000000) as i;
+];
+$psql_primary->pump_nb();
+
+# wait until restartpoint on standy
+my $logstart = -s $node_standby->logfile;
+my $checkpoint_start = 0;
+for (my $i = 0; $i < 3000; $i++)
+{
+    my $log = slurp_file($node_standby->logfile, $logstart);
+	if ($log =~ m/restartpoint starting: time/)
+	{
+        $checkpoint_start = 1;
+		last;
+	}
+	usleep(100_000);
+}
+is($checkpoint_start, 1, 'restartpoint has started');
+
+# promote during restartpoint
+$node_primary->stop;
+$node_standby->promote;
+
+# wait until checkpoint on new primary
+$logstart = -s $node_standby->logfile;
+$checkpoint_start = 0;
+for (my $i = 0; $i < 3000; $i++)
+{
+    my $log = slurp_file($node_standby->logfile, $logstart);
+	if ($log =~ m/restartpoint complete/)
+	{
+        $checkpoint_start = 1;
+		last;
+	}
+	usleep(100_000);
+}
+is($checkpoint_start, 1, 'checkpoint has started');
+
+# kill SIGKILL a backend, and all backend will restart. Note that previous checkpoint has not completed.
+my ($killme_stdin, $killme_stdout, $killme_stderr) = ('', '', '');
+my $killme = IPC::Run::start(
+	[ 'psql', '-XAtq', '-v', 'ON_ERROR_STOP=1', '-f', '-', '-d', $node_standby->connstr('postgres') ],
+	'<',
+	\$killme_stdin,
+	'>',
+	\$killme_stdout,
+	'2>',
+	\$killme_stderr,
+	$psql_timeout);
+$killme_stdin .= q[
+SELECT pg_backend_pid();
+];
+$killme->pump until $killme_stdout =~ /[[:digit:]]+[\r\n]$/;
+my $pid = $killme_stdout;
+chomp($pid);
+my $ret = PostgreSQL::Test::Utils::system_log('pg_ctl', 'kill', 'KILL', $pid);
+is($ret, 0, 'killed process with KILL');
+
+# after recovery, the server will not start, and log PANIC: could not locate a valid checkpoint record
+for (my $i = 0; $i < 30; $i++)
+{
+    ($ret, $stdout, $stderr) = $node_standby->psql('postgres', 'select 1');
+    last if $ret == 0;
+	sleep(1);
+}
+is($ret, 0, "psql connect success");
+is($stdout, 1, "psql select 1");
+
+$psql_primary->finish;
+$killme->finish;
-- 
2.36.0