From cbe9310cc2f2e065708e7e9e05cf6a94043d484d Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Mon, 20 Apr 2020 16:30:32 +0900
Subject: [PATCH v8] Fix handling of .ready files during crash recovery

78ea8b5 introduced a fix to correctly recycle WAL segments on a standby
when archive_mode = always, however it has introduced a regression with
the handling of .ready files during crash recovery of a primary, causing
those files to be removed.

A set of TAP tests is added to close the gap here, making sure that
archive status files are correctly handled when a cluster is in archive
or crash recovery with archive_mode set to "on" or "always, for both
standbys and primarys.

Reported-by: ???
Author: Jehan-Guillaume de Rorthais
Reviewed-by: Kyotaro Horiguchi, Fujii Masao, Michael Paquier
Backpatch-through: 9.5
---
 src/include/access/xlog.h                 |   9 +
 src/backend/access/transam/xlog.c         |  57 +++++-
 src/backend/access/transam/xlogarchive.c  |  21 +-
 src/test/recovery/t/020_archive_status.pl | 225 ++++++++++++++++++++++
 src/tools/pgindent/typedefs.list          |   1 +
 5 files changed, 297 insertions(+), 16 deletions(-)
 create mode 100644 src/test/recovery/t/020_archive_status.pl

diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index f60ed2d36c..0a12afb59e 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -166,6 +166,14 @@ typedef enum WalLevel
 	WAL_LEVEL_LOGICAL
 } WalLevel;
 
+/* Recovery states */
+typedef enum RecoveryState
+{
+	RECOVERY_STATE_CRASH = 0,	/* crash recovery */
+	RECOVERY_STATE_ARCHIVE,		/* archive recovery */
+	RECOVERY_STATE_DONE			/* currently in production */
+} RecoveryState;
+
 extern PGDLLIMPORT int wal_level;
 
 /* Is WAL archiving enabled (always or only while server is running normally)? */
@@ -291,6 +299,7 @@ extern const char *xlog_identify(uint8 info);
 extern void issue_xlog_fsync(int fd, XLogSegNo segno);
 
 extern bool RecoveryInProgress(void);
+extern RecoveryState GetRecoveryState(void);
 extern bool HotStandbyActive(void);
 extern bool HotStandbyActiveInReplay(void);
 extern bool XLogInsertAllowed(void);
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 11e32733c4..009a071276 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -221,8 +221,9 @@ static TimeLineID receiveTLI = 0;
 static bool lastFullPageWrites;
 
 /*
- * Local copy of SharedRecoveryInProgress variable. True actually means "not
- * known, need to check the shared state".
+ * Local copy of the state tracked by SharedRecoveryState in shared memory,
+ * It is false if SharedRecoveryState is RECOVERY_STATE_DONE.  True actually
+ * means "not known, need to check the shared state".
  */
 static bool LocalRecoveryInProgress = true;
 
@@ -653,10 +654,10 @@ typedef struct XLogCtlData
 	TimeLineID	PrevTimeLineID;
 
 	/*
-	 * SharedRecoveryInProgress indicates if we're still in crash or archive
+	 * SharedRecoveryState indicates if we're still in crash or archive
 	 * recovery.  Protected by info_lck.
 	 */
-	bool		SharedRecoveryInProgress;
+	RecoveryState SharedRecoveryState;
 
 	/*
 	 * SharedHotStandbyActive indicates if we allow hot standby queries to be
@@ -4434,6 +4435,16 @@ ReadRecord(XLogReaderState *xlogreader, int emode,
 				updateMinRecoveryPoint = true;
 
 				UpdateControlFile();
+
+				/*
+				 * We update SharedRecoveryState while holding the lock on
+				 * ControlFileLock so both states are consistent in shared
+				 * memory.
+				 */
+				SpinLockAcquire(&XLogCtl->info_lck);
+				XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
+				SpinLockRelease(&XLogCtl->info_lck);
+
 				LWLockRelease(ControlFileLock);
 
 				CheckRecoveryConsistency();
@@ -5166,7 +5177,7 @@ XLOGShmemInit(void)
 	 * in additional info.)
 	 */
 	XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
-	XLogCtl->SharedRecoveryInProgress = true;
+	XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
 	XLogCtl->SharedHotStandbyActive = false;
 	XLogCtl->SharedPromoteIsTriggered = false;
 	XLogCtl->WalWriterSleeping = false;
@@ -6871,7 +6882,13 @@ StartupXLOG(void)
 		 */
 		dbstate_at_startup = ControlFile->state;
 		if (InArchiveRecovery)
+		{
 			ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
+
+			SpinLockAcquire(&XLogCtl->info_lck);
+			XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
+			SpinLockRelease(&XLogCtl->info_lck);
+		}
 		else
 		{
 			ereport(LOG,
@@ -6884,6 +6901,10 @@ StartupXLOG(void)
 								ControlFile->checkPointCopy.ThisTimeLineID,
 								recoveryTargetTLI)));
 			ControlFile->state = DB_IN_CRASH_RECOVERY;
+
+			SpinLockAcquire(&XLogCtl->info_lck);
+			XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
+			SpinLockRelease(&XLogCtl->info_lck);
 		}
 		ControlFile->checkPoint = checkPointLoc;
 		ControlFile->checkPointCopy = checkPoint;
@@ -7911,7 +7932,7 @@ StartupXLOG(void)
 	ControlFile->time = (pg_time_t) time(NULL);
 
 	SpinLockAcquire(&XLogCtl->info_lck);
-	XLogCtl->SharedRecoveryInProgress = false;
+	XLogCtl->SharedRecoveryState = RECOVERY_STATE_DONE;
 	SpinLockRelease(&XLogCtl->info_lck);
 
 	UpdateControlFile();
@@ -8057,7 +8078,7 @@ RecoveryInProgress(void)
 		 */
 		volatile XLogCtlData *xlogctl = XLogCtl;
 
-		LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress;
+		LocalRecoveryInProgress = (xlogctl->SharedRecoveryState != RECOVERY_STATE_DONE);
 
 		/*
 		 * Initialize TimeLineID and RedoRecPtr when we discover that recovery
@@ -8069,8 +8090,8 @@ RecoveryInProgress(void)
 		{
 			/*
 			 * If we just exited recovery, make sure we read TimeLineID and
-			 * RedoRecPtr after SharedRecoveryInProgress (for machines with
-			 * weak memory ordering).
+			 * RedoRecPtr after SharedRecoveryState (for machines with weak
+			 * memory ordering).
 			 */
 			pg_memory_barrier();
 			InitXLOGAccess();
@@ -8086,6 +8107,24 @@ RecoveryInProgress(void)
 	}
 }
 
+/*
+ * Returns current recovery state from shared memory.
+ *
+ * This returned state is kept consistent with the contents of the control
+ * file.  See details about the possible values of RecoveryState in xlog.h.
+ */
+RecoveryState
+GetRecoveryState(void)
+{
+	RecoveryState retval;
+
+	SpinLockAcquire(&XLogCtl->info_lck);
+	retval = XLogCtl->SharedRecoveryState;
+	SpinLockRelease(&XLogCtl->info_lck);
+
+	return retval;
+}
+
 /*
  * Is HotStandby active yet? This is only important in special backends
  * since normal backends won't ever be able to connect until this returns
diff --git a/src/backend/access/transam/xlogarchive.c b/src/backend/access/transam/xlogarchive.c
index d62c12310a..55becd65d4 100644
--- a/src/backend/access/transam/xlogarchive.c
+++ b/src/backend/access/transam/xlogarchive.c
@@ -572,18 +572,25 @@ XLogArchiveCheckDone(const char *xlog)
 {
 	char		archiveStatusPath[MAXPGPATH];
 	struct stat stat_buf;
-	bool		inRecovery = RecoveryInProgress();
+
+	/* The file is always deletable if archive_mode is "off". */
+	if (!XLogArchivingActive())
+		return true;
 
 	/*
-	 * The file is always deletable if archive_mode is "off".  On standbys
-	 * archiving is disabled if archive_mode is "on", and enabled with
-	 * "always".  On a primary, archiving is enabled if archive_mode is "on"
-	 * or "always".
+	 * During archive recovery, the file is deletable if archive_mode is not
+	 * "always".
 	 */
-	if (!((XLogArchivingActive() && !inRecovery) ||
-		  (XLogArchivingAlways() && inRecovery)))
+	if (!XLogArchivingAlways() &&
+		GetRecoveryState() == RECOVERY_STATE_ARCHIVE)
 		return true;
 
+	/*
+	 * At this point of the logic, note that we are either a primary with
+	 * archive_mode set to "on" or "always", or a standby with archive_mode
+	 * set to "always".
+	 */
+
 	/* First check for .done --- this means archiver is done with it */
 	StatusFilePath(archiveStatusPath, xlog, ".done");
 	if (stat(archiveStatusPath, &stat_buf) == 0)
diff --git a/src/test/recovery/t/020_archive_status.pl b/src/test/recovery/t/020_archive_status.pl
new file mode 100644
index 0000000000..688014862a
--- /dev/null
+++ b/src/test/recovery/t/020_archive_status.pl
@@ -0,0 +1,225 @@
+#
+# Tests related to WAL archiving and recovery.
+#
+use strict;
+use warnings;
+use PostgresNode;
+use TestLib;
+use Test::More;
+use Config;
+
+if ($Config{osname} eq 'MSWin32')
+{
+
+	# some Windows Perls at least don't like IPC::Run's start/kill_kill regime.
+	plan skip_all => "Test fails on Windows perl";
+}
+else
+{
+	plan tests => 16;
+}
+
+my $primary = get_new_node('master');
+$primary->init(
+	has_archiving    => 1,
+	allows_streaming => 1);
+$primary->append_conf('postgresql.conf', 'autovacuum = off');
+$primary->start;
+my $primary_data = $primary->data_dir;
+
+# Temporarily use an archive_command value to make the archiver fail,
+# knowing that archiving is enabled.  Note that we cannot use a command
+# that does not exist as in this case the archiver process would just exit
+# without reporting the failure to pg_stat_archiver.  This also cannot
+# use a plain "false" as that's unportable on Windows.  So, instead, as
+# a portable solution, use an archive command that is known to work but
+# will fail: a copy with an incorrect original path.
+my $incorrect_command =
+  $TestLib::windows_os
+  ? qq{copy "%p_does_not_exist" "%f_does_not_exist"}
+  : qq{cp "%p_does_not_exist" "%f_does_not_exist"};
+$primary->safe_psql(
+	'postgres', qq{
+    ALTER SYSTEM SET archive_command TO '$incorrect_command';
+    SELECT pg_reload_conf();
+});
+
+# Save the WAL segment currently in use and switch to a new segment.
+# This will be used to track the activity of the archiver.
+my $segment_name_1 = $primary->safe_psql('postgres',
+	q{SELECT pg_walfile_name(pg_current_wal_lsn())});
+my $segment_path_1       = "pg_wal/archive_status/$segment_name_1";
+my $segment_path_1_ready = "$segment_path_1.ready";
+my $segment_path_1_done  = "$segment_path_1.done";
+$primary->safe_psql(
+	'postgres', q{
+	CREATE TABLE mine AS SELECT generate_series(1,10) AS x;
+	SELECT pg_switch_wal();
+	CHECKPOINT;
+});
+
+# Wait for an archive failure.
+$primary->poll_query_until('postgres',
+	q{SELECT failed_count > 0 FROM pg_stat_archiver}, 't')
+  or die "Timed out while waiting for archiving to fail";
+ok( -f "$primary_data/$segment_path_1_ready",
+	".ready file exists for WAL segment $segment_name_1 waiting to be archived"
+);
+ok( !-f "$primary_data/$segment_path_1_done",
+	".done file does not exist for WAL segment $segment_name_1 waiting to be archived"
+);
+
+is( $primary->safe_psql(
+		'postgres', q{
+		SELECT archived_count, last_failed_wal
+		FROM pg_stat_archiver
+	}),
+	"0|$segment_name_1",
+	'pg_stat_archiver failed to archive $segment_name_1');
+
+# We need to crash the cluster because next test checks the crash
+# recovery step do not removes non-archived WAL.
+$primary->stop('immediate');
+
+# Recovery tests for the archiving with a standby partially check
+# the recovery behavior when restoring a backup taken using a
+# snapshot with no pg_start/stop_backup.  In this situation,
+# the recovered standby should enter first crash recovery then
+# switch to regular archive recovery.  Note that the base backup
+# is taken here so as archive_command will fail.  This is necessary
+# for the assumptions of the tests done with the standbys below.
+$primary->backup_fs_cold('backup');
+
+$primary->start;
+ok( -f "$primary_data/$segment_path_1_ready",
+	".ready file for WAL segment $segment_name_1 still exists after crash recovery on primary"
+);
+
+# Allow WAL archiving again and wait for a success.
+$primary->safe_psql(
+	'postgres', q{
+	ALTER SYSTEM RESET archive_command;
+	SELECT pg_reload_conf();
+});
+
+$primary->poll_query_until('postgres',
+	q{SELECT archived_count FROM pg_stat_archiver}, '1')
+  or die "Timed out while waiting for archiving to finish";
+
+ok(!-f "$primary_data/$segment_path_1_ready",
+	".ready file for archived WAL segment $segment_name_1 removed");
+
+ok(-f "$primary_data/$segment_path_1_done",
+	".done file for archived WAL segment $segment_name_1 exists");
+
+is( $primary->safe_psql(
+		'postgres', q{ SELECT last_archived_wal FROM pg_stat_archiver }),
+	$segment_name_1,
+	'archive success reported in pg_stat_archiver for WAL segment $segment_name_1'
+);
+
+# Create some WAL activity and a new checkpoint so as the next standby can
+# create a restartpoint.  As this standby starts in crash recovery because
+# of the cold backup taken previously, it needs a clean restartpoint to deal
+# with existing status files.
+my $segment_name_2 = $primary->safe_psql('postgres',
+	q{SELECT pg_walfile_name(pg_current_wal_lsn())});
+my $segment_path_2       = "pg_wal/archive_status/$segment_name_2";
+my $segment_path_2_ready = "$segment_path_2.ready";
+my $segment_path_2_done  = "$segment_path_2.done";
+$primary->safe_psql(
+	'postgres', q{
+	INSERT INTO mine SELECT generate_series(10,20) AS x;
+	SELECT pg_switch_wal();
+	CHECKPOINT;
+});
+
+$primary->poll_query_until('postgres',
+	q{ SELECT last_archived_wal FROM pg_stat_archiver },
+	$segment_name_2)
+  or die "Timed out while waiting for archiving to finish";
+
+# Test standby with archive_mode = on.
+my $standby1 = get_new_node('standby');
+$standby1->init_from_backup($primary, 'backup', has_restoring => 1);
+$standby1->append_conf('postgresql.conf', "archive_mode = on");
+my $standby1_data = $standby1->data_dir;
+$standby1->start;
+$standby1->safe_psql('postgres', q{CHECKPOINT});
+
+# Recovery with archive_mode=on does not keep .ready signal files inherited
+# from backup.  Note that this WAL segment existed in the backup.
+ok( !-f "$standby1_data/$segment_path_1_ready",
+	".ready file for WAL segment $segment_name_1 present in backup got removed with archive_mode=on on standby"
+);
+
+# Recovery with archive_mode=on should not create .ready files.
+# Note that this segment did not exist in the backup.
+ok( !-f "$standby1_data/$segment_path_2_ready",
+	".ready file for WAL segment $segment_name_2 not created on standby when archive_mode=on on standby"
+);
+
+# Recovery with archive_mode = on creates .done files.
+ok( -f "$standby1_data/$segment_path_2_done",
+	".done file for WAL segment $segment_name_2 created when archive_mode=on on standby"
+);
+
+# Test recovery with archive_mode = always, which should always keep
+# .ready files if archiving is enabled, though here we want the archive
+# command to fail to persist the .ready files.  Note that this node
+# has inherited the archive command of the previous cold backup, that
+# will cause archiving failures.
+my $standby2 = get_new_node('standby2');
+$standby2->init_from_backup($primary, 'backup', has_restoring => 1);
+$standby2->append_conf('postgresql.conf', 'archive_mode = always');
+my $standby2_data = $standby2->data_dir;
+$standby2->start;
+
+$standby2->safe_psql('postgres', q{CHECKPOINT});
+
+ok( -f "$standby2_data/$segment_path_1_ready",
+	".ready file for WAL segment $segment_name_1 existing in backup is kept with archive_mode=always on standby"
+);
+
+ok( -f "$standby2_data/$segment_path_2_ready",
+	".ready file for WAL segment $segment_name_2 created with archive_mode=always on standby"
+);
+
+# Reset statistics of the archiver for the next checks.
+$standby2->safe_psql('postgres', q{SELECT pg_stat_reset_shared('archiver')});
+
+# Now crash the cluster to check that recovery step does not
+# remove non-archived WAL segments on a standby where archiving
+# is enabled.
+$standby2->stop('immediate');
+$standby2->start;
+
+ok( -f "$standby2_data/$segment_path_1_ready",
+	"WAL segment still ready to archive after crash recovery on standby with archive_mode=always"
+);
+
+# Allow WAL archiving again, and wait for the segments to be archived.
+$standby2->safe_psql(
+	'postgres', q{
+	ALTER SYSTEM RESET archive_command;
+	SELECT pg_reload_conf();
+});
+$standby2->poll_query_until('postgres',
+	q{SELECT last_archived_wal FROM pg_stat_archiver},
+	$segment_name_2)
+  or die "Timed out while waiting for archiving to finish";
+
+is( $standby2->safe_psql(
+		'postgres', q{SELECT archived_count FROM pg_stat_archiver}),
+	'2',
+	'correct number of WAL segments archived from standby');
+
+ok( !-f "$standby2_data/$segment_path_1_ready"
+	  && !-f "$standby2_data/$segment_path_2_ready",
+	".ready files removed after archive success with archive_mode=always on standby"
+);
+
+ok( -f "$standby2_data/$segment_path_1_done"
+	  && -f "$standby2_data/$segment_path_2_done",
+	".done files created after archive success with archive_mode=always on standby"
+);
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 525d58e7f0..b319266bb3 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -1983,6 +1983,7 @@ RecordCacheEntry
 RecordCompareData
 RecordIOData
 RecoveryLockListsEntry
+RecoveryState
 RecoveryTargetTimeLineGoal
 RecoveryTargetType
 RectBox
-- 
2.26.0

