From 147f616565bc652950bcd928a870cd3aa3dfb805 Mon Sep 17 00:00:00 2001
From: Jehan-Guillaume de Rorthais <jgdr@dalibo.com>
Date: Wed, 1 Apr 2020 16:36:06 +0200
Subject: [PATCH 1/2] Fix WAL retention during production crash recovery

During crash recovery of a production cluster with archive_mode=on,
XLogArchiveCheckDone() was considering the cluster as inRecovery
without archive_mode=always. Because of this non-arcived WAL and
related .ready files were recycled or removed.
---
 src/backend/access/transam/xlog.c        | 45 +++++++++++++++++++-----
 src/backend/access/transam/xlogarchive.c | 21 +++++++----
 src/include/access/xlog.h                | 23 ++++++++++++
 3 files changed, 73 insertions(+), 16 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index abf954ba39..a7a5ce6ef4 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -220,8 +220,8 @@ static TimeLineID receiveTLI = 0;
 static bool lastFullPageWrites;
 
 /*
- * Local copy of SharedRecoveryInProgress variable. True actually means "not
- * known, need to check the shared state".
+ * This is false when SharedRecoveryState is not ARCHIVE_RECOVERING.
+ * True actually means "not known, need to check the shared state".
  */
 static bool LocalRecoveryInProgress = true;
 
@@ -652,10 +652,10 @@ typedef struct XLogCtlData
 	TimeLineID	PrevTimeLineID;
 
 	/*
-	 * SharedRecoveryInProgress indicates if we're still in crash or archive
+	 * SharedRecoveryState indicates if we're either in crash or archive
 	 * recovery.  Protected by info_lck.
 	 */
-	bool		SharedRecoveryInProgress;
+	RecoveryState	SharedRecoveryState;
 
 	/*
 	 * SharedHotStandbyActive indicates if we allow hot standby queries to be
@@ -4407,6 +4407,11 @@ ReadRecord(XLogReaderState *xlogreader, int emode,
 				updateMinRecoveryPoint = true;
 
 				UpdateControlFile();
+
+				SpinLockAcquire(&XLogCtl->info_lck);
+				XLogCtl->SharedRecoveryState = ARCHIVE_RECOVERING;
+				SpinLockRelease(&XLogCtl->info_lck);
+
 				LWLockRelease(ControlFileLock);
 
 				CheckRecoveryConsistency();
@@ -5139,11 +5144,20 @@ XLOGShmemInit(void)
 	 * in additional info.)
 	 */
 	XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
-	XLogCtl->SharedRecoveryInProgress = true;
 	XLogCtl->SharedHotStandbyActive = false;
 	XLogCtl->SharedPromoteIsTriggered = false;
 	XLogCtl->WalWriterSleeping = false;
 
+	switch (ControlFile->state)
+	{
+		case DB_SHUTDOWNED:
+		case DB_SHUTDOWNED_IN_RECOVERY:
+			XLogCtl->SharedRecoveryState = ARCHIVE_RECOVERING;
+			break;
+		default:
+			XLogCtl->SharedRecoveryState = CRASH_RECOVERING;
+	}
+
 	SpinLockInit(&XLogCtl->Insert.insertpos_lck);
 	SpinLockInit(&XLogCtl->info_lck);
 	SpinLockInit(&XLogCtl->ulsn_lck);
@@ -7884,7 +7898,7 @@ StartupXLOG(void)
 	ControlFile->time = (pg_time_t) time(NULL);
 
 	SpinLockAcquire(&XLogCtl->info_lck);
-	XLogCtl->SharedRecoveryInProgress = false;
+	XLogCtl->SharedRecoveryState = NOT_IN_RECOVERY;
 	SpinLockRelease(&XLogCtl->info_lck);
 
 	UpdateControlFile();
@@ -8030,7 +8044,7 @@ RecoveryInProgress(void)
 		 */
 		volatile XLogCtlData *xlogctl = XLogCtl;
 
-		LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress;
+		LocalRecoveryInProgress = (xlogctl->SharedRecoveryState != NOT_IN_RECOVERY);
 
 		/*
 		 * Initialize TimeLineID and RedoRecPtr when we discover that recovery
@@ -8042,8 +8056,8 @@ RecoveryInProgress(void)
 		{
 			/*
 			 * If we just exited recovery, make sure we read TimeLineID and
-			 * RedoRecPtr after SharedRecoveryInProgress (for machines with
-			 * weak memory ordering).
+			 * RedoRecPtr after SharedRecoveryState (for machines with weak
+			 * memory ordering).
 			 */
 			pg_memory_barrier();
 			InitXLOGAccess();
@@ -8059,6 +8073,19 @@ RecoveryInProgress(void)
 	}
 }
 
+/*
+ * Returns current recovery state from shared memory.
+ *
+ * See details about RecoveryState in xlog.h
+ */
+RecoveryState
+GetRecoveryState(void)
+{
+	volatile XLogCtlData *xlogctl = XLogCtl;
+
+	return xlogctl->SharedRecoveryState;
+}
+
 /*
  * Is HotStandby active yet? This is only important in special backends
  * since normal backends won't ever be able to connect until this returns
diff --git a/src/backend/access/transam/xlogarchive.c b/src/backend/access/transam/xlogarchive.c
index d62c12310a..b207827651 100644
--- a/src/backend/access/transam/xlogarchive.c
+++ b/src/backend/access/transam/xlogarchive.c
@@ -572,18 +572,25 @@ XLogArchiveCheckDone(const char *xlog)
 {
 	char		archiveStatusPath[MAXPGPATH];
 	struct stat stat_buf;
-	bool		inRecovery = RecoveryInProgress();
+	RecoveryState	recoveryState = GetRecoveryState();
+
+	/* The file is always deletable if archive_mode is "off". */
+	if ( ! XLogArchivingActive() )
+		return true;
 
 	/*
-	 * The file is always deletable if archive_mode is "off".  On standbys
-	 * archiving is disabled if archive_mode is "on", and enabled with
-	 * "always".  On a primary, archiving is enabled if archive_mode is "on"
-	 * or "always".
+	 * On standbys, the file is deletable if archive_mode is not
+	 * "always".
 	 */
-	if (!((XLogArchivingActive() && !inRecovery) ||
-		  (XLogArchivingAlways() && inRecovery)))
+	if ( recoveryState == ARCHIVE_RECOVERING && !XLogArchivingAlways() )
 		return true;
 
+	/*
+	 * We are either:
+	 * * a primary with archive_mode set to "on" or "always"
+	 * * a standby with archive_mode set to "always".
+	 */
+
 	/* First check for .done --- this means archiver is done with it */
 	StatusFilePath(archiveStatusPath, xlog, ".done");
 	if (stat(archiveStatusPath, &stat_buf) == 0)
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 7412caa5f2..601ae34dd8 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -165,6 +165,28 @@ typedef enum WalLevel
 	WAL_LEVEL_LOGICAL
 } WalLevel;
 
+/*
+ * Recovery state.
+ *
+ * There's two different reasons to recover WAL: when standby mode is requested
+ * or after a crash to catchup with consistency.
+ *
+ * CRASH_RECOVERING is set when recovering after a crash.
+ * ARCHIVE_RECOVERING is set when archive recovery or standby is requested.
+ * NOT_IN_RECOVERY means the cluster is in production.
+ *
+ * Note that even if archive recovery or standby is requested, recovery state
+ * starts first with CRASH_RECOVERING before switching to ARCHIVE_RECOVERING if:
+ * - the control file shows evidence the cluster crashed based on its previous state
+ * - there's no backup label file
+ */
+typedef enum RecoveryState
+{
+	NOT_IN_RECOVERY = 0, /* currently in production */
+	CRASH_RECOVERING,    /* recovering from a crash */
+	ARCHIVE_RECOVERING   /* recovering archives as requested */
+} RecoveryState;
+
 extern PGDLLIMPORT int wal_level;
 
 /* Is WAL archiving enabled (always or only while server is running normally)? */
@@ -279,6 +301,7 @@ extern const char *xlog_identify(uint8 info);
 extern void issue_xlog_fsync(int fd, XLogSegNo segno);
 
 extern bool RecoveryInProgress(void);
+extern RecoveryState GetRecoveryState(void);
 extern bool HotStandbyActive(void);
 extern bool HotStandbyActiveInReplay(void);
 extern bool XLogInsertAllowed(void);
-- 
2.20.1

