>From 1522d6875e22d226a9cc9f3e73db93a60fefaf45 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Wed, 27 Nov 2013 15:24:20 +0100
Subject: [PATCH 2/3] Truncate pg_multixact/'s contents during crash recovery.

9dc842f0832fd71eda826349a0c17ecf8ae93b84 prevented multixact
truncation during crash recovery because it wasn't guaranteed that
enough state was setup and because it wasn't deemed to be a good idea
anyway due to possible information loss. Since then due to
Hot-Standby, Streaming Rep. and PITR the amount of time a cluster can
spend in crash recovery has increased significantly to the point that
a cluster often will never come out of it. That makes not truncating
the content of pg_multixact/ not really feasible anymore.

So, take care to setup enough state for multixact truncation before
crash recovery starts - easy since checkpoints contain the required
information - and move the current end-of-recovery actions to a new
TrimMultiXact() function, analogous to TrimCLOG().

At some later point this should probably done similarly to the way
clog.c is doing it, which is to just WAL log truncations, but we can't
do that for the backbranches.
---
 src/backend/access/transam/multixact.c | 69 ++++++++++++++++++++++++++++------
 src/backend/access/transam/xlog.c      | 10 +++--
 src/include/access/multixact.h         |  1 +
 3 files changed, 66 insertions(+), 14 deletions(-)

diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index 3dff1bb..9e071b4 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -1768,11 +1768,8 @@ MaybeExtendOffsetSlru(void)
  *
  * StartupXLOG has already established nextMXact/nextOffset by calling
  * MultiXactSetNextMXact and/or MultiXactAdvanceNextMXact, and the oldestMulti
- * info from pg_control and/or MultiXactAdvanceOldest.	Note that we may
- * already have replayed WAL data into the SLRU files.
- *
- * We don't need any locks here, really; the SLRU locks are taken
- * only because slru.c expects to be called with locks held.
+ * info from pg_control and/or MultiXactAdvanceOldest, but we haven't yet
+ * replayed WAL.
  */
 void
 StartupMultiXact(void)
@@ -1780,12 +1777,40 @@ StartupMultiXact(void)
 	MultiXactId multi = MultiXactState->nextMXact;
 	MultiXactOffset offset = MultiXactState->nextOffset;
 	int			pageno;
+
+	/*
+	 * Initialize our idea of the latest page number.
+	 */
+	pageno = MultiXactIdToOffsetPage(multi);
+	MultiXactOffsetCtl->shared->latest_page_number = pageno;
+
+	/*
+	 * Initialize our idea of the latest page number.
+	 */
+	pageno = MXOffsetToMemberPage(offset);
+	MultiXactMemberCtl->shared->latest_page_number = pageno;
+}
+
+/*
+ * This must be called ONCE at the end of startup/recovery.
+ *
+ * We don't need any locks here, really; the SLRU locks are taken only because
+ * slru.c expects to be called with locks held.
+ */
+void
+TrimMultiXact(void)
+{
+	MultiXactId multi = MultiXactState->nextMXact;
+	MultiXactOffset offset = MultiXactState->nextOffset;
+	int			pageno;
 	int			entryno;
 	int			flagsoff;
 
 	/*
 	 * During a binary upgrade, make sure that the offsets SLRU is large
-	 * enough to contain the next value that would be created.
+	 * enough to contain the next value that would be created. It's fine to do
+	 * this here and not in StartupMultiXact() since binary upgrades should
+	 * never need crash recovery.
 	 */
 	if (IsBinaryUpgrade)
 		MaybeExtendOffsetSlru();
@@ -1794,7 +1819,7 @@ StartupMultiXact(void)
 	LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
 
 	/*
-	 * Initialize our idea of the latest page number.
+	 * (Re-)Initialize our idea of the latest page number.
 	 */
 	pageno = MultiXactIdToOffsetPage(multi);
 	MultiXactOffsetCtl->shared->latest_page_number = pageno;
@@ -1824,7 +1849,7 @@ StartupMultiXact(void)
 	LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE);
 
 	/*
-	 * Initialize our idea of the latest page number.
+	 * (Re-)Initialize our idea of the latest page number.
 	 */
 	pageno = MXOffsetToMemberPage(offset);
 	MultiXactMemberCtl->shared->latest_page_number = pageno;
@@ -1906,6 +1931,24 @@ CheckPointMultiXact(void)
 	SimpleLruFlush(MultiXactOffsetCtl, true);
 	SimpleLruFlush(MultiXactMemberCtl, true);
 
+	/*
+	 * Due to an historical accident multixact truncations are not WAL-logged,
+	 * but just performed everytime the mxact horizon is increased. So, unless
+	 * we explicitly execute truncations on a standby it will never clean out
+	 * /pg_multixact which obviously is bad, both because it uses space and
+	 * because we can wrap around into pre-existing data...
+	 *
+	 * It's probably worth improving this.
+	 */
+	if (RecoveryInProgress())
+	{
+		/*
+		 * ->latest_page_number will already have been initialized by
+		 * StartupMultiXact() and maintained when zero-ing new pages.
+		 */
+		TruncateMultiXact(MultiXactState->oldestMultiXactId);
+	}
+
 	TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true);
 }
 
@@ -2257,9 +2300,13 @@ SlruScanDirCbFindEarliest(SlruCtl ctl, char *filename, int segpage, void *data)
  * Remove all MultiXactOffset and MultiXactMember segments before the oldest
  * ones still of interest.
  *
- * This is called by vacuum after it has successfully advanced a database's
- * datminmxid value; the cutoff value we're passed is the minimum of all
- * databases' datminmxid values.
+ * On a primary this is called by vacuum after it has successfully advanced a
+ * database's datminmxid value; the cutoff value we're passed is the minimum
+ * of all databases' datminmxid values.
+ *
+ * During crash recovery we call it from CheckPointMultiXact() instead and we
+ * rely on the fact that xlog_redo() will already have called
+ * MultiXactAdvanceOldest().
  */
 void
 TruncateMultiXact(MultiXactId oldestMXact)
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index de19d22..5a261c6 100755
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -6332,6 +6332,9 @@ StartupXLOG(void)
 	XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
 	XLogCtl->ckptXid = checkPoint.nextXid;
 
+	/* startup MultiXact, so we can perform truncations in restartpoints */
+	StartupMultiXact();
+
 	/*
 	 * Initialize unlogged LSN. On a clean shutdown, it's restored from the
 	 * control file. On recovery, all unlogged relations are blown away, so
@@ -6532,8 +6535,9 @@ StartupXLOG(void)
 			ProcArrayInitRecovery(ShmemVariableCache->nextXid);
 
 			/*
-			 * Startup commit log and subtrans only. Other SLRUs are not
-			 * maintained during recovery and need not be started yet.
+			 * Startup commit log and subtrans only. MultiXact has already
+			 * been started up and other SLRUs are not maintained during
+			 * recovery and need not be started yet.
 			 */
 			StartupCLOG();
 			StartupSUBTRANS(oldestActiveXID);
@@ -7197,8 +7201,8 @@ StartupXLOG(void)
 	/*
 	 * Perform end of recovery actions for any SLRUs that need it.
 	 */
-	StartupMultiXact();
 	TrimCLOG();
+	TrimMultiXact();
 
 	/* Reload shared-memory state for prepared transactions */
 	RecoverPreparedTransactions();
diff --git a/src/include/access/multixact.h b/src/include/access/multixact.h
index aa5a78c..348f84e 100644
--- a/src/include/access/multixact.h
+++ b/src/include/access/multixact.h
@@ -97,6 +97,7 @@ extern Size MultiXactShmemSize(void);
 extern void MultiXactShmemInit(void);
 extern void BootStrapMultiXact(void);
 extern void StartupMultiXact(void);
+extern void TrimMultiXact(void);
 extern void ShutdownMultiXact(void);
 extern void SetMultiXactIdLimit(MultiXactId oldest_datminmxid,
 					Oid oldest_datoid);
-- 
1.8.5.rc2.dirty

