From fe0401335e0ac1a9ae36dbdb5c2db82f9081a1a3 Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <horiguchi.kyotaro@lab.ntt.co.jp>
Date: Fri, 6 Apr 2018 13:57:48 +0900
Subject: [PATCH] Change FPW handling

The GUC full_pages_writes currently has an effect immediately. That
makes a race condition between config reload on checkpointer and
StartupXLOG. But since full page images are meaningful only when they
are attached to all WAL records covers a checkpoint, there is no
problem if we update the shared FPW only at REDO point.  On the other
hand, online backup mechanism on standby requires to know if FPW is
turned off before the next checkpoint record comes.

As the result, with this patch, changing of full_page_writes takes
effect at REDO point and additional XLOG_FPW_CHANGE is written only
for turning-off. These are sufficient for standby-backup to work
properly, reduces complexity and prevent the race condition.
---
 doc/src/sgml/config.sgml              |   4 +-
 src/backend/access/transam/xlog.c     | 114 +++++++++-------------------------
 src/backend/postmaster/checkpointer.c |   6 --
 src/include/access/xlog.h             |   1 -
 src/include/catalog/pg_control.h      |   2 +-
 5 files changed, 34 insertions(+), 93 deletions(-)

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 5d5f2d23c4..7ea42c25e2 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -2598,7 +2598,9 @@ include_dir 'conf.d'
        <para>
         This parameter can only be set in the <filename>postgresql.conf</filename>
         file or on the server command line.
-        The default is <literal>on</literal>.
+
+        The default is <literal>on</literal>. The change of the parmeter takes
+        effect at the next checkpoint time.
        </para>
       </listitem>
      </varlistentry>
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 4a47395174..ba9cf07d38 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -202,14 +202,6 @@ static XLogRecPtr LastRec;
 static XLogRecPtr receivedUpto = 0;
 static TimeLineID receiveTLI = 0;
 
-/*
- * During recovery, lastFullPageWrites keeps track of full_page_writes that
- * the replayed WAL records indicate. It's initialized with full_page_writes
- * that the recovery starting checkpoint record indicates, and then updated
- * each time XLOG_FPW_CHANGE record is replayed.
- */
-static bool lastFullPageWrites;
-
 /*
  * Local copy of SharedRecoveryInProgress variable. True actually means "not
  * known, need to check the shared state".
@@ -6851,11 +6843,7 @@ StartupXLOG(void)
 	 */
 	restoreTwoPhaseData();
 
-	lastFullPageWrites = checkPoint.fullPageWrites;
-
 	RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
-	doPageWrites = lastFullPageWrites;
-
 	if (RecPtr < checkPoint.redo)
 		ereport(PANIC,
 				(errmsg("invalid redo in checkpoint record")));
@@ -7650,16 +7638,6 @@ StartupXLOG(void)
 	/* Pre-scan prepared transactions to find out the range of XIDs present */
 	oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
 
-	/*
-	 * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
-	 * record before resource manager writes cleanup WAL records or checkpoint
-	 * record is written.
-	 */
-	Insert->fullPageWrites = lastFullPageWrites;
-	LocalSetXLogInsertAllowed();
-	UpdateFullPageWrites();
-	LocalXLogInsertAllowed = -1;
-
 	if (InRecovery)
 	{
 		/*
@@ -7893,6 +7871,13 @@ StartupXLOG(void)
 	ControlFile->state = DB_IN_PRODUCTION;
 	ControlFile->time = (pg_time_t) time(NULL);
 
+	/*
+	 * Set the initial value of shared fullPageWrites. Once
+	 * SharedRecoveryInProgress is turned false, checkpointer will update this
+	 * value.
+	 */
+	XLogCtl->Insert.fullPageWrites = checkPoint.fullPageWrites;
+
 	SpinLockAcquire(&XLogCtl->info_lck);
 	XLogCtl->SharedRecoveryInProgress = false;
 	SpinLockRelease(&XLogCtl->info_lck);
@@ -8754,6 +8739,20 @@ CreateCheckPoint(int flags)
 	 */
 	last_important_lsn = GetLastImportantRecPtr();
 
+	/*
+	 * If full_page_writes has been turned off, issue XLOG_FPW_CHANGE before
+	 * the flag actually takes effect. No lock is required since checkpointer
+	 * is the only updator of shared fullPageWrites after recovery is
+	 * finished. Both shared and local fullPageWrites do not change before the
+	 * next reading below.
+	 */
+	if (Insert->fullPageWrites && !fullPageWrites)
+	{
+		XLogBeginInsert();
+		XLogRegisterData((char *) (&fullPageWrites), sizeof(bool));
+		XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE);
+	}
+
 	/*
 	 * We must block concurrent insertions while examining insert state to
 	 * determine the checkpoint REDO pointer.
@@ -8795,6 +8794,15 @@ CreateCheckPoint(int flags)
 	else
 		checkPoint.PrevTimeLineID = ThisTimeLineID;
 
+	/*
+	 * Update shared flag of fullPageWrites. WALInsertLock ensures that this
+	 * affects all WAL records exactly from REDO point. As the result a
+	 * checkpoint marked as fpw=true is ensured that all WAL records have full
+	 * page image.
+	 */
+	if (fullPageWrites != Insert->fullPageWrites)
+		Insert->fullPageWrites = fullPageWrites;
+
 	checkPoint.fullPageWrites = Insert->fullPageWrites;
 
 	/*
@@ -9642,65 +9650,6 @@ XlogChecksums(ChecksumType new_type)
 	XLogInsert(RM_XLOG_ID, XLOG_CHECKSUMS);
 }
 
-/*
- * Update full_page_writes in shared memory, and write an
- * XLOG_FPW_CHANGE record if necessary.
- *
- * Note: this function assumes there is no other process running
- * concurrently that could update it.
- */
-void
-UpdateFullPageWrites(void)
-{
-	XLogCtlInsert *Insert = &XLogCtl->Insert;
-
-	/*
-	 * Do nothing if full_page_writes has not been changed.
-	 *
-	 * It's safe to check the shared full_page_writes without the lock,
-	 * because we assume that there is no concurrently running process which
-	 * can update it.
-	 */
-	if (fullPageWrites == Insert->fullPageWrites)
-		return;
-
-	START_CRIT_SECTION();
-
-	/*
-	 * It's always safe to take full page images, even when not strictly
-	 * required, but not the other round. So if we're setting full_page_writes
-	 * to true, first set it true and then write the WAL record. If we're
-	 * setting it to false, first write the WAL record and then set the global
-	 * flag.
-	 */
-	if (fullPageWrites)
-	{
-		WALInsertLockAcquireExclusive();
-		Insert->fullPageWrites = true;
-		WALInsertLockRelease();
-	}
-
-	/*
-	 * Write an XLOG_FPW_CHANGE record. This allows us to keep track of
-	 * full_page_writes during archive recovery, if required.
-	 */
-	if (XLogStandbyInfoActive() && !RecoveryInProgress())
-	{
-		XLogBeginInsert();
-		XLogRegisterData((char *) (&fullPageWrites), sizeof(bool));
-
-		XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE);
-	}
-
-	if (!fullPageWrites)
-	{
-		WALInsertLockAcquireExclusive();
-		Insert->fullPageWrites = false;
-		WALInsertLockRelease();
-	}
-	END_CRIT_SECTION();
-}
-
 /*
  * Check that it's OK to switch to new timeline during recovery.
  *
@@ -10066,9 +10015,6 @@ xlog_redo(XLogReaderState *record)
 				XLogCtl->lastFpwDisableRecPtr = ReadRecPtr;
 			SpinLockRelease(&XLogCtl->info_lck);
 		}
-
-		/* Keep track of full_page_writes */
-		lastFullPageWrites = fpw;
 	}
 	else if (info == XLOG_CHECKSUMS)
 	{
diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c
index 4b452e7cee..8b87d139d0 100644
--- a/src/backend/postmaster/checkpointer.c
+++ b/src/backend/postmaster/checkpointer.c
@@ -1359,12 +1359,6 @@ UpdateSharedMemoryConfig(void)
 	/* update global shmem state for sync rep */
 	SyncRepUpdateSyncStandbysDefined();
 
-	/*
-	 * If full_page_writes has been changed by SIGHUP, we update it in shared
-	 * memory and write an XLOG_FPW_CHANGE record.
-	 */
-	UpdateFullPageWrites();
-
 	elog(DEBUG2, "checkpointer updated shared memory configuration values");
 }
 
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index f21870c644..6e4648e94b 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -276,7 +276,6 @@ extern void CreateCheckPoint(int flags);
 extern bool CreateRestartPoint(int flags);
 extern void XLogPutNextOid(Oid nextOid);
 extern XLogRecPtr XLogRestorePoint(const char *rpName);
-extern void UpdateFullPageWrites(void);
 extern void GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p);
 extern XLogRecPtr GetRedoRecPtr(void);
 extern XLogRecPtr GetInsertRecPtr(void);
diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h
index 33c59f9a63..1710b8ce1e 100644
--- a/src/include/catalog/pg_control.h
+++ b/src/include/catalog/pg_control.h
@@ -38,7 +38,7 @@ typedef struct CheckPoint
 	TimeLineID	ThisTimeLineID; /* current TLI */
 	TimeLineID	PrevTimeLineID; /* previous TLI, if this record begins a new
 								 * timeline (equals ThisTimeLineID otherwise) */
-	bool		fullPageWrites; /* current full_page_writes */
+	bool		fullPageWrites; /* true if all covering WALs are having FPI */
 	uint32		nextXidEpoch;	/* higher-order bits of nextXid */
 	TransactionId nextXid;		/* next free XID */
 	Oid			nextOid;		/* next free OID */
-- 
2.16.3

