From 0077a37887d87e8b9e30e22b25d179c5f348dd0a Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Wed, 16 Jun 2021 16:13:41 +0300
Subject: [PATCH 2/2] Split xlog.c into xlog.c and xlogrecovery.c

This moves the functions related to performing WAL recovery into the new
xlogrecovery.c source file, leaving xlog.c responsible for maintaining
the WAL buffers, coordinating the startup and switch from recovery to
normal operations, and other miscellaneous stuff that have always been in
xlog.c.
---
 src/backend/access/heap/heapam.c              |     1 +
 src/backend/access/heap/visibilitymap.c       |     1 +
 src/backend/access/transam/Makefile           |     1 +
 src/backend/access/transam/clog.c             |     1 +
 src/backend/access/transam/commit_ts.c        |     1 +
 src/backend/access/transam/multixact.c        |     1 +
 src/backend/access/transam/slru.c             |     1 +
 src/backend/access/transam/twophase.c         |     1 +
 src/backend/access/transam/varsup.c           |     1 +
 src/backend/access/transam/xact.c             |     1 +
 src/backend/access/transam/xlog.c             | 11296 +++++-----------
 src/backend/access/transam/xlogfuncs.c        |     2 +-
 src/backend/access/transam/xlogrecovery.c     |  4422 ++++++
 src/backend/access/transam/xlogutils.c        |    21 +
 src/backend/commands/dbcommands.c             |     1 +
 src/backend/commands/tablespace.c             |     1 +
 src/backend/postmaster/checkpointer.c         |     1 +
 src/backend/postmaster/postmaster.c           |     1 +
 src/backend/postmaster/startup.c              |     2 +
 .../replication/logical/logicalfuncs.c        |     1 +
 src/backend/replication/slotfuncs.c           |     1 +
 src/backend/replication/walreceiver.c         |     1 +
 src/backend/replication/walreceiverfuncs.c    |     1 +
 src/backend/replication/walsender.c           |     1 +
 src/backend/storage/buffer/bufmgr.c           |     2 +-
 src/backend/storage/ipc/ipci.c                |     3 +
 src/backend/storage/ipc/procarray.c           |     2 +-
 src/backend/storage/ipc/standby.c             |     3 +-
 src/backend/storage/lmgr/lock.c               |     1 +
 src/backend/storage/lmgr/proc.c               |     1 +
 src/backend/storage/smgr/smgr.c               |     1 +
 src/backend/storage/sync/sync.c               |     1 +
 src/backend/utils/misc/guc.c                  |     1 +
 src/include/access/xlog.h                     |   123 +-
 src/include/access/xlogrecovery.h             |   117 +
 src/include/access/xlogutils.h                |    42 +
 36 files changed, 8288 insertions(+), 7771 deletions(-)
 create mode 100644 src/backend/access/transam/xlogrecovery.c
 create mode 100644 src/include/access/xlogrecovery.h

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 2433998f39b..eb487e7173a 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -50,6 +50,7 @@
 #include "access/xact.h"
 #include "access/xlog.h"
 #include "access/xloginsert.h"
+#include "access/xlogrecovery.h"
 #include "access/xlogutils.h"
 #include "catalog/catalog.h"
 #include "miscadmin.h"
diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
index e198df65d82..c3f2cab37e0 100644
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -89,6 +89,7 @@
 #include "access/heapam_xlog.h"
 #include "access/visibilitymap.h"
 #include "access/xlog.h"
+#include "access/xlogutils.h"
 #include "miscadmin.h"
 #include "port/pg_bitutils.h"
 #include "storage/bufmgr.h"
diff --git a/src/backend/access/transam/Makefile b/src/backend/access/transam/Makefile
index 595e02de722..79314c69abc 100644
--- a/src/backend/access/transam/Makefile
+++ b/src/backend/access/transam/Makefile
@@ -32,6 +32,7 @@ OBJS = \
 	xlogfuncs.o \
 	xloginsert.o \
 	xlogreader.o \
+	xlogrecovery.o \
 	xlogutils.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c
index 6fa4713fb4d..b61826ce82b 100644
--- a/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@ -37,6 +37,7 @@
 #include "access/transam.h"
 #include "access/xlog.h"
 #include "access/xloginsert.h"
+#include "access/xlogrecovery.h"
 #include "access/xlogutils.h"
 #include "miscadmin.h"
 #include "pg_trace.h"
diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c
index 0985fa155ca..42ea8e53f2c 100644
--- a/src/backend/access/transam/commit_ts.c
+++ b/src/backend/access/transam/commit_ts.c
@@ -28,6 +28,7 @@
 #include "access/htup_details.h"
 #include "access/slru.h"
 #include "access/transam.h"
+#include "access/xlogutils.h"
 #include "catalog/pg_type.h"
 #include "funcapi.h"
 #include "miscadmin.h"
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index daab546f296..1908f90f5eb 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -76,6 +76,7 @@
 #include "access/xact.h"
 #include "access/xlog.h"
 #include "access/xloginsert.h"
+#include "access/xlogutils.h"
 #include "catalog/pg_type.h"
 #include "commands/dbcommands.h"
 #include "funcapi.h"
diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index 82149ad7821..7585ae24ce9 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -54,6 +54,7 @@
 #include "access/slru.h"
 #include "access/transam.h"
 #include "access/xlog.h"
+#include "access/xlogutils.h"
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "storage/fd.h"
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index f67d813c564..7d2105b3934 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -86,6 +86,7 @@
 #include "access/xlog.h"
 #include "access/xloginsert.h"
 #include "access/xlogreader.h"
+#include "access/xlogrecovery.h"
 #include "access/xlogutils.h"
 #include "catalog/pg_type.h"
 #include "catalog/storage.h"
diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c
index a22bf375f85..d816fcd48e4 100644
--- a/src/backend/access/transam/varsup.c
+++ b/src/backend/access/transam/varsup.c
@@ -19,6 +19,7 @@
 #include "access/transam.h"
 #include "access/xact.h"
 #include "access/xlog.h"
+#include "access/xlogutils.h"
 #include "commands/dbcommands.h"
 #include "miscadmin.h"
 #include "postmaster/autovacuum.h"
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 441445927e8..6fddf744638 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -29,6 +29,7 @@
 #include "access/xact.h"
 #include "access/xlog.h"
 #include "access/xloginsert.h"
+#include "access/xlogrecovery.h"
 #include "access/xlogutils.h"
 #include "catalog/index.h"
 #include "catalog/namespace.h"
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index bf2c6e28402..39b9cc7d804 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -3,6 +3,30 @@
  * xlog.c
  *		PostgreSQL write-ahead log manager
  *
+ * The Write-Ahead Log (WAL) functionality is split into a few source
+ * files, in addition to this one:
+ *
+ * xloginsert.c - Functions for constructing WAL records
+ * xlogrecovery.c - WAL recovery and standby code
+ * xlogreader.c - Facility for reading WAL files and parsing WAL records
+ * xlogutils.c - Helper functions for WAL redo routines
+ *
+ * This file contains functions for coordinating database startup and
+ * checkpointing, and managing the write-ahead log buffers when the
+ * system is running.
+ *
+ * StartupXLOG() is the main entry point of the startup process. It
+ * coordinates database startup, performing WAL recovery, and the
+ * transition from WAL recovery into normal operations.
+ *
+ * XLogInsertRecord() inserts a WAL record into the WAL buffers. Most
+ * callers should not call this directly, but use the functions in
+ * xloginsert.c to construct the WAL record. XLogFlush() can be used
+ * to force the WAL to disk.
+ *
+ * In addition to those, there are many other functions for interrogating
+ * the current system state, and for starting/stopping backups.
+ *
  *
  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
@@ -36,12 +60,11 @@
 #include "access/xlogarchive.h"
 #include "access/xloginsert.h"
 #include "access/xlogreader.h"
+#include "access/xlogrecovery.h"
 #include "access/xlogutils.h"
 #include "catalog/catversion.h"
 #include "catalog/pg_control.h"
 #include "catalog/pg_database.h"
-#include "commands/progress.h"
-#include "commands/tablespace.h"
 #include "common/controldata_utils.h"
 #include "executor/instrument.h"
 #include "miscadmin.h"
@@ -72,7 +95,6 @@
 #include "storage/smgr.h"
 #include "storage/spin.h"
 #include "storage/sync.h"
-#include "utils/builtins.h"
 #include "utils/guc.h"
 #include "utils/memutils.h"
 #include "utils/ps_status.h"
@@ -83,10 +105,6 @@
 
 extern uint32 bootstrap_data_checksum_version;
 
-/* Unsupported old recovery command file names (relative to $PGDATA) */
-#define RECOVERY_COMMAND_FILE	"recovery.conf"
-#define RECOVERY_COMMAND_DONE	"recovery.done"
-
 /* User-settable parameters */
 int			max_wal_size_mb = 1024; /* 1 GB */
 int			min_wal_size_mb = 80;	/* 80 MB */
@@ -173,13 +191,6 @@ const struct config_enum_entry archive_mode_options[] = {
 	{NULL, 0, false}
 };
 
-const struct config_enum_entry recovery_target_action_options[] = {
-	{"pause", RECOVERY_TARGET_ACTION_PAUSE, false},
-	{"promote", RECOVERY_TARGET_ACTION_PROMOTE, false},
-	{"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false},
-	{NULL, 0, false}
-};
-
 /*
  * Statistics for current checkpoint are collected in this global struct.
  * Because only the checkpointer or a stand-alone backend can perform
@@ -190,31 +201,17 @@ CheckpointStatsData CheckpointStats;
 /*
  * ThisTimeLineID will be same in all backends --- it identifies current
  * WAL timeline for the database system.
+ *
+ * During normal operation, the only timeline we care about is ThisTimeLineID.
+ * During recovery, however, things are more complicated.  To simplify life
+ * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we
+ * scan through the WAL history (that is, it is the line that was active when
+ * the currently-scanned WAL record was generated).  We also need a few other
+ * timeline values to track the recovery target and the historical TLIs that
+ * we might need to recover from. They are in xlogrecovery.c.
  */
 TimeLineID	ThisTimeLineID = 0;
 
-/*
- * Are we doing recovery from XLOG?
- *
- * This is only ever true in the startup process; it should be read as meaning
- * "this process is replaying WAL records", rather than "the system is in
- * recovery mode".  It should be examined primarily by functions that need
- * to act differently when called from a WAL redo function (e.g., to skip WAL
- * logging).  To check whether the system is in recovery regardless of which
- * process you're running in, use RecoveryInProgress() but only after shared
- * memory startup and lock initialization.
- */
-bool		InRecovery = false;
-
-/* Are we in Hot Standby mode? Only valid in startup process, see xlog.h */
-HotStandbyState standbyState = STANDBY_DISABLED;
-
-static XLogRecPtr LastRec;
-
-/* Local copy of WalRcv->flushedUpto */
-static XLogRecPtr flushedUpto = 0;
-static TimeLineID receiveTLI = 0;
-
 /*
  * During recovery, lastFullPageWrites keeps track of full_page_writes that
  * the replayed WAL records indicate. It's initialized with full_page_writes
@@ -230,18 +227,6 @@ static bool lastFullPageWrites;
  */
 static bool LocalRecoveryInProgress = true;
 
-/*
- * Local copy of SharedHotStandbyActive variable. False actually means "not
- * known, need to check the shared state".
- */
-static bool LocalHotStandbyActive = false;
-
-/*
- * Local copy of SharedPromoteIsTriggered variable. False actually means "not
- * known, need to check the shared state".
- */
-static bool LocalPromoteIsTriggered = false;
-
 /*
  * Local state for XLogInsertAllowed():
  *		1: unconditionally allowed to insert XLOG
@@ -254,93 +239,6 @@ static bool LocalPromoteIsTriggered = false;
  */
 static int	LocalXLogInsertAllowed = -1;
 
-/*
- * When ArchiveRecoveryRequested is set, archive recovery was requested,
- * ie. signal files were present. When InArchiveRecovery is set, we are
- * currently recovering using offline XLOG archives. These variables are only
- * valid in the startup process.
- *
- * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
- * currently performing crash recovery using only XLOG files in pg_wal, but
- * will switch to using offline XLOG archives as soon as we reach the end of
- * WAL in pg_wal.
-*/
-bool		ArchiveRecoveryRequested = false;
-bool		InArchiveRecovery = false;
-
-static bool standby_signal_file_found = false;
-static bool recovery_signal_file_found = false;
-
-/* Buffers dedicated to consistency checks of size BLCKSZ */
-static char *replay_image_masked = NULL;
-static char *primary_image_masked = NULL;
-
-/* options formerly taken from recovery.conf for archive recovery */
-char	   *recoveryRestoreCommand = NULL;
-char	   *recoveryEndCommand = NULL;
-char	   *archiveCleanupCommand = NULL;
-RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
-bool		recoveryTargetInclusive = true;
-int			recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
-TransactionId recoveryTargetXid;
-char	   *recovery_target_time_string;
-static TimestampTz recoveryTargetTime;
-const char *recoveryTargetName;
-XLogRecPtr	recoveryTargetLSN;
-int			recovery_min_apply_delay = 0;
-
-/* options formerly taken from recovery.conf for XLOG streaming */
-bool		StandbyModeRequested = false;
-char	   *PrimaryConnInfo = NULL;
-char	   *PrimarySlotName = NULL;
-char	   *PromoteTriggerFile = NULL;
-bool		wal_receiver_create_temp_slot = false;
-
-/* are we currently in standby mode? */
-bool		StandbyMode = false;
-
-/*
- * if recoveryStopsBefore/After returns true, it saves information of the stop
- * point here
- */
-static TransactionId recoveryStopXid;
-static TimestampTz recoveryStopTime;
-static XLogRecPtr recoveryStopLSN;
-static char recoveryStopName[MAXFNAMELEN];
-static bool recoveryStopAfter;
-
-/*
- * During normal operation, the only timeline we care about is ThisTimeLineID.
- * During recovery, however, things are more complicated.  To simplify life
- * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we
- * scan through the WAL history (that is, it is the line that was active when
- * the currently-scanned WAL record was generated).  We also need these
- * timeline values:
- *
- * recoveryTargetTimeLineGoal: what the user requested, if any
- *
- * recoveryTargetTLIRequested: numeric value of requested timeline, if constant
- *
- * recoveryTargetTLI: the currently understood target timeline; changes
- *
- * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and the timelines of
- * its known parents, newest first (so recoveryTargetTLI is always the
- * first list member).  Only these TLIs are expected to be seen in the WAL
- * segments we read, and indeed only these TLIs will be considered as
- * candidate WAL files to open at all.
- *
- * curFileTLI: the TLI appearing in the name of the current input WAL file.
- * (This is not necessarily the same as ThisTimeLineID, because we could
- * be scanning data that was copied from an ancestor timeline when the current
- * file was created.)  During a sequential scan we do not allow this value
- * to decrease.
- */
-RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal = RECOVERY_TARGET_TIMELINE_LATEST;
-TimeLineID	recoveryTargetTLIRequested = 0;
-TimeLineID	recoveryTargetTLI = 0;
-static List *expectedTLEs;
-static TimeLineID curFileTLI;
-
 /*
  * ProcLastRecPtr points to the start of the last XLOG record inserted by the
  * current backend.  It is updated for all inserts.  XactLastRecEnd points to
@@ -379,21 +277,6 @@ static XLogRecPtr RedoRecPtr;
  */
 static bool doPageWrites;
 
-/* Has the recovery code requested a walreceiver wakeup? */
-static bool doRequestWalReceiverReply;
-
-/*
- * RedoStartLSN points to the checkpoint's REDO location which is specified
- * in a backup label file, backup history file or control file. In standby
- * mode, XLOG streaming usually starts from the position where an invalid
- * record was found. But if we fail to read even the initial checkpoint
- * record, we use the REDO location instead of the checkpoint location as
- * the start position of XLOG streaming. Otherwise we would have to jump
- * backwards to the REDO location after reading the checkpoint record,
- * because the REDO record can precede the checkpoint record.
- */
-static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
-
 /*----------
  * Shared-memory data structures for XLOG control
  *
@@ -653,18 +536,6 @@ typedef struct XLogCtlData
 	 */
 	RecoveryState SharedRecoveryState;
 
-	/*
-	 * SharedHotStandbyActive indicates if we allow hot standby queries to be
-	 * run.  Protected by info_lck.
-	 */
-	bool		SharedHotStandbyActive;
-
-	/*
-	 * SharedPromoteIsTriggered indicates if a standby promotion has been
-	 * triggered.  Protected by info_lck.
-	 */
-	bool		SharedPromoteIsTriggered;
-
 	/*
 	 * WalWriterSleeping indicates whether the WAL writer is currently in
 	 * low-power mode (and hence should be nudged if an async commit occurs).
@@ -672,23 +543,6 @@ typedef struct XLogCtlData
 	 */
 	bool		WalWriterSleeping;
 
-	/*
-	 * recoveryWakeupLatch is used to wake up the startup process to continue
-	 * WAL replay, if it is waiting for WAL to arrive or failover trigger file
-	 * to appear.
-	 *
-	 * Note that the startup process also uses another latch, its procLatch,
-	 * to wait for recovery conflict. If we get rid of recoveryWakeupLatch for
-	 * signaling the startup process in favor of using its procLatch, which
-	 * comports better with possible generic signal handlers using that latch.
-	 * But we should not do that because the startup process doesn't assume
-	 * that it's waken up by walreceiver process or SIGHUP signal handler
-	 * while it's waiting for recovery conflict. The separate latches,
-	 * recoveryWakeupLatch and procLatch, should be used for inter-process
-	 * communication for WAL replay and recovery conflict, respectively.
-	 */
-	Latch		recoveryWakeupLatch;
-
 	/*
 	 * During recovery, we keep a copy of the latest checkpoint record here.
 	 * lastCheckPointRecPtr points to start of checkpoint record and
@@ -701,28 +555,6 @@ typedef struct XLogCtlData
 	XLogRecPtr	lastCheckPointEndPtr;
 	CheckPoint	lastCheckPoint;
 
-	/*
-	 * lastReplayedEndRecPtr points to end+1 of the last record successfully
-	 * replayed. When we're currently replaying a record, ie. in a redo
-	 * function, replayEndRecPtr points to the end+1 of the record being
-	 * replayed, otherwise it's equal to lastReplayedEndRecPtr.
-	 */
-	XLogRecPtr	lastReplayedEndRecPtr;
-	TimeLineID	lastReplayedTLI;
-	XLogRecPtr	replayEndRecPtr;
-	TimeLineID	replayEndTLI;
-	/* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
-	TimestampTz recoveryLastXTime;
-
-	/*
-	 * timestamp of when we started replaying the current chunk of WAL data,
-	 * only relevant for replication or archive recovery
-	 */
-	TimestampTz currentChunkStartTime;
-	/* Recovery pause state */
-	RecoveryPauseState recoveryPauseState;
-	ConditionVariable recoveryNotPausedCV;
-
 	/*
 	 * lastFpwDisableRecPtr points to the start of the last replayed
 	 * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
@@ -780,21 +612,6 @@ static int	UsableBytesInSegment;
  */
 static XLogwrtResult LogwrtResult = {0, 0};
 
-/*
- * Codes indicating where we got a WAL file from during recovery, or where
- * to attempt to get one.
- */
-typedef enum
-{
-	XLOG_FROM_ANY = 0,			/* request to read WAL from any source */
-	XLOG_FROM_ARCHIVE,			/* restored using restore_command */
-	XLOG_FROM_PG_WAL,			/* existing file in pg_wal */
-	XLOG_FROM_STREAM			/* streamed from primary */
-} XLogSource;
-
-/* human-readable names for XLogSources, for debugging output */
-static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
-
 /*
  * openLogFile is -1 or a kernel FD for an open log file segment.
  * openLogSegNo identifies the segment.  These variables are only used to
@@ -804,57 +621,6 @@ static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "strea
 static int	openLogFile = -1;
 static XLogSegNo openLogSegNo = 0;
 
-/*
- * These variables are used similarly to the ones above, but for reading
- * the XLOG.  readOff is the offset of the page just read, readLen
- * indicates how much of it has been read into readBuf, and readSource
- * indicates where we got the currently open file from.
- * Note: we could use Reserve/ReleaseExternalFD to track consumption of
- * this FD too; but it doesn't currently seem worthwhile, since the XLOG is
- * not read by general-purpose sessions.
- */
-static int	readFile = -1;
-static XLogSegNo readSegNo = 0;
-static uint32 readOff = 0;
-static uint32 readLen = 0;
-static XLogSource readSource = XLOG_FROM_ANY;
-
-/*
- * Keeps track of which source we're currently reading from. This is
- * different from readSource in that this is always set, even when we don't
- * currently have a WAL file open. If lastSourceFailed is set, our last
- * attempt to read from currentSource failed, and we should try another source
- * next.
- *
- * pendingWalRcvRestart is set when a config change occurs that requires a
- * walreceiver restart.  This is only valid in XLOG_FROM_STREAM state.
- */
-static XLogSource currentSource = XLOG_FROM_ANY;
-static bool lastSourceFailed = false;
-static bool pendingWalRcvRestart = false;
-
-typedef struct XLogPageReadPrivate
-{
-	int			emode;
-	bool		fetching_ckpt;	/* are we fetching a checkpoint record? */
-	bool		randAccess;
-} XLogPageReadPrivate;
-
-/*
- * These variables track when we last obtained some WAL data to process,
- * and where we got it from.  (XLogReceiptSource is initially the same as
- * readSource, but readSource gets reset to zero when we don't have data
- * to process right now.  It is also different from currentSource, which
- * also changes when we try to read from a source and fail, while
- * XLogReceiptSource tracks where we last successfully read some WAL.)
- */
-static TimestampTz XLogReceiptTime = 0;
-static XLogSource XLogReceiptSource = XLOG_FROM_ANY;
-
-/* State information for XLOG reading */
-static XLogRecPtr ReadRecPtr;	/* start of last record read */
-static XLogRecPtr EndRecPtr;	/* end+1 of last record read */
-
 /*
  * Local copies of equivalent fields in the control file.  When running
  * crash recovery, minRecoveryPoint is set to InvalidXLogRecPtr as we
@@ -862,22 +628,10 @@ static XLogRecPtr EndRecPtr;	/* end+1 of last record read */
  * switched to false to prevent any updates while replaying records.
  * Those values are kept consistent as long as crash recovery runs.
  */
-static XLogRecPtr minRecoveryPoint;
-static TimeLineID minRecoveryPointTLI;
+static XLogRecPtr LocalMinRecoveryPoint;
+static TimeLineID LocalMinRecoveryPointTLI;
 static bool updateMinRecoveryPoint = true;
 
-/*
- * Have we reached a consistent database state? In crash recovery, we have
- * to replay all the WAL, so reachedConsistency is never set. During archive
- * recovery, the database is consistent once minRecoveryPoint is reached.
- */
-bool		reachedConsistency = false;
-
-static bool InRedo = false;
-
-/* Have we launched bgwriter during recovery? */
-static bool bgwriterLaunched = false;
-
 /* For WALInsertLockAcquire/Release functions */
 static int	MyLockNo = 0;
 static bool holdingAllLocks = false;
@@ -886,20 +640,8 @@ static bool holdingAllLocks = false;
 static MemoryContext walDebugCxt = NULL;
 #endif
 
-static void readRecoverySignalFile(void);
-static void validateRecoveryParameters(void);
-static void exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog);
-static bool recoveryStopsBefore(XLogReaderState *record);
-static bool recoveryStopsAfter(XLogReaderState *record);
-static void ConfirmRecoveryPaused(void);
-static void recoveryPausesHere(bool endOfRecovery);
-static bool recoveryApplyDelay(XLogReaderState *record);
-static void SetLatestXTime(TimestampTz xtime);
-static void SetCurrentChunkStartTime(TimestampTz xtime);
 static void CheckRequiredParameterValues(void);
 static void XLogReportParameters(void);
-static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
-								TimeLineID prevTLI);
 static void LocalSetXLogInsertAllowed(void);
 static void CreateEndOfRecoveryRecord(void);
 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
@@ -907,19 +649,11 @@ static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
 static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void);
 
 static void AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic);
-static bool XLogCheckpointNeeded(XLogSegNo new_segno);
 static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible);
 static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
 								   bool find_free, XLogSegNo max_segno,
 								   bool use_lock);
-static int	XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
-						 XLogSource source, bool notfoundOk);
-static int	XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source);
-static int	XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
-						 int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
-static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
-										bool fetching_ckpt, XLogRecPtr tliRecPtr);
-static int	emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
+static int XLogFileOpen(XLogSegNo segno);
 static void XLogFileClose(void);
 static void PreallocXlogFiles(XLogRecPtr endptr);
 static void RemoveTempXlogFiles(void);
@@ -930,31 +664,18 @@ static void UpdateLastRemovedPtr(char *filename);
 static void ValidateXLOGDirectoryStructure(void);
 static void CleanupBackupHistory(void);
 static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
-static XLogRecord *ReadRecord(XLogReaderState *xlogreader,
-							  int emode, bool fetching_ckpt);
-static void CheckRecoveryConsistency(void);
-static XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader,
-										XLogRecPtr RecPtr, int whichChkpt, bool report);
-static bool rescanLatestTimeLine(void);
 static void InitControlFile(uint64 sysidentifier);
 static void WriteControlFile(void);
 static void ReadControlFile(void);
+static void UpdateControlFile(void);
 static char *str_time(pg_time_t tnow);
-static void SetPromoteIsTriggered(void);
-static bool CheckForStandbyTrigger(void);
 
 #ifdef WAL_DEBUG
 static void xlog_outrec(StringInfo buf, XLogReaderState *record);
 #endif
-static void xlog_block_info(StringInfo buf, XLogReaderState *record);
-static void xlog_outdesc(StringInfo buf, XLogReaderState *record);
 static void pg_start_backup_callback(int code, Datum arg);
 static void pg_stop_backup_callback(int code, Datum arg);
-static bool read_backup_label(XLogRecPtr *checkPointLoc,
-							  bool *backupEndRequired, bool *backupFromStandby);
-static bool read_tablespace_map(List **tablespaces);
 
-static void rm_redo_error_callback(void *arg);
 static int	get_sync_bit(int method);
 
 static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch,
@@ -969,7 +690,6 @@ static char *GetXLogBuffer(XLogRecPtr ptr);
 static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
 static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
 static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
-static void checkXLogConsistency(XLogReaderState *record);
 
 static void WALInsertLockAcquire(void);
 static void WALInsertLockAcquireExclusive(void);
@@ -1399,114 +1119,6 @@ ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr)
 	return true;
 }
 
-/*
- * Checks whether the current buffer page and backup page stored in the
- * WAL record are consistent or not. Before comparing the two pages, a
- * masking can be applied to the pages to ignore certain areas like hint bits,
- * unused space between pd_lower and pd_upper among other things. This
- * function should be called once WAL replay has been completed for a
- * given record.
- */
-static void
-checkXLogConsistency(XLogReaderState *record)
-{
-	RmgrId		rmid = XLogRecGetRmid(record);
-	RelFileNode rnode;
-	ForkNumber	forknum;
-	BlockNumber blkno;
-	int			block_id;
-
-	/* Records with no backup blocks have no need for consistency checks. */
-	if (!XLogRecHasAnyBlockRefs(record))
-		return;
-
-	Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0);
-
-	for (block_id = 0; block_id <= record->max_block_id; block_id++)
-	{
-		Buffer		buf;
-		Page		page;
-
-		if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno))
-		{
-			/*
-			 * WAL record doesn't contain a block reference with the given id.
-			 * Do nothing.
-			 */
-			continue;
-		}
-
-		Assert(XLogRecHasBlockImage(record, block_id));
-
-		if (XLogRecBlockImageApply(record, block_id))
-		{
-			/*
-			 * WAL record has already applied the page, so bypass the
-			 * consistency check as that would result in comparing the full
-			 * page stored in the record with itself.
-			 */
-			continue;
-		}
-
-		/*
-		 * Read the contents from the current buffer and store it in a
-		 * temporary page.
-		 */
-		buf = XLogReadBufferExtended(rnode, forknum, blkno,
-									 RBM_NORMAL_NO_LOG);
-		if (!BufferIsValid(buf))
-			continue;
-
-		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
-		page = BufferGetPage(buf);
-
-		/*
-		 * Take a copy of the local page where WAL has been applied to have a
-		 * comparison base before masking it...
-		 */
-		memcpy(replay_image_masked, page, BLCKSZ);
-
-		/* No need for this page anymore now that a copy is in. */
-		UnlockReleaseBuffer(buf);
-
-		/*
-		 * If the block LSN is already ahead of this WAL record, we can't
-		 * expect contents to match.  This can happen if recovery is
-		 * restarted.
-		 */
-		if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
-			continue;
-
-		/*
-		 * Read the contents from the backup copy, stored in WAL record and
-		 * store it in a temporary page. There is no need to allocate a new
-		 * page here, a local buffer is fine to hold its contents and a mask
-		 * can be directly applied on it.
-		 */
-		if (!RestoreBlockImage(record, block_id, primary_image_masked))
-			elog(ERROR, "failed to restore block image");
-
-		/*
-		 * If masking function is defined, mask both the primary and replay
-		 * images
-		 */
-		if (RmgrTable[rmid].rm_mask != NULL)
-		{
-			RmgrTable[rmid].rm_mask(replay_image_masked, blkno);
-			RmgrTable[rmid].rm_mask(primary_image_masked, blkno);
-		}
-
-		/* Time to compare the primary and replay images. */
-		if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0)
-		{
-			elog(FATAL,
-				 "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
-				 rnode.spcNode, rnode.dbNode, rnode.relNode,
-				 forknum, blkno);
-		}
-	}
-}
-
 /*
  * Subroutine of XLogInsertRecord.  Copies a WAL record to an already-reserved
  * area in the WAL.
@@ -2392,7 +2004,7 @@ XLOGfileslop(XLogRecPtr lastredoptr)
  *
  * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
  */
-static bool
+bool
 XLogCheckpointNeeded(XLogSegNo new_segno)
 {
 	XLogSegNo	old_segno;
@@ -2773,6 +2385,77 @@ XLogGetReplicationSlotMinimumLSN(void)
 	return retval;
 }
 
+/*
+ * Callback frm PerformWalRecovery(), to tell us that we have switched from
+ * crash recover to archive recovery mode. Updates the control file.
+ */
+void
+SwitchIntoArchiveRecovery(XLogRecPtr EndRecPtr)
+{
+	/* initialize minRecoveryPoint to this record */
+	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+	ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
+	if (ControlFile->minRecoveryPoint < EndRecPtr)
+	{
+		ControlFile->minRecoveryPoint = EndRecPtr;
+		ControlFile->minRecoveryPointTLI = ThisTimeLineID;
+	}
+	/* update local copy */
+	LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
+	LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
+
+	/*
+	 * The startup process can update its local copy of
+	 * minRecoveryPoint from this point.
+	 */
+	updateMinRecoveryPoint = true;
+
+	UpdateControlFile();
+
+	/*
+	 * We update SharedRecoveryState while holding the lock on
+	 * ControlFileLock so both states are consistent in shared
+	 * memory.
+	 */
+	SpinLockAcquire(&XLogCtl->info_lck);
+	XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
+	SpinLockRelease(&XLogCtl->info_lck);
+
+	LWLockRelease(ControlFileLock);
+}
+
+/*
+ * Callback frm PerformWalRecovery(), to tell us that we reached the end of backup
+ * Updates the control file.
+ */
+void
+ReachedEndOfBackup(XLogRecPtr EndRecPtr, TimeLineID tli)
+{
+	/*
+	 * We have reached the end of base backup, as indicated by pg_control.
+	 * The data on disk is now consistent (unless minRecovery point is further
+	 * ahead, which can happen if we crashed during previous recovery). Reset backupStartPoint and
+	 * backupEndPoint, and update minRecoveryPoint to make sure we don't
+	 * allow starting up at an earlier point even if recovery is stopped
+	 * and restarted soon after this.
+	 */
+	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+
+	if (ControlFile->minRecoveryPoint < EndRecPtr)
+	{
+		ControlFile->minRecoveryPoint = EndRecPtr;
+		ControlFile->minRecoveryPointTLI = tli;
+	}
+
+	ControlFile->backupStartPoint = InvalidXLogRecPtr;
+	ControlFile->backupEndPoint = InvalidXLogRecPtr;
+	ControlFile->backupEndRequired = false;
+	UpdateControlFile();
+
+	LWLockRelease(ControlFileLock);
+}
+
+
 /*
  * Advance minRecoveryPoint in control file.
  *
@@ -2786,7 +2469,7 @@ static void
 UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
 {
 	/* Quick check using our local copy of the variable */
-	if (!updateMinRecoveryPoint || (!force && lsn <= minRecoveryPoint))
+	if (!updateMinRecoveryPoint || (!force && lsn <= LocalMinRecoveryPoint))
 		return;
 
 	/*
@@ -2800,7 +2483,7 @@ UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
 	 * available is replayed in this case.  This also saves from extra locks
 	 * taken on the control file from the startup process.
 	 */
-	if (XLogRecPtrIsInvalid(minRecoveryPoint) && InRecovery)
+	if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint) && InRecovery)
 	{
 		updateMinRecoveryPoint = false;
 		return;
@@ -2809,12 +2492,12 @@ UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
 	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
 
 	/* update local copy */
-	minRecoveryPoint = ControlFile->minRecoveryPoint;
-	minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
+	LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
+	LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
 
-	if (XLogRecPtrIsInvalid(minRecoveryPoint))
+	if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint))
 		updateMinRecoveryPoint = false;
-	else if (force || minRecoveryPoint < lsn)
+	else if (force || LocalMinRecoveryPoint < lsn)
 	{
 		XLogRecPtr	newMinRecoveryPoint;
 		TimeLineID	newMinRecoveryPointTLI;
@@ -2832,11 +2515,7 @@ UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
 		 * all.  Instead, we just log a warning and continue with recovery.
 		 * (See also the comments about corrupt LSNs in XLogFlush.)
 		 */
-		SpinLockAcquire(&XLogCtl->info_lck);
-		newMinRecoveryPoint = XLogCtl->replayEndRecPtr;
-		newMinRecoveryPointTLI = XLogCtl->replayEndTLI;
-		SpinLockRelease(&XLogCtl->info_lck);
-
+		newMinRecoveryPoint = GetCurrentReplayRecPtr(&newMinRecoveryPointTLI);
 		if (!force && newMinRecoveryPoint < lsn)
 			elog(WARNING,
 				 "xlog min recovery request %X/%X is past current point %X/%X",
@@ -2848,12 +2527,12 @@ UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
 			ControlFile->minRecoveryPoint = newMinRecoveryPoint;
 			ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI;
 			UpdateControlFile();
-			minRecoveryPoint = newMinRecoveryPoint;
-			minRecoveryPointTLI = newMinRecoveryPointTLI;
+			LocalMinRecoveryPoint = newMinRecoveryPoint;
+			LocalMinRecoveryPointTLI = newMinRecoveryPointTLI;
 
 			ereport(DEBUG2,
 					(errmsg_internal("updated min recovery point to %X/%X on timeline %u",
-									 LSN_FORMAT_ARGS(minRecoveryPoint),
+									 LSN_FORMAT_ARGS(newMinRecoveryPoint),
 									 newMinRecoveryPointTLI)));
 		}
 	}
@@ -3205,11 +2884,11 @@ XLogNeedsFlush(XLogRecPtr record)
 		 * which cannot update its local copy of minRecoveryPoint as long as
 		 * it has not replayed all WAL available when doing crash recovery.
 		 */
-		if (XLogRecPtrIsInvalid(minRecoveryPoint) && InRecovery)
+		if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint) && InRecovery)
 			updateMinRecoveryPoint = false;
 
 		/* Quick exit if already known to be updated or cannot be updated */
-		if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
+		if (record <= LocalMinRecoveryPoint || !updateMinRecoveryPoint)
 			return false;
 
 		/*
@@ -3218,8 +2897,8 @@ XLogNeedsFlush(XLogRecPtr record)
 		 */
 		if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
 			return true;
-		minRecoveryPoint = ControlFile->minRecoveryPoint;
-		minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
+		LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
+		LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
 		LWLockRelease(ControlFileLock);
 
 		/*
@@ -3227,11 +2906,11 @@ XLogNeedsFlush(XLogRecPtr record)
 		 * process doing crash recovery, which should not update the control
 		 * file value if crash recovery is still running.
 		 */
-		if (XLogRecPtrIsInvalid(minRecoveryPoint))
+		if (XLogRecPtrIsInvalid(LocalMinRecoveryPoint))
 			updateMinRecoveryPoint = false;
 
 		/* check again */
-		if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
+		if (record <= LocalMinRecoveryPoint || !updateMinRecoveryPoint)
 			return false;
 		else
 			return true;
@@ -3680,7 +3359,7 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
 /*
  * Open a pre-existing logfile segment for writing.
  */
-int
+static int
 XLogFileOpen(XLogSegNo segno)
 {
 	char		path[MAXPGPATH];
@@ -3698,246 +3377,61 @@ XLogFileOpen(XLogSegNo segno)
 }
 
 /*
- * Open a logfile segment for reading (during recovery).
- *
- * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
- * Otherwise, it's assumed to be already available in pg_wal.
+ * Close the current logfile segment for writing.
  */
-static int
-XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
-			 XLogSource source, bool notfoundOk)
+static void
+XLogFileClose(void)
 {
-	char		xlogfname[MAXFNAMELEN];
-	char		activitymsg[MAXFNAMELEN + 16];
-	char		path[MAXPGPATH];
-	int			fd;
-
-	XLogFileName(xlogfname, tli, segno, wal_segment_size);
-
-	switch (source)
-	{
-		case XLOG_FROM_ARCHIVE:
-			/* Report recovery progress in PS display */
-			snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
-					 xlogfname);
-			set_ps_display(activitymsg);
-
-			if (!RestoreArchivedFile(path, xlogfname,
-									 "RECOVERYXLOG",
-									 wal_segment_size,
-									 InRedo))
-				return -1;
-			break;
-
-		case XLOG_FROM_PG_WAL:
-		case XLOG_FROM_STREAM:
-			XLogFilePath(path, tli, segno, wal_segment_size);
-			break;
-
-		default:
-			elog(ERROR, "invalid XLogFileRead source %d", source);
-	}
+	Assert(openLogFile >= 0);
 
 	/*
-	 * If the segment was fetched from archival storage, replace the existing
-	 * xlog segment (if any) with the archival version.
+	 * WAL segment files will not be re-read in normal operation, so we advise
+	 * the OS to release any cached pages.  But do not do so if WAL archiving
+	 * or streaming is active, because archiver and walsender process could
+	 * use the cache to read the WAL segment.
 	 */
-	if (source == XLOG_FROM_ARCHIVE)
-	{
-		KeepFileRestoredFromArchive(path, xlogfname);
-
-		/*
-		 * Set path to point at the new file in pg_wal.
-		 */
-		snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
-	}
+#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
+	if (!XLogIsNeeded())
+		(void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
+#endif
 
-	fd = BasicOpenFile(path, O_RDONLY | PG_BINARY);
-	if (fd >= 0)
+	if (close(openLogFile) != 0)
 	{
-		/* Success! */
-		curFileTLI = tli;
-
-		/* Report recovery progress in PS display */
-		snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
-				 xlogfname);
-		set_ps_display(activitymsg);
-
-		/* Track source of data in assorted state variables */
-		readSource = source;
-		XLogReceiptSource = source;
-		/* In FROM_STREAM case, caller tracks receipt time, not me */
-		if (source != XLOG_FROM_STREAM)
-			XLogReceiptTime = GetCurrentTimestamp();
+		char		xlogfname[MAXFNAMELEN];
+		int			save_errno = errno;
 
-		return fd;
-	}
-	if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
+		XLogFileName(xlogfname, ThisTimeLineID, openLogSegNo, wal_segment_size);
+		errno = save_errno;
 		ereport(PANIC,
 				(errcode_for_file_access(),
-				 errmsg("could not open file \"%s\": %m", path)));
-	return -1;
+				 errmsg("could not close file \"%s\": %m", xlogfname)));
+	}
+
+	openLogFile = -1;
+	ReleaseExternalFD();
 }
 
 /*
- * Open a logfile segment for reading (during recovery).
+ * Preallocate log files beyond the specified log endpoint.
  *
- * This version searches for the segment with any TLI listed in expectedTLEs.
+ * XXX this is currently extremely conservative, since it forces only one
+ * future log segment to exist, and even that only if we are 75% done with
+ * the current one.  This is only appropriate for very low-WAL-volume systems.
+ * High-volume systems will be OK once they've built up a sufficient set of
+ * recycled log segments, but the startup transient is likely to include
+ * a lot of segment creations by foreground processes, which is not so good.
  */
-static int
-XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source)
+static void
+PreallocXlogFiles(XLogRecPtr endptr)
 {
-	char		path[MAXPGPATH];
-	ListCell   *cell;
-	int			fd;
-	List	   *tles;
-
-	/*
-	 * Loop looking for a suitable timeline ID: we might need to read any of
-	 * the timelines listed in expectedTLEs.
-	 *
-	 * We expect curFileTLI on entry to be the TLI of the preceding file in
-	 * sequence, or 0 if there was no predecessor.  We do not allow curFileTLI
-	 * to go backwards; this prevents us from picking up the wrong file when a
-	 * parent timeline extends to higher segment numbers than the child we
-	 * want to read.
-	 *
-	 * If we haven't read the timeline history file yet, read it now, so that
-	 * we know which TLIs to scan.  We don't save the list in expectedTLEs,
-	 * however, unless we actually find a valid segment.  That way if there is
-	 * neither a timeline history file nor a WAL segment in the archive, and
-	 * streaming replication is set up, we'll read the timeline history file
-	 * streamed from the primary when we start streaming, instead of
-	 * recovering with a dummy history generated here.
-	 */
-	if (expectedTLEs)
-		tles = expectedTLEs;
-	else
-		tles = readTimeLineHistory(recoveryTargetTLI);
+	XLogSegNo	_logSegNo;
+	int			lf;
+	bool		use_existent;
+	uint64		offset;
 
-	foreach(cell, tles)
-	{
-		TimeLineHistoryEntry *hent = (TimeLineHistoryEntry *) lfirst(cell);
-		TimeLineID	tli = hent->tli;
-
-		if (tli < curFileTLI)
-			break;				/* don't bother looking at too-old TLIs */
-
-		/*
-		 * Skip scanning the timeline ID that the logfile segment to read
-		 * doesn't belong to
-		 */
-		if (hent->begin != InvalidXLogRecPtr)
-		{
-			XLogSegNo	beginseg = 0;
-
-			XLByteToSeg(hent->begin, beginseg, wal_segment_size);
-
-			/*
-			 * The logfile segment that doesn't belong to the timeline is
-			 * older or newer than the segment that the timeline started or
-			 * ended at, respectively. It's sufficient to check only the
-			 * starting segment of the timeline here. Since the timelines are
-			 * scanned in descending order in this loop, any segments newer
-			 * than the ending segment should belong to newer timeline and
-			 * have already been read before. So it's not necessary to check
-			 * the ending segment of the timeline here.
-			 */
-			if (segno < beginseg)
-				continue;
-		}
-
-		if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
-		{
-			fd = XLogFileRead(segno, emode, tli,
-							  XLOG_FROM_ARCHIVE, true);
-			if (fd != -1)
-			{
-				elog(DEBUG1, "got WAL segment from archive");
-				if (!expectedTLEs)
-					expectedTLEs = tles;
-				return fd;
-			}
-		}
-
-		if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_WAL)
-		{
-			fd = XLogFileRead(segno, emode, tli,
-							  XLOG_FROM_PG_WAL, true);
-			if (fd != -1)
-			{
-				if (!expectedTLEs)
-					expectedTLEs = tles;
-				return fd;
-			}
-		}
-	}
-
-	/* Couldn't find it.  For simplicity, complain about front timeline */
-	XLogFilePath(path, recoveryTargetTLI, segno, wal_segment_size);
-	errno = ENOENT;
-	ereport(emode,
-			(errcode_for_file_access(),
-			 errmsg("could not open file \"%s\": %m", path)));
-	return -1;
-}
-
-/*
- * Close the current logfile segment for writing.
- */
-static void
-XLogFileClose(void)
-{
-	Assert(openLogFile >= 0);
-
-	/*
-	 * WAL segment files will not be re-read in normal operation, so we advise
-	 * the OS to release any cached pages.  But do not do so if WAL archiving
-	 * or streaming is active, because archiver and walsender process could
-	 * use the cache to read the WAL segment.
-	 */
-#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
-	if (!XLogIsNeeded())
-		(void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
-#endif
-
-	if (close(openLogFile) != 0)
-	{
-		char		xlogfname[MAXFNAMELEN];
-		int			save_errno = errno;
-
-		XLogFileName(xlogfname, ThisTimeLineID, openLogSegNo, wal_segment_size);
-		errno = save_errno;
-		ereport(PANIC,
-				(errcode_for_file_access(),
-				 errmsg("could not close file \"%s\": %m", xlogfname)));
-	}
-
-	openLogFile = -1;
-	ReleaseExternalFD();
-}
-
-/*
- * Preallocate log files beyond the specified log endpoint.
- *
- * XXX this is currently extremely conservative, since it forces only one
- * future log segment to exist, and even that only if we are 75% done with
- * the current one.  This is only appropriate for very low-WAL-volume systems.
- * High-volume systems will be OK once they've built up a sufficient set of
- * recycled log segments, but the startup transient is likely to include
- * a lot of segment creations by foreground processes, which is not so good.
- */
-static void
-PreallocXlogFiles(XLogRecPtr endptr)
-{
-	XLogSegNo	_logSegNo;
-	int			lf;
-	bool		use_existent;
-	uint64		offset;
-
-	XLByteToPrevSeg(endptr, _logSegNo, wal_segment_size);
-	offset = XLogSegmentOffset(endptr - 1, wal_segment_size);
-	if (offset >= (uint32) (0.75 * wal_segment_size))
+	XLByteToPrevSeg(endptr, _logSegNo, wal_segment_size);
+	offset = XLogSegmentOffset(endptr - 1, wal_segment_size);
+	if (offset >= (uint32) (0.75 * wal_segment_size))
 	{
 		_logSegNo++;
 		use_existent = true;
@@ -4131,7 +3625,7 @@ RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr, XLogRecPtr endptr)
  * 'switchpoint' is the current point in WAL where we switch to new timeline,
  * and 'newTLI' is the new timeline we switch to.
  */
-static void
+void
 RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI)
 {
 	DIR		   *xldir;
@@ -4351,249 +3845,6 @@ CleanupBackupHistory(void)
 	FreeDir(xldir);
 }
 
-/*
- * Attempt to read the next XLOG record.
- *
- * Before first call, the reader needs to be positioned to the first record
- * by calling XLogBeginRead().
- *
- * If no valid record is available, returns NULL, or fails if emode is PANIC.
- * (emode must be either PANIC, LOG). In standby mode, retries until a valid
- * record is available.
- */
-static XLogRecord *
-ReadRecord(XLogReaderState *xlogreader, int emode,
-		   bool fetching_ckpt)
-{
-	XLogRecord *record;
-	XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
-
-	/* Pass through parameters to XLogPageRead */
-	private->fetching_ckpt = fetching_ckpt;
-	private->emode = emode;
-	private->randAccess = (xlogreader->ReadRecPtr == InvalidXLogRecPtr);
-
-	/* This is the first attempt to read this page. */
-	lastSourceFailed = false;
-
-	for (;;)
-	{
-		char	   *errormsg;
-
-		record = XLogReadRecord(xlogreader, &errormsg);
-		ReadRecPtr = xlogreader->ReadRecPtr;
-		EndRecPtr = xlogreader->EndRecPtr;
-		if (record == NULL)
-		{
-			if (readFile >= 0)
-			{
-				close(readFile);
-				readFile = -1;
-			}
-
-			/*
-			 * We only end up here without a message when XLogPageRead()
-			 * failed - in that case we already logged something. In
-			 * StandbyMode that only happens if we have been triggered, so we
-			 * shouldn't loop anymore in that case.
-			 */
-			if (errormsg)
-				ereport(emode_for_corrupt_record(emode, EndRecPtr),
-						(errmsg_internal("%s", errormsg) /* already translated */ ));
-		}
-
-		/*
-		 * Check page TLI is one of the expected values.
-		 */
-		else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
-		{
-			char		fname[MAXFNAMELEN];
-			XLogSegNo	segno;
-			int32		offset;
-
-			XLByteToSeg(xlogreader->latestPagePtr, segno, wal_segment_size);
-			offset = XLogSegmentOffset(xlogreader->latestPagePtr,
-									   wal_segment_size);
-			XLogFileName(fname, xlogreader->seg.ws_tli, segno,
-						 wal_segment_size);
-			ereport(emode_for_corrupt_record(emode, EndRecPtr),
-					(errmsg("unexpected timeline ID %u in log segment %s, offset %u",
-							xlogreader->latestPageTLI,
-							fname,
-							offset)));
-			record = NULL;
-		}
-
-		if (record)
-		{
-			/* Great, got a record */
-			return record;
-		}
-		else
-		{
-			/* No valid record available from this source */
-			lastSourceFailed = true;
-
-			/*
-			 * If archive recovery was requested, but we were still doing
-			 * crash recovery, switch to archive recovery and retry using the
-			 * offline archive. We have now replayed all the valid WAL in
-			 * pg_wal, so we are presumably now consistent.
-			 *
-			 * We require that there's at least some valid WAL present in
-			 * pg_wal, however (!fetching_ckpt).  We could recover using the
-			 * WAL from the archive, even if pg_wal is completely empty, but
-			 * we'd have no idea how far we'd have to replay to reach
-			 * consistency.  So err on the safe side and give up.
-			 */
-			if (!InArchiveRecovery && ArchiveRecoveryRequested &&
-				!fetching_ckpt)
-			{
-				ereport(DEBUG1,
-						(errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
-				InArchiveRecovery = true;
-				if (StandbyModeRequested)
-					StandbyMode = true;
-
-				/* initialize minRecoveryPoint to this record */
-				LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
-				ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
-				if (ControlFile->minRecoveryPoint < EndRecPtr)
-				{
-					ControlFile->minRecoveryPoint = EndRecPtr;
-					ControlFile->minRecoveryPointTLI = ThisTimeLineID;
-				}
-				/* update local copy */
-				minRecoveryPoint = ControlFile->minRecoveryPoint;
-				minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
-
-				/*
-				 * The startup process can update its local copy of
-				 * minRecoveryPoint from this point.
-				 */
-				updateMinRecoveryPoint = true;
-
-				UpdateControlFile();
-
-				/*
-				 * We update SharedRecoveryState while holding the lock on
-				 * ControlFileLock so both states are consistent in shared
-				 * memory.
-				 */
-				SpinLockAcquire(&XLogCtl->info_lck);
-				XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
-				SpinLockRelease(&XLogCtl->info_lck);
-
-				LWLockRelease(ControlFileLock);
-
-				CheckRecoveryConsistency();
-
-				/*
-				 * Before we retry, reset lastSourceFailed and currentSource
-				 * so that we will check the archive next.
-				 */
-				lastSourceFailed = false;
-				currentSource = XLOG_FROM_ANY;
-
-				continue;
-			}
-
-			/* In standby mode, loop back to retry. Otherwise, give up. */
-			if (StandbyMode && !CheckForStandbyTrigger())
-				continue;
-			else
-				return NULL;
-		}
-	}
-}
-
-/*
- * Scan for new timelines that might have appeared in the archive since we
- * started recovery.
- *
- * If there are any, the function changes recovery target TLI to the latest
- * one and returns 'true'.
- */
-static bool
-rescanLatestTimeLine(void)
-{
-	List	   *newExpectedTLEs;
-	bool		found;
-	ListCell   *cell;
-	TimeLineID	newtarget;
-	TimeLineID	oldtarget = recoveryTargetTLI;
-	TimeLineHistoryEntry *currentTle = NULL;
-
-	newtarget = findNewestTimeLine(recoveryTargetTLI);
-	if (newtarget == recoveryTargetTLI)
-	{
-		/* No new timelines found */
-		return false;
-	}
-
-	/*
-	 * Determine the list of expected TLIs for the new TLI
-	 */
-
-	newExpectedTLEs = readTimeLineHistory(newtarget);
-
-	/*
-	 * If the current timeline is not part of the history of the new timeline,
-	 * we cannot proceed to it.
-	 */
-	found = false;
-	foreach(cell, newExpectedTLEs)
-	{
-		currentTle = (TimeLineHistoryEntry *) lfirst(cell);
-
-		if (currentTle->tli == recoveryTargetTLI)
-		{
-			found = true;
-			break;
-		}
-	}
-	if (!found)
-	{
-		ereport(LOG,
-				(errmsg("new timeline %u is not a child of database system timeline %u",
-						newtarget,
-						ThisTimeLineID)));
-		return false;
-	}
-
-	/*
-	 * The current timeline was found in the history file, but check that the
-	 * next timeline was forked off from it *after* the current recovery
-	 * location.
-	 */
-	if (currentTle->end < EndRecPtr)
-	{
-		ereport(LOG,
-				(errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
-						newtarget,
-						ThisTimeLineID,
-						LSN_FORMAT_ARGS(EndRecPtr))));
-		return false;
-	}
-
-	/* The new timeline history seems valid. Switch target */
-	recoveryTargetTLI = newtarget;
-	list_free_deep(expectedTLEs);
-	expectedTLEs = newExpectedTLEs;
-
-	/*
-	 * As in StartupXLOG(), try to ensure we have all the history files
-	 * between the old target and new target in pg_wal.
-	 */
-	restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
-
-	ereport(LOG,
-			(errmsg("new target timeline is %u",
-					recoveryTargetTLI)));
-
-	return true;
-}
-
 /*
  * I/O routines for pg_control
  *
@@ -5214,15 +4465,11 @@ XLOGShmemInit(void)
 	 */
 	XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
 	XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
-	XLogCtl->SharedHotStandbyActive = false;
-	XLogCtl->SharedPromoteIsTriggered = false;
 	XLogCtl->WalWriterSleeping = false;
 
 	SpinLockInit(&XLogCtl->Insert.insertpos_lck);
 	SpinLockInit(&XLogCtl->info_lck);
 	SpinLockInit(&XLogCtl->ulsn_lck);
-	InitSharedLatch(&XLogCtl->recoveryWakeupLatch);
-	ConditionVariableInit(&XLogCtl->recoveryNotPausedCV);
 }
 
 /*
@@ -5408,7507 +4655,4267 @@ str_time(pg_time_t tnow)
 }
 
 /*
- * See if there are any recovery signal files and if so, set state for
- * recovery.
+ * Check to see if required parameters are set high enough on this server
+ * for various aspects of recovery operation.
  *
- * See if there is a recovery command file (recovery.conf), and if so
- * throw an ERROR since as of PG12 we no longer recognize that.
+ * Note that all the parameters which this function tests need to be
+ * listed in Administrator's Overview section in high-availability.sgml.
+ * If you change them, don't forget to update the list.
  */
 static void
-readRecoverySignalFile(void)
+CheckRequiredParameterValues(void)
 {
-	struct stat stat_buf;
-
-	if (IsBootstrapProcessingMode())
-		return;
-
 	/*
-	 * Check for old recovery API file: recovery.conf
+	 * For archive recovery, the WAL must be generated with at least 'replica'
+	 * wal_level.
 	 */
-	if (stat(RECOVERY_COMMAND_FILE, &stat_buf) == 0)
+	if (ArchiveRecoveryRequested && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
+	{
 		ereport(FATAL,
-				(errcode_for_file_access(),
-				 errmsg("using recovery command file \"%s\" is not supported",
-						RECOVERY_COMMAND_FILE)));
-
-	/*
-	 * Remove unused .done file, if present. Ignore if absent.
-	 */
-	unlink(RECOVERY_COMMAND_DONE);
+				(errmsg("WAL was generated with wal_level=minimal, cannot continue recovering"),
+				 errdetail("This happens if you temporarily set wal_level=minimal on the server."),
+				 errhint("Use a backup taken after setting wal_level to higher than minimal.")));
+	}
 
 	/*
-	 * Check for recovery signal files and if found, fsync them since they
-	 * represent server state information.  We don't sweat too much about the
-	 * possibility of fsync failure, however.
-	 *
-	 * If present, standby signal file takes precedence. If neither is present
-	 * then we won't enter archive recovery.
+	 * For Hot Standby, the WAL must be generated with 'replica' mode, and we
+	 * must have at least as many backend slots as the primary.
 	 */
-	if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0)
-	{
-		int			fd;
-
-		fd = BasicOpenFilePerm(STANDBY_SIGNAL_FILE, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
-							   S_IRUSR | S_IWUSR);
-		if (fd >= 0)
-		{
-			(void) pg_fsync(fd);
-			close(fd);
-		}
-		standby_signal_file_found = true;
-	}
-	else if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0)
+	if (ArchiveRecoveryRequested && EnableHotStandby)
 	{
-		int			fd;
-
-		fd = BasicOpenFilePerm(RECOVERY_SIGNAL_FILE, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
-							   S_IRUSR | S_IWUSR);
-		if (fd >= 0)
-		{
-			(void) pg_fsync(fd);
-			close(fd);
-		}
-		recovery_signal_file_found = true;
+		/* We ignore autovacuum_max_workers when we make this test. */
+		RecoveryRequiresIntParameter("max_connections",
+									 MaxConnections,
+									 ControlFile->MaxConnections);
+		RecoveryRequiresIntParameter("max_worker_processes",
+									 max_worker_processes,
+									 ControlFile->max_worker_processes);
+		RecoveryRequiresIntParameter("max_wal_senders",
+									 max_wal_senders,
+									 ControlFile->max_wal_senders);
+		RecoveryRequiresIntParameter("max_prepared_transactions",
+									 max_prepared_xacts,
+									 ControlFile->max_prepared_xacts);
+		RecoveryRequiresIntParameter("max_locks_per_transaction",
+									 max_locks_per_xact,
+									 ControlFile->max_locks_per_xact);
 	}
+}
 
-	StandbyModeRequested = false;
-	ArchiveRecoveryRequested = false;
-	if (standby_signal_file_found)
-	{
-		StandbyModeRequested = true;
-		ArchiveRecoveryRequested = true;
-	}
-	else if (recovery_signal_file_found)
-	{
-		StandbyModeRequested = false;
-		ArchiveRecoveryRequested = true;
-	}
-	else
-		return;
+/*
+ * This must be called ONCE during postmaster or standalone-backend startup
+ */
+void
+StartupXLOG(void)
+{
+	XLogCtlInsert *Insert;
+	CheckPoint	checkPoint;
+	bool		wasShutdown;
+	XLogRecPtr	EndOfLog;
+	TimeLineID	EndOfLogTLI;
+	TimeLineID	PrevTimeLineID;
+	TransactionId oldestActiveXID;
+	bool		promoted = false;
+	XLogRecPtr	LastRec;
+	bool		haveTblspcMap;
+	bool		haveBackupLabel;
+	char	   *lastPage;
+	XLogRecPtr	lastPageBeginPtr;
+	char	   *reason;
+	bool		bgwriterLaunched;
+	bool		standby_signal_file_found;
+	bool		recovery_signal_file_found;
 
 	/*
-	 * We don't support standby mode in standalone backends; that requires
-	 * other processes such as the WAL receiver to be alive.
+	 * We should have an aux process resource owner to use, and we should not
+	 * be in a transaction that's installed some other resowner.
 	 */
-	if (StandbyModeRequested && !IsUnderPostmaster)
-		ereport(FATAL,
-				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-				 errmsg("standby mode is not supported by single-user servers")));
-}
-
-static void
-validateRecoveryParameters(void)
-{
-	if (!ArchiveRecoveryRequested)
-		return;
+	Assert(AuxProcessResourceOwner != NULL);
+	Assert(CurrentResourceOwner == NULL ||
+		   CurrentResourceOwner == AuxProcessResourceOwner);
+	CurrentResourceOwner = AuxProcessResourceOwner;
 
 	/*
-	 * Check for compulsory parameters
+	 * Check that contents look valid.
 	 */
-	if (StandbyModeRequested)
-	{
-		if ((PrimaryConnInfo == NULL || strcmp(PrimaryConnInfo, "") == 0) &&
-			(recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0))
-			ereport(WARNING,
-					(errmsg("specified neither primary_conninfo nor restore_command"),
-					 errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
-	}
-	else
+	if (!XRecOffIsValid(ControlFile->checkPoint))
+		ereport(FATAL,
+				(errmsg("control file contains invalid checkpoint location")));
+
+	switch (ControlFile->state)
 	{
-		if (recoveryRestoreCommand == NULL ||
-			strcmp(recoveryRestoreCommand, "") == 0)
+		case DB_SHUTDOWNED:
+
+			/*
+			 * This is the expected case, so don't be chatty in standalone
+			 * mode
+			 */
+			ereport(IsPostmasterEnvironment ? LOG : NOTICE,
+					(errmsg("database system was shut down at %s",
+							str_time(ControlFile->time))));
+			break;
+
+		case DB_SHUTDOWNED_IN_RECOVERY:
+			ereport(LOG,
+					(errmsg("database system was shut down in recovery at %s",
+							str_time(ControlFile->time))));
+			break;
+
+		case DB_SHUTDOWNING:
+			ereport(LOG,
+					(errmsg("database system shutdown was interrupted; last known up at %s",
+							str_time(ControlFile->time))));
+			break;
+
+		case DB_IN_CRASH_RECOVERY:
+			ereport(LOG,
+					(errmsg("database system was interrupted while in recovery at %s",
+							str_time(ControlFile->time)),
+					 errhint("This probably means that some data is corrupted and"
+							 " you will have to use the last backup for recovery.")));
+			break;
+
+		case DB_IN_ARCHIVE_RECOVERY:
+			ereport(LOG,
+					(errmsg("database system was interrupted while in recovery at log time %s",
+							str_time(ControlFile->checkPointCopy.time)),
+					 errhint("If this has occurred more than once some data might be corrupted"
+							 " and you might need to choose an earlier recovery target.")));
+			break;
+
+		case DB_IN_PRODUCTION:
+			ereport(LOG,
+					(errmsg("database system was interrupted; last known up at %s",
+							str_time(ControlFile->time))));
+			break;
+
+		default:
 			ereport(FATAL,
-					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-					 errmsg("must specify restore_command when standby mode is not enabled")));
+					(errmsg("control file contains invalid database cluster state")));
 	}
 
+	/* This is just to allow attaching to startup process with a debugger */
+#ifdef XLOG_REPLAY_DELAY
+	if (ControlFile->state != DB_SHUTDOWNED)
+		pg_usleep(60000000L);
+#endif
+
 	/*
-	 * Override any inconsistent requests. Note that this is a change of
-	 * behaviour in 9.5; prior to this we simply ignored a request to pause if
-	 * hot_standby = off, which was surprising behaviour.
+	 * Verify that pg_wal and pg_wal/archive_status exist.  In cases where
+	 * someone has performed a copy for PITR, these directories may have been
+	 * excluded and need to be re-created.
 	 */
-	if (recoveryTargetAction == RECOVERY_TARGET_ACTION_PAUSE &&
-		!EnableHotStandby)
-		recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
+	ValidateXLOGDirectoryStructure();
 
-	/*
-	 * Final parsing of recovery_target_time string; see also
-	 * check_recovery_target_time().
+	/*----------
+	 * If we previously crashed, perform a couple of actions:
+	 *
+	 * - The pg_wal directory may still include some temporary WAL segments
+	 *   used when creating a new segment, so perform some clean up to not
+	 *   bloat this path.  This is done first as there is no point to sync
+	 *   this temporary data.
+	 *
+	 * - There might be data which we had written, intending to fsync it, but
+	 *   which we had not actually fsync'd yet.  Therefore, a power failure in
+	 *   the near future might cause earlier unflushed writes to be lost, even
+	 *   though more recent data written to disk from here on would be
+	 *   persisted.  To avoid that, fsync the entire data directory.
 	 */
-	if (recoveryTarget == RECOVERY_TARGET_TIME)
+	if (ControlFile->state != DB_SHUTDOWNED &&
+		ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
 	{
-		recoveryTargetTime = DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
-																	 CStringGetDatum(recovery_target_time_string),
-																	 ObjectIdGetDatum(InvalidOid),
-																	 Int32GetDatum(-1)));
+		RemoveTempXlogFiles();
+		SyncDataDirectory();
 	}
 
 	/*
-	 * If user specified recovery_target_timeline, validate it or compute the
-	 * "latest" value.  We can't do this until after we've gotten the restore
-	 * command and set InArchiveRecovery, because we need to fetch timeline
-	 * history files from the archive.
+	 * Read checkpoint record and backup label and prepare for WAL recovery if needed
+	 *
+	 * - Sets InRecovery is recovery is needed
+	 * - Applies the tablespace map file, if any
+	 * - Updates ControlFile with values from the backup label
+	 * - Sets ArchiveRecoveryRequested
 	 */
-	if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC)
-	{
-		TimeLineID	rtli = recoveryTargetTLIRequested;
-
-		/* Timeline 1 does not have a history file, all else should */
-		if (rtli != 1 && !existsTimeLineHistory(rtli))
-			ereport(FATAL,
-					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-					 errmsg("recovery target timeline %u does not exist",
-							rtli)));
-		recoveryTargetTLI = rtli;
-	}
-	else if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
-	{
-		/* We start the "latest" search from pg_control's timeline */
-		recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
-	}
-	else
-	{
-		/*
-		 * else we just use the recoveryTargetTLI as already read from
-		 * ControlFile
-		 */
-		Assert(recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_CONTROLFILE);
-	}
-}
+	InitWalRecovery(ControlFile, &wasShutdown,
+					&haveBackupLabel, &haveTblspcMap);
+	checkPoint = ControlFile->checkPointCopy;
 
-/*
- * Exit archive-recovery state
- */
-static void
-exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog)
-{
-	char		xlogfname[MAXFNAMELEN];
-	XLogSegNo	endLogSegNo;
-	XLogSegNo	startLogSegNo;
+	/*
+	 * Clear out any old relcache cache files.  This is *necessary* if we do
+	 * any WAL replay, since that would probably result in the cache files
+	 * being out of sync with database reality.  In theory we could leave them
+	 * in place if the database had been cleanly shut down, but it seems
+	 * safest to just remove them always and let them be rebuilt during the
+	 * first backend startup.  These files needs to be removed from all
+	 * directories including pg_tblspc, however the symlinks are created only
+	 * after reading tablespace_map file in case of archive recovery from
+	 * backup, so needs to clear old relcache files here after creating
+	 * symlinks.
+	 */
+	RelationCacheInitFileRemove();
 
-	/* we always switch to a new timeline after archive recovery */
-	Assert(endTLI != ThisTimeLineID);
+	/* initialize shared memory variables from the checkpoint record */
+	ShmemVariableCache->nextXid = checkPoint.nextXid;
+	ShmemVariableCache->nextOid = checkPoint.nextOid;
+	ShmemVariableCache->oidCount = 0;
+	MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
+	AdvanceOldestClogXid(checkPoint.oldestXid);
+	SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
+	SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
+	SetCommitTsLimit(checkPoint.oldestCommitTsXid,
+					 checkPoint.newestCommitTsXid);
+	XLogCtl->ckptFullXid = checkPoint.nextXid;
 
 	/*
-	 * We are no longer in archive recovery state.
+	 * Initialize replication slots, before there's a chance to remove
+	 * required resources.
 	 */
-	InArchiveRecovery = false;
+	StartupReplicationSlots();
 
 	/*
-	 * Update min recovery point one last time.
+	 * Startup logical state, needs to be setup now so we have proper data
+	 * during crash recovery.
 	 */
-	UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
+	StartupReorderBuffer();
 
 	/*
-	 * If the ending log segment is still open, close it (to avoid problems on
-	 * Windows with trying to rename or delete an open file).
+	 * Startup CLOG. This must be done after ShmemVariableCache->nextXid has
+	 * been initialized and before we accept connections or begin WAL replay.
 	 */
-	if (readFile >= 0)
-	{
-		close(readFile);
-		readFile = -1;
-	}
+	StartupCLOG();
 
 	/*
-	 * Calculate the last segment on the old timeline, and the first segment
-	 * on the new timeline. If the switch happens in the middle of a segment,
-	 * they are the same, but if the switch happens exactly at a segment
-	 * boundary, startLogSegNo will be endLogSegNo + 1.
+	 * Startup MultiXact. We need to do this early to be able to replay
+	 * truncations.
 	 */
-	XLByteToPrevSeg(endOfLog, endLogSegNo, wal_segment_size);
-	XLByteToSeg(endOfLog, startLogSegNo, wal_segment_size);
+	StartupMultiXact();
 
 	/*
-	 * Initialize the starting WAL segment for the new timeline. If the switch
-	 * happens in the middle of a segment, copy data from the last WAL segment
-	 * of the old timeline up to the switch point, to the starting WAL segment
-	 * on the new timeline.
+	 * Ditto for commit timestamps.  Activate the facility if the setting is
+	 * enabled in the control file, as there should be no tracking of commit
+	 * timestamps done when the setting was disabled.  This facility can be
+	 * started or stopped when replaying a XLOG_PARAMETER_CHANGE record.
 	 */
-	if (endLogSegNo == startLogSegNo)
-	{
-		/*
-		 * Make a copy of the file on the new timeline.
-		 *
-		 * Writing WAL isn't allowed yet, so there are no locking
-		 * considerations. But we should be just as tense as XLogFileInit to
-		 * avoid emplacing a bogus file.
-		 */
-		XLogFileCopy(endLogSegNo, endTLI, endLogSegNo,
-					 XLogSegmentOffset(endOfLog, wal_segment_size));
-	}
-	else
-	{
-		/*
-		 * The switch happened at a segment boundary, so just create the next
-		 * segment on the new timeline.
-		 */
-		bool		use_existent = true;
-		int			fd;
-
-		fd = XLogFileInit(startLogSegNo, &use_existent, true);
-
-		if (close(fd) != 0)
-		{
-			char		xlogfname[MAXFNAMELEN];
-			int			save_errno = errno;
-
-			XLogFileName(xlogfname, ThisTimeLineID, startLogSegNo,
-						 wal_segment_size);
-			errno = save_errno;
-			ereport(ERROR,
-					(errcode_for_file_access(),
-					 errmsg("could not close file \"%s\": %m", xlogfname)));
-		}
-	}
+	if (ControlFile->track_commit_timestamp)
+		StartupCommitTs();
 
 	/*
-	 * Let's just make real sure there are not .ready or .done flags posted
-	 * for the new segment.
+	 * Recover knowledge about replay progress of known replication partners.
 	 */
-	XLogFileName(xlogfname, ThisTimeLineID, startLogSegNo, wal_segment_size);
-	XLogArchiveCleanup(xlogfname);
+	StartupReplicationOrigin();
 
 	/*
-	 * Remove the signal files out of the way, so that we don't accidentally
-	 * re-enter archive recovery mode in a subsequent crash.
+	 * Initialize unlogged LSN. On a clean shutdown, it's restored from the
+	 * control file. On recovery, all unlogged relations are blown away, so
+	 * the unlogged LSN counter can be reset too.
 	 */
-	if (standby_signal_file_found)
-		durable_unlink(STANDBY_SIGNAL_FILE, FATAL);
-
-	if (recovery_signal_file_found)
-		durable_unlink(RECOVERY_SIGNAL_FILE, FATAL);
-
-	ereport(LOG,
-			(errmsg("archive recovery complete")));
-}
-
-/*
- * Extract timestamp from WAL record.
- *
- * If the record contains a timestamp, returns true, and saves the timestamp
- * in *recordXtime. If the record type has no timestamp, returns false.
- * Currently, only transaction commit/abort records and restore points contain
- * timestamps.
- */
-static bool
-getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime)
-{
-	uint8		info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
-	uint8		xact_info = info & XLOG_XACT_OPMASK;
-	uint8		rmid = XLogRecGetRmid(record);
-
-	if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
-	{
-		*recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
-		return true;
-	}
-	if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
-							   xact_info == XLOG_XACT_COMMIT_PREPARED))
-	{
-		*recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
-		return true;
-	}
-	if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
-							   xact_info == XLOG_XACT_ABORT_PREPARED))
-	{
-		*recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
-		return true;
-	}
-	return false;
-}
-
-/*
- * For point-in-time recovery, this function decides whether we want to
- * stop applying the XLOG before the current record.
- *
- * Returns true if we are stopping, false otherwise. If stopping, some
- * information is saved in recoveryStopXid et al for use in annotating the
- * new timeline's history file.
- */
-static bool
-recoveryStopsBefore(XLogReaderState *record)
-{
-	bool		stopsHere = false;
-	uint8		xact_info;
-	bool		isCommit;
-	TimestampTz recordXtime = 0;
-	TransactionId recordXid;
+	if (ControlFile->state == DB_SHUTDOWNED)
+		XLogCtl->unloggedLSN = ControlFile->unloggedLSN;
+	else
+		XLogCtl->unloggedLSN = FirstNormalUnloggedLSN;
 
 	/*
-	 * Ignore recovery target settings when not in archive recovery (meaning
-	 * we are in crash recovery).
+	 * We must replay WAL entries using the same TimeLineID they were created
+	 * under, so temporarily adopt the TLI indicated by the checkpoint (see
+	 * also xlog_redo()).
 	 */
-	if (!ArchiveRecoveryRequested)
-		return false;
-
-	/* Check if we should stop as soon as reaching consistency */
-	if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
-	{
-		ereport(LOG,
-				(errmsg("recovery stopping after reaching consistency")));
+	ThisTimeLineID = checkPoint.ThisTimeLineID;
 
-		recoveryStopAfter = false;
-		recoveryStopXid = InvalidTransactionId;
-		recoveryStopLSN = InvalidXLogRecPtr;
-		recoveryStopTime = 0;
-		recoveryStopName[0] = '\0';
-		return true;
-	}
+	/*
+	 * Copy any missing timeline history files between 'now' and the recovery
+	 * target timeline from archive to pg_wal. While we don't need those files
+	 * ourselves - the history file of the recovery target timeline covers all
+	 * the previous timelines in the history too - a cascading standby server
+	 * might be interested in them. Or, if you archive the WAL from this
+	 * server to a different archive than the primary, it'd be good for all
+	 * the history files to get archived there after failover, so that you can
+	 * use one of the old timelines as a PITR target. Timeline history files
+	 * are small, so it's better to copy them unnecessarily than not copy them
+	 * and regret later.
+	 */
+	restoreTimeLineHistoryFiles(ThisTimeLineID, recoveryTargetTLI);
 
-	/* Check if target LSN has been reached */
-	if (recoveryTarget == RECOVERY_TARGET_LSN &&
-		!recoveryTargetInclusive &&
-		record->ReadRecPtr >= recoveryTargetLSN)
-	{
-		recoveryStopAfter = false;
-		recoveryStopXid = InvalidTransactionId;
-		recoveryStopLSN = record->ReadRecPtr;
-		recoveryStopTime = 0;
-		recoveryStopName[0] = '\0';
-		ereport(LOG,
-				(errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"",
-						LSN_FORMAT_ARGS(recoveryStopLSN))));
-		return true;
-	}
+	/*
+	 * Before running in recovery, scan pg_twophase and fill in its status to
+	 * be able to work on entries generated by redo.  Doing a scan before
+	 * taking any recovery action has the merit to discard any 2PC files that
+	 * are newer than the first record to replay, saving from any conflicts at
+	 * replay.  This avoids as well any subsequent scans when doing recovery
+	 * of the on-disk two-phase data.
+	 */
+	restoreTwoPhaseData();
 
-	/* Otherwise we only consider stopping before COMMIT or ABORT records. */
-	if (XLogRecGetRmid(record) != RM_XACT_ID)
-		return false;
+	lastFullPageWrites = checkPoint.fullPageWrites;
 
-	xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
+	RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
+	doPageWrites = lastFullPageWrites;
 
-	if (xact_info == XLOG_XACT_COMMIT)
-	{
-		isCommit = true;
-		recordXid = XLogRecGetXid(record);
-	}
-	else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
-	{
-		xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
-		xl_xact_parsed_commit parsed;
-
-		isCommit = true;
-		ParseCommitRecord(XLogRecGetInfo(record),
-						  xlrec,
-						  &parsed);
-		recordXid = parsed.twophase_xid;
-	}
-	else if (xact_info == XLOG_XACT_ABORT)
-	{
-		isCommit = false;
-		recordXid = XLogRecGetXid(record);
-	}
-	else if (xact_info == XLOG_XACT_ABORT_PREPARED)
-	{
-		xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
-		xl_xact_parsed_abort parsed;
-
-		isCommit = true;
-		ParseAbortRecord(XLogRecGetInfo(record),
-						 xlrec,
-						 &parsed);
-		recordXid = parsed.twophase_xid;
-	}
-	else
-		return false;
-
-	if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive)
-	{
-		/*
-		 * There can be only one transaction end record with this exact
-		 * transactionid
-		 *
-		 * when testing for an xid, we MUST test for equality only, since
-		 * transactions are numbered in the order they start, not the order
-		 * they complete. A higher numbered xid will complete before you about
-		 * 50% of the time...
-		 */
-		stopsHere = (recordXid == recoveryTargetXid);
-	}
-
-	if (recoveryTarget == RECOVERY_TARGET_TIME &&
-		getRecordTimestamp(record, &recordXtime))
+	/* REDO */
+	if (InRecovery)
 	{
 		/*
-		 * There can be many transactions that share the same commit time, so
-		 * we stop after the last one, if we are inclusive, or stop at the
-		 * first one if we are exclusive
+		 * Update the control file to indicate that we are in recovery. It's updated
+		 * with the values on disk we read from the backup label.
 		 */
-		if (recoveryTargetInclusive)
-			stopsHere = (recordXtime > recoveryTargetTime);
+		SpinLockAcquire(&XLogCtl->info_lck);
+		if (InArchiveRecovery)
+			XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
 		else
-			stopsHere = (recordXtime >= recoveryTargetTime);
-	}
+			XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
+		SpinLockRelease(&XLogCtl->info_lck);
 
-	if (stopsHere)
-	{
-		recoveryStopAfter = false;
-		recoveryStopXid = recordXid;
-		recoveryStopTime = recordXtime;
-		recoveryStopLSN = InvalidXLogRecPtr;
-		recoveryStopName[0] = '\0';
+		UpdateControlFile();
 
-		if (isCommit)
-		{
-			ereport(LOG,
-					(errmsg("recovery stopping before commit of transaction %u, time %s",
-							recoveryStopXid,
-							timestamptz_to_str(recoveryStopTime))));
-		}
-		else
+		/*
+		 * If there was a backup label file, it's done its job and the info
+		 * has now been propagated into pg_control.  We must get rid of the
+		 * label file so that if we crash during recovery, we'll pick up at
+		 * the latest recovery restartpoint instead of going all the way back
+		 * to the backup start point.  It seems prudent though to just rename
+		 * the file out of the way rather than delete it completely.
+		 */
+		if (haveBackupLabel)
 		{
-			ereport(LOG,
-					(errmsg("recovery stopping before abort of transaction %u, time %s",
-							recoveryStopXid,
-							timestamptz_to_str(recoveryStopTime))));
+			unlink(BACKUP_LABEL_OLD);
+			durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, FATAL);
 		}
-	}
-
-	return stopsHere;
-}
-
-/*
- * Same as recoveryStopsBefore, but called after applying the record.
- *
- * We also track the timestamp of the latest applied COMMIT/ABORT
- * record in XLogCtl->recoveryLastXTime.
- */
-static bool
-recoveryStopsAfter(XLogReaderState *record)
-{
-	uint8		info;
-	uint8		xact_info;
-	uint8		rmid;
-	TimestampTz recordXtime;
-
-	/*
-	 * Ignore recovery target settings when not in archive recovery (meaning
-	 * we are in crash recovery).
-	 */
-	if (!ArchiveRecoveryRequested)
-		return false;
-
-	info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
-	rmid = XLogRecGetRmid(record);
 
-	/*
-	 * There can be many restore points that share the same name; we stop at
-	 * the first one.
-	 */
-	if (recoveryTarget == RECOVERY_TARGET_NAME &&
-		rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
-	{
-		xl_restore_point *recordRestorePointData;
-
-		recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
-
-		if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
+		/*
+		 * If there was a tablespace_map file, it's done its job and the
+		 * symlinks have been created.  We must get rid of the map file so
+		 * that if we crash during recovery, we don't create symlinks again.
+		 * It seems prudent though to just rename the file out of the way
+		 * rather than delete it completely.
+		 */
+		if (haveTblspcMap)
 		{
-			recoveryStopAfter = true;
-			recoveryStopXid = InvalidTransactionId;
-			recoveryStopLSN = InvalidXLogRecPtr;
-			(void) getRecordTimestamp(record, &recoveryStopTime);
-			strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
-
-			ereport(LOG,
-					(errmsg("recovery stopping at restore point \"%s\", time %s",
-							recoveryStopName,
-							timestamptz_to_str(recoveryStopTime))));
-			return true;
+			unlink(TABLESPACE_MAP_OLD);
+			durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, FATAL);
 		}
-	}
-
-	/* Check if the target LSN has been reached */
-	if (recoveryTarget == RECOVERY_TARGET_LSN &&
-		recoveryTargetInclusive &&
-		record->ReadRecPtr >= recoveryTargetLSN)
-	{
-		recoveryStopAfter = true;
-		recoveryStopXid = InvalidTransactionId;
-		recoveryStopLSN = record->ReadRecPtr;
-		recoveryStopTime = 0;
-		recoveryStopName[0] = '\0';
-		ereport(LOG,
-				(errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"",
-						LSN_FORMAT_ARGS(recoveryStopLSN))));
-		return true;
-	}
 
-	if (rmid != RM_XACT_ID)
-		return false;
-
-	xact_info = info & XLOG_XACT_OPMASK;
-
-	if (xact_info == XLOG_XACT_COMMIT ||
-		xact_info == XLOG_XACT_COMMIT_PREPARED ||
-		xact_info == XLOG_XACT_ABORT ||
-		xact_info == XLOG_XACT_ABORT_PREPARED)
-	{
-		TransactionId recordXid;
-
-		/* Update the last applied transaction timestamp */
-		if (getRecordTimestamp(record, &recordXtime))
-			SetLatestXTime(recordXtime);
-
-		/* Extract the XID of the committed/aborted transaction */
-		if (xact_info == XLOG_XACT_COMMIT_PREPARED)
+		/*
+		 * Initialize our local copy of minRecoveryPoint.  When doing crash
+		 * recovery we want to replay up to the end of WAL.  Particularly, in
+		 * the case of a promoted standby minRecoveryPoint value in the
+		 * control file is only updated after the first checkpoint.  However,
+		 * if the instance crashes before the first post-recovery checkpoint
+		 * is completed then recovery will use a stale location causing the
+		 * startup process to think that there are still invalid page
+		 * references when checking for data consistency.
+		 */
+		if (InArchiveRecovery)
 		{
-			xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
-			xl_xact_parsed_commit parsed;
-
-			ParseCommitRecord(XLogRecGetInfo(record),
-							  xlrec,
-							  &parsed);
-			recordXid = parsed.twophase_xid;
+			LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
+			LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
 		}
-		else if (xact_info == XLOG_XACT_ABORT_PREPARED)
+		else
 		{
-			xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
-			xl_xact_parsed_abort parsed;
-
-			ParseAbortRecord(XLogRecGetInfo(record),
-							 xlrec,
-							 &parsed);
-			recordXid = parsed.twophase_xid;
+			LocalMinRecoveryPoint = InvalidXLogRecPtr;
+			LocalMinRecoveryPointTLI = 0;
 		}
-		else
-			recordXid = XLogRecGetXid(record);
 
 		/*
-		 * There can be only one transaction end record with this exact
-		 * transactionid
-		 *
-		 * when testing for an xid, we MUST test for equality only, since
-		 * transactions are numbered in the order they start, not the order
-		 * they complete. A higher numbered xid will complete before you about
-		 * 50% of the time...
+		 * Reset pgstat data, because it may be invalid after recovery.
 		 */
-		if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive &&
-			recordXid == recoveryTargetXid)
-		{
-			recoveryStopAfter = true;
-			recoveryStopXid = recordXid;
-			recoveryStopTime = recordXtime;
-			recoveryStopLSN = InvalidXLogRecPtr;
-			recoveryStopName[0] = '\0';
-
-			if (xact_info == XLOG_XACT_COMMIT ||
-				xact_info == XLOG_XACT_COMMIT_PREPARED)
-			{
-				ereport(LOG,
-						(errmsg("recovery stopping after commit of transaction %u, time %s",
-								recoveryStopXid,
-								timestamptz_to_str(recoveryStopTime))));
-			}
-			else if (xact_info == XLOG_XACT_ABORT ||
-					 xact_info == XLOG_XACT_ABORT_PREPARED)
-			{
-				ereport(LOG,
-						(errmsg("recovery stopping after abort of transaction %u, time %s",
-								recoveryStopXid,
-								timestamptz_to_str(recoveryStopTime))));
-			}
-			return true;
-		}
-	}
-
-	/* Check if we should stop as soon as reaching consistency */
-	if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
-	{
-		ereport(LOG,
-				(errmsg("recovery stopping after reaching consistency")));
-
-		recoveryStopAfter = true;
-		recoveryStopXid = InvalidTransactionId;
-		recoveryStopTime = 0;
-		recoveryStopLSN = InvalidXLogRecPtr;
-		recoveryStopName[0] = '\0';
-		return true;
-	}
-
-	return false;
-}
-
-/*
- * Wait until shared recoveryPauseState is set to RECOVERY_NOT_PAUSED.
- *
- * endOfRecovery is true if the recovery target is reached and
- * the paused state starts at the end of recovery because of
- * recovery_target_action=pause, and false otherwise.
- */
-static void
-recoveryPausesHere(bool endOfRecovery)
-{
-	/* Don't pause unless users can connect! */
-	if (!LocalHotStandbyActive)
-		return;
-
-	/* Don't pause after standby promotion has been triggered */
-	if (LocalPromoteIsTriggered)
-		return;
-
-	if (endOfRecovery)
-		ereport(LOG,
-				(errmsg("pausing at the end of recovery"),
-				 errhint("Execute pg_wal_replay_resume() to promote.")));
-	else
-		ereport(LOG,
-				(errmsg("recovery has paused"),
-				 errhint("Execute pg_wal_replay_resume() to continue.")));
+		pgstat_reset_all();
 
-	/* loop until recoveryPauseState is set to RECOVERY_NOT_PAUSED */
-	while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED)
-	{
-		HandleStartupProcInterrupts();
-		if (CheckForStandbyTrigger())
-			return;
+		/* Check that the GUCs used to generate the WAL allow recovery */
+		CheckRequiredParameterValues();
 
 		/*
-		 * If recovery pause is requested then set it paused.  While we are in
-		 * the loop, user might resume and pause again so set this every time.
+		 * We're in recovery, so unlogged relations may be trashed and must be
+		 * reset.  This should be done BEFORE allowing Hot Standby
+		 * connections, so that read-only backends don't try to read whatever
+		 * garbage is left over from before.
 		 */
-		ConfirmRecoveryPaused();
+		ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
 
 		/*
-		 * We wait on a condition variable that will wake us as soon as the
-		 * pause ends, but we use a timeout so we can check the above exit
-		 * condition periodically too.
+		 * Likewise, delete any saved transaction snapshot files that got left
+		 * behind by crashed backends.
 		 */
-		ConditionVariableTimedSleep(&XLogCtl->recoveryNotPausedCV, 1000,
-									WAIT_EVENT_RECOVERY_PAUSE);
-	}
-	ConditionVariableCancelSleep();
-}
-
-/*
- * Get the current state of the recovery pause request.
- */
-RecoveryPauseState
-GetRecoveryPauseState(void)
-{
-	RecoveryPauseState state;
+		DeleteAllExportedSnapshotFiles();
 
-	SpinLockAcquire(&XLogCtl->info_lck);
-	state = XLogCtl->recoveryPauseState;
-	SpinLockRelease(&XLogCtl->info_lck);
+		/*
+		 * Initialize for Hot Standby, if enabled. We won't let backends in
+		 * yet, not until we've reached the min recovery point specified in
+		 * control file and we've established a recovery snapshot from a
+		 * running-xacts WAL record.
+		 */
+		if (ArchiveRecoveryRequested && EnableHotStandby)
+		{
+			TransactionId *xids;
+			int			nxids;
 
-	return state;
-}
+			ereport(DEBUG1,
+					(errmsg_internal("initializing for hot standby")));
 
-/*
- * Set the recovery pause state.
- *
- * If recovery pause is requested then sets the recovery pause state to
- * 'pause requested' if it is not already 'paused'.  Otherwise, sets it
- * to 'not paused' to resume the recovery.  The recovery pause will be
- * confirmed by the ConfirmRecoveryPaused.
- */
-void
-SetRecoveryPause(bool recoveryPause)
-{
-	SpinLockAcquire(&XLogCtl->info_lck);
+			InitRecoveryTransactionEnvironment();
 
-	if (!recoveryPause)
-		XLogCtl->recoveryPauseState = RECOVERY_NOT_PAUSED;
-	else if (XLogCtl->recoveryPauseState == RECOVERY_NOT_PAUSED)
-		XLogCtl->recoveryPauseState = RECOVERY_PAUSE_REQUESTED;
+			if (wasShutdown)
+				oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
+			else
+				oldestActiveXID = checkPoint.oldestActiveXid;
+			Assert(TransactionIdIsValid(oldestActiveXID));
 
-	SpinLockRelease(&XLogCtl->info_lck);
+			/* Tell procarray about the range of xids it has to deal with */
+			ProcArrayInitRecovery(XidFromFullTransactionId(ShmemVariableCache->nextXid));
 
-	if (!recoveryPause)
-		ConditionVariableBroadcast(&XLogCtl->recoveryNotPausedCV);
-}
+			/*
+			 * Startup subtrans only.  CLOG, MultiXact and commit timestamp
+			 * have already been started up and other SLRUs are not maintained
+			 * during recovery and need not be started yet.
+			 */
+			StartupSUBTRANS(oldestActiveXID);
 
-/*
- * Confirm the recovery pause by setting the recovery pause state to
- * RECOVERY_PAUSED.
- */
-static void
-ConfirmRecoveryPaused(void)
-{
-	/* If recovery pause is requested then set it paused */
-	SpinLockAcquire(&XLogCtl->info_lck);
-	if (XLogCtl->recoveryPauseState == RECOVERY_PAUSE_REQUESTED)
-		XLogCtl->recoveryPauseState = RECOVERY_PAUSED;
-	SpinLockRelease(&XLogCtl->info_lck);
-}
+			/*
+			 * If we're beginning at a shutdown checkpoint, we know that
+			 * nothing was running on the primary at this point. So fake-up an
+			 * empty running-xacts record and use that here and now. Recover
+			 * additional standby state for prepared transactions.
+			 */
+			if (wasShutdown)
+			{
+				RunningTransactionsData running;
+				TransactionId latestCompletedXid;
 
-/*
- * When recovery_min_apply_delay is set, we wait long enough to make sure
- * certain record types are applied at least that interval behind the primary.
- *
- * Returns true if we waited.
- *
- * Note that the delay is calculated between the WAL record log time and
- * the current time on standby. We would prefer to keep track of when this
- * standby received each WAL record, which would allow a more consistent
- * approach and one not affected by time synchronisation issues, but that
- * is significantly more effort and complexity for little actual gain in
- * usability.
- */
-static bool
-recoveryApplyDelay(XLogReaderState *record)
-{
-	uint8		xact_info;
-	TimestampTz xtime;
-	TimestampTz delayUntil;
-	long		msecs;
+				/*
+				 * Construct a RunningTransactions snapshot representing a
+				 * shut down server, with only prepared transactions still
+				 * alive. We're never overflowed at this point because all
+				 * subxids are listed with their parent prepared transactions.
+				 */
+				running.xcnt = nxids;
+				running.subxcnt = 0;
+				running.subxid_overflow = false;
+				running.nextXid = XidFromFullTransactionId(checkPoint.nextXid);
+				running.oldestRunningXid = oldestActiveXID;
+				latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid);
+				TransactionIdRetreat(latestCompletedXid);
+				Assert(TransactionIdIsNormal(latestCompletedXid));
+				running.latestCompletedXid = latestCompletedXid;
+				running.xids = xids;
 
-	/* nothing to do if no delay configured */
-	if (recovery_min_apply_delay <= 0)
-		return false;
+				ProcArrayApplyRecoveryInfo(&running);
 
-	/* no delay is applied on a database not yet consistent */
-	if (!reachedConsistency)
-		return false;
+				StandbyRecoverPreparedTransactions();
+			}
+		}
 
-	/* nothing to do if crash recovery is requested */
-	if (!ArchiveRecoveryRequested)
-		return false;
+		PerformWalRecovery();
+	}
 
 	/*
-	 * Is it a COMMIT record?
-	 *
-	 * We deliberately choose not to delay aborts since they have no effect on
-	 * MVCC. We already allow replay of records that don't have a timestamp,
-	 * so there is already opportunity for issues caused by early conflicts on
-	 * standbys.
+	 * End WAL recovery.
 	 */
-	if (XLogRecGetRmid(record) != RM_XACT_ID)
-		return false;
-
-	xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
-
-	if (xact_info != XLOG_XACT_COMMIT &&
-		xact_info != XLOG_XACT_COMMIT_PREPARED)
-		return false;
-
-	if (!getRecordTimestamp(record, &xtime))
-		return false;
-
-	delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
+	EndWalRecovery(&LastRec, &EndOfLog, &EndOfLogTLI, &lastPageBeginPtr, &lastPage, &reason,
+				   &bgwriterLaunched,
+				   &standby_signal_file_found,
+				   &recovery_signal_file_found);
 
 	/*
-	 * Exit without arming the latch if it's already past time to apply this
-	 * record
+	 * Update min recovery point one last time.
 	 */
-	msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(), delayUntil);
-	if (msecs <= 0)
-		return false;
-
-	while (true)
-	{
-		ResetLatch(&XLogCtl->recoveryWakeupLatch);
-
-		/* might change the trigger file's location */
-		HandleStartupProcInterrupts();
-
-		if (CheckForStandbyTrigger())
-			break;
-
-		/*
-		 * Wait for difference between GetCurrentTimestamp() and delayUntil
-		 */
-		msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(),
-												delayUntil);
+	UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
 
-		if (msecs <= 0)
-			break;
+	/*
+	 * Complain if we did not roll forward far enough to render the backup
+	 * dump consistent.  Note: it is indeed okay to look at the local variable
+	 * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
+	 * be further ahead --- ControlFile->minRecoveryPoint cannot have been
+	 * advanced beyond the WAL we processed.
+	 */
+	if (InRecovery &&
+		(EndOfLog < LocalMinRecoveryPoint ||
+		 !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
+	{
+		/*
+		 * Ran off end of WAL before reaching end-of-backup WAL record, or
+		 * LocalMinRecoveryPoint. That's usually a bad sign, indicating that you
+		 * tried to recover from an online backup but never called
+		 * pg_stop_backup(), or you didn't archive all the WAL up to that
+		 * point. However, this also happens in crash recovery, if the system
+		 * crashes while an online backup is in progress. We must not treat
+		 * that as an error, or the database will refuse to start up.
+		 */
+		if (ArchiveRecoveryRequested || ControlFile->backupEndRequired)
+		{
+			if (ControlFile->backupEndRequired)
+				ereport(FATAL,
+						(errmsg("WAL ends before end of online backup"),
+						 errhint("All WAL generated while online backup was taken must be available at recovery.")));
+			else if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
+				ereport(FATAL,
+						(errmsg("WAL ends before end of online backup"),
+						 errhint("Online backup started with pg_start_backup() must be ended with pg_stop_backup(), and all WAL up to that point must be available at recovery.")));
+			else
+				ereport(FATAL,
+						(errmsg("WAL ends before consistent recovery point")));
+		}
+	}
 
-		elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs);
+	/*
+	 * Reset unlogged relations to the contents of their INIT fork. This is
+	 * done AFTER recovery is complete so as to include any unlogged relations
+	 * created during recovery, but BEFORE recovery is marked as having
+	 * completed successfully. Otherwise we'd not retry if any of the post
+	 * end-of-recovery steps fail.
+	 */
+	if (InRecovery)
+		ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
 
-		(void) WaitLatch(&XLogCtl->recoveryWakeupLatch,
-						 WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
-						 msecs,
-						 WAIT_EVENT_RECOVERY_APPLY_DELAY);
-	}
-	return true;
-}
+	/*
+	 * Pre-scan prepared transactions to find out the range of XIDs present.
+	 * This information is not quite needed yet, but it is positioned here so
+	 * as potential problems are detected before any on-disk change is done.
+	 */
+	oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
 
-/*
- * Save timestamp of latest processed commit/abort record.
- *
- * We keep this in XLogCtl, not a simple static variable, so that it can be
- * seen by processes other than the startup process.  Note in particular
- * that CreateRestartPoint is executed in the checkpointer.
- */
-static void
-SetLatestXTime(TimestampTz xtime)
-{
-	SpinLockAcquire(&XLogCtl->info_lck);
-	XLogCtl->recoveryLastXTime = xtime;
-	SpinLockRelease(&XLogCtl->info_lck);
-}
+	/*
+	 * Consider whether we need to assign a new timeline ID.
+	 *
+	 * If we are doing an archive recovery, we always assign a new ID.  This
+	 * handles a couple of issues.  If we stopped short of the end of WAL
+	 * during recovery, then we are clearly generating a new timeline and must
+	 * assign it a unique new ID.  Even if we ran to the end, modifying the
+	 * current last segment is problematic because it may result in trying to
+	 * overwrite an already-archived copy of that segment, and we encourage
+	 * DBAs to make their archive_commands reject that.  We can dodge the
+	 * problem by making the new active segment have a new timeline ID.
+	 *
+	 * In a normal crash recovery, we can just extend the timeline we were in.
+	 */
+	PrevTimeLineID = ThisTimeLineID;
+	if (ArchiveRecoveryRequested)
+	{
+		char		xlogfname[MAXFNAMELEN];
+		XLogSegNo	endLogSegNo;
+		XLogSegNo	startLogSegNo;
 
-/*
- * Fetch timestamp of latest processed commit/abort record.
- */
-TimestampTz
-GetLatestXTime(void)
-{
-	TimestampTz xtime;
+		ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
+		ereport(LOG,
+				(errmsg("selected new timeline ID: %u", ThisTimeLineID)));
+		/* we always switch to a new timeline after archive recovery */
+		Assert(EndOfLogTLI != ThisTimeLineID);
 
-	SpinLockAcquire(&XLogCtl->info_lck);
-	xtime = XLogCtl->recoveryLastXTime;
-	SpinLockRelease(&XLogCtl->info_lck);
+		/*
+		 * Calculate the last segment on the old timeline, and the first segment
+		 * on the new timeline. If the switch happens in the middle of a segment,
+		 * they are the same, but if the switch happens exactly at a segment
+		 * boundary, startLogSegNo will be endLogSegNo + 1.
+		 */
+		XLByteToPrevSeg(EndOfLog, endLogSegNo, wal_segment_size);
+		XLByteToSeg(EndOfLog, startLogSegNo, wal_segment_size);
 
-	return xtime;
-}
+		/*
+		 * Initialize the starting WAL segment for the new timeline. If the switch
+		 * happens in the middle of a segment, copy data from the last WAL segment
+		 * of the old timeline up to the switch point, to the starting WAL segment
+		 * on the new timeline.
+		 */
+		if (endLogSegNo == startLogSegNo)
+		{
+			/*
+			 * Make a copy of the file on the new timeline.
+			 *
+			 * Writing WAL isn't allowed yet, so there are no locking
+			 * considerations. But we should be just as tense as XLogFileInit to
+			 * avoid emplacing a bogus file.
+			 */
+			XLogFileCopy(endLogSegNo, EndOfLogTLI, endLogSegNo,
+						 XLogSegmentOffset(EndOfLog, wal_segment_size));
+		}
+		else
+		{
+			/*
+			 * The switch happened at a segment boundary, so just create the next
+			 * segment on the new timeline.
+			 */
+			bool		use_existent = true;
+			int			fd;
 
-/*
- * Save timestamp of the next chunk of WAL records to apply.
- *
- * We keep this in XLogCtl, not a simple static variable, so that it can be
- * seen by all backends.
- */
-static void
-SetCurrentChunkStartTime(TimestampTz xtime)
-{
-	SpinLockAcquire(&XLogCtl->info_lck);
-	XLogCtl->currentChunkStartTime = xtime;
-	SpinLockRelease(&XLogCtl->info_lck);
-}
+			fd = XLogFileInit(startLogSegNo, &use_existent, true);
 
-/*
- * Fetch timestamp of latest processed commit/abort record.
- * Startup process maintains an accurate local copy in XLogReceiptTime
- */
-TimestampTz
-GetCurrentChunkReplayStartTime(void)
-{
-	TimestampTz xtime;
+			if (close(fd) != 0)
+			{
+				char		xlogfname[MAXFNAMELEN];
+				int			save_errno = errno;
 
-	SpinLockAcquire(&XLogCtl->info_lck);
-	xtime = XLogCtl->currentChunkStartTime;
-	SpinLockRelease(&XLogCtl->info_lck);
+				XLogFileName(xlogfname, ThisTimeLineID, startLogSegNo,
+							 wal_segment_size);
+				errno = save_errno;
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not close file \"%s\": %m", xlogfname)));
+			}
+		}
 
-	return xtime;
-}
+		/*
+		 * Let's just make real sure there are not .ready or .done flags posted
+		 * for the new segment.
+		 */
+		XLogFileName(xlogfname, ThisTimeLineID, startLogSegNo, wal_segment_size);
+		XLogArchiveCleanup(xlogfname);
+
+		/*
+		 * Remove the signal files out of the way, so that we don't accidentally
+		 * re-enter archive recovery mode in a subsequent crash.
+		 */
+		if (standby_signal_file_found)
+			durable_unlink(STANDBY_SIGNAL_FILE, FATAL);
+
+		if (recovery_signal_file_found)
+			durable_unlink(RECOVERY_SIGNAL_FILE, FATAL);
+
+		ereport(LOG,
+				(errmsg("archive recovery complete")));
+
+		/*
+		 * Write the timeline history file, and have it archived. After this
+		 * point (or rather, as soon as the file is archived), the timeline
+		 * will appear as "taken" in the WAL archive and to any standby
+		 * servers.  If we crash before actually switching to the new
+		 * timeline, standby servers will nevertheless think that we switched
+		 * to the new timeline, and will try to connect to the new timeline.
+		 * To minimize the window for that, try to do as little as possible
+		 * between here and writing the end-of-recovery record.
+		 */
+		writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
+							 EndOfLog, reason);
+	}
+
+	/* Save the selected TimeLineID in shared memory, too */
+	XLogCtl->ThisTimeLineID = ThisTimeLineID;
+	XLogCtl->PrevTimeLineID = PrevTimeLineID;
 
-/*
- * Returns time of receipt of current chunk of XLOG data, as well as
- * whether it was received from streaming replication or from archives.
- */
-void
-GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
-{
 	/*
-	 * This must be executed in the startup process, since we don't export the
-	 * relevant state to shared memory.
+	 * Prepare to write WAL starting at EndOfLog location, and init xlog
+	 * buffer cache using the block containing the last record from the
+	 * previous incarnation.
 	 */
-	Assert(InRecovery);
+	Insert = &XLogCtl->Insert;
+	Insert->PrevBytePos = XLogRecPtrToBytePos(LastRec);
+	Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog);
 
-	*rtime = XLogReceiptTime;
-	*fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
-}
+	/*
+	 * Tricky point here: lastPage contains the *last* block that the LastRec
+	 * record spans, not the one it starts in.  The last block is indeed the
+	 * one we want to use.
+	 */
+	if (EndOfLog % XLOG_BLCKSZ != 0)
+	{
+		char	   *page;
+		int			len;
+		int			firstIdx;
 
-/*
- * Note that text field supplied is a parameter name and does not require
- * translation
- */
-static void
-RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue)
-{
-	if (currValue < minValue)
+		firstIdx = XLogRecPtrToBufIdx(EndOfLog);
+		len = EndOfLog - lastPageBeginPtr;
+		Assert(len < XLOG_BLCKSZ);
+
+		/* Copy the valid part of the last block, and zero the rest */
+		page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
+		memcpy(page, lastPage, XLOG_BLCKSZ);
+		memset(page + len, 0, XLOG_BLCKSZ - len);
+
+		XLogCtl->xlblocks[firstIdx] = lastPageBeginPtr + XLOG_BLCKSZ;
+		XLogCtl->InitializedUpTo = lastPageBeginPtr + XLOG_BLCKSZ;
+	}
+	else
 	{
-		if (LocalHotStandbyActive)
-		{
-			bool		warned_for_promote = false;
+		/*
+		 * There is no partial block to copy. Just set InitializedUpTo, and
+		 * let the first attempt to insert a log record to initialize the next
+		 * buffer.
+		 */
+		Assert(lastPageBeginPtr == EndOfLog);
+		XLogCtl->InitializedUpTo = EndOfLog;
+	}
 
-			ereport(WARNING,
-					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-					 errmsg("hot standby is not possible because of insufficient parameter settings"),
-					 errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
-							   param_name,
-							   currValue,
-							   minValue)));
+	LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
+
+	XLogCtl->LogwrtResult = LogwrtResult;
 
-			SetRecoveryPause(true);
+	XLogCtl->LogwrtRqst.Write = EndOfLog;
+	XLogCtl->LogwrtRqst.Flush = EndOfLog;
 
-			ereport(LOG,
-					(errmsg("recovery has paused"),
-					 errdetail("If recovery is unpaused, the server will shut down."),
-					 errhint("You can then restart the server after making the necessary configuration changes.")));
+	/*
+	 * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
+	 * record before resource manager writes cleanup WAL records or checkpoint
+	 * record is written.
+	 */
+	Insert->fullPageWrites = lastFullPageWrites;
+	LocalSetXLogInsertAllowed();
+	UpdateFullPageWrites();
+	LocalXLogInsertAllowed = -1;
 
-			while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED)
+	if (InRecovery)
+	{
+		/*
+		 * Perform a checkpoint to update all our recovery activity to disk.
+		 *
+		 * Note that we write a shutdown checkpoint rather than an on-line
+		 * one. This is not particularly critical, but since we may be
+		 * assigning a new TLI, using a shutdown checkpoint allows us to have
+		 * the rule that TLI only changes in shutdown checkpoints, which
+		 * allows some extra error checking in xlog_redo.
+		 *
+		 * In promotion, only create a lightweight end-of-recovery record
+		 * instead of a full checkpoint. A checkpoint is requested later,
+		 * after we're fully out of recovery mode and already accepting
+		 * queries.
+		 */
+		if (bgwriterLaunched)
+		{
+			if (PromoteIsTriggered())
 			{
-				HandleStartupProcInterrupts();
+				XLogRecPtr	checkPointLoc;
+				XLogRecord *record;
 
-				if (CheckForStandbyTrigger())
-				{
-					if (!warned_for_promote)
-						ereport(WARNING,
-								(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-								 errmsg("promotion is not possible because of insufficient parameter settings"),
-
-						/*
-						 * Repeat the detail from above so it's easy to find
-						 * in the log.
-						 */
-								 errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
-										   param_name,
-										   currValue,
-										   minValue),
-								 errhint("Restart the server after making the necessary configuration changes.")));
-					warned_for_promote = true;
-				}
+				checkPointLoc = ControlFile->checkPoint;
 
 				/*
-				 * If recovery pause is requested then set it paused.  While
-				 * we are in the loop, user might resume and pause again so
-				 * set this every time.
+				 * Confirm the last checkpoint is available for us to recover
+				 * from if we fail.
 				 */
-				ConfirmRecoveryPaused();
+				record = ReadCheckpointRecord(checkPointLoc, 1, false);
+				if (record != NULL)
+				{
+					promoted = true;
 
-				/*
-				 * We wait on a condition variable that will wake us as soon
-				 * as the pause ends, but we use a timeout so we can check the
-				 * above conditions periodically too.
-				 */
-				ConditionVariableTimedSleep(&XLogCtl->recoveryNotPausedCV, 1000,
-											WAIT_EVENT_RECOVERY_PAUSE);
+					/*
+					 * Insert a special WAL record to mark the end of
+					 * recovery, since we aren't doing a checkpoint. That
+					 * means that the checkpointer process may likely be in
+					 * the middle of a time-smoothed restartpoint and could
+					 * continue to be for minutes after this. That sounds
+					 * strange, but the effect is roughly the same and it
+					 * would be stranger to try to come out of the
+					 * restartpoint and then checkpoint. We request a
+					 * checkpoint later anyway, just for safety.
+					 */
+					CreateEndOfRecoveryRecord();
+				}
 			}
-			ConditionVariableCancelSleep();
-		}
 
-		ereport(FATAL,
-				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-				 errmsg("recovery aborted because of insufficient parameter settings"),
-		/* Repeat the detail from above so it's easy to find in the log. */
-				 errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
-						   param_name,
-						   currValue,
-						   minValue),
-				 errhint("You can restart the server after making the necessary configuration changes.")));
-	}
-}
-
-/*
- * Check to see if required parameters are set high enough on this server
- * for various aspects of recovery operation.
- *
- * Note that all the parameters which this function tests need to be
- * listed in Administrator's Overview section in high-availability.sgml.
- * If you change them, don't forget to update the list.
- */
-static void
-CheckRequiredParameterValues(void)
-{
-	/*
-	 * For archive recovery, the WAL must be generated with at least 'replica'
-	 * wal_level.
-	 */
-	if (ArchiveRecoveryRequested && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
-	{
-		ereport(FATAL,
-				(errmsg("WAL was generated with wal_level=minimal, cannot continue recovering"),
-				 errdetail("This happens if you temporarily set wal_level=minimal on the server."),
-				 errhint("Use a backup taken after setting wal_level to higher than minimal.")));
+			if (!promoted)
+				RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
+								  CHECKPOINT_IMMEDIATE |
+								  CHECKPOINT_WAIT);
+		}
+		else
+			CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
 	}
 
-	/*
-	 * For Hot Standby, the WAL must be generated with 'replica' mode, and we
-	 * must have at least as many backend slots as the primary.
-	 */
-	if (ArchiveRecoveryRequested && EnableHotStandby)
+	if (ArchiveRecoveryRequested)
 	{
-		/* We ignore autovacuum_max_workers when we make this test. */
-		RecoveryRequiresIntParameter("max_connections",
-									 MaxConnections,
-									 ControlFile->MaxConnections);
-		RecoveryRequiresIntParameter("max_worker_processes",
-									 max_worker_processes,
-									 ControlFile->max_worker_processes);
-		RecoveryRequiresIntParameter("max_wal_senders",
-									 max_wal_senders,
-									 ControlFile->max_wal_senders);
-		RecoveryRequiresIntParameter("max_prepared_transactions",
-									 max_prepared_xacts,
-									 ControlFile->max_prepared_xacts);
-		RecoveryRequiresIntParameter("max_locks_per_transaction",
-									 max_locks_per_xact,
-									 ControlFile->max_locks_per_xact);
-	}
-}
-
-/*
- * This must be called ONCE during postmaster or standalone-backend startup
- */
-void
-StartupXLOG(void)
-{
-	XLogCtlInsert *Insert;
-	CheckPoint	checkPoint;
-	bool		wasShutdown;
-	bool		reachedRecoveryTarget = false;
-	bool		haveBackupLabel = false;
-	bool		haveTblspcMap = false;
-	XLogRecPtr	RecPtr,
-				checkPointLoc,
-				EndOfLog;
-	TimeLineID	EndOfLogTLI;
-	TimeLineID	PrevTimeLineID;
-	XLogRecord *record;
-	TransactionId oldestActiveXID;
-	bool		backupEndRequired = false;
-	bool		backupFromStandby = false;
-	DBState		dbstate_at_startup;
-	XLogReaderState *xlogreader;
-	XLogPageReadPrivate private;
-	bool		promoted = false;
-	struct stat st;
+		/*
+		 * And finally, execute the recovery_end_command, if any.
+		 */
+		if (recoveryEndCommand && strcmp(recoveryEndCommand, "") != 0)
+			ExecuteRecoveryCommand(recoveryEndCommand,
+								   "recovery_end_command",
+								   true);
 
-	/*
-	 * We should have an aux process resource owner to use, and we should not
-	 * be in a transaction that's installed some other resowner.
-	 */
-	Assert(AuxProcessResourceOwner != NULL);
-	Assert(CurrentResourceOwner == NULL ||
-		   CurrentResourceOwner == AuxProcessResourceOwner);
-	CurrentResourceOwner = AuxProcessResourceOwner;
+		/*
+		 * We switched to a new timeline. Clean up segments on the old
+		 * timeline.
+		 *
+		 * If there are any higher-numbered segments on the old timeline,
+		 * remove them. They might contain valid WAL, but they might also be
+		 * pre-allocated files containing garbage. In any case, they are not
+		 * part of the new timeline's history so we don't need them.
+		 */
+		RemoveNonParentXlogFiles(EndOfLog, ThisTimeLineID);
 
-	/*
-	 * Check that contents look valid.
-	 */
-	if (!XRecOffIsValid(ControlFile->checkPoint))
-		ereport(FATAL,
-				(errmsg("control file contains invalid checkpoint location")));
+		/*
+		 * If the switch happened in the middle of a segment, what to do with
+		 * the last, partial segment on the old timeline? If we don't archive
+		 * it, and the server that created the WAL never archives it either
+		 * (e.g. because it was hit by a meteor), it will never make it to the
+		 * archive. That's OK from our point of view, because the new segment
+		 * that we created with the new TLI contains all the WAL from the old
+		 * timeline up to the switch point. But if you later try to do PITR to
+		 * the "missing" WAL on the old timeline, recovery won't find it in
+		 * the archive. It's physically present in the new file with new TLI,
+		 * but recovery won't look there when it's recovering to the older
+		 * timeline. On the other hand, if we archive the partial segment, and
+		 * the original server on that timeline is still running and archives
+		 * the completed version of the same segment later, it will fail. (We
+		 * used to do that in 9.4 and below, and it caused such problems).
+		 *
+		 * As a compromise, we rename the last segment with the .partial
+		 * suffix, and archive it. Archive recovery will never try to read
+		 * .partial segments, so they will normally go unused. But in the odd
+		 * PITR case, the administrator can copy them manually to the pg_wal
+		 * directory (removing the suffix). They can be useful in debugging,
+		 * too.
+		 *
+		 * If a .done or .ready file already exists for the old timeline,
+		 * however, we had already determined that the segment is complete, so
+		 * we can let it be archived normally. (In particular, if it was
+		 * restored from the archive to begin with, it's expected to have a
+		 * .done file).
+		 */
+		if (XLogSegmentOffset(EndOfLog, wal_segment_size) != 0 &&
+			XLogArchivingActive())
+		{
+			char		origfname[MAXFNAMELEN];
+			XLogSegNo	endLogSegNo;
 
-	switch (ControlFile->state)
-	{
-		case DB_SHUTDOWNED:
+			XLByteToPrevSeg(EndOfLog, endLogSegNo, wal_segment_size);
+			XLogFileName(origfname, EndOfLogTLI, endLogSegNo, wal_segment_size);
 
-			/*
-			 * This is the expected case, so don't be chatty in standalone
-			 * mode
-			 */
-			ereport(IsPostmasterEnvironment ? LOG : NOTICE,
-					(errmsg("database system was shut down at %s",
-							str_time(ControlFile->time))));
-			break;
+			if (!XLogArchiveIsReadyOrDone(origfname))
+			{
+				char		origpath[MAXPGPATH];
+				char		partialfname[MAXFNAMELEN];
+				char		partialpath[MAXPGPATH];
 
-		case DB_SHUTDOWNED_IN_RECOVERY:
-			ereport(LOG,
-					(errmsg("database system was shut down in recovery at %s",
-							str_time(ControlFile->time))));
-			break;
+				XLogFilePath(origpath, EndOfLogTLI, endLogSegNo, wal_segment_size);
+				snprintf(partialfname, MAXFNAMELEN, "%s.partial", origfname);
+				snprintf(partialpath, MAXPGPATH, "%s.partial", origpath);
 
-		case DB_SHUTDOWNING:
-			ereport(LOG,
-					(errmsg("database system shutdown was interrupted; last known up at %s",
-							str_time(ControlFile->time))));
-			break;
+				/*
+				 * Make sure there's no .done or .ready file for the .partial
+				 * file.
+				 */
+				XLogArchiveCleanup(partialfname);
 
-		case DB_IN_CRASH_RECOVERY:
-			ereport(LOG,
-					(errmsg("database system was interrupted while in recovery at %s",
-							str_time(ControlFile->time)),
-					 errhint("This probably means that some data is corrupted and"
-							 " you will have to use the last backup for recovery.")));
-			break;
+				durable_rename(origpath, partialpath, ERROR);
+				XLogArchiveNotify(partialfname);
+			}
+		}
+	}
 
-		case DB_IN_ARCHIVE_RECOVERY:
-			ereport(LOG,
-					(errmsg("database system was interrupted while in recovery at log time %s",
-							str_time(ControlFile->checkPointCopy.time)),
-					 errhint("If this has occurred more than once some data might be corrupted"
-							 " and you might need to choose an earlier recovery target.")));
-			break;
+	/*
+	 * Preallocate additional log files, if wanted.
+	 */
+	PreallocXlogFiles(EndOfLog);
 
-		case DB_IN_PRODUCTION:
-			ereport(LOG,
-					(errmsg("database system was interrupted; last known up at %s",
-							str_time(ControlFile->time))));
-			break;
+	/*
+	 * Okay, we're officially UP.
+	 */
+	InRecovery = false;
 
-		default:
-			ereport(FATAL,
-					(errmsg("control file contains invalid database cluster state")));
-	}
+	/* start the archive_timeout timer and LSN running */
+	XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
+	XLogCtl->lastSegSwitchLSN = EndOfLog;
 
-	/* This is just to allow attaching to startup process with a debugger */
-#ifdef XLOG_REPLAY_DELAY
-	if (ControlFile->state != DB_SHUTDOWNED)
-		pg_usleep(60000000L);
-#endif
+	/* also initialize latestCompletedXid, to nextXid - 1 */
+	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+	ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
+	FullTransactionIdRetreat(&ShmemVariableCache->latestCompletedXid);
+	LWLockRelease(ProcArrayLock);
 
 	/*
-	 * Verify that pg_wal and pg_wal/archive_status exist.  In cases where
-	 * someone has performed a copy for PITR, these directories may have been
-	 * excluded and need to be re-created.
+	 * Start up subtrans, if not already done for hot standby.  (commit
+	 * timestamps are started below, if necessary.)
 	 */
-	ValidateXLOGDirectoryStructure();
+	if (standbyState == STANDBY_DISABLED)
+		StartupSUBTRANS(oldestActiveXID);
 
-	/*----------
-	 * If we previously crashed, perform a couple of actions:
-	 *
-	 * - The pg_wal directory may still include some temporary WAL segments
-	 *   used when creating a new segment, so perform some clean up to not
-	 *   bloat this path.  This is done first as there is no point to sync
-	 *   this temporary data.
-	 *
-	 * - There might be data which we had written, intending to fsync it, but
-	 *   which we had not actually fsync'd yet.  Therefore, a power failure in
-	 *   the near future might cause earlier unflushed writes to be lost, even
-	 *   though more recent data written to disk from here on would be
-	 *   persisted.  To avoid that, fsync the entire data directory.
+	/*
+	 * Perform end of recovery actions for any SLRUs that need it.
 	 */
-	if (ControlFile->state != DB_SHUTDOWNED &&
-		ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
-	{
-		RemoveTempXlogFiles();
-		SyncDataDirectory();
-	}
+	TrimCLOG();
+	TrimMultiXact();
+
+	/* Reload shared-memory state for prepared transactions */
+	RecoverPreparedTransactions();
 
 	/*
-	 * Initialize on the assumption we want to recover to the latest timeline
-	 * that's active according to pg_control.
+	 * Shutdown the recovery environment. This must occur after
+	 * RecoverPreparedTransactions(), see notes for lock_twophase_recover()
 	 */
-	if (ControlFile->minRecoveryPointTLI >
-		ControlFile->checkPointCopy.ThisTimeLineID)
-		recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
-	else
-		recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
+	if (standbyState != STANDBY_DISABLED)
+		ShutdownRecoveryTransactionEnvironment();
+
+	/* Shut down xlogreader */
+	FreeWalRecovery();
 
 	/*
-	 * Check for signal files, and if so set up state for offline recovery
+	 * If any of the critical GUCs have changed, log them before we allow
+	 * backends to write WAL.
 	 */
-	readRecoverySignalFile();
-	validateRecoveryParameters();
-
-	if (ArchiveRecoveryRequested)
-	{
-		if (StandbyModeRequested)
-			ereport(LOG,
-					(errmsg("entering standby mode")));
-		else if (recoveryTarget == RECOVERY_TARGET_XID)
-			ereport(LOG,
-					(errmsg("starting point-in-time recovery to XID %u",
-							recoveryTargetXid)));
-		else if (recoveryTarget == RECOVERY_TARGET_TIME)
-			ereport(LOG,
-					(errmsg("starting point-in-time recovery to %s",
-							timestamptz_to_str(recoveryTargetTime))));
-		else if (recoveryTarget == RECOVERY_TARGET_NAME)
-			ereport(LOG,
-					(errmsg("starting point-in-time recovery to \"%s\"",
-							recoveryTargetName)));
-		else if (recoveryTarget == RECOVERY_TARGET_LSN)
-			ereport(LOG,
-					(errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"",
-							LSN_FORMAT_ARGS(recoveryTargetLSN))));
-		else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
-			ereport(LOG,
-					(errmsg("starting point-in-time recovery to earliest consistent point")));
-		else
-			ereport(LOG,
-					(errmsg("starting archive recovery")));
-	}
+	LocalSetXLogInsertAllowed();
+	XLogReportParameters();
 
 	/*
-	 * Take ownership of the wakeup latch if we're going to sleep during
-	 * recovery.
+	 * Local WAL inserts enabled, so it's time to finish initialization of
+	 * commit timestamp.
 	 */
-	if (ArchiveRecoveryRequested)
-		OwnLatch(&XLogCtl->recoveryWakeupLatch);
-
-	/* Set up XLOG reader facility */
-	MemSet(&private, 0, sizeof(XLogPageReadPrivate));
-	xlogreader =
-		XLogReaderAllocate(wal_segment_size, NULL,
-						   XL_ROUTINE(.page_read = &XLogPageRead,
-									  .segment_open = NULL,
-									  .segment_close = wal_segment_close),
-						   &private);
-	if (!xlogreader)
-		ereport(ERROR,
-				(errcode(ERRCODE_OUT_OF_MEMORY),
-				 errmsg("out of memory"),
-				 errdetail("Failed while allocating a WAL reading processor.")));
-	xlogreader->system_identifier = ControlFile->system_identifier;
+	CompleteCommitTsInitialization();
 
 	/*
-	 * Allocate two page buffers dedicated to WAL consistency checks.  We do
-	 * it this way, rather than just making static arrays, for two reasons:
-	 * (1) no need to waste the storage in most instantiations of the backend;
-	 * (2) a static char array isn't guaranteed to have any particular
-	 * alignment, whereas palloc() will provide MAXALIGN'd storage.
+	 * All done with end-of-recovery actions.
+	 *
+	 * Now allow backends to write WAL and update the control file status in
+	 * consequence.  SharedRecoveryState, that controls if backends can write
+	 * WAL, is updated while holding ControlFileLock to prevent other backends
+	 * to look at an inconsistent state of the control file in shared memory.
+	 * There is still a small window during which backends can write WAL and
+	 * the control file is still referring to a system not in DB_IN_PRODUCTION
+	 * state while looking at the on-disk control file.
+	 *
+	 * Also, we use info_lck to update SharedRecoveryState to ensure that
+	 * there are no race conditions concerning visibility of other recent
+	 * updates to shared memory.
 	 */
-	replay_image_masked = (char *) palloc(BLCKSZ);
-	primary_image_masked = (char *) palloc(BLCKSZ);
-
-	if (read_backup_label(&checkPointLoc, &backupEndRequired,
-						  &backupFromStandby))
-	{
-		List	   *tablespaces = NIL;
-
-		/*
-		 * Archive recovery was requested, and thanks to the backup label
-		 * file, we know how far we need to replay to reach consistency. Enter
-		 * archive recovery directly.
-		 */
-		InArchiveRecovery = true;
-		if (StandbyModeRequested)
-			StandbyMode = true;
-
-		/*
-		 * When a backup_label file is present, we want to roll forward from
-		 * the checkpoint it identifies, rather than using pg_control.
-		 */
-		record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true);
-		if (record != NULL)
-		{
-			memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
-			wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
-			ereport(DEBUG1,
-					(errmsg_internal("checkpoint record is at %X/%X",
-									 LSN_FORMAT_ARGS(checkPointLoc))));
-			InRecovery = true;	/* force recovery even if SHUTDOWNED */
-
-			/*
-			 * Make sure that REDO location exists. This may not be the case
-			 * if there was a crash during an online backup, which left a
-			 * backup_label around that references a WAL segment that's
-			 * already been archived.
-			 */
-			if (checkPoint.redo < checkPointLoc)
-			{
-				XLogBeginRead(xlogreader, checkPoint.redo);
-				if (!ReadRecord(xlogreader, LOG, false))
-					ereport(FATAL,
-							(errmsg("could not find redo location referenced by checkpoint record"),
-							 errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n"
-									 "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
-									 "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
-									 DataDir, DataDir, DataDir)));
-			}
-		}
-		else
-		{
-			ereport(FATAL,
-					(errmsg("could not locate required checkpoint record"),
-					 errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n"
-							 "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
-							 "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
-							 DataDir, DataDir, DataDir)));
-			wasShutdown = false;	/* keep compiler quiet */
-		}
-
-		/* read the tablespace_map file if present and create symlinks. */
-		if (read_tablespace_map(&tablespaces))
-		{
-			ListCell   *lc;
-
-			foreach(lc, tablespaces)
-			{
-				tablespaceinfo *ti = lfirst(lc);
-				char	   *linkloc;
-
-				linkloc = psprintf("pg_tblspc/%s", ti->oid);
+	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+	ControlFile->state = DB_IN_PRODUCTION;
+	ControlFile->time = (pg_time_t) time(NULL);
 
-				/*
-				 * Remove the existing symlink if any and Create the symlink
-				 * under PGDATA.
-				 */
-				remove_tablespace_symlink(linkloc);
+	SpinLockAcquire(&XLogCtl->info_lck);
+	XLogCtl->SharedRecoveryState = RECOVERY_STATE_DONE;
+	SpinLockRelease(&XLogCtl->info_lck);
 
-				if (symlink(ti->path, linkloc) < 0)
-					ereport(ERROR,
-							(errcode_for_file_access(),
-							 errmsg("could not create symbolic link \"%s\": %m",
-									linkloc)));
+	UpdateControlFile();
+	LWLockRelease(ControlFileLock);
 
-				pfree(ti->oid);
-				pfree(ti->path);
-				pfree(ti);
-			}
+	/*
+	 * If there were cascading standby servers connected to us, nudge any wal
+	 * sender processes to notice that we've been promoted.
+	 */
+	WalSndWakeup();
 
-			/* set flag to delete it later */
-			haveTblspcMap = true;
-		}
+	/*
+	 * If this was a promotion, request an (online) checkpoint now. This isn't
+	 * required for consistency, but the last restartpoint might be far back,
+	 * and in case of a crash, recovering from it might take a longer than is
+	 * appropriate now that we're not in standby mode anymore.
+	 */
+	if (promoted)
+		RequestCheckpoint(CHECKPOINT_FORCE);
+}
 
-		/* set flag to delete it later */
-		haveBackupLabel = true;
-	}
+/*
+ * Is the system still in recovery?
+ *
+ * Unlike testing InRecovery, this works in any process that's connected to
+ * shared memory.
+ *
+ * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
+ * variables the first time we see that recovery is finished.
+ */
+bool
+RecoveryInProgress(void)
+{
+	/*
+	 * We check shared state each time only until we leave recovery mode. We
+	 * can't re-enter recovery, so there's no need to keep checking after the
+	 * shared variable has once been seen false.
+	 */
+	if (!LocalRecoveryInProgress)
+		return false;
 	else
 	{
 		/*
-		 * If tablespace_map file is present without backup_label file, there
-		 * is no use of such file.  There is no harm in retaining it, but it
-		 * is better to get rid of the map file so that we don't have any
-		 * redundant file in data directory and it will avoid any sort of
-		 * confusion.  It seems prudent though to just rename the file out of
-		 * the way rather than delete it completely, also we ignore any error
-		 * that occurs in rename operation as even if map file is present
-		 * without backup_label file, it is harmless.
+		 * use volatile pointer to make sure we make a fresh read of the
+		 * shared variable.
 		 */
-		if (stat(TABLESPACE_MAP, &st) == 0)
-		{
-			unlink(TABLESPACE_MAP_OLD);
-			if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
-				ereport(LOG,
-						(errmsg("ignoring file \"%s\" because no file \"%s\" exists",
-								TABLESPACE_MAP, BACKUP_LABEL_FILE),
-						 errdetail("File \"%s\" was renamed to \"%s\".",
-								   TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
-			else
-				ereport(LOG,
-						(errmsg("ignoring file \"%s\" because no file \"%s\" exists",
-								TABLESPACE_MAP, BACKUP_LABEL_FILE),
-						 errdetail("Could not rename file \"%s\" to \"%s\": %m.",
-								   TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
-		}
+		volatile XLogCtlData *xlogctl = XLogCtl;
+
+		LocalRecoveryInProgress = (xlogctl->SharedRecoveryState != RECOVERY_STATE_DONE);
 
 		/*
-		 * It's possible that archive recovery was requested, but we don't
-		 * know how far we need to replay the WAL before we reach consistency.
-		 * This can happen for example if a base backup is taken from a
-		 * running server using an atomic filesystem snapshot, without calling
-		 * pg_start/stop_backup. Or if you just kill a running primary server
-		 * and put it into archive recovery by creating a recovery signal
-		 * file.
-		 *
-		 * Our strategy in that case is to perform crash recovery first,
-		 * replaying all the WAL present in pg_wal, and only enter archive
-		 * recovery after that.
-		 *
-		 * But usually we already know how far we need to replay the WAL (up
-		 * to minRecoveryPoint, up to backupEndPoint, or until we see an
-		 * end-of-backup record), and we can enter archive recovery directly.
+		 * Initialize TimeLineID and RedoRecPtr when we discover that recovery
+		 * is finished. InitPostgres() relies upon this behaviour to ensure
+		 * that InitXLOGAccess() is called at backend startup.  (If you change
+		 * this, see also LocalSetXLogInsertAllowed.)
 		 */
-		if (ArchiveRecoveryRequested &&
-			(ControlFile->minRecoveryPoint != InvalidXLogRecPtr ||
-			 ControlFile->backupEndRequired ||
-			 ControlFile->backupEndPoint != InvalidXLogRecPtr ||
-			 ControlFile->state == DB_SHUTDOWNED))
-		{
-			InArchiveRecovery = true;
-			if (StandbyModeRequested)
-				StandbyMode = true;
-		}
-
-		/* Get the last valid checkpoint record. */
-		checkPointLoc = ControlFile->checkPoint;
-		RedoStartLSN = ControlFile->checkPointCopy.redo;
-		record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true);
-		if (record != NULL)
-		{
-			ereport(DEBUG1,
-					(errmsg_internal("checkpoint record is at %X/%X",
-									 LSN_FORMAT_ARGS(checkPointLoc))));
-		}
-		else
+		if (!LocalRecoveryInProgress)
 		{
 			/*
-			 * We used to attempt to go back to a secondary checkpoint record
-			 * here, but only when not in standby mode. We now just fail if we
-			 * can't read the last checkpoint because this allows us to
-			 * simplify processing around checkpoints.
+			 * If we just exited recovery, make sure we read TimeLineID and
+			 * RedoRecPtr after SharedRecoveryState (for machines with weak
+			 * memory ordering).
 			 */
-			ereport(PANIC,
-					(errmsg("could not locate a valid checkpoint record")));
+			pg_memory_barrier();
+			InitXLOGAccess();
 		}
-		memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
-		wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
-	}
-
-	/*
-	 * Clear out any old relcache cache files.  This is *necessary* if we do
-	 * any WAL replay, since that would probably result in the cache files
-	 * being out of sync with database reality.  In theory we could leave them
-	 * in place if the database had been cleanly shut down, but it seems
-	 * safest to just remove them always and let them be rebuilt during the
-	 * first backend startup.  These files needs to be removed from all
-	 * directories including pg_tblspc, however the symlinks are created only
-	 * after reading tablespace_map file in case of archive recovery from
-	 * backup, so needs to clear old relcache files here after creating
-	 * symlinks.
-	 */
-	RelationCacheInitFileRemove();
-
-	/*
-	 * If the location of the checkpoint record is not on the expected
-	 * timeline in the history of the requested timeline, we cannot proceed:
-	 * the backup is not part of the history of the requested timeline.
-	 */
-	Assert(expectedTLEs);		/* was initialized by reading checkpoint
-								 * record */
-	if (tliOfPointInHistory(checkPointLoc, expectedTLEs) !=
-		checkPoint.ThisTimeLineID)
-	{
-		XLogRecPtr	switchpoint;
 
 		/*
-		 * tliSwitchPoint will throw an error if the checkpoint's timeline is
-		 * not in expectedTLEs at all.
+		 * Note: We don't need a memory barrier when we're still in recovery.
+		 * We might exit recovery immediately after return, so the caller
+		 * can't rely on 'true' meaning that we're still in recovery anyway.
 		 */
-		switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs, NULL);
-		ereport(FATAL,
-				(errmsg("requested timeline %u is not a child of this server's history",
-						recoveryTargetTLI),
-				 errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
-						   LSN_FORMAT_ARGS(ControlFile->checkPoint),
-						   ControlFile->checkPointCopy.ThisTimeLineID,
-						   LSN_FORMAT_ARGS(switchpoint))));
+
+		return LocalRecoveryInProgress;
 	}
+}
 
-	/*
-	 * The min recovery point should be part of the requested timeline's
-	 * history, too.
-	 */
-	if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) &&
-		tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
-		ControlFile->minRecoveryPointTLI)
-		ereport(FATAL,
-				(errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
-						recoveryTargetTLI,
-						LSN_FORMAT_ARGS(ControlFile->minRecoveryPoint),
-						ControlFile->minRecoveryPointTLI)));
-
-	LastRec = RecPtr = checkPointLoc;
-
-	ereport(DEBUG1,
-			(errmsg_internal("redo record is at %X/%X; shutdown %s",
-							 LSN_FORMAT_ARGS(checkPoint.redo),
-							 wasShutdown ? "true" : "false")));
-	ereport(DEBUG1,
-			(errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u",
-							 U64FromFullTransactionId(checkPoint.nextXid),
-							 checkPoint.nextOid)));
-	ereport(DEBUG1,
-			(errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u",
-							 checkPoint.nextMulti, checkPoint.nextMultiOffset)));
-	ereport(DEBUG1,
-			(errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
-							 checkPoint.oldestXid, checkPoint.oldestXidDB)));
-	ereport(DEBUG1,
-			(errmsg_internal("oldest MultiXactId: %u, in database %u",
-							 checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
-	ereport(DEBUG1,
-			(errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
-							 checkPoint.oldestCommitTsXid,
-							 checkPoint.newestCommitTsXid)));
-	if (!TransactionIdIsNormal(XidFromFullTransactionId(checkPoint.nextXid)))
-		ereport(PANIC,
-				(errmsg("invalid next transaction ID")));
+/*
+ * Returns current recovery state from shared memory.
+ *
+ * This returned state is kept consistent with the contents of the control
+ * file.  See details about the possible values of RecoveryState in xlog.h.
+ */
+RecoveryState
+GetRecoveryState(void)
+{
+	RecoveryState retval;
 
-	/* initialize shared memory variables from the checkpoint record */
-	ShmemVariableCache->nextXid = checkPoint.nextXid;
-	ShmemVariableCache->nextOid = checkPoint.nextOid;
-	ShmemVariableCache->oidCount = 0;
-	MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
-	AdvanceOldestClogXid(checkPoint.oldestXid);
-	SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
-	SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
-	SetCommitTsLimit(checkPoint.oldestCommitTsXid,
-					 checkPoint.newestCommitTsXid);
-	XLogCtl->ckptFullXid = checkPoint.nextXid;
+	SpinLockAcquire(&XLogCtl->info_lck);
+	retval = XLogCtl->SharedRecoveryState;
+	SpinLockRelease(&XLogCtl->info_lck);
 
-	/*
-	 * Initialize replication slots, before there's a chance to remove
-	 * required resources.
-	 */
-	StartupReplicationSlots();
+	return retval;
+}
 
+/*
+ * Is this process allowed to insert new WAL records?
+ *
+ * Ordinarily this is essentially equivalent to !RecoveryInProgress().
+ * But we also have provisions for forcing the result "true" or "false"
+ * within specific processes regardless of the global state.
+ */
+bool
+XLogInsertAllowed(void)
+{
 	/*
-	 * Startup logical state, needs to be setup now so we have proper data
-	 * during crash recovery.
+	 * If value is "unconditionally true" or "unconditionally false", just
+	 * return it.  This provides the normal fast path once recovery is known
+	 * done.
 	 */
-	StartupReorderBuffer();
+	if (LocalXLogInsertAllowed >= 0)
+		return (bool) LocalXLogInsertAllowed;
 
 	/*
-	 * Startup CLOG. This must be done after ShmemVariableCache->nextXid has
-	 * been initialized and before we accept connections or begin WAL replay.
+	 * Else, must check to see if we're still in recovery.
 	 */
-	StartupCLOG();
+	if (RecoveryInProgress())
+		return false;
 
 	/*
-	 * Startup MultiXact. We need to do this early to be able to replay
-	 * truncations.
+	 * On exit from recovery, reset to "unconditionally true", since there is
+	 * no need to keep checking.
 	 */
-	StartupMultiXact();
+	LocalXLogInsertAllowed = 1;
+	return true;
+}
 
-	/*
-	 * Ditto for commit timestamps.  Activate the facility if the setting is
-	 * enabled in the control file, as there should be no tracking of commit
-	 * timestamps done when the setting was disabled.  This facility can be
-	 * started or stopped when replaying a XLOG_PARAMETER_CHANGE record.
-	 */
-	if (ControlFile->track_commit_timestamp)
-		StartupCommitTs();
+/*
+ * Make XLogInsertAllowed() return true in the current process only.
+ *
+ * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
+ * and even call LocalSetXLogInsertAllowed() again after that.
+ */
+static void
+LocalSetXLogInsertAllowed(void)
+{
+	Assert(LocalXLogInsertAllowed == -1);
+	LocalXLogInsertAllowed = 1;
 
-	/*
-	 * Recover knowledge about replay progress of known replication partners.
-	 */
-	StartupReplicationOrigin();
+	/* Initialize as RecoveryInProgress() would do when switching state */
+	InitXLOGAccess();
+}
 
-	/*
-	 * Initialize unlogged LSN. On a clean shutdown, it's restored from the
-	 * control file. On recovery, all unlogged relations are blown away, so
-	 * the unlogged LSN counter can be reset too.
-	 */
-	if (ControlFile->state == DB_SHUTDOWNED)
-		XLogCtl->unloggedLSN = ControlFile->unloggedLSN;
-	else
-		XLogCtl->unloggedLSN = FirstNormalUnloggedLSN;
-
-	/*
-	 * We must replay WAL entries using the same TimeLineID they were created
-	 * under, so temporarily adopt the TLI indicated by the checkpoint (see
-	 * also xlog_redo()).
-	 */
-	ThisTimeLineID = checkPoint.ThisTimeLineID;
+/*
+ * This must be called in a backend process before creating WAL records
+ * (except in a standalone backend, which does StartupXLOG instead).  We need
+ * to initialize the local copies of ThisTimeLineID and RedoRecPtr.
+ *
+ * Note: before Postgres 8.0, we went to some effort to keep the postmaster
+ * process's copies of ThisTimeLineID and RedoRecPtr valid too.  This was
+ * unnecessary however, since the postmaster itself never touches XLOG anyway.
+ */
+void
+InitXLOGAccess(void)
+{
+	XLogCtlInsert *Insert = &XLogCtl->Insert;
 
-	/*
-	 * Copy any missing timeline history files between 'now' and the recovery
-	 * target timeline from archive to pg_wal. While we don't need those files
-	 * ourselves - the history file of the recovery target timeline covers all
-	 * the previous timelines in the history too - a cascading standby server
-	 * might be interested in them. Or, if you archive the WAL from this
-	 * server to a different archive than the primary, it'd be good for all
-	 * the history files to get archived there after failover, so that you can
-	 * use one of the old timelines as a PITR target. Timeline history files
-	 * are small, so it's better to copy them unnecessarily than not copy them
-	 * and regret later.
-	 */
-	restoreTimeLineHistoryFiles(ThisTimeLineID, recoveryTargetTLI);
+	/* ThisTimeLineID doesn't change so we need no lock to copy it */
+	ThisTimeLineID = XLogCtl->ThisTimeLineID;
+	Assert(ThisTimeLineID != 0 || IsBootstrapProcessingMode());
 
-	/*
-	 * Before running in recovery, scan pg_twophase and fill in its status to
-	 * be able to work on entries generated by redo.  Doing a scan before
-	 * taking any recovery action has the merit to discard any 2PC files that
-	 * are newer than the first record to replay, saving from any conflicts at
-	 * replay.  This avoids as well any subsequent scans when doing recovery
-	 * of the on-disk two-phase data.
-	 */
-	restoreTwoPhaseData();
+	/* set wal_segment_size */
+	wal_segment_size = ControlFile->xlog_seg_size;
 
-	lastFullPageWrites = checkPoint.fullPageWrites;
+	/* Use GetRedoRecPtr to copy the RedoRecPtr safely */
+	(void) GetRedoRecPtr();
+	/* Also update our copy of doPageWrites. */
+	doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites);
 
-	RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
-	doPageWrites = lastFullPageWrites;
+	/* Also initialize the working areas for constructing WAL records */
+	InitXLogInsert();
+}
 
-	if (RecPtr < checkPoint.redo)
-		ereport(PANIC,
-				(errmsg("invalid redo in checkpoint record")));
+/*
+ * Return the current Redo pointer from shared memory.
+ *
+ * As a side-effect, the local RedoRecPtr copy is updated.
+ */
+XLogRecPtr
+GetRedoRecPtr(void)
+{
+	XLogRecPtr	ptr;
 
 	/*
-	 * Check whether we need to force recovery from WAL.  If it appears to
-	 * have been a clean shutdown and we did not have a recovery signal file,
-	 * then assume no recovery needed.
+	 * The possibly not up-to-date copy in XlogCtl is enough. Even if we
+	 * grabbed a WAL insertion lock to read the authoritative value in
+	 * Insert->RedoRecPtr, someone might update it just after we've released
+	 * the lock.
 	 */
-	if (checkPoint.redo < RecPtr)
-	{
-		if (wasShutdown)
-			ereport(PANIC,
-					(errmsg("invalid redo record in shutdown checkpoint")));
-		InRecovery = true;
-	}
-	else if (ControlFile->state != DB_SHUTDOWNED)
-		InRecovery = true;
-	else if (ArchiveRecoveryRequested)
-	{
-		/* force recovery due to presence of recovery signal file */
-		InRecovery = true;
-	}
-
-	/* REDO */
-	if (InRecovery)
-	{
-		int			rmid;
+	SpinLockAcquire(&XLogCtl->info_lck);
+	ptr = XLogCtl->RedoRecPtr;
+	SpinLockRelease(&XLogCtl->info_lck);
 
-		/*
-		 * Update pg_control to show that we are recovering and to show the
-		 * selected checkpoint as the place we are starting from. We also mark
-		 * pg_control with any minimum recovery stop point obtained from a
-		 * backup history file.
-		 */
-		dbstate_at_startup = ControlFile->state;
-		if (InArchiveRecovery)
-		{
-			ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
+	if (RedoRecPtr < ptr)
+		RedoRecPtr = ptr;
 
-			SpinLockAcquire(&XLogCtl->info_lck);
-			XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
-			SpinLockRelease(&XLogCtl->info_lck);
-		}
-		else
-		{
-			ereport(LOG,
-					(errmsg("database system was not properly shut down; "
-							"automatic recovery in progress")));
-			if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID)
-				ereport(LOG,
-						(errmsg("crash recovery starts in timeline %u "
-								"and has target timeline %u",
-								ControlFile->checkPointCopy.ThisTimeLineID,
-								recoveryTargetTLI)));
-			ControlFile->state = DB_IN_CRASH_RECOVERY;
+	return RedoRecPtr;
+}
 
-			SpinLockAcquire(&XLogCtl->info_lck);
-			XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
-			SpinLockRelease(&XLogCtl->info_lck);
-		}
-		ControlFile->checkPoint = checkPointLoc;
-		ControlFile->checkPointCopy = checkPoint;
-		if (InArchiveRecovery)
-		{
-			/* initialize minRecoveryPoint if not set yet */
-			if (ControlFile->minRecoveryPoint < checkPoint.redo)
-			{
-				ControlFile->minRecoveryPoint = checkPoint.redo;
-				ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
-			}
-		}
+/*
+ * Return information needed to decide whether a modified block needs a
+ * full-page image to be included in the WAL record.
+ *
+ * The returned values are cached copies from backend-private memory, and
+ * possibly out-of-date.  XLogInsertRecord will re-check them against
+ * up-to-date values, while holding the WAL insert lock.
+ */
+void
+GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p)
+{
+	*RedoRecPtr_p = RedoRecPtr;
+	*doPageWrites_p = doPageWrites;
+}
 
-		/*
-		 * Set backupStartPoint if we're starting recovery from a base backup.
-		 *
-		 * Also set backupEndPoint and use minRecoveryPoint as the backup end
-		 * location if we're starting recovery from a base backup which was
-		 * taken from a standby. In this case, the database system status in
-		 * pg_control must indicate that the database was already in recovery.
-		 * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
-		 * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
-		 * before reaching this point; e.g. because restore_command or
-		 * primary_conninfo were faulty.
-		 *
-		 * Any other state indicates that the backup somehow became corrupted
-		 * and we can't sensibly continue with recovery.
-		 */
-		if (haveBackupLabel)
-		{
-			ControlFile->backupStartPoint = checkPoint.redo;
-			ControlFile->backupEndRequired = backupEndRequired;
+/*
+ * GetInsertRecPtr -- Returns the current insert position.
+ *
+ * NOTE: The value *actually* returned is the position of the last full
+ * xlog page. It lags behind the real insert position by at most 1 page.
+ * For that, we don't need to scan through WAL insertion locks, and an
+ * approximation is enough for the current usage of this function.
+ */
+XLogRecPtr
+GetInsertRecPtr(void)
+{
+	XLogRecPtr	recptr;
 
-			if (backupFromStandby)
-			{
-				if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
-					dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
-					ereport(FATAL,
-							(errmsg("backup_label contains data inconsistent with control file"),
-							 errhint("This means that the backup is corrupted and you will "
-									 "have to use another backup for recovery.")));
-				ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
-			}
-		}
-		ControlFile->time = (pg_time_t) time(NULL);
-		/* No need to hold ControlFileLock yet, we aren't up far enough */
-		UpdateControlFile();
+	SpinLockAcquire(&XLogCtl->info_lck);
+	recptr = XLogCtl->LogwrtRqst.Write;
+	SpinLockRelease(&XLogCtl->info_lck);
 
-		/*
-		 * Initialize our local copy of minRecoveryPoint.  When doing crash
-		 * recovery we want to replay up to the end of WAL.  Particularly, in
-		 * the case of a promoted standby minRecoveryPoint value in the
-		 * control file is only updated after the first checkpoint.  However,
-		 * if the instance crashes before the first post-recovery checkpoint
-		 * is completed then recovery will use a stale location causing the
-		 * startup process to think that there are still invalid page
-		 * references when checking for data consistency.
-		 */
-		if (InArchiveRecovery)
-		{
-			minRecoveryPoint = ControlFile->minRecoveryPoint;
-			minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
-		}
-		else
-		{
-			minRecoveryPoint = InvalidXLogRecPtr;
-			minRecoveryPointTLI = 0;
-		}
+	return recptr;
+}
 
-		/*
-		 * Reset pgstat data, because it may be invalid after recovery.
-		 */
-		pgstat_reset_all();
+/*
+ * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
+ * position known to be fsync'd to disk.
+ */
+XLogRecPtr
+GetFlushRecPtr(void)
+{
+	SpinLockAcquire(&XLogCtl->info_lck);
+	LogwrtResult = XLogCtl->LogwrtResult;
+	SpinLockRelease(&XLogCtl->info_lck);
 
-		/*
-		 * If there was a backup label file, it's done its job and the info
-		 * has now been propagated into pg_control.  We must get rid of the
-		 * label file so that if we crash during recovery, we'll pick up at
-		 * the latest recovery restartpoint instead of going all the way back
-		 * to the backup start point.  It seems prudent though to just rename
-		 * the file out of the way rather than delete it completely.
-		 */
-		if (haveBackupLabel)
-		{
-			unlink(BACKUP_LABEL_OLD);
-			durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, FATAL);
-		}
+	return LogwrtResult.Flush;
+}
 
-		/*
-		 * If there was a tablespace_map file, it's done its job and the
-		 * symlinks have been created.  We must get rid of the map file so
-		 * that if we crash during recovery, we don't create symlinks again.
-		 * It seems prudent though to just rename the file out of the way
-		 * rather than delete it completely.
-		 */
-		if (haveTblspcMap)
-		{
-			unlink(TABLESPACE_MAP_OLD);
-			durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, FATAL);
-		}
+/*
+ * GetLastImportantRecPtr -- Returns the LSN of the last important record
+ * inserted. All records not explicitly marked as unimportant are considered
+ * important.
+ *
+ * The LSN is determined by computing the maximum of
+ * WALInsertLocks[i].lastImportantAt.
+ */
+XLogRecPtr
+GetLastImportantRecPtr(void)
+{
+	XLogRecPtr	res = InvalidXLogRecPtr;
+	int			i;
 
-		/* Check that the GUCs used to generate the WAL allow recovery */
-		CheckRequiredParameterValues();
+	for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
+	{
+		XLogRecPtr	last_important;
 
 		/*
-		 * We're in recovery, so unlogged relations may be trashed and must be
-		 * reset.  This should be done BEFORE allowing Hot Standby
-		 * connections, so that read-only backends don't try to read whatever
-		 * garbage is left over from before.
+		 * Need to take a lock to prevent torn reads of the LSN, which are
+		 * possible on some of the supported platforms. WAL insert locks only
+		 * support exclusive mode, so we have to use that.
 		 */
-		ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
+		LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
+		last_important = WALInsertLocks[i].l.lastImportantAt;
+		LWLockRelease(&WALInsertLocks[i].l.lock);
 
-		/*
-		 * Likewise, delete any saved transaction snapshot files that got left
-		 * behind by crashed backends.
-		 */
-		DeleteAllExportedSnapshotFiles();
+		if (res < last_important)
+			res = last_important;
+	}
 
-		/*
-		 * Initialize for Hot Standby, if enabled. We won't let backends in
-		 * yet, not until we've reached the min recovery point specified in
-		 * control file and we've established a recovery snapshot from a
-		 * running-xacts WAL record.
-		 */
-		if (ArchiveRecoveryRequested && EnableHotStandby)
-		{
-			TransactionId *xids;
-			int			nxids;
-
-			ereport(DEBUG1,
-					(errmsg_internal("initializing for hot standby")));
-
-			InitRecoveryTransactionEnvironment();
-
-			if (wasShutdown)
-				oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
-			else
-				oldestActiveXID = checkPoint.oldestActiveXid;
-			Assert(TransactionIdIsValid(oldestActiveXID));
+	return res;
+}
 
-			/* Tell procarray about the range of xids it has to deal with */
-			ProcArrayInitRecovery(XidFromFullTransactionId(ShmemVariableCache->nextXid));
+/*
+ * Get the time and LSN of the last xlog segment switch
+ */
+pg_time_t
+GetLastSegSwitchData(XLogRecPtr *lastSwitchLSN)
+{
+	pg_time_t	result;
 
-			/*
-			 * Startup subtrans only.  CLOG, MultiXact and commit timestamp
-			 * have already been started up and other SLRUs are not maintained
-			 * during recovery and need not be started yet.
-			 */
-			StartupSUBTRANS(oldestActiveXID);
+	/* Need WALWriteLock, but shared lock is sufficient */
+	LWLockAcquire(WALWriteLock, LW_SHARED);
+	result = XLogCtl->lastSegSwitchTime;
+	*lastSwitchLSN = XLogCtl->lastSegSwitchLSN;
+	LWLockRelease(WALWriteLock);
 
-			/*
-			 * If we're beginning at a shutdown checkpoint, we know that
-			 * nothing was running on the primary at this point. So fake-up an
-			 * empty running-xacts record and use that here and now. Recover
-			 * additional standby state for prepared transactions.
-			 */
-			if (wasShutdown)
-			{
-				RunningTransactionsData running;
-				TransactionId latestCompletedXid;
+	return result;
+}
 
-				/*
-				 * Construct a RunningTransactions snapshot representing a
-				 * shut down server, with only prepared transactions still
-				 * alive. We're never overflowed at this point because all
-				 * subxids are listed with their parent prepared transactions.
-				 */
-				running.xcnt = nxids;
-				running.subxcnt = 0;
-				running.subxid_overflow = false;
-				running.nextXid = XidFromFullTransactionId(checkPoint.nextXid);
-				running.oldestRunningXid = oldestActiveXID;
-				latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid);
-				TransactionIdRetreat(latestCompletedXid);
-				Assert(TransactionIdIsNormal(latestCompletedXid));
-				running.latestCompletedXid = latestCompletedXid;
-				running.xids = xids;
+/*
+ * This must be called ONCE during postmaster or standalone-backend shutdown
+ */
+void
+ShutdownXLOG(int code, Datum arg)
+{
+	/*
+	 * We should have an aux process resource owner to use, and we should not
+	 * be in a transaction that's installed some other resowner.
+	 */
+	Assert(AuxProcessResourceOwner != NULL);
+	Assert(CurrentResourceOwner == NULL ||
+		   CurrentResourceOwner == AuxProcessResourceOwner);
+	CurrentResourceOwner = AuxProcessResourceOwner;
 
-				ProcArrayApplyRecoveryInfo(&running);
+	/* Don't be chatty in standalone mode */
+	ereport(IsPostmasterEnvironment ? LOG : NOTICE,
+			(errmsg("shutting down")));
 
-				StandbyRecoverPreparedTransactions();
-			}
-		}
+	/*
+	 * Signal walsenders to move to stopping state.
+	 */
+	WalSndInitStopping();
 
-		/* Initialize resource managers */
-		for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
-		{
-			if (RmgrTable[rmid].rm_startup != NULL)
-				RmgrTable[rmid].rm_startup();
-		}
+	/*
+	 * Wait for WAL senders to be in stopping state.  This prevents commands
+	 * from writing new WAL.
+	 */
+	WalSndWaitStopping();
 
+	if (RecoveryInProgress())
+		CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
+	else
+	{
 		/*
-		 * Initialize shared variables for tracking progress of WAL replay, as
-		 * if we had just replayed the record before the REDO location (or the
-		 * checkpoint record itself, if it's a shutdown checkpoint).
+		 * If archiving is enabled, rotate the last XLOG file so that all the
+		 * remaining records are archived (postmaster wakes up the archiver
+		 * process one more time at the end of shutdown). The checkpoint
+		 * record will go to the next XLOG file and won't be archived (yet).
 		 */
-		SpinLockAcquire(&XLogCtl->info_lck);
-		if (checkPoint.redo < RecPtr)
-			XLogCtl->replayEndRecPtr = checkPoint.redo;
-		else
-			XLogCtl->replayEndRecPtr = EndRecPtr;
-		XLogCtl->replayEndTLI = ThisTimeLineID;
-		XLogCtl->lastReplayedEndRecPtr = XLogCtl->replayEndRecPtr;
-		XLogCtl->lastReplayedTLI = XLogCtl->replayEndTLI;
-		XLogCtl->recoveryLastXTime = 0;
-		XLogCtl->currentChunkStartTime = 0;
-		XLogCtl->recoveryPauseState = RECOVERY_NOT_PAUSED;
-		SpinLockRelease(&XLogCtl->info_lck);
-
-		/* Also ensure XLogReceiptTime has a sane value */
-		XLogReceiptTime = GetCurrentTimestamp();
+		if (XLogArchivingActive() && XLogArchiveCommandSet())
+			RequestXLogSwitch(false);
 
-		/*
-		 * Let postmaster know we've started redo now, so that it can launch
-		 * checkpointer to perform restartpoints.  We don't bother during
-		 * crash recovery as restartpoints can only be performed during
-		 * archive recovery.  And we'd like to keep crash recovery simple, to
-		 * avoid introducing bugs that could affect you when recovering after
-		 * crash.
-		 *
-		 * After this point, we can no longer assume that we're the only
-		 * process in addition to postmaster!  Also, fsync requests are
-		 * subsequently to be handled by the checkpointer, not locally.
-		 */
-		if (ArchiveRecoveryRequested && IsUnderPostmaster)
-		{
-			PublishStartupProcessInformation();
-			EnableSyncRequestForwarding();
-			SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
-			bgwriterLaunched = true;
-		}
+		CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
+	}
+}
 
-		/*
-		 * Allow read-only connections immediately if we're consistent
-		 * already.
-		 */
-		CheckRecoveryConsistency();
+/*
+ * Log start of a checkpoint.
+ */
+static void
+LogCheckpointStart(int flags, bool restartpoint)
+{
+	if (restartpoint)
+		ereport(LOG,
+		/* translator: the placeholders show checkpoint options */
+				(errmsg("restartpoint starting:%s%s%s%s%s%s%s%s",
+						(flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
+						(flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
+						(flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
+						(flags & CHECKPOINT_FORCE) ? " force" : "",
+						(flags & CHECKPOINT_WAIT) ? " wait" : "",
+						(flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "",
+						(flags & CHECKPOINT_CAUSE_TIME) ? " time" : "",
+						(flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : "")));
+	else
+		ereport(LOG,
+		/* translator: the placeholders show checkpoint options */
+				(errmsg("checkpoint starting:%s%s%s%s%s%s%s%s",
+						(flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
+						(flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
+						(flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
+						(flags & CHECKPOINT_FORCE) ? " force" : "",
+						(flags & CHECKPOINT_WAIT) ? " wait" : "",
+						(flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "",
+						(flags & CHECKPOINT_CAUSE_TIME) ? " time" : "",
+						(flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : "")));
+}
 
-		/*
-		 * Find the first record that logically follows the checkpoint --- it
-		 * might physically precede it, though.
-		 */
-		if (checkPoint.redo < RecPtr)
-		{
-			/* back up to find the record */
-			XLogBeginRead(xlogreader, checkPoint.redo);
-			record = ReadRecord(xlogreader, PANIC, false);
-		}
-		else
-		{
-			/* just have to read next record after CheckPoint */
-			record = ReadRecord(xlogreader, LOG, false);
-		}
+/*
+ * Log end of a checkpoint.
+ */
+static void
+LogCheckpointEnd(bool restartpoint)
+{
+	long		write_msecs,
+				sync_msecs,
+				total_msecs,
+				longest_msecs,
+				average_msecs;
+	uint64		average_sync_time;
 
-		if (record != NULL)
-		{
-			ErrorContextCallback errcallback;
-			TimestampTz xtime;
-			PGRUsage	ru0;
+	CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
 
-			pg_rusage_init(&ru0);
+	write_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_write_t,
+												  CheckpointStats.ckpt_sync_t);
 
-			InRedo = true;
+	sync_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_sync_t,
+												 CheckpointStats.ckpt_sync_end_t);
 
-			ereport(LOG,
-					(errmsg("redo starts at %X/%X",
-							LSN_FORMAT_ARGS(ReadRecPtr))));
+	/* Accumulate checkpoint timing summary data, in milliseconds. */
+	BgWriterStats.m_checkpoint_write_time += write_msecs;
+	BgWriterStats.m_checkpoint_sync_time += sync_msecs;
 
-			/*
-			 * main redo apply loop
-			 */
-			do
-			{
-				bool		switchedTLI = false;
+	/*
+	 * All of the published timing statistics are accounted for.  Only
+	 * continue if a log message is to be written.
+	 */
+	if (!log_checkpoints)
+		return;
 
-#ifdef WAL_DEBUG
-				if (XLOG_DEBUG ||
-					(rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
-					(rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
-				{
-					StringInfoData buf;
-
-					initStringInfo(&buf);
-					appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
-									 LSN_FORMAT_ARGS(ReadRecPtr),
-									 LSN_FORMAT_ARGS(EndRecPtr));
-					xlog_outrec(&buf, xlogreader);
-					appendStringInfoString(&buf, " - ");
-					xlog_outdesc(&buf, xlogreader);
-					elog(LOG, "%s", buf.data);
-					pfree(buf.data);
-				}
-#endif
+	total_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_start_t,
+												  CheckpointStats.ckpt_end_t);
 
-				/* Handle interrupt signals of startup process */
-				HandleStartupProcInterrupts();
+	/*
+	 * Timing values returned from CheckpointStats are in microseconds.
+	 * Convert to milliseconds for consistent printing.
+	 */
+	longest_msecs = (long) ((CheckpointStats.ckpt_longest_sync + 999) / 1000);
 
-				/*
-				 * Pause WAL replay, if requested by a hot-standby session via
-				 * SetRecoveryPause().
-				 *
-				 * Note that we intentionally don't take the info_lck spinlock
-				 * here.  We might therefore read a slightly stale value of
-				 * the recoveryPause flag, but it can't be very stale (no
-				 * worse than the last spinlock we did acquire).  Since a
-				 * pause request is a pretty asynchronous thing anyway,
-				 * possibly responding to it one WAL record later than we
-				 * otherwise would is a minor issue, so it doesn't seem worth
-				 * adding another spinlock cycle to prevent that.
-				 */
-				if (((volatile XLogCtlData *) XLogCtl)->recoveryPauseState !=
-					RECOVERY_NOT_PAUSED)
-					recoveryPausesHere(false);
+	average_sync_time = 0;
+	if (CheckpointStats.ckpt_sync_rels > 0)
+		average_sync_time = CheckpointStats.ckpt_agg_sync_time /
+			CheckpointStats.ckpt_sync_rels;
+	average_msecs = (long) ((average_sync_time + 999) / 1000);
 
-				/*
-				 * Have we reached our recovery target?
-				 */
-				if (recoveryStopsBefore(xlogreader))
-				{
-					reachedRecoveryTarget = true;
-					break;
-				}
+	if (restartpoint)
+		ereport(LOG,
+				(errmsg("restartpoint complete: wrote %d buffers (%.1f%%); "
+						"%d WAL file(s) added, %d removed, %d recycled; "
+						"write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
+						"sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
+						"distance=%d kB, estimate=%d kB",
+						CheckpointStats.ckpt_bufs_written,
+						(double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
+						CheckpointStats.ckpt_segs_added,
+						CheckpointStats.ckpt_segs_removed,
+						CheckpointStats.ckpt_segs_recycled,
+						write_msecs / 1000, (int) (write_msecs % 1000),
+						sync_msecs / 1000, (int) (sync_msecs % 1000),
+						total_msecs / 1000, (int) (total_msecs % 1000),
+						CheckpointStats.ckpt_sync_rels,
+						longest_msecs / 1000, (int) (longest_msecs % 1000),
+						average_msecs / 1000, (int) (average_msecs % 1000),
+						(int) (PrevCheckPointDistance / 1024.0),
+						(int) (CheckPointDistanceEstimate / 1024.0))));
+	else
+		ereport(LOG,
+				(errmsg("checkpoint complete: wrote %d buffers (%.1f%%); "
+						"%d WAL file(s) added, %d removed, %d recycled; "
+						"write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
+						"sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
+						"distance=%d kB, estimate=%d kB",
+						CheckpointStats.ckpt_bufs_written,
+						(double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
+						CheckpointStats.ckpt_segs_added,
+						CheckpointStats.ckpt_segs_removed,
+						CheckpointStats.ckpt_segs_recycled,
+						write_msecs / 1000, (int) (write_msecs % 1000),
+						sync_msecs / 1000, (int) (sync_msecs % 1000),
+						total_msecs / 1000, (int) (total_msecs % 1000),
+						CheckpointStats.ckpt_sync_rels,
+						longest_msecs / 1000, (int) (longest_msecs % 1000),
+						average_msecs / 1000, (int) (average_msecs % 1000),
+						(int) (PrevCheckPointDistance / 1024.0),
+						(int) (CheckPointDistanceEstimate / 1024.0))));
+}
 
-				/*
-				 * If we've been asked to lag the primary, wait on latch until
-				 * enough time has passed.
-				 */
-				if (recoveryApplyDelay(xlogreader))
-				{
-					/*
-					 * We test for paused recovery again here. If user sets
-					 * delayed apply, it may be because they expect to pause
-					 * recovery in case of problems, so we must test again
-					 * here otherwise pausing during the delay-wait wouldn't
-					 * work.
-					 */
-					if (((volatile XLogCtlData *) XLogCtl)->recoveryPauseState !=
-						RECOVERY_NOT_PAUSED)
-						recoveryPausesHere(false);
-				}
-
-				/* Setup error traceback support for ereport() */
-				errcallback.callback = rm_redo_error_callback;
-				errcallback.arg = (void *) xlogreader;
-				errcallback.previous = error_context_stack;
-				error_context_stack = &errcallback;
-
-				/*
-				 * ShmemVariableCache->nextXid must be beyond record's xid.
-				 */
-				AdvanceNextFullTransactionIdPastXid(record->xl_xid);
-
-				/*
-				 * Before replaying this record, check if this record causes
-				 * the current timeline to change. The record is already
-				 * considered to be part of the new timeline, so we update
-				 * ThisTimeLineID before replaying it. That's important so
-				 * that replayEndTLI, which is recorded as the minimum
-				 * recovery point's TLI if recovery stops after this record,
-				 * is set correctly.
-				 */
-				if (record->xl_rmid == RM_XLOG_ID)
-				{
-					TimeLineID	newTLI = ThisTimeLineID;
-					TimeLineID	prevTLI = ThisTimeLineID;
-					uint8		info = record->xl_info & ~XLR_INFO_MASK;
-
-					if (info == XLOG_CHECKPOINT_SHUTDOWN)
-					{
-						CheckPoint	checkPoint;
-
-						memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
-						newTLI = checkPoint.ThisTimeLineID;
-						prevTLI = checkPoint.PrevTimeLineID;
-					}
-					else if (info == XLOG_END_OF_RECOVERY)
-					{
-						xl_end_of_recovery xlrec;
-
-						memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
-						newTLI = xlrec.ThisTimeLineID;
-						prevTLI = xlrec.PrevTimeLineID;
-					}
-
-					if (newTLI != ThisTimeLineID)
-					{
-						/* Check that it's OK to switch to this TLI */
-						checkTimeLineSwitch(EndRecPtr, newTLI, prevTLI);
-
-						/* Following WAL records should be run with new TLI */
-						ThisTimeLineID = newTLI;
-						switchedTLI = true;
-					}
-				}
-
-				/*
-				 * Update shared replayEndRecPtr before replaying this record,
-				 * so that XLogFlush will update minRecoveryPoint correctly.
-				 */
-				SpinLockAcquire(&XLogCtl->info_lck);
-				XLogCtl->replayEndRecPtr = EndRecPtr;
-				XLogCtl->replayEndTLI = ThisTimeLineID;
-				SpinLockRelease(&XLogCtl->info_lck);
-
-				/*
-				 * If we are attempting to enter Hot Standby mode, process
-				 * XIDs we see
-				 */
-				if (standbyState >= STANDBY_INITIALIZED &&
-					TransactionIdIsValid(record->xl_xid))
-					RecordKnownAssignedTransactionIds(record->xl_xid);
-
-				/* Now apply the WAL record itself */
-				RmgrTable[record->xl_rmid].rm_redo(xlogreader);
-
-				/*
-				 * After redo, check whether the backup pages associated with
-				 * the WAL record are consistent with the existing pages. This
-				 * check is done only if consistency check is enabled for this
-				 * record.
-				 */
-				if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
-					checkXLogConsistency(xlogreader);
-
-				/* Pop the error context stack */
-				error_context_stack = errcallback.previous;
-
-				/*
-				 * Update lastReplayedEndRecPtr after this record has been
-				 * successfully replayed.
-				 */
-				SpinLockAcquire(&XLogCtl->info_lck);
-				XLogCtl->lastReplayedEndRecPtr = EndRecPtr;
-				XLogCtl->lastReplayedTLI = ThisTimeLineID;
-				SpinLockRelease(&XLogCtl->info_lck);
-
-				/*
-				 * If rm_redo called XLogRequestWalReceiverReply, then we wake
-				 * up the receiver so that it notices the updated
-				 * lastReplayedEndRecPtr and sends a reply to the primary.
-				 */
-				if (doRequestWalReceiverReply)
-				{
-					doRequestWalReceiverReply = false;
-					WalRcvForceReply();
-				}
-
-				/* Remember this record as the last-applied one */
-				LastRec = ReadRecPtr;
-
-				/* Allow read-only connections if we're consistent now */
-				CheckRecoveryConsistency();
-
-				/* Is this a timeline switch? */
-				if (switchedTLI)
-				{
-					/*
-					 * Before we continue on the new timeline, clean up any
-					 * (possibly bogus) future WAL segments on the old
-					 * timeline.
-					 */
-					RemoveNonParentXlogFiles(EndRecPtr, ThisTimeLineID);
-
-					/*
-					 * Wake up any walsenders to notice that we are on a new
-					 * timeline.
-					 */
-					if (AllowCascadeReplication())
-						WalSndWakeup();
-				}
-
-				/* Exit loop if we reached inclusive recovery target */
-				if (recoveryStopsAfter(xlogreader))
-				{
-					reachedRecoveryTarget = true;
-					break;
-				}
-
-				/* Else, try to fetch the next WAL record */
-				record = ReadRecord(xlogreader, LOG, false);
-			} while (record != NULL);
-
-			/*
-			 * end of main redo apply loop
-			 */
-
-			if (reachedRecoveryTarget)
-			{
-				if (!reachedConsistency)
-					ereport(FATAL,
-							(errmsg("requested recovery stop point is before consistent recovery point")));
-
-				/*
-				 * This is the last point where we can restart recovery with a
-				 * new recovery target, if we shutdown and begin again. After
-				 * this, Resource Managers may choose to do permanent
-				 * corrective actions at end of recovery.
-				 */
-				switch (recoveryTargetAction)
-				{
-					case RECOVERY_TARGET_ACTION_SHUTDOWN:
-
-						/*
-						 * exit with special return code to request shutdown
-						 * of postmaster.  Log messages issued from
-						 * postmaster.
-						 */
-						proc_exit(3);
+/*
+ * Update the estimate of distance between checkpoints.
+ *
+ * The estimate is used to calculate the number of WAL segments to keep
+ * preallocated, see XLOGfileslop().
+ */
+static void
+UpdateCheckPointDistanceEstimate(uint64 nbytes)
+{
+	/*
+	 * To estimate the number of segments consumed between checkpoints, keep a
+	 * moving average of the amount of WAL generated in previous checkpoint
+	 * cycles. However, if the load is bursty, with quiet periods and busy
+	 * periods, we want to cater for the peak load. So instead of a plain
+	 * moving average, let the average decline slowly if the previous cycle
+	 * used less WAL than estimated, but bump it up immediately if it used
+	 * more.
+	 *
+	 * When checkpoints are triggered by max_wal_size, this should converge to
+	 * CheckpointSegments * wal_segment_size,
+	 *
+	 * Note: This doesn't pay any attention to what caused the checkpoint.
+	 * Checkpoints triggered manually with CHECKPOINT command, or by e.g.
+	 * starting a base backup, are counted the same as those created
+	 * automatically. The slow-decline will largely mask them out, if they are
+	 * not frequent. If they are frequent, it seems reasonable to count them
+	 * in as any others; if you issue a manual checkpoint every 5 minutes and
+	 * never let a timed checkpoint happen, it makes sense to base the
+	 * preallocation on that 5 minute interval rather than whatever
+	 * checkpoint_timeout is set to.
+	 */
+	PrevCheckPointDistance = nbytes;
+	if (CheckPointDistanceEstimate < nbytes)
+		CheckPointDistanceEstimate = nbytes;
+	else
+		CheckPointDistanceEstimate =
+			(0.90 * CheckPointDistanceEstimate + 0.10 * (double) nbytes);
+}
 
-					case RECOVERY_TARGET_ACTION_PAUSE:
-						SetRecoveryPause(true);
-						recoveryPausesHere(true);
+/*
+ * Update the ps display for a process running a checkpoint.  Note that
+ * this routine should not do any allocations so as it can be called
+ * from a critical section.
+ */
+static void
+update_checkpoint_display(int flags, bool restartpoint, bool reset)
+{
+	/*
+	 * The status is reported only for end-of-recovery and shutdown
+	 * checkpoints or shutdown restartpoints.  Updating the ps display is
+	 * useful in those situations as it may not be possible to rely on
+	 * pg_stat_activity to see the status of the checkpointer or the startup
+	 * process.
+	 */
+	if ((flags & (CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IS_SHUTDOWN)) == 0)
+		return;
 
-						/* drop into promote */
+	if (reset)
+		set_ps_display("");
+	else
+	{
+		char		activitymsg[128];
 
-					case RECOVERY_TARGET_ACTION_PROMOTE:
-						break;
-				}
-			}
+		snprintf(activitymsg, sizeof(activitymsg), "performing %s%s%s",
+				 (flags & CHECKPOINT_END_OF_RECOVERY) ? "end-of-recovery " : "",
+				 (flags & CHECKPOINT_IS_SHUTDOWN) ? "shutdown " : "",
+				 restartpoint ? "restartpoint" : "checkpoint");
+		set_ps_display(activitymsg);
+	}
+}
 
-			/* Allow resource managers to do any required cleanup. */
-			for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
-			{
-				if (RmgrTable[rmid].rm_cleanup != NULL)
-					RmgrTable[rmid].rm_cleanup();
-			}
 
-			ereport(LOG,
-					(errmsg("redo done at %X/%X system usage: %s",
-							LSN_FORMAT_ARGS(ReadRecPtr),
-							pg_rusage_show(&ru0))));
-			xtime = GetLatestXTime();
-			if (xtime)
-				ereport(LOG,
-						(errmsg("last completed transaction was at log time %s",
-								timestamptz_to_str(xtime))));
-
-			InRedo = false;
-		}
-		else
-		{
-			/* there are no WAL records following the checkpoint */
-			ereport(LOG,
-					(errmsg("redo is not required")));
+/*
+ * Perform a checkpoint --- either during shutdown, or on-the-fly
+ *
+ * flags is a bitwise OR of the following:
+ *	CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
+ *	CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
+ *	CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
+ *		ignoring checkpoint_completion_target parameter.
+ *	CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
+ *		since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
+ *		CHECKPOINT_END_OF_RECOVERY).
+ *	CHECKPOINT_FLUSH_ALL: also flush buffers of unlogged tables.
+ *
+ * Note: flags contains other bits, of interest here only for logging purposes.
+ * In particular note that this routine is synchronous and does not pay
+ * attention to CHECKPOINT_WAIT.
+ *
+ * If !shutdown then we are writing an online checkpoint. This is a very special
+ * kind of operation and WAL record because the checkpoint action occurs over
+ * a period of time yet logically occurs at just a single LSN. The logical
+ * position of the WAL record (redo ptr) is the same or earlier than the
+ * physical position. When we replay WAL we locate the checkpoint via its
+ * physical position then read the redo ptr and actually start replay at the
+ * earlier logical position. Note that we don't write *anything* to WAL at
+ * the logical position, so that location could be any other kind of WAL record.
+ * All of this mechanism allows us to continue working while we checkpoint.
+ * As a result, timing of actions is critical here and be careful to note that
+ * this function will likely take minutes to execute on a busy system.
+ */
+void
+CreateCheckPoint(int flags)
+{
+	bool		shutdown;
+	CheckPoint	checkPoint;
+	XLogRecPtr	recptr;
+	XLogSegNo	_logSegNo;
+	XLogCtlInsert *Insert = &XLogCtl->Insert;
+	uint32		freespace;
+	XLogRecPtr	PriorRedoPtr;
+	XLogRecPtr	curInsert;
+	XLogRecPtr	last_important_lsn;
+	VirtualTransactionId *vxids;
+	int			nvxids;
 
-		}
+	/*
+	 * An end-of-recovery checkpoint is really a shutdown checkpoint, just
+	 * issued at a different time.
+	 */
+	if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
+		shutdown = true;
+	else
+		shutdown = false;
 
-		/*
-		 * This check is intentionally after the above log messages that
-		 * indicate how far recovery went.
-		 */
-		if (ArchiveRecoveryRequested &&
-			recoveryTarget != RECOVERY_TARGET_UNSET &&
-			!reachedRecoveryTarget)
-			ereport(FATAL,
-					(errmsg("recovery ended before configured recovery target was reached")));
-	}
+	/* sanity check */
+	if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
+		elog(ERROR, "can't create a checkpoint during recovery");
 
 	/*
-	 * Kill WAL receiver, if it's still running, before we continue to write
-	 * the startup checkpoint record. It will trump over the checkpoint and
-	 * subsequent records if it's still alive when we start writing WAL.
+	 * Initialize InitXLogInsert working areas before entering the critical
+	 * section.  Normally, this is done by the first call to
+	 * RecoveryInProgress() or LocalSetXLogInsertAllowed(), but when creating
+	 * an end-of-recovery checkpoint, the LocalSetXLogInsertAllowed call is
+	 * done below in a critical section, and InitXLogInsert cannot be called
+	 * in a critical section.
 	 */
-	ShutdownWalRcv();
+	InitXLogInsert();
 
 	/*
-	 * Reset unlogged relations to the contents of their INIT fork. This is
-	 * done AFTER recovery is complete so as to include any unlogged relations
-	 * created during recovery, but BEFORE recovery is marked as having
-	 * completed successfully. Otherwise we'd not retry if any of the post
-	 * end-of-recovery steps fail.
+	 * Prepare to accumulate statistics.
+	 *
+	 * Note: because it is possible for log_checkpoints to change while a
+	 * checkpoint proceeds, we always accumulate stats, even if
+	 * log_checkpoints is currently off.
 	 */
-	if (InRecovery)
-		ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
+	MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
+	CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
 
 	/*
-	 * We don't need the latch anymore. It's not strictly necessary to disown
-	 * it, but let's do it for the sake of tidiness.
+	 * Use a critical section to force system panic if we have trouble.
 	 */
-	if (ArchiveRecoveryRequested)
-		DisownLatch(&XLogCtl->recoveryWakeupLatch);
+	START_CRIT_SECTION();
+
+	if (shutdown)
+	{
+		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+		ControlFile->state = DB_SHUTDOWNING;
+		ControlFile->time = (pg_time_t) time(NULL);
+		UpdateControlFile();
+		LWLockRelease(ControlFileLock);
+	}
 
 	/*
-	 * We are now done reading the xlog from stream. Turn off streaming
-	 * recovery to force fetching the files (which would be required at end of
-	 * recovery, e.g., timeline history file) from archive or pg_wal.
-	 *
-	 * Note that standby mode must be turned off after killing WAL receiver,
-	 * i.e., calling ShutdownWalRcv().
+	 * Let smgr prepare for checkpoint; this has to happen before we determine
+	 * the REDO pointer.  Note that smgr must not do anything that'd have to
+	 * be undone if we decide no checkpoint is needed.
 	 */
-	Assert(!WalRcvStreaming());
-	StandbyMode = false;
+	SyncPreCheckpoint();
+
+	/* Begin filling in the checkpoint WAL record */
+	MemSet(&checkPoint, 0, sizeof(checkPoint));
+	checkPoint.time = (pg_time_t) time(NULL);
 
 	/*
-	 * Re-fetch the last valid or last applied record, so we can identify the
-	 * exact endpoint of what we consider the valid portion of WAL.
+	 * For Hot Standby, derive the oldestActiveXid before we fix the redo
+	 * pointer. This allows us to begin accumulating changes to assemble our
+	 * starting snapshot of locks and transactions.
 	 */
-	XLogBeginRead(xlogreader, LastRec);
-	record = ReadRecord(xlogreader, PANIC, false);
-	EndOfLog = EndRecPtr;
+	if (!shutdown && XLogStandbyInfoActive())
+		checkPoint.oldestActiveXid = GetOldestActiveTransactionId();
+	else
+		checkPoint.oldestActiveXid = InvalidTransactionId;
 
 	/*
-	 * EndOfLogTLI is the TLI in the filename of the XLOG segment containing
-	 * the end-of-log. It could be different from the timeline that EndOfLog
-	 * nominally belongs to, if there was a timeline switch in that segment,
-	 * and we were reading the old WAL from a segment belonging to a higher
-	 * timeline.
+	 * Get location of last important record before acquiring insert locks (as
+	 * GetLastImportantRecPtr() also locks WAL locks).
 	 */
-	EndOfLogTLI = xlogreader->seg.ws_tli;
+	last_important_lsn = GetLastImportantRecPtr();
 
 	/*
-	 * Complain if we did not roll forward far enough to render the backup
-	 * dump consistent.  Note: it is indeed okay to look at the local variable
-	 * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
-	 * be further ahead --- ControlFile->minRecoveryPoint cannot have been
-	 * advanced beyond the WAL we processed.
+	 * We must block concurrent insertions while examining insert state to
+	 * determine the checkpoint REDO pointer.
 	 */
-	if (InRecovery &&
-		(EndOfLog < minRecoveryPoint ||
-		 !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
+	WALInsertLockAcquireExclusive();
+	curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
+
+	/*
+	 * If this isn't a shutdown or forced checkpoint, and if there has been no
+	 * WAL activity requiring a checkpoint, skip it.  The idea here is to
+	 * avoid inserting duplicate checkpoints when the system is idle.
+	 */
+	if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
+				  CHECKPOINT_FORCE)) == 0)
 	{
-		/*
-		 * Ran off end of WAL before reaching end-of-backup WAL record, or
-		 * minRecoveryPoint. That's usually a bad sign, indicating that you
-		 * tried to recover from an online backup but never called
-		 * pg_stop_backup(), or you didn't archive all the WAL up to that
-		 * point. However, this also happens in crash recovery, if the system
-		 * crashes while an online backup is in progress. We must not treat
-		 * that as an error, or the database will refuse to start up.
-		 */
-		if (ArchiveRecoveryRequested || ControlFile->backupEndRequired)
+		if (last_important_lsn == ControlFile->checkPoint)
 		{
-			if (ControlFile->backupEndRequired)
-				ereport(FATAL,
-						(errmsg("WAL ends before end of online backup"),
-						 errhint("All WAL generated while online backup was taken must be available at recovery.")));
-			else if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
-				ereport(FATAL,
-						(errmsg("WAL ends before end of online backup"),
-						 errhint("Online backup started with pg_start_backup() must be ended with pg_stop_backup(), and all WAL up to that point must be available at recovery.")));
-			else
-				ereport(FATAL,
-						(errmsg("WAL ends before consistent recovery point")));
+			WALInsertLockRelease();
+			END_CRIT_SECTION();
+			ereport(DEBUG1,
+					(errmsg_internal("checkpoint skipped because system is idle")));
+			return;
 		}
 	}
 
 	/*
-	 * Pre-scan prepared transactions to find out the range of XIDs present.
-	 * This information is not quite needed yet, but it is positioned here so
-	 * as potential problems are detected before any on-disk change is done.
+	 * An end-of-recovery checkpoint is created before anyone is allowed to
+	 * write WAL. To allow us to write the checkpoint record, temporarily
+	 * enable XLogInsertAllowed.  (This also ensures ThisTimeLineID is
+	 * initialized, which we need here and in AdvanceXLInsertBuffer.)
 	 */
-	oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
+	if (flags & CHECKPOINT_END_OF_RECOVERY)
+		LocalSetXLogInsertAllowed();
+
+	checkPoint.ThisTimeLineID = ThisTimeLineID;
+	if (flags & CHECKPOINT_END_OF_RECOVERY)
+		checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID;
+	else
+		checkPoint.PrevTimeLineID = ThisTimeLineID;
+
+	checkPoint.fullPageWrites = Insert->fullPageWrites;
 
 	/*
-	 * Consider whether we need to assign a new timeline ID.
-	 *
-	 * If we are doing an archive recovery, we always assign a new ID.  This
-	 * handles a couple of issues.  If we stopped short of the end of WAL
-	 * during recovery, then we are clearly generating a new timeline and must
-	 * assign it a unique new ID.  Even if we ran to the end, modifying the
-	 * current last segment is problematic because it may result in trying to
-	 * overwrite an already-archived copy of that segment, and we encourage
-	 * DBAs to make their archive_commands reject that.  We can dodge the
-	 * problem by making the new active segment have a new timeline ID.
+	 * Compute new REDO record ptr = location of next XLOG record.
 	 *
-	 * In a normal crash recovery, we can just extend the timeline we were in.
+	 * NB: this is NOT necessarily where the checkpoint record itself will be,
+	 * since other backends may insert more XLOG records while we're off doing
+	 * the buffer flush work.  Those XLOG records are logically after the
+	 * checkpoint, even though physically before it.  Got that?
 	 */
-	PrevTimeLineID = ThisTimeLineID;
-	if (ArchiveRecoveryRequested)
+	freespace = INSERT_FREESPACE(curInsert);
+	if (freespace == 0)
 	{
-		char		reason[200];
-		char		recoveryPath[MAXPGPATH];
-
-		Assert(InArchiveRecovery);
-
-		ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
-		ereport(LOG,
-				(errmsg("selected new timeline ID: %u", ThisTimeLineID)));
-
-		/*
-		 * Create a comment for the history file to explain why and where
-		 * timeline changed.
-		 */
-		if (recoveryTarget == RECOVERY_TARGET_XID)
-			snprintf(reason, sizeof(reason),
-					 "%s transaction %u",
-					 recoveryStopAfter ? "after" : "before",
-					 recoveryStopXid);
-		else if (recoveryTarget == RECOVERY_TARGET_TIME)
-			snprintf(reason, sizeof(reason),
-					 "%s %s\n",
-					 recoveryStopAfter ? "after" : "before",
-					 timestamptz_to_str(recoveryStopTime));
-		else if (recoveryTarget == RECOVERY_TARGET_LSN)
-			snprintf(reason, sizeof(reason),
-					 "%s LSN %X/%X\n",
-					 recoveryStopAfter ? "after" : "before",
-					 LSN_FORMAT_ARGS(recoveryStopLSN));
-		else if (recoveryTarget == RECOVERY_TARGET_NAME)
-			snprintf(reason, sizeof(reason),
-					 "at restore point \"%s\"",
-					 recoveryStopName);
-		else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
-			snprintf(reason, sizeof(reason), "reached consistency");
+		if (XLogSegmentOffset(curInsert, wal_segment_size) == 0)
+			curInsert += SizeOfXLogLongPHD;
 		else
-			snprintf(reason, sizeof(reason), "no recovery target specified");
-
-		/*
-		 * We are now done reading the old WAL.  Turn off archive fetching if
-		 * it was active, and make a writable copy of the last WAL segment.
-		 * (Note that we also have a copy of the last block of the old WAL in
-		 * readBuf; we will use that below.)
-		 */
-		exitArchiveRecovery(EndOfLogTLI, EndOfLog);
-
-		/*
-		 * Write the timeline history file, and have it archived. After this
-		 * point (or rather, as soon as the file is archived), the timeline
-		 * will appear as "taken" in the WAL archive and to any standby
-		 * servers.  If we crash before actually switching to the new
-		 * timeline, standby servers will nevertheless think that we switched
-		 * to the new timeline, and will try to connect to the new timeline.
-		 * To minimize the window for that, try to do as little as possible
-		 * between here and writing the end-of-recovery record.
-		 */
-		writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
-							 EndRecPtr, reason);
+			curInsert += SizeOfXLogShortPHD;
+	}
+	checkPoint.redo = curInsert;
 
-		/*
-		 * Since there might be a partial WAL segment named RECOVERYXLOG, get
-		 * rid of it.
-		 */
-		snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
-		unlink(recoveryPath);	/* ignore any error */
+	/*
+	 * Here we update the shared RedoRecPtr for future XLogInsert calls; this
+	 * must be done while holding all the insertion locks.
+	 *
+	 * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
+	 * pointing past where it really needs to point.  This is okay; the only
+	 * consequence is that XLogInsert might back up whole buffers that it
+	 * didn't really need to.  We can't postpone advancing RedoRecPtr because
+	 * XLogInserts that happen while we are dumping buffers must assume that
+	 * their buffer changes are not included in the checkpoint.
+	 */
+	RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
 
-		/* Get rid of any remaining recovered timeline-history file, too */
-		snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
-		unlink(recoveryPath);	/* ignore any error */
-	}
+	/*
+	 * Now we can release the WAL insertion locks, allowing other xacts to
+	 * proceed while we are flushing disk buffers.
+	 */
+	WALInsertLockRelease();
 
-	/* Save the selected TimeLineID in shared memory, too */
-	XLogCtl->ThisTimeLineID = ThisTimeLineID;
-	XLogCtl->PrevTimeLineID = PrevTimeLineID;
+	/* Update the info_lck-protected copy of RedoRecPtr as well */
+	SpinLockAcquire(&XLogCtl->info_lck);
+	XLogCtl->RedoRecPtr = checkPoint.redo;
+	SpinLockRelease(&XLogCtl->info_lck);
 
 	/*
-	 * Prepare to write WAL starting at EndOfLog location, and init xlog
-	 * buffer cache using the block containing the last record from the
-	 * previous incarnation.
+	 * If enabled, log checkpoint start.  We postpone this until now so as not
+	 * to log anything if we decided to skip the checkpoint.
 	 */
-	Insert = &XLogCtl->Insert;
-	Insert->PrevBytePos = XLogRecPtrToBytePos(LastRec);
-	Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog);
+	if (log_checkpoints)
+		LogCheckpointStart(flags, false);
+
+	/* Update the process title */
+	update_checkpoint_display(flags, false, false);
+
+	TRACE_POSTGRESQL_CHECKPOINT_START(flags);
 
 	/*
-	 * Tricky point here: readBuf contains the *last* block that the LastRec
-	 * record spans, not the one it starts in.  The last block is indeed the
-	 * one we want to use.
+	 * Get the other info we need for the checkpoint record.
+	 *
+	 * We don't need to save oldestClogXid in the checkpoint, it only matters
+	 * for the short period in which clog is being truncated, and if we crash
+	 * during that we'll redo the clog truncation and fix up oldestClogXid
+	 * there.
 	 */
-	if (EndOfLog % XLOG_BLCKSZ != 0)
-	{
-		char	   *page;
-		int			len;
-		int			firstIdx;
-		XLogRecPtr	pageBeginPtr;
+	LWLockAcquire(XidGenLock, LW_SHARED);
+	checkPoint.nextXid = ShmemVariableCache->nextXid;
+	checkPoint.oldestXid = ShmemVariableCache->oldestXid;
+	checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;
+	LWLockRelease(XidGenLock);
 
-		pageBeginPtr = EndOfLog - (EndOfLog % XLOG_BLCKSZ);
-		Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size));
+	LWLockAcquire(CommitTsLock, LW_SHARED);
+	checkPoint.oldestCommitTsXid = ShmemVariableCache->oldestCommitTsXid;
+	checkPoint.newestCommitTsXid = ShmemVariableCache->newestCommitTsXid;
+	LWLockRelease(CommitTsLock);
 
-		firstIdx = XLogRecPtrToBufIdx(EndOfLog);
+	LWLockAcquire(OidGenLock, LW_SHARED);
+	checkPoint.nextOid = ShmemVariableCache->nextOid;
+	if (!shutdown)
+		checkPoint.nextOid += ShmemVariableCache->oidCount;
+	LWLockRelease(OidGenLock);
 
-		/* Copy the valid part of the last block, and zero the rest */
-		page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
-		len = EndOfLog % XLOG_BLCKSZ;
-		memcpy(page, xlogreader->readBuf, len);
-		memset(page + len, 0, XLOG_BLCKSZ - len);
+	MultiXactGetCheckptMulti(shutdown,
+							 &checkPoint.nextMulti,
+							 &checkPoint.nextMultiOffset,
+							 &checkPoint.oldestMulti,
+							 &checkPoint.oldestMultiDB);
 
-		XLogCtl->xlblocks[firstIdx] = pageBeginPtr + XLOG_BLCKSZ;
-		XLogCtl->InitializedUpTo = pageBeginPtr + XLOG_BLCKSZ;
-	}
-	else
+	/*
+	 * Having constructed the checkpoint record, ensure all shmem disk buffers
+	 * and commit-log buffers are flushed to disk.
+	 *
+	 * This I/O could fail for various reasons.  If so, we will fail to
+	 * complete the checkpoint, but there is no reason to force a system
+	 * panic. Accordingly, exit critical section while doing it.
+	 */
+	END_CRIT_SECTION();
+
+	/*
+	 * In some cases there are groups of actions that must all occur on one
+	 * side or the other of a checkpoint record. Before flushing the
+	 * checkpoint record we must explicitly wait for any backend currently
+	 * performing those groups of actions.
+	 *
+	 * One example is end of transaction, so we must wait for any transactions
+	 * that are currently in commit critical sections.  If an xact inserted
+	 * its commit record into XLOG just before the REDO point, then a crash
+	 * restart from the REDO point would not replay that record, which means
+	 * that our flushing had better include the xact's update of pg_xact.  So
+	 * we wait till he's out of his commit critical section before proceeding.
+	 * See notes in RecordTransactionCommit().
+	 *
+	 * Because we've already released the insertion locks, this test is a bit
+	 * fuzzy: it is possible that we will wait for xacts we didn't really need
+	 * to wait for.  But the delay should be short and it seems better to make
+	 * checkpoint take a bit longer than to hold off insertions longer than
+	 * necessary. (In fact, the whole reason we have this issue is that xact.c
+	 * does commit record XLOG insertion and clog update as two separate steps
+	 * protected by different locks, but again that seems best on grounds of
+	 * minimizing lock contention.)
+	 *
+	 * A transaction that has not yet set delayChkpt when we look cannot be at
+	 * risk, since he's not inserted his commit record yet; and one that's
+	 * already cleared it is not at risk either, since he's done fixing clog
+	 * and we will correctly flush the update below.  So we cannot miss any
+	 * xacts we need to wait for.
+	 */
+	vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
+	if (nvxids > 0)
 	{
-		/*
-		 * There is no partial block to copy. Just set InitializedUpTo, and
-		 * let the first attempt to insert a log record to initialize the next
-		 * buffer.
-		 */
-		XLogCtl->InitializedUpTo = EndOfLog;
+		do
+		{
+			pg_usleep(10000L);	/* wait for 10 msec */
+		} while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
 	}
+	pfree(vxids);
 
-	LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
+	CheckPointGuts(checkPoint.redo, flags);
 
-	XLogCtl->LogwrtResult = LogwrtResult;
+	/*
+	 * Take a snapshot of running transactions and write this to WAL. This
+	 * allows us to reconstruct the state of running transactions during
+	 * archive recovery, if required. Skip, if this info disabled.
+	 *
+	 * If we are shutting down, or Startup process is completing crash
+	 * recovery we don't need to write running xact data.
+	 */
+	if (!shutdown && XLogStandbyInfoActive())
+		LogStandbySnapshot();
 
-	XLogCtl->LogwrtRqst.Write = EndOfLog;
-	XLogCtl->LogwrtRqst.Flush = EndOfLog;
+	START_CRIT_SECTION();
 
 	/*
-	 * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
-	 * record before resource manager writes cleanup WAL records or checkpoint
-	 * record is written.
+	 * Now insert the checkpoint record into XLOG.
 	 */
-	Insert->fullPageWrites = lastFullPageWrites;
-	LocalSetXLogInsertAllowed();
-	UpdateFullPageWrites();
-	LocalXLogInsertAllowed = -1;
-
-	if (InRecovery)
-	{
-		/*
-		 * Perform a checkpoint to update all our recovery activity to disk.
-		 *
-		 * Note that we write a shutdown checkpoint rather than an on-line
-		 * one. This is not particularly critical, but since we may be
-		 * assigning a new TLI, using a shutdown checkpoint allows us to have
-		 * the rule that TLI only changes in shutdown checkpoints, which
-		 * allows some extra error checking in xlog_redo.
-		 *
-		 * In promotion, only create a lightweight end-of-recovery record
-		 * instead of a full checkpoint. A checkpoint is requested later,
-		 * after we're fully out of recovery mode and already accepting
-		 * queries.
-		 */
-		if (bgwriterLaunched)
-		{
-			if (LocalPromoteIsTriggered)
-			{
-				checkPointLoc = ControlFile->checkPoint;
-
-				/*
-				 * Confirm the last checkpoint is available for us to recover
-				 * from if we fail.
-				 */
-				record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, false);
-				if (record != NULL)
-				{
-					promoted = true;
+	XLogBeginInsert();
+	XLogRegisterData((char *) (&checkPoint), sizeof(checkPoint));
+	recptr = XLogInsert(RM_XLOG_ID,
+						shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
+						XLOG_CHECKPOINT_ONLINE);
 
-					/*
-					 * Insert a special WAL record to mark the end of
-					 * recovery, since we aren't doing a checkpoint. That
-					 * means that the checkpointer process may likely be in
-					 * the middle of a time-smoothed restartpoint and could
-					 * continue to be for minutes after this. That sounds
-					 * strange, but the effect is roughly the same and it
-					 * would be stranger to try to come out of the
-					 * restartpoint and then checkpoint. We request a
-					 * checkpoint later anyway, just for safety.
-					 */
-					CreateEndOfRecoveryRecord();
-				}
-			}
+	XLogFlush(recptr);
 
-			if (!promoted)
-				RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
-								  CHECKPOINT_IMMEDIATE |
-								  CHECKPOINT_WAIT);
-		}
+	/*
+	 * We mustn't write any new WAL after a shutdown checkpoint, or it will be
+	 * overwritten at next startup.  No-one should even try, this just allows
+	 * sanity-checking.  In the case of an end-of-recovery checkpoint, we want
+	 * to just temporarily disable writing until the system has exited
+	 * recovery.
+	 */
+	if (shutdown)
+	{
+		if (flags & CHECKPOINT_END_OF_RECOVERY)
+			LocalXLogInsertAllowed = -1;	/* return to "check" state */
 		else
-			CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
+			LocalXLogInsertAllowed = 0; /* never again write WAL */
 	}
 
-	if (ArchiveRecoveryRequested)
-	{
-		/*
-		 * And finally, execute the recovery_end_command, if any.
-		 */
-		if (recoveryEndCommand && strcmp(recoveryEndCommand, "") != 0)
-			ExecuteRecoveryCommand(recoveryEndCommand,
-								   "recovery_end_command",
-								   true);
-
-		/*
-		 * We switched to a new timeline. Clean up segments on the old
-		 * timeline.
-		 *
-		 * If there are any higher-numbered segments on the old timeline,
-		 * remove them. They might contain valid WAL, but they might also be
-		 * pre-allocated files containing garbage. In any case, they are not
-		 * part of the new timeline's history so we don't need them.
-		 */
-		RemoveNonParentXlogFiles(EndOfLog, ThisTimeLineID);
-
-		/*
-		 * If the switch happened in the middle of a segment, what to do with
-		 * the last, partial segment on the old timeline? If we don't archive
-		 * it, and the server that created the WAL never archives it either
-		 * (e.g. because it was hit by a meteor), it will never make it to the
-		 * archive. That's OK from our point of view, because the new segment
-		 * that we created with the new TLI contains all the WAL from the old
-		 * timeline up to the switch point. But if you later try to do PITR to
-		 * the "missing" WAL on the old timeline, recovery won't find it in
-		 * the archive. It's physically present in the new file with new TLI,
-		 * but recovery won't look there when it's recovering to the older
-		 * timeline. On the other hand, if we archive the partial segment, and
-		 * the original server on that timeline is still running and archives
-		 * the completed version of the same segment later, it will fail. (We
-		 * used to do that in 9.4 and below, and it caused such problems).
-		 *
-		 * As a compromise, we rename the last segment with the .partial
-		 * suffix, and archive it. Archive recovery will never try to read
-		 * .partial segments, so they will normally go unused. But in the odd
-		 * PITR case, the administrator can copy them manually to the pg_wal
-		 * directory (removing the suffix). They can be useful in debugging,
-		 * too.
-		 *
-		 * If a .done or .ready file already exists for the old timeline,
-		 * however, we had already determined that the segment is complete, so
-		 * we can let it be archived normally. (In particular, if it was
-		 * restored from the archive to begin with, it's expected to have a
-		 * .done file).
-		 */
-		if (XLogSegmentOffset(EndOfLog, wal_segment_size) != 0 &&
-			XLogArchivingActive())
-		{
-			char		origfname[MAXFNAMELEN];
-			XLogSegNo	endLogSegNo;
-
-			XLByteToPrevSeg(EndOfLog, endLogSegNo, wal_segment_size);
-			XLogFileName(origfname, EndOfLogTLI, endLogSegNo, wal_segment_size);
-
-			if (!XLogArchiveIsReadyOrDone(origfname))
-			{
-				char		origpath[MAXPGPATH];
-				char		partialfname[MAXFNAMELEN];
-				char		partialpath[MAXPGPATH];
-
-				XLogFilePath(origpath, EndOfLogTLI, endLogSegNo, wal_segment_size);
-				snprintf(partialfname, MAXFNAMELEN, "%s.partial", origfname);
-				snprintf(partialpath, MAXPGPATH, "%s.partial", origpath);
-
-				/*
-				 * Make sure there's no .done or .ready file for the .partial
-				 * file.
-				 */
-				XLogArchiveCleanup(partialfname);
+	/*
+	 * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
+	 * = end of actual checkpoint record.
+	 */
+	if (shutdown && checkPoint.redo != ProcLastRecPtr)
+		ereport(PANIC,
+				(errmsg("concurrent write-ahead log activity while database system is shutting down")));
 
-				durable_rename(origpath, partialpath, ERROR);
-				XLogArchiveNotify(partialfname);
-			}
-		}
-	}
+	/*
+	 * Remember the prior checkpoint's redo ptr for
+	 * UpdateCheckPointDistanceEstimate()
+	 */
+	PriorRedoPtr = ControlFile->checkPointCopy.redo;
 
 	/*
-	 * Preallocate additional log files, if wanted.
+	 * Update the control file.
 	 */
-	PreallocXlogFiles(EndOfLog);
+	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+	if (shutdown)
+		ControlFile->state = DB_SHUTDOWNED;
+	ControlFile->checkPoint = ProcLastRecPtr;
+	ControlFile->checkPointCopy = checkPoint;
+	ControlFile->time = (pg_time_t) time(NULL);
+	/* crash recovery should always recover to the end of WAL */
+	ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
+	ControlFile->minRecoveryPointTLI = 0;
 
 	/*
-	 * Okay, we're officially UP.
+	 * Persist unloggedLSN value. It's reset on crash recovery, so this goes
+	 * unused on non-shutdown checkpoints, but seems useful to store it always
+	 * for debugging purposes.
 	 */
-	InRecovery = false;
+	SpinLockAcquire(&XLogCtl->ulsn_lck);
+	ControlFile->unloggedLSN = XLogCtl->unloggedLSN;
+	SpinLockRelease(&XLogCtl->ulsn_lck);
 
-	/* start the archive_timeout timer and LSN running */
-	XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
-	XLogCtl->lastSegSwitchLSN = EndOfLog;
+	UpdateControlFile();
+	LWLockRelease(ControlFileLock);
 
-	/* also initialize latestCompletedXid, to nextXid - 1 */
-	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
-	ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
-	FullTransactionIdRetreat(&ShmemVariableCache->latestCompletedXid);
-	LWLockRelease(ProcArrayLock);
+	/* Update shared-memory copy of checkpoint XID/epoch */
+	SpinLockAcquire(&XLogCtl->info_lck);
+	XLogCtl->ckptFullXid = checkPoint.nextXid;
+	SpinLockRelease(&XLogCtl->info_lck);
 
 	/*
-	 * Start up subtrans, if not already done for hot standby.  (commit
-	 * timestamps are started below, if necessary.)
+	 * We are now done with critical updates; no need for system panic if we
+	 * have trouble while fooling with old log segments.
 	 */
-	if (standbyState == STANDBY_DISABLED)
-		StartupSUBTRANS(oldestActiveXID);
+	END_CRIT_SECTION();
 
 	/*
-	 * Perform end of recovery actions for any SLRUs that need it.
+	 * Let smgr do post-checkpoint cleanup (eg, deleting old files).
 	 */
-	TrimCLOG();
-	TrimMultiXact();
-
-	/* Reload shared-memory state for prepared transactions */
-	RecoverPreparedTransactions();
+	SyncPostCheckpoint();
 
 	/*
-	 * Shutdown the recovery environment. This must occur after
-	 * RecoverPreparedTransactions(), see notes for lock_twophase_recover()
+	 * Update the average distance between checkpoints if the prior checkpoint
+	 * exists.
 	 */
-	if (standbyState != STANDBY_DISABLED)
-		ShutdownRecoveryTransactionEnvironment();
-
-	/* Shut down xlogreader */
-	if (readFile >= 0)
-	{
-		close(readFile);
-		readFile = -1;
-	}
-	XLogReaderFree(xlogreader);
+	if (PriorRedoPtr != InvalidXLogRecPtr)
+		UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
 
 	/*
-	 * If any of the critical GUCs have changed, log them before we allow
-	 * backends to write WAL.
+	 * Delete old log files, those no longer needed for last checkpoint to
+	 * prevent the disk holding the xlog from growing full.
 	 */
-	LocalSetXLogInsertAllowed();
-	XLogReportParameters();
+	XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
+	KeepLogSeg(recptr, &_logSegNo);
+	InvalidateObsoleteReplicationSlots(_logSegNo);
+	_logSegNo--;
+	RemoveOldXlogFiles(_logSegNo, RedoRecPtr, recptr);
 
 	/*
-	 * Local WAL inserts enabled, so it's time to finish initialization of
-	 * commit timestamp.
+	 * Make more log segments if needed.  (Do this after recycling old log
+	 * segments, since that may supply some of the needed files.)
 	 */
-	CompleteCommitTsInitialization();
+	if (!shutdown)
+		PreallocXlogFiles(recptr);
 
 	/*
-	 * All done with end-of-recovery actions.
-	 *
-	 * Now allow backends to write WAL and update the control file status in
-	 * consequence.  SharedRecoveryState, that controls if backends can write
-	 * WAL, is updated while holding ControlFileLock to prevent other backends
-	 * to look at an inconsistent state of the control file in shared memory.
-	 * There is still a small window during which backends can write WAL and
-	 * the control file is still referring to a system not in DB_IN_PRODUCTION
-	 * state while looking at the on-disk control file.
-	 *
-	 * Also, we use info_lck to update SharedRecoveryState to ensure that
-	 * there are no race conditions concerning visibility of other recent
-	 * updates to shared memory.
-	 */
-	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
-	ControlFile->state = DB_IN_PRODUCTION;
-	ControlFile->time = (pg_time_t) time(NULL);
-
-	SpinLockAcquire(&XLogCtl->info_lck);
-	XLogCtl->SharedRecoveryState = RECOVERY_STATE_DONE;
-	SpinLockRelease(&XLogCtl->info_lck);
+	 * Truncate pg_subtrans if possible.  We can throw away all data before
+	 * the oldest XMIN of any running transaction.  No future transaction will
+	 * attempt to reference any pg_subtrans entry older than that (see Asserts
+	 * in subtrans.c).  During recovery, though, we mustn't do this because
+	 * StartupSUBTRANS hasn't been called yet.
+	 */
+	if (!RecoveryInProgress())
+		TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning());
 
-	UpdateControlFile();
-	LWLockRelease(ControlFileLock);
+	/* Real work is done, but log and update stats before releasing lock. */
+	LogCheckpointEnd(false);
 
-	/*
-	 * If there were cascading standby servers connected to us, nudge any wal
-	 * sender processes to notice that we've been promoted.
-	 */
-	WalSndWakeup();
+	/* Reset the process title */
+	update_checkpoint_display(flags, false, true);
 
-	/*
-	 * If this was a promotion, request an (online) checkpoint now. This isn't
-	 * required for consistency, but the last restartpoint might be far back,
-	 * and in case of a crash, recovering from it might take a longer than is
-	 * appropriate now that we're not in standby mode anymore.
-	 */
-	if (promoted)
-		RequestCheckpoint(CHECKPOINT_FORCE);
+	TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
+									 NBuffers,
+									 CheckpointStats.ckpt_segs_added,
+									 CheckpointStats.ckpt_segs_removed,
+									 CheckpointStats.ckpt_segs_recycled);
 }
 
 /*
- * Checks if recovery has reached a consistent state. When consistency is
- * reached and we have a valid starting standby snapshot, tell postmaster
- * that it can start accepting read-only connections.
+ * Mark the end of recovery in WAL though without running a full checkpoint.
+ * We can expect that a restartpoint is likely to be in progress as we
+ * do this, though we are unwilling to wait for it to complete.
+ *
+ * CreateRestartPoint() allows for the case where recovery may end before
+ * the restartpoint completes so there is no concern of concurrent behaviour.
  */
 static void
-CheckRecoveryConsistency(void)
+CreateEndOfRecoveryRecord(void)
 {
-	XLogRecPtr	lastReplayedEndRecPtr;
-
-	/*
-	 * During crash recovery, we don't reach a consistent state until we've
-	 * replayed all the WAL.
-	 */
-	if (XLogRecPtrIsInvalid(minRecoveryPoint))
-		return;
-
-	Assert(InArchiveRecovery);
-
-	/*
-	 * assume that we are called in the startup process, and hence don't need
-	 * a lock to read lastReplayedEndRecPtr
-	 */
-	lastReplayedEndRecPtr = XLogCtl->lastReplayedEndRecPtr;
+	xl_end_of_recovery xlrec;
+	XLogRecPtr	recptr;
 
-	/*
-	 * Have we reached the point where our base backup was completed?
-	 */
-	if (!XLogRecPtrIsInvalid(ControlFile->backupEndPoint) &&
-		ControlFile->backupEndPoint <= lastReplayedEndRecPtr)
-	{
-		/*
-		 * We have reached the end of base backup, as indicated by pg_control.
-		 * The data on disk is now consistent. Reset backupStartPoint and
-		 * backupEndPoint, and update minRecoveryPoint to make sure we don't
-		 * allow starting up at an earlier point even if recovery is stopped
-		 * and restarted soon after this.
-		 */
-		elog(DEBUG1, "end of backup reached");
+	/* sanity check */
+	if (!RecoveryInProgress())
+		elog(ERROR, "can only be used to end recovery");
 
-		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+	xlrec.end_time = GetCurrentTimestamp();
 
-		if (ControlFile->minRecoveryPoint < lastReplayedEndRecPtr)
-			ControlFile->minRecoveryPoint = lastReplayedEndRecPtr;
+	WALInsertLockAcquireExclusive();
+	xlrec.ThisTimeLineID = ThisTimeLineID;
+	xlrec.PrevTimeLineID = XLogCtl->PrevTimeLineID;
+	WALInsertLockRelease();
 
-		ControlFile->backupStartPoint = InvalidXLogRecPtr;
-		ControlFile->backupEndPoint = InvalidXLogRecPtr;
-		ControlFile->backupEndRequired = false;
-		UpdateControlFile();
+	LocalSetXLogInsertAllowed();
 
-		LWLockRelease(ControlFileLock);
-	}
+	START_CRIT_SECTION();
 
-	/*
-	 * Have we passed our safe starting point? Note that minRecoveryPoint is
-	 * known to be incorrectly set if ControlFile->backupEndRequired, until
-	 * the XLOG_BACKUP_END arrives to advise us of the correct
-	 * minRecoveryPoint. All we know prior to that is that we're not
-	 * consistent yet.
-	 */
-	if (!reachedConsistency && !ControlFile->backupEndRequired &&
-		minRecoveryPoint <= lastReplayedEndRecPtr &&
-		XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
-	{
-		/*
-		 * Check to see if the XLOG sequence contained any unresolved
-		 * references to uninitialized pages.
-		 */
-		XLogCheckInvalidPages();
+	XLogBeginInsert();
+	XLogRegisterData((char *) &xlrec, sizeof(xl_end_of_recovery));
+	recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY);
 
-		reachedConsistency = true;
-		ereport(LOG,
-				(errmsg("consistent recovery state reached at %X/%X",
-						LSN_FORMAT_ARGS(lastReplayedEndRecPtr))));
-	}
+	XLogFlush(recptr);
 
 	/*
-	 * Have we got a valid starting snapshot that will allow queries to be
-	 * run? If so, we can tell postmaster that the database is consistent now,
-	 * enabling connections.
+	 * Update the control file so that crash recovery can follow the timeline
+	 * changes to this point.
 	 */
-	if (standbyState == STANDBY_SNAPSHOT_READY &&
-		!LocalHotStandbyActive &&
-		reachedConsistency &&
-		IsUnderPostmaster)
-	{
-		SpinLockAcquire(&XLogCtl->info_lck);
-		XLogCtl->SharedHotStandbyActive = true;
-		SpinLockRelease(&XLogCtl->info_lck);
+	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+	ControlFile->time = (pg_time_t) time(NULL);
+	ControlFile->minRecoveryPoint = recptr;
+	ControlFile->minRecoveryPointTLI = ThisTimeLineID;
+	UpdateControlFile();
+	LWLockRelease(ControlFileLock);
 
-		LocalHotStandbyActive = true;
+	END_CRIT_SECTION();
 
-		SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
-	}
+	LocalXLogInsertAllowed = -1;	/* return to "check" state */
 }
 
 /*
- * Is the system still in recovery?
- *
- * Unlike testing InRecovery, this works in any process that's connected to
- * shared memory.
+ * Flush all data in shared memory to disk, and fsync
  *
- * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
- * variables the first time we see that recovery is finished.
+ * This is the common code shared between regular checkpoints and
+ * recovery restartpoints.
  */
-bool
-RecoveryInProgress(void)
+static void
+CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
 {
-	/*
-	 * We check shared state each time only until we leave recovery mode. We
-	 * can't re-enter recovery, so there's no need to keep checking after the
-	 * shared variable has once been seen false.
-	 */
-	if (!LocalRecoveryInProgress)
-		return false;
-	else
-	{
-		/*
-		 * use volatile pointer to make sure we make a fresh read of the
-		 * shared variable.
-		 */
-		volatile XLogCtlData *xlogctl = XLogCtl;
-
-		LocalRecoveryInProgress = (xlogctl->SharedRecoveryState != RECOVERY_STATE_DONE);
+	CheckPointRelationMap();
+	CheckPointReplicationSlots();
+	CheckPointSnapBuild();
+	CheckPointLogicalRewriteHeap();
+	CheckPointReplicationOrigin();
 
-		/*
-		 * Initialize TimeLineID and RedoRecPtr when we discover that recovery
-		 * is finished. InitPostgres() relies upon this behaviour to ensure
-		 * that InitXLOGAccess() is called at backend startup.  (If you change
-		 * this, see also LocalSetXLogInsertAllowed.)
-		 */
-		if (!LocalRecoveryInProgress)
-		{
-			/*
-			 * If we just exited recovery, make sure we read TimeLineID and
-			 * RedoRecPtr after SharedRecoveryState (for machines with weak
-			 * memory ordering).
-			 */
-			pg_memory_barrier();
-			InitXLOGAccess();
-		}
+	/* Write out all dirty data in SLRUs and the main buffer pool */
+	TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
+	CheckpointStats.ckpt_write_t = GetCurrentTimestamp();
+	CheckPointCLOG();
+	CheckPointCommitTs();
+	CheckPointSUBTRANS();
+	CheckPointMultiXact();
+	CheckPointPredicate();
+	CheckPointBuffers(flags);
 
-		/*
-		 * Note: We don't need a memory barrier when we're still in recovery.
-		 * We might exit recovery immediately after return, so the caller
-		 * can't rely on 'true' meaning that we're still in recovery anyway.
-		 */
+	/* Perform all queued up fsyncs */
+	TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START();
+	CheckpointStats.ckpt_sync_t = GetCurrentTimestamp();
+	ProcessSyncRequests();
+	CheckpointStats.ckpt_sync_end_t = GetCurrentTimestamp();
+	TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE();
 
-		return LocalRecoveryInProgress;
-	}
+	/* We deliberately delay 2PC checkpointing as long as possible */
+	CheckPointTwoPhase(checkPointRedo);
 }
 
 /*
- * Returns current recovery state from shared memory.
+ * Save a checkpoint for recovery restart if appropriate
  *
- * This returned state is kept consistent with the contents of the control
- * file.  See details about the possible values of RecoveryState in xlog.h.
+ * This function is called each time a checkpoint record is read from XLOG.
+ * It must determine whether the checkpoint represents a safe restartpoint or
+ * not.  If so, the checkpoint record is stashed in shared memory so that
+ * CreateRestartPoint can consult it.  (Note that the latter function is
+ * executed by the checkpointer, while this one will be executed by the
+ * startup process.)
  */
-RecoveryState
-GetRecoveryState(void)
+static void
+RecoveryRestartPoint(const XLogReaderState *record, const CheckPoint *checkPoint)
 {
-	RecoveryState retval;
+	/*
+	 * Also refrain from creating a restartpoint if we have seen any
+	 * references to non-existent pages. Restarting recovery from the
+	 * restartpoint would not see the references, so we would lose the
+	 * cross-check that the pages belonged to a relation that was dropped
+	 * later.
+	 */
+	if (XLogHaveInvalidPages())
+	{
+		elog(trace_recovery(DEBUG2),
+			 "could not record restart point at %X/%X because there "
+			 "are unresolved references to invalid pages",
+			 LSN_FORMAT_ARGS(checkPoint->redo));
+		return;
+	}
 
+	/*
+	 * Copy the checkpoint record to shared memory, so that checkpointer can
+	 * work out the next time it wants to perform a restartpoint.
+	 */
 	SpinLockAcquire(&XLogCtl->info_lck);
-	retval = XLogCtl->SharedRecoveryState;
+	XLogCtl->lastCheckPointRecPtr = record->ReadRecPtr;
+	XLogCtl->lastCheckPointEndPtr = record->EndRecPtr;
+	XLogCtl->lastCheckPoint = *checkPoint;
 	SpinLockRelease(&XLogCtl->info_lck);
-
-	return retval;
 }
 
 /*
- * Is HotStandby active yet? This is only important in special backends
- * since normal backends won't ever be able to connect until this returns
- * true. Postmaster knows this by way of signal, not via shared memory.
+ * Establish a restartpoint if possible.
+ *
+ * This is similar to CreateCheckPoint, but is used during WAL recovery
+ * to establish a point from which recovery can roll forward without
+ * replaying the entire recovery log.
  *
- * Unlike testing standbyState, this works in any process that's connected to
- * shared memory.  (And note that standbyState alone doesn't tell the truth
- * anyway.)
+ * Returns true if a new restartpoint was established. We can only establish
+ * a restartpoint if we have replayed a safe checkpoint record since last
+ * restartpoint.
  */
 bool
-HotStandbyActive(void)
+CreateRestartPoint(int flags)
 {
+	XLogRecPtr	lastCheckPointRecPtr;
+	XLogRecPtr	lastCheckPointEndPtr;
+	CheckPoint	lastCheckPoint;
+	XLogRecPtr	PriorRedoPtr;
+	XLogRecPtr	receivePtr;
+	XLogRecPtr	replayPtr;
+	TimeLineID	replayTLI;
+	XLogRecPtr	endptr;
+	XLogSegNo	_logSegNo;
+	TimestampTz xtime;
+
+	/* Get a local copy of the last safe checkpoint record. */
+	SpinLockAcquire(&XLogCtl->info_lck);
+	lastCheckPointRecPtr = XLogCtl->lastCheckPointRecPtr;
+	lastCheckPointEndPtr = XLogCtl->lastCheckPointEndPtr;
+	lastCheckPoint = XLogCtl->lastCheckPoint;
+	SpinLockRelease(&XLogCtl->info_lck);
+
 	/*
-	 * We check shared state each time only until Hot Standby is active. We
-	 * can't de-activate Hot Standby, so there's no need to keep checking
-	 * after the shared variable has once been seen true.
+	 * Check that we're still in recovery mode. It's ok if we exit recovery
+	 * mode after this check, the restart point is valid anyway.
 	 */
-	if (LocalHotStandbyActive)
-		return true;
-	else
+	if (!RecoveryInProgress())
 	{
-		/* spinlock is essential on machines with weak memory ordering! */
-		SpinLockAcquire(&XLogCtl->info_lck);
-		LocalHotStandbyActive = XLogCtl->SharedHotStandbyActive;
-		SpinLockRelease(&XLogCtl->info_lck);
-
-		return LocalHotStandbyActive;
+		ereport(DEBUG2,
+				(errmsg_internal("skipping restartpoint, recovery has already ended")));
+		return false;
 	}
-}
-
-/*
- * Like HotStandbyActive(), but to be used only in WAL replay code,
- * where we don't need to ask any other process what the state is.
- */
-bool
-HotStandbyActiveInReplay(void)
-{
-	Assert(AmStartupProcess() || !IsPostmasterEnvironment);
-	return LocalHotStandbyActive;
-}
 
-/*
- * Is this process allowed to insert new WAL records?
- *
- * Ordinarily this is essentially equivalent to !RecoveryInProgress().
- * But we also have provisions for forcing the result "true" or "false"
- * within specific processes regardless of the global state.
- */
-bool
-XLogInsertAllowed(void)
-{
 	/*
-	 * If value is "unconditionally true" or "unconditionally false", just
-	 * return it.  This provides the normal fast path once recovery is known
-	 * done.
+	 * If the last checkpoint record we've replayed is already our last
+	 * restartpoint, we can't perform a new restart point. We still update
+	 * minRecoveryPoint in that case, so that if this is a shutdown restart
+	 * point, we won't start up earlier than before. That's not strictly
+	 * necessary, but when hot standby is enabled, it would be rather weird if
+	 * the database opened up for read-only connections at a point-in-time
+	 * before the last shutdown. Such time travel is still possible in case of
+	 * immediate shutdown, though.
+	 *
+	 * We don't explicitly advance minRecoveryPoint when we do create a
+	 * restartpoint. It's assumed that flushing the buffers will do that as a
+	 * side-effect.
 	 */
-	if (LocalXLogInsertAllowed >= 0)
-		return (bool) LocalXLogInsertAllowed;
+	if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
+		lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
+	{
+		ereport(DEBUG2,
+				(errmsg_internal("skipping restartpoint, already performed at %X/%X",
+								 LSN_FORMAT_ARGS(lastCheckPoint.redo))));
 
-	/*
-	 * Else, must check to see if we're still in recovery.
-	 */
-	if (RecoveryInProgress())
+		UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
+		if (flags & CHECKPOINT_IS_SHUTDOWN)
+		{
+			LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+			ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
+			ControlFile->time = (pg_time_t) time(NULL);
+			UpdateControlFile();
+			LWLockRelease(ControlFileLock);
+		}
 		return false;
+	}
 
 	/*
-	 * On exit from recovery, reset to "unconditionally true", since there is
-	 * no need to keep checking.
+	 * Update the shared RedoRecPtr so that the startup process can calculate
+	 * the number of segments replayed since last restartpoint, and request a
+	 * restartpoint if it exceeds CheckPointSegments.
+	 *
+	 * Like in CreateCheckPoint(), hold off insertions to update it, although
+	 * during recovery this is just pro forma, because no WAL insertions are
+	 * happening.
 	 */
-	LocalXLogInsertAllowed = 1;
-	return true;
-}
+	WALInsertLockAcquireExclusive();
+	RedoRecPtr = XLogCtl->Insert.RedoRecPtr = lastCheckPoint.redo;
+	WALInsertLockRelease();
 
-/*
- * Make XLogInsertAllowed() return true in the current process only.
- *
- * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
- * and even call LocalSetXLogInsertAllowed() again after that.
- */
-static void
-LocalSetXLogInsertAllowed(void)
-{
-	Assert(LocalXLogInsertAllowed == -1);
-	LocalXLogInsertAllowed = 1;
+	/* Also update the info_lck-protected copy */
+	SpinLockAcquire(&XLogCtl->info_lck);
+	XLogCtl->RedoRecPtr = lastCheckPoint.redo;
+	SpinLockRelease(&XLogCtl->info_lck);
 
-	/* Initialize as RecoveryInProgress() would do when switching state */
-	InitXLOGAccess();
-}
+	/*
+	 * Prepare to accumulate statistics.
+	 *
+	 * Note: because it is possible for log_checkpoints to change while a
+	 * checkpoint proceeds, we always accumulate stats, even if
+	 * log_checkpoints is currently off.
+	 */
+	MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
+	CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
 
-/*
- * Subroutine to try to fetch and validate a prior checkpoint record.
- *
- * whichChkpt identifies the checkpoint (merely for reporting purposes).
- * 1 for "primary", 0 for "other" (backup_label)
- */
-static XLogRecord *
-ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
-					 int whichChkpt, bool report)
-{
-	XLogRecord *record;
-	uint8		info;
+	if (log_checkpoints)
+		LogCheckpointStart(flags, true);
 
-	if (!XRecOffIsValid(RecPtr))
-	{
-		if (!report)
-			return NULL;
+	/* Update the process title */
+	update_checkpoint_display(flags, true, false);
 
-		switch (whichChkpt)
-		{
-			case 1:
-				ereport(LOG,
-						(errmsg("invalid primary checkpoint link in control file")));
-				break;
-			default:
-				ereport(LOG,
-						(errmsg("invalid checkpoint link in backup_label file")));
-				break;
-		}
-		return NULL;
-	}
+	CheckPointGuts(lastCheckPoint.redo, flags);
 
-	XLogBeginRead(xlogreader, RecPtr);
-	record = ReadRecord(xlogreader, LOG, true);
+	/*
+	 * Remember the prior checkpoint's redo ptr for
+	 * UpdateCheckPointDistanceEstimate()
+	 */
+	PriorRedoPtr = ControlFile->checkPointCopy.redo;
 
-	if (record == NULL)
+	/*
+	 * Update pg_control, using current time.  Check that it still shows
+	 * DB_IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
+	 * this is a quick hack to make sure nothing really bad happens if somehow
+	 * we get here after the end-of-recovery checkpoint.
+	 */
+	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+	if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
+		ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
 	{
-		if (!report)
-			return NULL;
+		ControlFile->checkPoint = lastCheckPointRecPtr;
+		ControlFile->checkPointCopy = lastCheckPoint;
+		ControlFile->time = (pg_time_t) time(NULL);
 
-		switch (whichChkpt)
-		{
-			case 1:
-				ereport(LOG,
-						(errmsg("invalid primary checkpoint record")));
-				break;
-			default:
-				ereport(LOG,
-						(errmsg("invalid checkpoint record")));
-				break;
-		}
-		return NULL;
-	}
-	if (record->xl_rmid != RM_XLOG_ID)
-	{
-		switch (whichChkpt)
-		{
-			case 1:
-				ereport(LOG,
-						(errmsg("invalid resource manager ID in primary checkpoint record")));
-				break;
-			default:
-				ereport(LOG,
-						(errmsg("invalid resource manager ID in checkpoint record")));
-				break;
-		}
-		return NULL;
-	}
-	info = record->xl_info & ~XLR_INFO_MASK;
-	if (info != XLOG_CHECKPOINT_SHUTDOWN &&
-		info != XLOG_CHECKPOINT_ONLINE)
-	{
-		switch (whichChkpt)
-		{
-			case 1:
-				ereport(LOG,
-						(errmsg("invalid xl_info in primary checkpoint record")));
-				break;
-			default:
-				ereport(LOG,
-						(errmsg("invalid xl_info in checkpoint record")));
-				break;
-		}
-		return NULL;
-	}
-	if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint))
-	{
-		switch (whichChkpt)
+		/*
+		 * Ensure minRecoveryPoint is past the checkpoint record.  Normally,
+		 * this will have happened already while writing out dirty buffers,
+		 * but not necessarily - e.g. because no buffers were dirtied.  We do
+		 * this because a non-exclusive base backup uses minRecoveryPoint to
+		 * determine which WAL files must be included in the backup, and the
+		 * file (or files) containing the checkpoint record must be included,
+		 * at a minimum. Note that for an ordinary restart of recovery there's
+		 * no value in having the minimum recovery point any earlier than this
+		 * anyway, because redo will begin just after the checkpoint record.
+		 */
+		if (ControlFile->minRecoveryPoint < lastCheckPointEndPtr)
 		{
-			case 1:
-				ereport(LOG,
-						(errmsg("invalid length of primary checkpoint record")));
-				break;
-			default:
-				ereport(LOG,
-						(errmsg("invalid length of checkpoint record")));
-				break;
+			ControlFile->minRecoveryPoint = lastCheckPointEndPtr;
+			ControlFile->minRecoveryPointTLI = lastCheckPoint.ThisTimeLineID;
+
+			/* update local copy */
+			LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
+			LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
 		}
-		return NULL;
+		if (flags & CHECKPOINT_IS_SHUTDOWN)
+			ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
+		UpdateControlFile();
 	}
-	return record;
-}
-
-/*
- * This must be called in a backend process before creating WAL records
- * (except in a standalone backend, which does StartupXLOG instead).  We need
- * to initialize the local copies of ThisTimeLineID and RedoRecPtr.
- *
- * Note: before Postgres 8.0, we went to some effort to keep the postmaster
- * process's copies of ThisTimeLineID and RedoRecPtr valid too.  This was
- * unnecessary however, since the postmaster itself never touches XLOG anyway.
- */
-void
-InitXLOGAccess(void)
-{
-	XLogCtlInsert *Insert = &XLogCtl->Insert;
+	LWLockRelease(ControlFileLock);
 
-	/* ThisTimeLineID doesn't change so we need no lock to copy it */
-	ThisTimeLineID = XLogCtl->ThisTimeLineID;
-	Assert(ThisTimeLineID != 0 || IsBootstrapProcessingMode());
+	/*
+	 * Update the average distance between checkpoints/restartpoints if the
+	 * prior checkpoint exists.
+	 */
+	if (PriorRedoPtr != InvalidXLogRecPtr)
+		UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
 
-	/* set wal_segment_size */
-	wal_segment_size = ControlFile->xlog_seg_size;
+	/*
+	 * Delete old log files, those no longer needed for last restartpoint to
+	 * prevent the disk holding the xlog from growing full.
+	 */
+	XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
 
-	/* Use GetRedoRecPtr to copy the RedoRecPtr safely */
-	(void) GetRedoRecPtr();
-	/* Also update our copy of doPageWrites. */
-	doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites);
+	/*
+	 * Retreat _logSegNo using the current end of xlog replayed or received,
+	 * whichever is later.
+	 */
+	receivePtr = GetWalRcvFlushRecPtr(NULL, NULL);
+	replayPtr = GetXLogReplayRecPtr(&replayTLI);
+	endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
+	KeepLogSeg(endptr, &_logSegNo);
+	InvalidateObsoleteReplicationSlots(_logSegNo);
+	_logSegNo--;
 
-	/* Also initialize the working areas for constructing WAL records */
-	InitXLogInsert();
-}
+	/*
+	 * Try to recycle segments on a useful timeline. If we've been promoted
+	 * since the beginning of this restartpoint, use the new timeline chosen
+	 * at end of recovery (RecoveryInProgress() sets ThisTimeLineID in that
+	 * case). If we're still in recovery, use the timeline we're currently
+	 * replaying.
+	 *
+	 * There is no guarantee that the WAL segments will be useful on the
+	 * current timeline; if recovery proceeds to a new timeline right after
+	 * this, the pre-allocated WAL segments on this timeline will not be used,
+	 * and will go wasted until recycled on the next restartpoint. We'll live
+	 * with that.
+	 */
+	if (RecoveryInProgress())
+		ThisTimeLineID = replayTLI;
 
-/*
- * Return the current Redo pointer from shared memory.
- *
- * As a side-effect, the local RedoRecPtr copy is updated.
- */
-XLogRecPtr
-GetRedoRecPtr(void)
-{
-	XLogRecPtr	ptr;
+	RemoveOldXlogFiles(_logSegNo, RedoRecPtr, endptr);
 
 	/*
-	 * The possibly not up-to-date copy in XlogCtl is enough. Even if we
-	 * grabbed a WAL insertion lock to read the authoritative value in
-	 * Insert->RedoRecPtr, someone might update it just after we've released
-	 * the lock.
+	 * Make more log segments if needed.  (Do this after recycling old log
+	 * segments, since that may supply some of the needed files.)
 	 */
-	SpinLockAcquire(&XLogCtl->info_lck);
-	ptr = XLogCtl->RedoRecPtr;
-	SpinLockRelease(&XLogCtl->info_lck);
+	PreallocXlogFiles(endptr);
 
-	if (RedoRecPtr < ptr)
-		RedoRecPtr = ptr;
+	/*
+	 * ThisTimeLineID is normally not set when we're still in recovery.
+	 * However, recycling/preallocating segments above needed ThisTimeLineID
+	 * to determine which timeline to install the segments on. Reset it now,
+	 * to restore the normal state of affairs for debugging purposes.
+	 */
+	if (RecoveryInProgress())
+		ThisTimeLineID = 0;
 
-	return RedoRecPtr;
-}
+	/*
+	 * Truncate pg_subtrans if possible.  We can throw away all data before
+	 * the oldest XMIN of any running transaction.  No future transaction will
+	 * attempt to reference any pg_subtrans entry older than that (see Asserts
+	 * in subtrans.c).  When hot standby is disabled, though, we mustn't do
+	 * this because StartupSUBTRANS hasn't been called yet.
+	 */
+	if (EnableHotStandby)
+		TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning());
 
-/*
- * Return information needed to decide whether a modified block needs a
- * full-page image to be included in the WAL record.
- *
- * The returned values are cached copies from backend-private memory, and
- * possibly out-of-date.  XLogInsertRecord will re-check them against
- * up-to-date values, while holding the WAL insert lock.
- */
-void
-GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p)
-{
-	*RedoRecPtr_p = RedoRecPtr;
-	*doPageWrites_p = doPageWrites;
+	/* Real work is done, but log and update before releasing lock. */
+	LogCheckpointEnd(true);
+
+	/* Reset the process title */
+	update_checkpoint_display(flags, true, true);
+
+	xtime = GetLatestXTime();
+	ereport((log_checkpoints ? LOG : DEBUG2),
+			(errmsg("recovery restart point at %X/%X",
+					LSN_FORMAT_ARGS(lastCheckPoint.redo)),
+			 xtime ? errdetail("Last completed transaction was at log time %s.",
+							   timestamptz_to_str(xtime)) : 0));
+
+	/*
+	 * Finally, execute archive_cleanup_command, if any.
+	 */
+	if (archiveCleanupCommand && strcmp(archiveCleanupCommand, "") != 0)
+		ExecuteRecoveryCommand(archiveCleanupCommand,
+							   "archive_cleanup_command",
+							   false);
+
+	return true;
 }
 
 /*
- * GetInsertRecPtr -- Returns the current insert position.
+ * Report availability of WAL for the given target LSN
+ *		(typically a slot's restart_lsn)
  *
- * NOTE: The value *actually* returned is the position of the last full
- * xlog page. It lags behind the real insert position by at most 1 page.
- * For that, we don't need to scan through WAL insertion locks, and an
- * approximation is enough for the current usage of this function.
+ * Returns one of the following enum values:
+ *
+ * * WALAVAIL_RESERVED means targetLSN is available and it is in the range of
+ *   max_wal_size.
+ *
+ * * WALAVAIL_EXTENDED means it is still available by preserving extra
+ *   segments beyond max_wal_size. If max_slot_wal_keep_size is smaller
+ *   than max_wal_size, this state is not returned.
+ *
+ * * WALAVAIL_UNRESERVED means it is being lost and the next checkpoint will
+ *   remove reserved segments. The walsender using this slot may return to the
+ *   above.
+ *
+ * * WALAVAIL_REMOVED means it has been removed. A replication stream on
+ *   a slot with this LSN cannot continue after a restart.
+ *
+ * * WALAVAIL_INVALID_LSN means the slot hasn't been set to reserve WAL.
  */
-XLogRecPtr
-GetInsertRecPtr(void)
+WALAvailability
+GetWALAvailability(XLogRecPtr targetLSN)
 {
-	XLogRecPtr	recptr;
+	XLogRecPtr	currpos;		/* current write LSN */
+	XLogSegNo	currSeg;		/* segid of currpos */
+	XLogSegNo	targetSeg;		/* segid of targetLSN */
+	XLogSegNo	oldestSeg;		/* actual oldest segid */
+	XLogSegNo	oldestSegMaxWalSize;	/* oldest segid kept by max_wal_size */
+	XLogSegNo	oldestSlotSeg;	/* oldest segid kept by slot */
+	uint64		keepSegs;
 
-	SpinLockAcquire(&XLogCtl->info_lck);
-	recptr = XLogCtl->LogwrtRqst.Write;
-	SpinLockRelease(&XLogCtl->info_lck);
+	/*
+	 * slot does not reserve WAL. Either deactivated, or has never been active
+	 */
+	if (XLogRecPtrIsInvalid(targetLSN))
+		return WALAVAIL_INVALID_LSN;
 
-	return recptr;
-}
+	/*
+	 * Calculate the oldest segment currently reserved by all slots,
+	 * considering wal_keep_size and max_slot_wal_keep_size.  Initialize
+	 * oldestSlotSeg to the current segment.
+	 */
+	currpos = GetXLogWriteRecPtr();
+	XLByteToSeg(currpos, oldestSlotSeg, wal_segment_size);
+	KeepLogSeg(currpos, &oldestSlotSeg);
 
-/*
- * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
- * position known to be fsync'd to disk.
- */
-XLogRecPtr
-GetFlushRecPtr(void)
-{
-	SpinLockAcquire(&XLogCtl->info_lck);
-	LogwrtResult = XLogCtl->LogwrtResult;
-	SpinLockRelease(&XLogCtl->info_lck);
+	/*
+	 * Find the oldest extant segment file. We get 1 until checkpoint removes
+	 * the first WAL segment file since startup, which causes the status being
+	 * wrong under certain abnormal conditions but that doesn't actually harm.
+	 */
+	oldestSeg = XLogGetLastRemovedSegno() + 1;
 
-	return LogwrtResult.Flush;
+	/* calculate oldest segment by max_wal_size */
+	XLByteToSeg(currpos, currSeg, wal_segment_size);
+	keepSegs = ConvertToXSegs(max_wal_size_mb, wal_segment_size) + 1;
+
+	if (currSeg > keepSegs)
+		oldestSegMaxWalSize = currSeg - keepSegs;
+	else
+		oldestSegMaxWalSize = 1;
+
+	/* the segment we care about */
+	XLByteToSeg(targetLSN, targetSeg, wal_segment_size);
+
+	/*
+	 * No point in returning reserved or extended status values if the
+	 * targetSeg is known to be lost.
+	 */
+	if (targetSeg >= oldestSlotSeg)
+	{
+		/* show "reserved" when targetSeg is within max_wal_size */
+		if (targetSeg >= oldestSegMaxWalSize)
+			return WALAVAIL_RESERVED;
+
+		/* being retained by slots exceeding max_wal_size */
+		return WALAVAIL_EXTENDED;
+	}
+
+	/* WAL segments are no longer retained but haven't been removed yet */
+	if (targetSeg >= oldestSeg)
+		return WALAVAIL_UNRESERVED;
+
+	/* Definitely lost */
+	return WALAVAIL_REMOVED;
 }
 
+
 /*
- * GetLastImportantRecPtr -- Returns the LSN of the last important record
- * inserted. All records not explicitly marked as unimportant are considered
- * important.
+ * Retreat *logSegNo to the last segment that we need to retain because of
+ * either wal_keep_size or replication slots.
  *
- * The LSN is determined by computing the maximum of
- * WALInsertLocks[i].lastImportantAt.
+ * This is calculated by subtracting wal_keep_size from the given xlog
+ * location, recptr and by making sure that that result is below the
+ * requirement of replication slots.  For the latter criterion we do consider
+ * the effects of max_slot_wal_keep_size: reserve at most that much space back
+ * from recptr.
  */
-XLogRecPtr
-GetLastImportantRecPtr(void)
+static void
+KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
 {
-	XLogRecPtr	res = InvalidXLogRecPtr;
-	int			i;
+	XLogSegNo	currSegNo;
+	XLogSegNo	segno;
+	XLogRecPtr	keep;
 
-	for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
+	XLByteToSeg(recptr, currSegNo, wal_segment_size);
+	segno = currSegNo;
+
+	/*
+	 * Calculate how many segments are kept by slots first, adjusting for
+	 * max_slot_wal_keep_size.
+	 */
+	keep = XLogGetReplicationSlotMinimumLSN();
+	if (keep != InvalidXLogRecPtr)
 	{
-		XLogRecPtr	last_important;
+		XLByteToSeg(keep, segno, wal_segment_size);
 
-		/*
-		 * Need to take a lock to prevent torn reads of the LSN, which are
-		 * possible on some of the supported platforms. WAL insert locks only
-		 * support exclusive mode, so we have to use that.
-		 */
-		LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
-		last_important = WALInsertLocks[i].l.lastImportantAt;
-		LWLockRelease(&WALInsertLocks[i].l.lock);
+		/* Cap by max_slot_wal_keep_size ... */
+		if (max_slot_wal_keep_size_mb >= 0)
+		{
+			uint64		slot_keep_segs;
 
-		if (res < last_important)
-			res = last_important;
-	}
+			slot_keep_segs =
+				ConvertToXSegs(max_slot_wal_keep_size_mb, wal_segment_size);
 
-	return res;
-}
+			if (currSegNo - segno > slot_keep_segs)
+				segno = currSegNo - slot_keep_segs;
+		}
+	}
 
-/*
- * Get the time and LSN of the last xlog segment switch
- */
-pg_time_t
-GetLastSegSwitchData(XLogRecPtr *lastSwitchLSN)
-{
-	pg_time_t	result;
+	/* but, keep at least wal_keep_size if that's set */
+	if (wal_keep_size_mb > 0)
+	{
+		uint64		keep_segs;
 
-	/* Need WALWriteLock, but shared lock is sufficient */
-	LWLockAcquire(WALWriteLock, LW_SHARED);
-	result = XLogCtl->lastSegSwitchTime;
-	*lastSwitchLSN = XLogCtl->lastSegSwitchLSN;
-	LWLockRelease(WALWriteLock);
+		keep_segs = ConvertToXSegs(wal_keep_size_mb, wal_segment_size);
+		if (currSegNo - segno < keep_segs)
+		{
+			/* avoid underflow, don't go below 1 */
+			if (currSegNo <= keep_segs)
+				segno = 1;
+			else
+				segno = currSegNo - keep_segs;
+		}
+	}
 
-	return result;
+	/* don't delete WAL segments newer than the calculated segment */
+	if (segno < *logSegNo)
+		*logSegNo = segno;
 }
 
 /*
- * This must be called ONCE during postmaster or standalone-backend shutdown
+ * Write a NEXTOID log record
  */
 void
-ShutdownXLOG(int code, Datum arg)
+XLogPutNextOid(Oid nextOid)
 {
-	/*
-	 * We should have an aux process resource owner to use, and we should not
-	 * be in a transaction that's installed some other resowner.
-	 */
-	Assert(AuxProcessResourceOwner != NULL);
-	Assert(CurrentResourceOwner == NULL ||
-		   CurrentResourceOwner == AuxProcessResourceOwner);
-	CurrentResourceOwner = AuxProcessResourceOwner;
-
-	/* Don't be chatty in standalone mode */
-	ereport(IsPostmasterEnvironment ? LOG : NOTICE,
-			(errmsg("shutting down")));
+	XLogBeginInsert();
+	XLogRegisterData((char *) (&nextOid), sizeof(Oid));
+	(void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID);
 
 	/*
-	 * Signal walsenders to move to stopping state.
-	 */
-	WalSndInitStopping();
-
-	/*
-	 * Wait for WAL senders to be in stopping state.  This prevents commands
-	 * from writing new WAL.
+	 * We need not flush the NEXTOID record immediately, because any of the
+	 * just-allocated OIDs could only reach disk as part of a tuple insert or
+	 * update that would have its own XLOG record that must follow the NEXTOID
+	 * record.  Therefore, the standard buffer LSN interlock applied to those
+	 * records will ensure no such OID reaches disk before the NEXTOID record
+	 * does.
+	 *
+	 * Note, however, that the above statement only covers state "within" the
+	 * database.  When we use a generated OID as a file or directory name, we
+	 * are in a sense violating the basic WAL rule, because that filesystem
+	 * change may reach disk before the NEXTOID WAL record does.  The impact
+	 * of this is that if a database crash occurs immediately afterward, we
+	 * might after restart re-generate the same OID and find that it conflicts
+	 * with the leftover file or directory.  But since for safety's sake we
+	 * always loop until finding a nonconflicting filename, this poses no real
+	 * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
 	 */
-	WalSndWaitStopping();
-
-	if (RecoveryInProgress())
-		CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
-	else
-	{
-		/*
-		 * If archiving is enabled, rotate the last XLOG file so that all the
-		 * remaining records are archived (postmaster wakes up the archiver
-		 * process one more time at the end of shutdown). The checkpoint
-		 * record will go to the next XLOG file and won't be archived (yet).
-		 */
-		if (XLogArchivingActive() && XLogArchiveCommandSet())
-			RequestXLogSwitch(false);
-
-		CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
-	}
 }
 
 /*
- * Log start of a checkpoint.
+ * Write an XLOG SWITCH record.
+ *
+ * Here we just blindly issue an XLogInsert request for the record.
+ * All the magic happens inside XLogInsert.
+ *
+ * The return value is either the end+1 address of the switch record,
+ * or the end+1 address of the prior segment if we did not need to
+ * write a switch record because we are already at segment start.
  */
-static void
-LogCheckpointStart(int flags, bool restartpoint)
+XLogRecPtr
+RequestXLogSwitch(bool mark_unimportant)
 {
-	if (restartpoint)
-		ereport(LOG,
-		/* translator: the placeholders show checkpoint options */
-				(errmsg("restartpoint starting:%s%s%s%s%s%s%s%s",
-						(flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
-						(flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
-						(flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
-						(flags & CHECKPOINT_FORCE) ? " force" : "",
-						(flags & CHECKPOINT_WAIT) ? " wait" : "",
-						(flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "",
-						(flags & CHECKPOINT_CAUSE_TIME) ? " time" : "",
-						(flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : "")));
-	else
-		ereport(LOG,
-		/* translator: the placeholders show checkpoint options */
-				(errmsg("checkpoint starting:%s%s%s%s%s%s%s%s",
-						(flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
-						(flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
-						(flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
-						(flags & CHECKPOINT_FORCE) ? " force" : "",
-						(flags & CHECKPOINT_WAIT) ? " wait" : "",
-						(flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "",
-						(flags & CHECKPOINT_CAUSE_TIME) ? " time" : "",
-						(flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : "")));
+	XLogRecPtr	RecPtr;
+
+	/* XLOG SWITCH has no data */
+	XLogBeginInsert();
+
+	if (mark_unimportant)
+		XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
+	RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH);
+
+	return RecPtr;
 }
 
 /*
- * Log end of a checkpoint.
+ * Write a RESTORE POINT record
  */
-static void
-LogCheckpointEnd(bool restartpoint)
+XLogRecPtr
+XLogRestorePoint(const char *rpName)
 {
-	long		write_msecs,
-				sync_msecs,
-				total_msecs,
-				longest_msecs,
-				average_msecs;
-	uint64		average_sync_time;
+	XLogRecPtr	RecPtr;
+	xl_restore_point xlrec;
 
-	CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
+	xlrec.rp_time = GetCurrentTimestamp();
+	strlcpy(xlrec.rp_name, rpName, MAXFNAMELEN);
 
-	write_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_write_t,
-												  CheckpointStats.ckpt_sync_t);
+	XLogBeginInsert();
+	XLogRegisterData((char *) &xlrec, sizeof(xl_restore_point));
 
-	sync_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_sync_t,
-												 CheckpointStats.ckpt_sync_end_t);
+	RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT);
 
-	/* Accumulate checkpoint timing summary data, in milliseconds. */
-	BgWriterStats.m_checkpoint_write_time += write_msecs;
-	BgWriterStats.m_checkpoint_sync_time += sync_msecs;
+	ereport(LOG,
+			(errmsg("restore point \"%s\" created at %X/%X",
+					rpName, LSN_FORMAT_ARGS(RecPtr))));
 
-	/*
-	 * All of the published timing statistics are accounted for.  Only
-	 * continue if a log message is to be written.
-	 */
-	if (!log_checkpoints)
-		return;
+	return RecPtr;
+}
 
-	total_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_start_t,
-												  CheckpointStats.ckpt_end_t);
+/*
+ * Check if any of the GUC parameters that are critical for hot standby
+ * have changed, and update the value in pg_control file if necessary.
+ */
+static void
+XLogReportParameters(void)
+{
+	if (wal_level != ControlFile->wal_level ||
+		wal_log_hints != ControlFile->wal_log_hints ||
+		MaxConnections != ControlFile->MaxConnections ||
+		max_worker_processes != ControlFile->max_worker_processes ||
+		max_wal_senders != ControlFile->max_wal_senders ||
+		max_prepared_xacts != ControlFile->max_prepared_xacts ||
+		max_locks_per_xact != ControlFile->max_locks_per_xact ||
+		track_commit_timestamp != ControlFile->track_commit_timestamp)
+	{
+		/*
+		 * The change in number of backend slots doesn't need to be WAL-logged
+		 * if archiving is not enabled, as you can't start archive recovery
+		 * with wal_level=minimal anyway. We don't really care about the
+		 * values in pg_control either if wal_level=minimal, but seems better
+		 * to keep them up-to-date to avoid confusion.
+		 */
+		if (wal_level != ControlFile->wal_level || XLogIsNeeded())
+		{
+			xl_parameter_change xlrec;
+			XLogRecPtr	recptr;
 
-	/*
-	 * Timing values returned from CheckpointStats are in microseconds.
-	 * Convert to milliseconds for consistent printing.
-	 */
-	longest_msecs = (long) ((CheckpointStats.ckpt_longest_sync + 999) / 1000);
+			xlrec.MaxConnections = MaxConnections;
+			xlrec.max_worker_processes = max_worker_processes;
+			xlrec.max_wal_senders = max_wal_senders;
+			xlrec.max_prepared_xacts = max_prepared_xacts;
+			xlrec.max_locks_per_xact = max_locks_per_xact;
+			xlrec.wal_level = wal_level;
+			xlrec.wal_log_hints = wal_log_hints;
+			xlrec.track_commit_timestamp = track_commit_timestamp;
 
-	average_sync_time = 0;
-	if (CheckpointStats.ckpt_sync_rels > 0)
-		average_sync_time = CheckpointStats.ckpt_agg_sync_time /
-			CheckpointStats.ckpt_sync_rels;
-	average_msecs = (long) ((average_sync_time + 999) / 1000);
+			XLogBeginInsert();
+			XLogRegisterData((char *) &xlrec, sizeof(xlrec));
 
-	if (restartpoint)
-		ereport(LOG,
-				(errmsg("restartpoint complete: wrote %d buffers (%.1f%%); "
-						"%d WAL file(s) added, %d removed, %d recycled; "
-						"write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
-						"sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
-						"distance=%d kB, estimate=%d kB",
-						CheckpointStats.ckpt_bufs_written,
-						(double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
-						CheckpointStats.ckpt_segs_added,
-						CheckpointStats.ckpt_segs_removed,
-						CheckpointStats.ckpt_segs_recycled,
-						write_msecs / 1000, (int) (write_msecs % 1000),
-						sync_msecs / 1000, (int) (sync_msecs % 1000),
-						total_msecs / 1000, (int) (total_msecs % 1000),
-						CheckpointStats.ckpt_sync_rels,
-						longest_msecs / 1000, (int) (longest_msecs % 1000),
-						average_msecs / 1000, (int) (average_msecs % 1000),
-						(int) (PrevCheckPointDistance / 1024.0),
-						(int) (CheckPointDistanceEstimate / 1024.0))));
-	else
-		ereport(LOG,
-				(errmsg("checkpoint complete: wrote %d buffers (%.1f%%); "
-						"%d WAL file(s) added, %d removed, %d recycled; "
-						"write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
-						"sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
-						"distance=%d kB, estimate=%d kB",
-						CheckpointStats.ckpt_bufs_written,
-						(double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
-						CheckpointStats.ckpt_segs_added,
-						CheckpointStats.ckpt_segs_removed,
-						CheckpointStats.ckpt_segs_recycled,
-						write_msecs / 1000, (int) (write_msecs % 1000),
-						sync_msecs / 1000, (int) (sync_msecs % 1000),
-						total_msecs / 1000, (int) (total_msecs % 1000),
-						CheckpointStats.ckpt_sync_rels,
-						longest_msecs / 1000, (int) (longest_msecs % 1000),
-						average_msecs / 1000, (int) (average_msecs % 1000),
-						(int) (PrevCheckPointDistance / 1024.0),
-						(int) (CheckPointDistanceEstimate / 1024.0))));
+			recptr = XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE);
+			XLogFlush(recptr);
+		}
+
+		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+
+		ControlFile->MaxConnections = MaxConnections;
+		ControlFile->max_worker_processes = max_worker_processes;
+		ControlFile->max_wal_senders = max_wal_senders;
+		ControlFile->max_prepared_xacts = max_prepared_xacts;
+		ControlFile->max_locks_per_xact = max_locks_per_xact;
+		ControlFile->wal_level = wal_level;
+		ControlFile->wal_log_hints = wal_log_hints;
+		ControlFile->track_commit_timestamp = track_commit_timestamp;
+		UpdateControlFile();
+
+		LWLockRelease(ControlFileLock);
+	}
 }
 
 /*
- * Update the estimate of distance between checkpoints.
+ * Update full_page_writes in shared memory, and write an
+ * XLOG_FPW_CHANGE record if necessary.
  *
- * The estimate is used to calculate the number of WAL segments to keep
- * preallocated, see XLOGfileslop().
+ * Note: this function assumes there is no other process running
+ * concurrently that could update it.
  */
-static void
-UpdateCheckPointDistanceEstimate(uint64 nbytes)
+void
+UpdateFullPageWrites(void)
 {
+	XLogCtlInsert *Insert = &XLogCtl->Insert;
+	bool		recoveryInProgress;
+
 	/*
-	 * To estimate the number of segments consumed between checkpoints, keep a
-	 * moving average of the amount of WAL generated in previous checkpoint
-	 * cycles. However, if the load is bursty, with quiet periods and busy
-	 * periods, we want to cater for the peak load. So instead of a plain
-	 * moving average, let the average decline slowly if the previous cycle
-	 * used less WAL than estimated, but bump it up immediately if it used
-	 * more.
-	 *
-	 * When checkpoints are triggered by max_wal_size, this should converge to
-	 * CheckpointSegments * wal_segment_size,
+	 * Do nothing if full_page_writes has not been changed.
 	 *
-	 * Note: This doesn't pay any attention to what caused the checkpoint.
-	 * Checkpoints triggered manually with CHECKPOINT command, or by e.g.
-	 * starting a base backup, are counted the same as those created
-	 * automatically. The slow-decline will largely mask them out, if they are
-	 * not frequent. If they are frequent, it seems reasonable to count them
-	 * in as any others; if you issue a manual checkpoint every 5 minutes and
-	 * never let a timed checkpoint happen, it makes sense to base the
-	 * preallocation on that 5 minute interval rather than whatever
-	 * checkpoint_timeout is set to.
+	 * It's safe to check the shared full_page_writes without the lock,
+	 * because we assume that there is no concurrently running process which
+	 * can update it.
 	 */
-	PrevCheckPointDistance = nbytes;
-	if (CheckPointDistanceEstimate < nbytes)
-		CheckPointDistanceEstimate = nbytes;
-	else
-		CheckPointDistanceEstimate =
-			(0.90 * CheckPointDistanceEstimate + 0.10 * (double) nbytes);
-}
+	if (fullPageWrites == Insert->fullPageWrites)
+		return;
 
-/*
- * Update the ps display for a process running a checkpoint.  Note that
- * this routine should not do any allocations so as it can be called
- * from a critical section.
- */
-static void
-update_checkpoint_display(int flags, bool restartpoint, bool reset)
-{
 	/*
-	 * The status is reported only for end-of-recovery and shutdown
-	 * checkpoints or shutdown restartpoints.  Updating the ps display is
-	 * useful in those situations as it may not be possible to rely on
-	 * pg_stat_activity to see the status of the checkpointer or the startup
-	 * process.
+	 * Perform this outside critical section so that the WAL insert
+	 * initialization done by RecoveryInProgress() doesn't trigger an
+	 * assertion failure.
 	 */
-	if ((flags & (CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IS_SHUTDOWN)) == 0)
-		return;
+	recoveryInProgress = RecoveryInProgress();
 
-	if (reset)
-		set_ps_display("");
-	else
+	START_CRIT_SECTION();
+
+	/*
+	 * It's always safe to take full page images, even when not strictly
+	 * required, but not the other round. So if we're setting full_page_writes
+	 * to true, first set it true and then write the WAL record. If we're
+	 * setting it to false, first write the WAL record and then set the global
+	 * flag.
+	 */
+	if (fullPageWrites)
 	{
-		char		activitymsg[128];
+		WALInsertLockAcquireExclusive();
+		Insert->fullPageWrites = true;
+		WALInsertLockRelease();
+	}
 
-		snprintf(activitymsg, sizeof(activitymsg), "performing %s%s%s",
-				 (flags & CHECKPOINT_END_OF_RECOVERY) ? "end-of-recovery " : "",
-				 (flags & CHECKPOINT_IS_SHUTDOWN) ? "shutdown " : "",
-				 restartpoint ? "restartpoint" : "checkpoint");
-		set_ps_display(activitymsg);
+	/*
+	 * Write an XLOG_FPW_CHANGE record. This allows us to keep track of
+	 * full_page_writes during archive recovery, if required.
+	 */
+	if (XLogStandbyInfoActive() && !recoveryInProgress)
+	{
+		XLogBeginInsert();
+		XLogRegisterData((char *) (&fullPageWrites), sizeof(bool));
+
+		XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE);
 	}
-}
 
+	if (!fullPageWrites)
+	{
+		WALInsertLockAcquireExclusive();
+		Insert->fullPageWrites = false;
+		WALInsertLockRelease();
+	}
+	END_CRIT_SECTION();
+}
 
 /*
- * Perform a checkpoint --- either during shutdown, or on-the-fly
- *
- * flags is a bitwise OR of the following:
- *	CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
- *	CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
- *	CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
- *		ignoring checkpoint_completion_target parameter.
- *	CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
- *		since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
- *		CHECKPOINT_END_OF_RECOVERY).
- *	CHECKPOINT_FLUSH_ALL: also flush buffers of unlogged tables.
- *
- * Note: flags contains other bits, of interest here only for logging purposes.
- * In particular note that this routine is synchronous and does not pay
- * attention to CHECKPOINT_WAIT.
+ * XLOG resource manager's routines
  *
- * If !shutdown then we are writing an online checkpoint. This is a very special
- * kind of operation and WAL record because the checkpoint action occurs over
- * a period of time yet logically occurs at just a single LSN. The logical
- * position of the WAL record (redo ptr) is the same or earlier than the
- * physical position. When we replay WAL we locate the checkpoint via its
- * physical position then read the redo ptr and actually start replay at the
- * earlier logical position. Note that we don't write *anything* to WAL at
- * the logical position, so that location could be any other kind of WAL record.
- * All of this mechanism allows us to continue working while we checkpoint.
- * As a result, timing of actions is critical here and be careful to note that
- * this function will likely take minutes to execute on a busy system.
+ * Definitions of info values are in include/catalog/pg_control.h, though
+ * not all record types are related to control file updates.
  */
 void
-CreateCheckPoint(int flags)
+xlog_redo(XLogReaderState *record)
 {
-	bool		shutdown;
-	CheckPoint	checkPoint;
-	XLogRecPtr	recptr;
-	XLogSegNo	_logSegNo;
-	XLogCtlInsert *Insert = &XLogCtl->Insert;
-	uint32		freespace;
-	XLogRecPtr	PriorRedoPtr;
-	XLogRecPtr	curInsert;
-	XLogRecPtr	last_important_lsn;
-	VirtualTransactionId *vxids;
-	int			nvxids;
-
-	/*
-	 * An end-of-recovery checkpoint is really a shutdown checkpoint, just
-	 * issued at a different time.
-	 */
-	if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
-		shutdown = true;
-	else
-		shutdown = false;
+	uint8		info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+	XLogRecPtr	lsn = record->EndRecPtr;
 
-	/* sanity check */
-	if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
-		elog(ERROR, "can't create a checkpoint during recovery");
+	/* in XLOG rmgr, backup blocks are only used by XLOG_FPI records */
+	Assert(info == XLOG_FPI || info == XLOG_FPI_FOR_HINT ||
+		   !XLogRecHasAnyBlockRefs(record));
 
-	/*
-	 * Initialize InitXLogInsert working areas before entering the critical
-	 * section.  Normally, this is done by the first call to
-	 * RecoveryInProgress() or LocalSetXLogInsertAllowed(), but when creating
-	 * an end-of-recovery checkpoint, the LocalSetXLogInsertAllowed call is
-	 * done below in a critical section, and InitXLogInsert cannot be called
-	 * in a critical section.
-	 */
-	InitXLogInsert();
+	if (info == XLOG_NEXTOID)
+	{
+		Oid			nextOid;
 
-	/*
-	 * Prepare to accumulate statistics.
-	 *
-	 * Note: because it is possible for log_checkpoints to change while a
-	 * checkpoint proceeds, we always accumulate stats, even if
-	 * log_checkpoints is currently off.
-	 */
-	MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
-	CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
+		/*
+		 * We used to try to take the maximum of ShmemVariableCache->nextOid
+		 * and the recorded nextOid, but that fails if the OID counter wraps
+		 * around.  Since no OID allocation should be happening during replay
+		 * anyway, better to just believe the record exactly.  We still take
+		 * OidGenLock while setting the variable, just in case.
+		 */
+		memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
+		LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
+		ShmemVariableCache->nextOid = nextOid;
+		ShmemVariableCache->oidCount = 0;
+		LWLockRelease(OidGenLock);
+	}
+	else if (info == XLOG_CHECKPOINT_SHUTDOWN)
+	{
+		CheckPoint	checkPoint;
 
-	/*
-	 * Use a critical section to force system panic if we have trouble.
-	 */
-	START_CRIT_SECTION();
+		memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
+		/* In a SHUTDOWN checkpoint, believe the counters exactly */
+		LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
+		ShmemVariableCache->nextXid = checkPoint.nextXid;
+		LWLockRelease(XidGenLock);
+		LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
+		ShmemVariableCache->nextOid = checkPoint.nextOid;
+		ShmemVariableCache->oidCount = 0;
+		LWLockRelease(OidGenLock);
+		MultiXactSetNextMXact(checkPoint.nextMulti,
+							  checkPoint.nextMultiOffset);
 
-	if (shutdown)
-	{
-		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
-		ControlFile->state = DB_SHUTDOWNING;
-		ControlFile->time = (pg_time_t) time(NULL);
-		UpdateControlFile();
-		LWLockRelease(ControlFileLock);
-	}
+		MultiXactAdvanceOldest(checkPoint.oldestMulti,
+							   checkPoint.oldestMultiDB);
 
-	/*
-	 * Let smgr prepare for checkpoint; this has to happen before we determine
-	 * the REDO pointer.  Note that smgr must not do anything that'd have to
-	 * be undone if we decide no checkpoint is needed.
-	 */
-	SyncPreCheckpoint();
+		/*
+		 * No need to set oldestClogXid here as well; it'll be set when we
+		 * redo an xl_clog_truncate if it changed since initialization.
+		 */
+		SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
 
-	/* Begin filling in the checkpoint WAL record */
-	MemSet(&checkPoint, 0, sizeof(checkPoint));
-	checkPoint.time = (pg_time_t) time(NULL);
+		/*
+		 * If we see a shutdown checkpoint while waiting for an end-of-backup
+		 * record, the backup was canceled and the end-of-backup record will
+		 * never arrive.
+		 */
+		if (ArchiveRecoveryRequested &&
+			!XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
+			XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
+			ereport(PANIC,
+					(errmsg("online backup was canceled, recovery cannot continue")));
 
-	/*
-	 * For Hot Standby, derive the oldestActiveXid before we fix the redo
-	 * pointer. This allows us to begin accumulating changes to assemble our
-	 * starting snapshot of locks and transactions.
-	 */
-	if (!shutdown && XLogStandbyInfoActive())
-		checkPoint.oldestActiveXid = GetOldestActiveTransactionId();
-	else
-		checkPoint.oldestActiveXid = InvalidTransactionId;
+		/*
+		 * If we see a shutdown checkpoint, we know that nothing was running
+		 * on the primary at this point. So fake-up an empty running-xacts
+		 * record and use that here and now. Recover additional standby state
+		 * for prepared transactions.
+		 */
+		if (standbyState >= STANDBY_INITIALIZED)
+		{
+			TransactionId *xids;
+			int			nxids;
+			TransactionId oldestActiveXID;
+			TransactionId latestCompletedXid;
+			RunningTransactionsData running;
 
-	/*
-	 * Get location of last important record before acquiring insert locks (as
-	 * GetLastImportantRecPtr() also locks WAL locks).
-	 */
-	last_important_lsn = GetLastImportantRecPtr();
-
-	/*
-	 * We must block concurrent insertions while examining insert state to
-	 * determine the checkpoint REDO pointer.
-	 */
-	WALInsertLockAcquireExclusive();
-	curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
-
-	/*
-	 * If this isn't a shutdown or forced checkpoint, and if there has been no
-	 * WAL activity requiring a checkpoint, skip it.  The idea here is to
-	 * avoid inserting duplicate checkpoints when the system is idle.
-	 */
-	if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
-				  CHECKPOINT_FORCE)) == 0)
-	{
-		if (last_important_lsn == ControlFile->checkPoint)
-		{
-			WALInsertLockRelease();
-			END_CRIT_SECTION();
-			ereport(DEBUG1,
-					(errmsg_internal("checkpoint skipped because system is idle")));
-			return;
-		}
-	}
-
-	/*
-	 * An end-of-recovery checkpoint is created before anyone is allowed to
-	 * write WAL. To allow us to write the checkpoint record, temporarily
-	 * enable XLogInsertAllowed.  (This also ensures ThisTimeLineID is
-	 * initialized, which we need here and in AdvanceXLInsertBuffer.)
-	 */
-	if (flags & CHECKPOINT_END_OF_RECOVERY)
-		LocalSetXLogInsertAllowed();
-
-	checkPoint.ThisTimeLineID = ThisTimeLineID;
-	if (flags & CHECKPOINT_END_OF_RECOVERY)
-		checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID;
-	else
-		checkPoint.PrevTimeLineID = ThisTimeLineID;
-
-	checkPoint.fullPageWrites = Insert->fullPageWrites;
-
-	/*
-	 * Compute new REDO record ptr = location of next XLOG record.
-	 *
-	 * NB: this is NOT necessarily where the checkpoint record itself will be,
-	 * since other backends may insert more XLOG records while we're off doing
-	 * the buffer flush work.  Those XLOG records are logically after the
-	 * checkpoint, even though physically before it.  Got that?
-	 */
-	freespace = INSERT_FREESPACE(curInsert);
-	if (freespace == 0)
-	{
-		if (XLogSegmentOffset(curInsert, wal_segment_size) == 0)
-			curInsert += SizeOfXLogLongPHD;
-		else
-			curInsert += SizeOfXLogShortPHD;
-	}
-	checkPoint.redo = curInsert;
-
-	/*
-	 * Here we update the shared RedoRecPtr for future XLogInsert calls; this
-	 * must be done while holding all the insertion locks.
-	 *
-	 * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
-	 * pointing past where it really needs to point.  This is okay; the only
-	 * consequence is that XLogInsert might back up whole buffers that it
-	 * didn't really need to.  We can't postpone advancing RedoRecPtr because
-	 * XLogInserts that happen while we are dumping buffers must assume that
-	 * their buffer changes are not included in the checkpoint.
-	 */
-	RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
-
-	/*
-	 * Now we can release the WAL insertion locks, allowing other xacts to
-	 * proceed while we are flushing disk buffers.
-	 */
-	WALInsertLockRelease();
-
-	/* Update the info_lck-protected copy of RedoRecPtr as well */
-	SpinLockAcquire(&XLogCtl->info_lck);
-	XLogCtl->RedoRecPtr = checkPoint.redo;
-	SpinLockRelease(&XLogCtl->info_lck);
-
-	/*
-	 * If enabled, log checkpoint start.  We postpone this until now so as not
-	 * to log anything if we decided to skip the checkpoint.
-	 */
-	if (log_checkpoints)
-		LogCheckpointStart(flags, false);
-
-	/* Update the process title */
-	update_checkpoint_display(flags, false, false);
-
-	TRACE_POSTGRESQL_CHECKPOINT_START(flags);
-
-	/*
-	 * Get the other info we need for the checkpoint record.
-	 *
-	 * We don't need to save oldestClogXid in the checkpoint, it only matters
-	 * for the short period in which clog is being truncated, and if we crash
-	 * during that we'll redo the clog truncation and fix up oldestClogXid
-	 * there.
-	 */
-	LWLockAcquire(XidGenLock, LW_SHARED);
-	checkPoint.nextXid = ShmemVariableCache->nextXid;
-	checkPoint.oldestXid = ShmemVariableCache->oldestXid;
-	checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;
-	LWLockRelease(XidGenLock);
-
-	LWLockAcquire(CommitTsLock, LW_SHARED);
-	checkPoint.oldestCommitTsXid = ShmemVariableCache->oldestCommitTsXid;
-	checkPoint.newestCommitTsXid = ShmemVariableCache->newestCommitTsXid;
-	LWLockRelease(CommitTsLock);
-
-	LWLockAcquire(OidGenLock, LW_SHARED);
-	checkPoint.nextOid = ShmemVariableCache->nextOid;
-	if (!shutdown)
-		checkPoint.nextOid += ShmemVariableCache->oidCount;
-	LWLockRelease(OidGenLock);
-
-	MultiXactGetCheckptMulti(shutdown,
-							 &checkPoint.nextMulti,
-							 &checkPoint.nextMultiOffset,
-							 &checkPoint.oldestMulti,
-							 &checkPoint.oldestMultiDB);
-
-	/*
-	 * Having constructed the checkpoint record, ensure all shmem disk buffers
-	 * and commit-log buffers are flushed to disk.
-	 *
-	 * This I/O could fail for various reasons.  If so, we will fail to
-	 * complete the checkpoint, but there is no reason to force a system
-	 * panic. Accordingly, exit critical section while doing it.
-	 */
-	END_CRIT_SECTION();
-
-	/*
-	 * In some cases there are groups of actions that must all occur on one
-	 * side or the other of a checkpoint record. Before flushing the
-	 * checkpoint record we must explicitly wait for any backend currently
-	 * performing those groups of actions.
-	 *
-	 * One example is end of transaction, so we must wait for any transactions
-	 * that are currently in commit critical sections.  If an xact inserted
-	 * its commit record into XLOG just before the REDO point, then a crash
-	 * restart from the REDO point would not replay that record, which means
-	 * that our flushing had better include the xact's update of pg_xact.  So
-	 * we wait till he's out of his commit critical section before proceeding.
-	 * See notes in RecordTransactionCommit().
-	 *
-	 * Because we've already released the insertion locks, this test is a bit
-	 * fuzzy: it is possible that we will wait for xacts we didn't really need
-	 * to wait for.  But the delay should be short and it seems better to make
-	 * checkpoint take a bit longer than to hold off insertions longer than
-	 * necessary. (In fact, the whole reason we have this issue is that xact.c
-	 * does commit record XLOG insertion and clog update as two separate steps
-	 * protected by different locks, but again that seems best on grounds of
-	 * minimizing lock contention.)
-	 *
-	 * A transaction that has not yet set delayChkpt when we look cannot be at
-	 * risk, since he's not inserted his commit record yet; and one that's
-	 * already cleared it is not at risk either, since he's done fixing clog
-	 * and we will correctly flush the update below.  So we cannot miss any
-	 * xacts we need to wait for.
-	 */
-	vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
-	if (nvxids > 0)
-	{
-		do
-		{
-			pg_usleep(10000L);	/* wait for 10 msec */
-		} while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
-	}
-	pfree(vxids);
-
-	CheckPointGuts(checkPoint.redo, flags);
-
-	/*
-	 * Take a snapshot of running transactions and write this to WAL. This
-	 * allows us to reconstruct the state of running transactions during
-	 * archive recovery, if required. Skip, if this info disabled.
-	 *
-	 * If we are shutting down, or Startup process is completing crash
-	 * recovery we don't need to write running xact data.
-	 */
-	if (!shutdown && XLogStandbyInfoActive())
-		LogStandbySnapshot();
-
-	START_CRIT_SECTION();
-
-	/*
-	 * Now insert the checkpoint record into XLOG.
-	 */
-	XLogBeginInsert();
-	XLogRegisterData((char *) (&checkPoint), sizeof(checkPoint));
-	recptr = XLogInsert(RM_XLOG_ID,
-						shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
-						XLOG_CHECKPOINT_ONLINE);
-
-	XLogFlush(recptr);
-
-	/*
-	 * We mustn't write any new WAL after a shutdown checkpoint, or it will be
-	 * overwritten at next startup.  No-one should even try, this just allows
-	 * sanity-checking.  In the case of an end-of-recovery checkpoint, we want
-	 * to just temporarily disable writing until the system has exited
-	 * recovery.
-	 */
-	if (shutdown)
-	{
-		if (flags & CHECKPOINT_END_OF_RECOVERY)
-			LocalXLogInsertAllowed = -1;	/* return to "check" state */
-		else
-			LocalXLogInsertAllowed = 0; /* never again write WAL */
-	}
-
-	/*
-	 * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
-	 * = end of actual checkpoint record.
-	 */
-	if (shutdown && checkPoint.redo != ProcLastRecPtr)
-		ereport(PANIC,
-				(errmsg("concurrent write-ahead log activity while database system is shutting down")));
-
-	/*
-	 * Remember the prior checkpoint's redo ptr for
-	 * UpdateCheckPointDistanceEstimate()
-	 */
-	PriorRedoPtr = ControlFile->checkPointCopy.redo;
-
-	/*
-	 * Update the control file.
-	 */
-	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
-	if (shutdown)
-		ControlFile->state = DB_SHUTDOWNED;
-	ControlFile->checkPoint = ProcLastRecPtr;
-	ControlFile->checkPointCopy = checkPoint;
-	ControlFile->time = (pg_time_t) time(NULL);
-	/* crash recovery should always recover to the end of WAL */
-	ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
-	ControlFile->minRecoveryPointTLI = 0;
-
-	/*
-	 * Persist unloggedLSN value. It's reset on crash recovery, so this goes
-	 * unused on non-shutdown checkpoints, but seems useful to store it always
-	 * for debugging purposes.
-	 */
-	SpinLockAcquire(&XLogCtl->ulsn_lck);
-	ControlFile->unloggedLSN = XLogCtl->unloggedLSN;
-	SpinLockRelease(&XLogCtl->ulsn_lck);
-
-	UpdateControlFile();
-	LWLockRelease(ControlFileLock);
-
-	/* Update shared-memory copy of checkpoint XID/epoch */
-	SpinLockAcquire(&XLogCtl->info_lck);
-	XLogCtl->ckptFullXid = checkPoint.nextXid;
-	SpinLockRelease(&XLogCtl->info_lck);
-
-	/*
-	 * We are now done with critical updates; no need for system panic if we
-	 * have trouble while fooling with old log segments.
-	 */
-	END_CRIT_SECTION();
-
-	/*
-	 * Let smgr do post-checkpoint cleanup (eg, deleting old files).
-	 */
-	SyncPostCheckpoint();
-
-	/*
-	 * Update the average distance between checkpoints if the prior checkpoint
-	 * exists.
-	 */
-	if (PriorRedoPtr != InvalidXLogRecPtr)
-		UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
-
-	/*
-	 * Delete old log files, those no longer needed for last checkpoint to
-	 * prevent the disk holding the xlog from growing full.
-	 */
-	XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
-	KeepLogSeg(recptr, &_logSegNo);
-	InvalidateObsoleteReplicationSlots(_logSegNo);
-	_logSegNo--;
-	RemoveOldXlogFiles(_logSegNo, RedoRecPtr, recptr);
-
-	/*
-	 * Make more log segments if needed.  (Do this after recycling old log
-	 * segments, since that may supply some of the needed files.)
-	 */
-	if (!shutdown)
-		PreallocXlogFiles(recptr);
-
-	/*
-	 * Truncate pg_subtrans if possible.  We can throw away all data before
-	 * the oldest XMIN of any running transaction.  No future transaction will
-	 * attempt to reference any pg_subtrans entry older than that (see Asserts
-	 * in subtrans.c).  During recovery, though, we mustn't do this because
-	 * StartupSUBTRANS hasn't been called yet.
-	 */
-	if (!RecoveryInProgress())
-		TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning());
-
-	/* Real work is done, but log and update stats before releasing lock. */
-	LogCheckpointEnd(false);
-
-	/* Reset the process title */
-	update_checkpoint_display(flags, false, true);
-
-	TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
-									 NBuffers,
-									 CheckpointStats.ckpt_segs_added,
-									 CheckpointStats.ckpt_segs_removed,
-									 CheckpointStats.ckpt_segs_recycled);
-}
-
-/*
- * Mark the end of recovery in WAL though without running a full checkpoint.
- * We can expect that a restartpoint is likely to be in progress as we
- * do this, though we are unwilling to wait for it to complete.
- *
- * CreateRestartPoint() allows for the case where recovery may end before
- * the restartpoint completes so there is no concern of concurrent behaviour.
- */
-static void
-CreateEndOfRecoveryRecord(void)
-{
-	xl_end_of_recovery xlrec;
-	XLogRecPtr	recptr;
-
-	/* sanity check */
-	if (!RecoveryInProgress())
-		elog(ERROR, "can only be used to end recovery");
-
-	xlrec.end_time = GetCurrentTimestamp();
-
-	WALInsertLockAcquireExclusive();
-	xlrec.ThisTimeLineID = ThisTimeLineID;
-	xlrec.PrevTimeLineID = XLogCtl->PrevTimeLineID;
-	WALInsertLockRelease();
-
-	LocalSetXLogInsertAllowed();
-
-	START_CRIT_SECTION();
-
-	XLogBeginInsert();
-	XLogRegisterData((char *) &xlrec, sizeof(xl_end_of_recovery));
-	recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY);
-
-	XLogFlush(recptr);
-
-	/*
-	 * Update the control file so that crash recovery can follow the timeline
-	 * changes to this point.
-	 */
-	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
-	ControlFile->time = (pg_time_t) time(NULL);
-	ControlFile->minRecoveryPoint = recptr;
-	ControlFile->minRecoveryPointTLI = ThisTimeLineID;
-	UpdateControlFile();
-	LWLockRelease(ControlFileLock);
-
-	END_CRIT_SECTION();
-
-	LocalXLogInsertAllowed = -1;	/* return to "check" state */
-}
-
-/*
- * Flush all data in shared memory to disk, and fsync
- *
- * This is the common code shared between regular checkpoints and
- * recovery restartpoints.
- */
-static void
-CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
-{
-	CheckPointRelationMap();
-	CheckPointReplicationSlots();
-	CheckPointSnapBuild();
-	CheckPointLogicalRewriteHeap();
-	CheckPointReplicationOrigin();
-
-	/* Write out all dirty data in SLRUs and the main buffer pool */
-	TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
-	CheckpointStats.ckpt_write_t = GetCurrentTimestamp();
-	CheckPointCLOG();
-	CheckPointCommitTs();
-	CheckPointSUBTRANS();
-	CheckPointMultiXact();
-	CheckPointPredicate();
-	CheckPointBuffers(flags);
-
-	/* Perform all queued up fsyncs */
-	TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START();
-	CheckpointStats.ckpt_sync_t = GetCurrentTimestamp();
-	ProcessSyncRequests();
-	CheckpointStats.ckpt_sync_end_t = GetCurrentTimestamp();
-	TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE();
-
-	/* We deliberately delay 2PC checkpointing as long as possible */
-	CheckPointTwoPhase(checkPointRedo);
-}
-
-/*
- * Save a checkpoint for recovery restart if appropriate
- *
- * This function is called each time a checkpoint record is read from XLOG.
- * It must determine whether the checkpoint represents a safe restartpoint or
- * not.  If so, the checkpoint record is stashed in shared memory so that
- * CreateRestartPoint can consult it.  (Note that the latter function is
- * executed by the checkpointer, while this one will be executed by the
- * startup process.)
- */
-static void
-RecoveryRestartPoint(const CheckPoint *checkPoint)
-{
-	/*
-	 * Also refrain from creating a restartpoint if we have seen any
-	 * references to non-existent pages. Restarting recovery from the
-	 * restartpoint would not see the references, so we would lose the
-	 * cross-check that the pages belonged to a relation that was dropped
-	 * later.
-	 */
-	if (XLogHaveInvalidPages())
-	{
-		elog(trace_recovery(DEBUG2),
-			 "could not record restart point at %X/%X because there "
-			 "are unresolved references to invalid pages",
-			 LSN_FORMAT_ARGS(checkPoint->redo));
-		return;
-	}
-
-	/*
-	 * Copy the checkpoint record to shared memory, so that checkpointer can
-	 * work out the next time it wants to perform a restartpoint.
-	 */
-	SpinLockAcquire(&XLogCtl->info_lck);
-	XLogCtl->lastCheckPointRecPtr = ReadRecPtr;
-	XLogCtl->lastCheckPointEndPtr = EndRecPtr;
-	XLogCtl->lastCheckPoint = *checkPoint;
-	SpinLockRelease(&XLogCtl->info_lck);
-}
-
-/*
- * Establish a restartpoint if possible.
- *
- * This is similar to CreateCheckPoint, but is used during WAL recovery
- * to establish a point from which recovery can roll forward without
- * replaying the entire recovery log.
- *
- * Returns true if a new restartpoint was established. We can only establish
- * a restartpoint if we have replayed a safe checkpoint record since last
- * restartpoint.
- */
-bool
-CreateRestartPoint(int flags)
-{
-	XLogRecPtr	lastCheckPointRecPtr;
-	XLogRecPtr	lastCheckPointEndPtr;
-	CheckPoint	lastCheckPoint;
-	XLogRecPtr	PriorRedoPtr;
-	XLogRecPtr	receivePtr;
-	XLogRecPtr	replayPtr;
-	TimeLineID	replayTLI;
-	XLogRecPtr	endptr;
-	XLogSegNo	_logSegNo;
-	TimestampTz xtime;
-
-	/* Get a local copy of the last safe checkpoint record. */
-	SpinLockAcquire(&XLogCtl->info_lck);
-	lastCheckPointRecPtr = XLogCtl->lastCheckPointRecPtr;
-	lastCheckPointEndPtr = XLogCtl->lastCheckPointEndPtr;
-	lastCheckPoint = XLogCtl->lastCheckPoint;
-	SpinLockRelease(&XLogCtl->info_lck);
-
-	/*
-	 * Check that we're still in recovery mode. It's ok if we exit recovery
-	 * mode after this check, the restart point is valid anyway.
-	 */
-	if (!RecoveryInProgress())
-	{
-		ereport(DEBUG2,
-				(errmsg_internal("skipping restartpoint, recovery has already ended")));
-		return false;
-	}
-
-	/*
-	 * If the last checkpoint record we've replayed is already our last
-	 * restartpoint, we can't perform a new restart point. We still update
-	 * minRecoveryPoint in that case, so that if this is a shutdown restart
-	 * point, we won't start up earlier than before. That's not strictly
-	 * necessary, but when hot standby is enabled, it would be rather weird if
-	 * the database opened up for read-only connections at a point-in-time
-	 * before the last shutdown. Such time travel is still possible in case of
-	 * immediate shutdown, though.
-	 *
-	 * We don't explicitly advance minRecoveryPoint when we do create a
-	 * restartpoint. It's assumed that flushing the buffers will do that as a
-	 * side-effect.
-	 */
-	if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
-		lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
-	{
-		ereport(DEBUG2,
-				(errmsg_internal("skipping restartpoint, already performed at %X/%X",
-								 LSN_FORMAT_ARGS(lastCheckPoint.redo))));
-
-		UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
-		if (flags & CHECKPOINT_IS_SHUTDOWN)
-		{
-			LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
-			ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
-			ControlFile->time = (pg_time_t) time(NULL);
-			UpdateControlFile();
-			LWLockRelease(ControlFileLock);
-		}
-		return false;
-	}
-
-	/*
-	 * Update the shared RedoRecPtr so that the startup process can calculate
-	 * the number of segments replayed since last restartpoint, and request a
-	 * restartpoint if it exceeds CheckPointSegments.
-	 *
-	 * Like in CreateCheckPoint(), hold off insertions to update it, although
-	 * during recovery this is just pro forma, because no WAL insertions are
-	 * happening.
-	 */
-	WALInsertLockAcquireExclusive();
-	RedoRecPtr = XLogCtl->Insert.RedoRecPtr = lastCheckPoint.redo;
-	WALInsertLockRelease();
-
-	/* Also update the info_lck-protected copy */
-	SpinLockAcquire(&XLogCtl->info_lck);
-	XLogCtl->RedoRecPtr = lastCheckPoint.redo;
-	SpinLockRelease(&XLogCtl->info_lck);
-
-	/*
-	 * Prepare to accumulate statistics.
-	 *
-	 * Note: because it is possible for log_checkpoints to change while a
-	 * checkpoint proceeds, we always accumulate stats, even if
-	 * log_checkpoints is currently off.
-	 */
-	MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
-	CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
-
-	if (log_checkpoints)
-		LogCheckpointStart(flags, true);
-
-	/* Update the process title */
-	update_checkpoint_display(flags, true, false);
-
-	CheckPointGuts(lastCheckPoint.redo, flags);
-
-	/*
-	 * Remember the prior checkpoint's redo ptr for
-	 * UpdateCheckPointDistanceEstimate()
-	 */
-	PriorRedoPtr = ControlFile->checkPointCopy.redo;
-
-	/*
-	 * Update pg_control, using current time.  Check that it still shows
-	 * DB_IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
-	 * this is a quick hack to make sure nothing really bad happens if somehow
-	 * we get here after the end-of-recovery checkpoint.
-	 */
-	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
-	if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
-		ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
-	{
-		ControlFile->checkPoint = lastCheckPointRecPtr;
-		ControlFile->checkPointCopy = lastCheckPoint;
-		ControlFile->time = (pg_time_t) time(NULL);
-
-		/*
-		 * Ensure minRecoveryPoint is past the checkpoint record.  Normally,
-		 * this will have happened already while writing out dirty buffers,
-		 * but not necessarily - e.g. because no buffers were dirtied.  We do
-		 * this because a non-exclusive base backup uses minRecoveryPoint to
-		 * determine which WAL files must be included in the backup, and the
-		 * file (or files) containing the checkpoint record must be included,
-		 * at a minimum. Note that for an ordinary restart of recovery there's
-		 * no value in having the minimum recovery point any earlier than this
-		 * anyway, because redo will begin just after the checkpoint record.
-		 */
-		if (ControlFile->minRecoveryPoint < lastCheckPointEndPtr)
-		{
-			ControlFile->minRecoveryPoint = lastCheckPointEndPtr;
-			ControlFile->minRecoveryPointTLI = lastCheckPoint.ThisTimeLineID;
-
-			/* update local copy */
-			minRecoveryPoint = ControlFile->minRecoveryPoint;
-			minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
-		}
-		if (flags & CHECKPOINT_IS_SHUTDOWN)
-			ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
-		UpdateControlFile();
-	}
-	LWLockRelease(ControlFileLock);
-
-	/*
-	 * Update the average distance between checkpoints/restartpoints if the
-	 * prior checkpoint exists.
-	 */
-	if (PriorRedoPtr != InvalidXLogRecPtr)
-		UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
-
-	/*
-	 * Delete old log files, those no longer needed for last restartpoint to
-	 * prevent the disk holding the xlog from growing full.
-	 */
-	XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
-
-	/*
-	 * Retreat _logSegNo using the current end of xlog replayed or received,
-	 * whichever is later.
-	 */
-	receivePtr = GetWalRcvFlushRecPtr(NULL, NULL);
-	replayPtr = GetXLogReplayRecPtr(&replayTLI);
-	endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
-	KeepLogSeg(endptr, &_logSegNo);
-	InvalidateObsoleteReplicationSlots(_logSegNo);
-	_logSegNo--;
-
-	/*
-	 * Try to recycle segments on a useful timeline. If we've been promoted
-	 * since the beginning of this restartpoint, use the new timeline chosen
-	 * at end of recovery (RecoveryInProgress() sets ThisTimeLineID in that
-	 * case). If we're still in recovery, use the timeline we're currently
-	 * replaying.
-	 *
-	 * There is no guarantee that the WAL segments will be useful on the
-	 * current timeline; if recovery proceeds to a new timeline right after
-	 * this, the pre-allocated WAL segments on this timeline will not be used,
-	 * and will go wasted until recycled on the next restartpoint. We'll live
-	 * with that.
-	 */
-	if (RecoveryInProgress())
-		ThisTimeLineID = replayTLI;
-
-	RemoveOldXlogFiles(_logSegNo, RedoRecPtr, endptr);
-
-	/*
-	 * Make more log segments if needed.  (Do this after recycling old log
-	 * segments, since that may supply some of the needed files.)
-	 */
-	PreallocXlogFiles(endptr);
-
-	/*
-	 * ThisTimeLineID is normally not set when we're still in recovery.
-	 * However, recycling/preallocating segments above needed ThisTimeLineID
-	 * to determine which timeline to install the segments on. Reset it now,
-	 * to restore the normal state of affairs for debugging purposes.
-	 */
-	if (RecoveryInProgress())
-		ThisTimeLineID = 0;
-
-	/*
-	 * Truncate pg_subtrans if possible.  We can throw away all data before
-	 * the oldest XMIN of any running transaction.  No future transaction will
-	 * attempt to reference any pg_subtrans entry older than that (see Asserts
-	 * in subtrans.c).  When hot standby is disabled, though, we mustn't do
-	 * this because StartupSUBTRANS hasn't been called yet.
-	 */
-	if (EnableHotStandby)
-		TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning());
-
-	/* Real work is done, but log and update before releasing lock. */
-	LogCheckpointEnd(true);
-
-	/* Reset the process title */
-	update_checkpoint_display(flags, true, true);
-
-	xtime = GetLatestXTime();
-	ereport((log_checkpoints ? LOG : DEBUG2),
-			(errmsg("recovery restart point at %X/%X",
-					LSN_FORMAT_ARGS(lastCheckPoint.redo)),
-			 xtime ? errdetail("Last completed transaction was at log time %s.",
-							   timestamptz_to_str(xtime)) : 0));
-
-	/*
-	 * Finally, execute archive_cleanup_command, if any.
-	 */
-	if (archiveCleanupCommand && strcmp(archiveCleanupCommand, "") != 0)
-		ExecuteRecoveryCommand(archiveCleanupCommand,
-							   "archive_cleanup_command",
-							   false);
-
-	return true;
-}
-
-/*
- * Report availability of WAL for the given target LSN
- *		(typically a slot's restart_lsn)
- *
- * Returns one of the following enum values:
- *
- * * WALAVAIL_RESERVED means targetLSN is available and it is in the range of
- *   max_wal_size.
- *
- * * WALAVAIL_EXTENDED means it is still available by preserving extra
- *   segments beyond max_wal_size. If max_slot_wal_keep_size is smaller
- *   than max_wal_size, this state is not returned.
- *
- * * WALAVAIL_UNRESERVED means it is being lost and the next checkpoint will
- *   remove reserved segments. The walsender using this slot may return to the
- *   above.
- *
- * * WALAVAIL_REMOVED means it has been removed. A replication stream on
- *   a slot with this LSN cannot continue after a restart.
- *
- * * WALAVAIL_INVALID_LSN means the slot hasn't been set to reserve WAL.
- */
-WALAvailability
-GetWALAvailability(XLogRecPtr targetLSN)
-{
-	XLogRecPtr	currpos;		/* current write LSN */
-	XLogSegNo	currSeg;		/* segid of currpos */
-	XLogSegNo	targetSeg;		/* segid of targetLSN */
-	XLogSegNo	oldestSeg;		/* actual oldest segid */
-	XLogSegNo	oldestSegMaxWalSize;	/* oldest segid kept by max_wal_size */
-	XLogSegNo	oldestSlotSeg;	/* oldest segid kept by slot */
-	uint64		keepSegs;
-
-	/*
-	 * slot does not reserve WAL. Either deactivated, or has never been active
-	 */
-	if (XLogRecPtrIsInvalid(targetLSN))
-		return WALAVAIL_INVALID_LSN;
-
-	/*
-	 * Calculate the oldest segment currently reserved by all slots,
-	 * considering wal_keep_size and max_slot_wal_keep_size.  Initialize
-	 * oldestSlotSeg to the current segment.
-	 */
-	currpos = GetXLogWriteRecPtr();
-	XLByteToSeg(currpos, oldestSlotSeg, wal_segment_size);
-	KeepLogSeg(currpos, &oldestSlotSeg);
-
-	/*
-	 * Find the oldest extant segment file. We get 1 until checkpoint removes
-	 * the first WAL segment file since startup, which causes the status being
-	 * wrong under certain abnormal conditions but that doesn't actually harm.
-	 */
-	oldestSeg = XLogGetLastRemovedSegno() + 1;
-
-	/* calculate oldest segment by max_wal_size */
-	XLByteToSeg(currpos, currSeg, wal_segment_size);
-	keepSegs = ConvertToXSegs(max_wal_size_mb, wal_segment_size) + 1;
-
-	if (currSeg > keepSegs)
-		oldestSegMaxWalSize = currSeg - keepSegs;
-	else
-		oldestSegMaxWalSize = 1;
-
-	/* the segment we care about */
-	XLByteToSeg(targetLSN, targetSeg, wal_segment_size);
-
-	/*
-	 * No point in returning reserved or extended status values if the
-	 * targetSeg is known to be lost.
-	 */
-	if (targetSeg >= oldestSlotSeg)
-	{
-		/* show "reserved" when targetSeg is within max_wal_size */
-		if (targetSeg >= oldestSegMaxWalSize)
-			return WALAVAIL_RESERVED;
-
-		/* being retained by slots exceeding max_wal_size */
-		return WALAVAIL_EXTENDED;
-	}
-
-	/* WAL segments are no longer retained but haven't been removed yet */
-	if (targetSeg >= oldestSeg)
-		return WALAVAIL_UNRESERVED;
-
-	/* Definitely lost */
-	return WALAVAIL_REMOVED;
-}
-
-
-/*
- * Retreat *logSegNo to the last segment that we need to retain because of
- * either wal_keep_size or replication slots.
- *
- * This is calculated by subtracting wal_keep_size from the given xlog
- * location, recptr and by making sure that that result is below the
- * requirement of replication slots.  For the latter criterion we do consider
- * the effects of max_slot_wal_keep_size: reserve at most that much space back
- * from recptr.
- */
-static void
-KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
-{
-	XLogSegNo	currSegNo;
-	XLogSegNo	segno;
-	XLogRecPtr	keep;
-
-	XLByteToSeg(recptr, currSegNo, wal_segment_size);
-	segno = currSegNo;
-
-	/*
-	 * Calculate how many segments are kept by slots first, adjusting for
-	 * max_slot_wal_keep_size.
-	 */
-	keep = XLogGetReplicationSlotMinimumLSN();
-	if (keep != InvalidXLogRecPtr)
-	{
-		XLByteToSeg(keep, segno, wal_segment_size);
-
-		/* Cap by max_slot_wal_keep_size ... */
-		if (max_slot_wal_keep_size_mb >= 0)
-		{
-			uint64		slot_keep_segs;
-
-			slot_keep_segs =
-				ConvertToXSegs(max_slot_wal_keep_size_mb, wal_segment_size);
-
-			if (currSegNo - segno > slot_keep_segs)
-				segno = currSegNo - slot_keep_segs;
-		}
-	}
-
-	/* but, keep at least wal_keep_size if that's set */
-	if (wal_keep_size_mb > 0)
-	{
-		uint64		keep_segs;
-
-		keep_segs = ConvertToXSegs(wal_keep_size_mb, wal_segment_size);
-		if (currSegNo - segno < keep_segs)
-		{
-			/* avoid underflow, don't go below 1 */
-			if (currSegNo <= keep_segs)
-				segno = 1;
-			else
-				segno = currSegNo - keep_segs;
-		}
-	}
-
-	/* don't delete WAL segments newer than the calculated segment */
-	if (segno < *logSegNo)
-		*logSegNo = segno;
-}
-
-/*
- * Write a NEXTOID log record
- */
-void
-XLogPutNextOid(Oid nextOid)
-{
-	XLogBeginInsert();
-	XLogRegisterData((char *) (&nextOid), sizeof(Oid));
-	(void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID);
-
-	/*
-	 * We need not flush the NEXTOID record immediately, because any of the
-	 * just-allocated OIDs could only reach disk as part of a tuple insert or
-	 * update that would have its own XLOG record that must follow the NEXTOID
-	 * record.  Therefore, the standard buffer LSN interlock applied to those
-	 * records will ensure no such OID reaches disk before the NEXTOID record
-	 * does.
-	 *
-	 * Note, however, that the above statement only covers state "within" the
-	 * database.  When we use a generated OID as a file or directory name, we
-	 * are in a sense violating the basic WAL rule, because that filesystem
-	 * change may reach disk before the NEXTOID WAL record does.  The impact
-	 * of this is that if a database crash occurs immediately afterward, we
-	 * might after restart re-generate the same OID and find that it conflicts
-	 * with the leftover file or directory.  But since for safety's sake we
-	 * always loop until finding a nonconflicting filename, this poses no real
-	 * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
-	 */
-}
-
-/*
- * Write an XLOG SWITCH record.
- *
- * Here we just blindly issue an XLogInsert request for the record.
- * All the magic happens inside XLogInsert.
- *
- * The return value is either the end+1 address of the switch record,
- * or the end+1 address of the prior segment if we did not need to
- * write a switch record because we are already at segment start.
- */
-XLogRecPtr
-RequestXLogSwitch(bool mark_unimportant)
-{
-	XLogRecPtr	RecPtr;
-
-	/* XLOG SWITCH has no data */
-	XLogBeginInsert();
-
-	if (mark_unimportant)
-		XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
-	RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH);
-
-	return RecPtr;
-}
-
-/*
- * Write a RESTORE POINT record
- */
-XLogRecPtr
-XLogRestorePoint(const char *rpName)
-{
-	XLogRecPtr	RecPtr;
-	xl_restore_point xlrec;
-
-	xlrec.rp_time = GetCurrentTimestamp();
-	strlcpy(xlrec.rp_name, rpName, MAXFNAMELEN);
-
-	XLogBeginInsert();
-	XLogRegisterData((char *) &xlrec, sizeof(xl_restore_point));
-
-	RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT);
-
-	ereport(LOG,
-			(errmsg("restore point \"%s\" created at %X/%X",
-					rpName, LSN_FORMAT_ARGS(RecPtr))));
-
-	return RecPtr;
-}
-
-/*
- * Check if any of the GUC parameters that are critical for hot standby
- * have changed, and update the value in pg_control file if necessary.
- */
-static void
-XLogReportParameters(void)
-{
-	if (wal_level != ControlFile->wal_level ||
-		wal_log_hints != ControlFile->wal_log_hints ||
-		MaxConnections != ControlFile->MaxConnections ||
-		max_worker_processes != ControlFile->max_worker_processes ||
-		max_wal_senders != ControlFile->max_wal_senders ||
-		max_prepared_xacts != ControlFile->max_prepared_xacts ||
-		max_locks_per_xact != ControlFile->max_locks_per_xact ||
-		track_commit_timestamp != ControlFile->track_commit_timestamp)
-	{
-		/*
-		 * The change in number of backend slots doesn't need to be WAL-logged
-		 * if archiving is not enabled, as you can't start archive recovery
-		 * with wal_level=minimal anyway. We don't really care about the
-		 * values in pg_control either if wal_level=minimal, but seems better
-		 * to keep them up-to-date to avoid confusion.
-		 */
-		if (wal_level != ControlFile->wal_level || XLogIsNeeded())
-		{
-			xl_parameter_change xlrec;
-			XLogRecPtr	recptr;
-
-			xlrec.MaxConnections = MaxConnections;
-			xlrec.max_worker_processes = max_worker_processes;
-			xlrec.max_wal_senders = max_wal_senders;
-			xlrec.max_prepared_xacts = max_prepared_xacts;
-			xlrec.max_locks_per_xact = max_locks_per_xact;
-			xlrec.wal_level = wal_level;
-			xlrec.wal_log_hints = wal_log_hints;
-			xlrec.track_commit_timestamp = track_commit_timestamp;
-
-			XLogBeginInsert();
-			XLogRegisterData((char *) &xlrec, sizeof(xlrec));
-
-			recptr = XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE);
-			XLogFlush(recptr);
-		}
-
-		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
-
-		ControlFile->MaxConnections = MaxConnections;
-		ControlFile->max_worker_processes = max_worker_processes;
-		ControlFile->max_wal_senders = max_wal_senders;
-		ControlFile->max_prepared_xacts = max_prepared_xacts;
-		ControlFile->max_locks_per_xact = max_locks_per_xact;
-		ControlFile->wal_level = wal_level;
-		ControlFile->wal_log_hints = wal_log_hints;
-		ControlFile->track_commit_timestamp = track_commit_timestamp;
-		UpdateControlFile();
-
-		LWLockRelease(ControlFileLock);
-	}
-}
-
-/*
- * Update full_page_writes in shared memory, and write an
- * XLOG_FPW_CHANGE record if necessary.
- *
- * Note: this function assumes there is no other process running
- * concurrently that could update it.
- */
-void
-UpdateFullPageWrites(void)
-{
-	XLogCtlInsert *Insert = &XLogCtl->Insert;
-	bool		recoveryInProgress;
-
-	/*
-	 * Do nothing if full_page_writes has not been changed.
-	 *
-	 * It's safe to check the shared full_page_writes without the lock,
-	 * because we assume that there is no concurrently running process which
-	 * can update it.
-	 */
-	if (fullPageWrites == Insert->fullPageWrites)
-		return;
-
-	/*
-	 * Perform this outside critical section so that the WAL insert
-	 * initialization done by RecoveryInProgress() doesn't trigger an
-	 * assertion failure.
-	 */
-	recoveryInProgress = RecoveryInProgress();
-
-	START_CRIT_SECTION();
-
-	/*
-	 * It's always safe to take full page images, even when not strictly
-	 * required, but not the other round. So if we're setting full_page_writes
-	 * to true, first set it true and then write the WAL record. If we're
-	 * setting it to false, first write the WAL record and then set the global
-	 * flag.
-	 */
-	if (fullPageWrites)
-	{
-		WALInsertLockAcquireExclusive();
-		Insert->fullPageWrites = true;
-		WALInsertLockRelease();
-	}
-
-	/*
-	 * Write an XLOG_FPW_CHANGE record. This allows us to keep track of
-	 * full_page_writes during archive recovery, if required.
-	 */
-	if (XLogStandbyInfoActive() && !recoveryInProgress)
-	{
-		XLogBeginInsert();
-		XLogRegisterData((char *) (&fullPageWrites), sizeof(bool));
-
-		XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE);
-	}
-
-	if (!fullPageWrites)
-	{
-		WALInsertLockAcquireExclusive();
-		Insert->fullPageWrites = false;
-		WALInsertLockRelease();
-	}
-	END_CRIT_SECTION();
-}
-
-/*
- * Check that it's OK to switch to new timeline during recovery.
- *
- * 'lsn' is the address of the shutdown checkpoint record we're about to
- * replay. (Currently, timeline can only change at a shutdown checkpoint).
- */
-static void
-checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI)
-{
-	/* Check that the record agrees on what the current (old) timeline is */
-	if (prevTLI != ThisTimeLineID)
-		ereport(PANIC,
-				(errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
-						prevTLI, ThisTimeLineID)));
-
-	/*
-	 * The new timeline better be in the list of timelines we expect to see,
-	 * according to the timeline history. It should also not decrease.
-	 */
-	if (newTLI < ThisTimeLineID || !tliInHistory(newTLI, expectedTLEs))
-		ereport(PANIC,
-				(errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
-						newTLI, ThisTimeLineID)));
-
-	/*
-	 * If we have not yet reached min recovery point, and we're about to
-	 * switch to a timeline greater than the timeline of the min recovery
-	 * point: trouble. After switching to the new timeline, we could not
-	 * possibly visit the min recovery point on the correct timeline anymore.
-	 * This can happen if there is a newer timeline in the archive that
-	 * branched before the timeline the min recovery point is on, and you
-	 * attempt to do PITR to the new timeline.
-	 */
-	if (!XLogRecPtrIsInvalid(minRecoveryPoint) &&
-		lsn < minRecoveryPoint &&
-		newTLI > minRecoveryPointTLI)
-		ereport(PANIC,
-				(errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
-						newTLI,
-						LSN_FORMAT_ARGS(minRecoveryPoint),
-						minRecoveryPointTLI)));
-
-	/* Looks good */
-}
-
-/*
- * XLOG resource manager's routines
- *
- * Definitions of info values are in include/catalog/pg_control.h, though
- * not all record types are related to control file updates.
- */
-void
-xlog_redo(XLogReaderState *record)
-{
-	uint8		info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
-	XLogRecPtr	lsn = record->EndRecPtr;
-
-	/* in XLOG rmgr, backup blocks are only used by XLOG_FPI records */
-	Assert(info == XLOG_FPI || info == XLOG_FPI_FOR_HINT ||
-		   !XLogRecHasAnyBlockRefs(record));
-
-	if (info == XLOG_NEXTOID)
-	{
-		Oid			nextOid;
-
-		/*
-		 * We used to try to take the maximum of ShmemVariableCache->nextOid
-		 * and the recorded nextOid, but that fails if the OID counter wraps
-		 * around.  Since no OID allocation should be happening during replay
-		 * anyway, better to just believe the record exactly.  We still take
-		 * OidGenLock while setting the variable, just in case.
-		 */
-		memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
-		LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
-		ShmemVariableCache->nextOid = nextOid;
-		ShmemVariableCache->oidCount = 0;
-		LWLockRelease(OidGenLock);
-	}
-	else if (info == XLOG_CHECKPOINT_SHUTDOWN)
-	{
-		CheckPoint	checkPoint;
-
-		memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
-		/* In a SHUTDOWN checkpoint, believe the counters exactly */
-		LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
-		ShmemVariableCache->nextXid = checkPoint.nextXid;
-		LWLockRelease(XidGenLock);
-		LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
-		ShmemVariableCache->nextOid = checkPoint.nextOid;
-		ShmemVariableCache->oidCount = 0;
-		LWLockRelease(OidGenLock);
-		MultiXactSetNextMXact(checkPoint.nextMulti,
-							  checkPoint.nextMultiOffset);
-
-		MultiXactAdvanceOldest(checkPoint.oldestMulti,
-							   checkPoint.oldestMultiDB);
-
-		/*
-		 * No need to set oldestClogXid here as well; it'll be set when we
-		 * redo an xl_clog_truncate if it changed since initialization.
-		 */
-		SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
-
-		/*
-		 * If we see a shutdown checkpoint while waiting for an end-of-backup
-		 * record, the backup was canceled and the end-of-backup record will
-		 * never arrive.
-		 */
-		if (ArchiveRecoveryRequested &&
-			!XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
-			XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
-			ereport(PANIC,
-					(errmsg("online backup was canceled, recovery cannot continue")));
-
-		/*
-		 * If we see a shutdown checkpoint, we know that nothing was running
-		 * on the primary at this point. So fake-up an empty running-xacts
-		 * record and use that here and now. Recover additional standby state
-		 * for prepared transactions.
-		 */
-		if (standbyState >= STANDBY_INITIALIZED)
-		{
-			TransactionId *xids;
-			int			nxids;
-			TransactionId oldestActiveXID;
-			TransactionId latestCompletedXid;
-			RunningTransactionsData running;
-
-			oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
-
-			/*
-			 * Construct a RunningTransactions snapshot representing a shut
-			 * down server, with only prepared transactions still alive. We're
-			 * never overflowed at this point because all subxids are listed
-			 * with their parent prepared transactions.
-			 */
-			running.xcnt = nxids;
-			running.subxcnt = 0;
-			running.subxid_overflow = false;
-			running.nextXid = XidFromFullTransactionId(checkPoint.nextXid);
-			running.oldestRunningXid = oldestActiveXID;
-			latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid);
-			TransactionIdRetreat(latestCompletedXid);
-			Assert(TransactionIdIsNormal(latestCompletedXid));
-			running.latestCompletedXid = latestCompletedXid;
-			running.xids = xids;
-
-			ProcArrayApplyRecoveryInfo(&running);
-
-			StandbyRecoverPreparedTransactions();
-		}
-
-		/* ControlFile->checkPointCopy always tracks the latest ckpt XID */
-		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
-		ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
-		LWLockRelease(ControlFileLock);
-
-		/* Update shared-memory copy of checkpoint XID/epoch */
-		SpinLockAcquire(&XLogCtl->info_lck);
-		XLogCtl->ckptFullXid = checkPoint.nextXid;
-		SpinLockRelease(&XLogCtl->info_lck);
-
-		/*
-		 * We should've already switched to the new TLI before replaying this
-		 * record.
-		 */
-		if (checkPoint.ThisTimeLineID != ThisTimeLineID)
-			ereport(PANIC,
-					(errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
-							checkPoint.ThisTimeLineID, ThisTimeLineID)));
-
-		RecoveryRestartPoint(&checkPoint);
-	}
-	else if (info == XLOG_CHECKPOINT_ONLINE)
-	{
-		CheckPoint	checkPoint;
-
-		memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
-		/* In an ONLINE checkpoint, treat the XID counter as a minimum */
-		LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
-		if (FullTransactionIdPrecedes(ShmemVariableCache->nextXid,
-									  checkPoint.nextXid))
-			ShmemVariableCache->nextXid = checkPoint.nextXid;
-		LWLockRelease(XidGenLock);
-
-		/*
-		 * We ignore the nextOid counter in an ONLINE checkpoint, preferring
-		 * to track OID assignment through XLOG_NEXTOID records.  The nextOid
-		 * counter is from the start of the checkpoint and might well be stale
-		 * compared to later XLOG_NEXTOID records.  We could try to take the
-		 * maximum of the nextOid counter and our latest value, but since
-		 * there's no particular guarantee about the speed with which the OID
-		 * counter wraps around, that's a risky thing to do.  In any case,
-		 * users of the nextOid counter are required to avoid assignment of
-		 * duplicates, so that a somewhat out-of-date value should be safe.
-		 */
-
-		/* Handle multixact */
-		MultiXactAdvanceNextMXact(checkPoint.nextMulti,
-								  checkPoint.nextMultiOffset);
-
-		/*
-		 * NB: This may perform multixact truncation when replaying WAL
-		 * generated by an older primary.
-		 */
-		MultiXactAdvanceOldest(checkPoint.oldestMulti,
-							   checkPoint.oldestMultiDB);
-		if (TransactionIdPrecedes(ShmemVariableCache->oldestXid,
-								  checkPoint.oldestXid))
-			SetTransactionIdLimit(checkPoint.oldestXid,
-								  checkPoint.oldestXidDB);
-		/* ControlFile->checkPointCopy always tracks the latest ckpt XID */
-		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
-		ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
-		LWLockRelease(ControlFileLock);
-
-		/* Update shared-memory copy of checkpoint XID/epoch */
-		SpinLockAcquire(&XLogCtl->info_lck);
-		XLogCtl->ckptFullXid = checkPoint.nextXid;
-		SpinLockRelease(&XLogCtl->info_lck);
-
-		/* TLI should not change in an on-line checkpoint */
-		if (checkPoint.ThisTimeLineID != ThisTimeLineID)
-			ereport(PANIC,
-					(errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
-							checkPoint.ThisTimeLineID, ThisTimeLineID)));
-
-		RecoveryRestartPoint(&checkPoint);
-	}
-	else if (info == XLOG_END_OF_RECOVERY)
-	{
-		xl_end_of_recovery xlrec;
-
-		memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
-
-		/*
-		 * For Hot Standby, we could treat this like a Shutdown Checkpoint,
-		 * but this case is rarer and harder to test, so the benefit doesn't
-		 * outweigh the potential extra cost of maintenance.
-		 */
-
-		/*
-		 * We should've already switched to the new TLI before replaying this
-		 * record.
-		 */
-		if (xlrec.ThisTimeLineID != ThisTimeLineID)
-			ereport(PANIC,
-					(errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
-							xlrec.ThisTimeLineID, ThisTimeLineID)));
-	}
-	else if (info == XLOG_NOOP)
-	{
-		/* nothing to do here */
-	}
-	else if (info == XLOG_SWITCH)
-	{
-		/* nothing to do here */
-	}
-	else if (info == XLOG_RESTORE_POINT)
-	{
-		/* nothing to do here */
-	}
-	else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT)
-	{
-		/*
-		 * Full-page image (FPI) records contain nothing else but a backup
-		 * block (or multiple backup blocks). Every block reference must
-		 * include a full-page image - otherwise there would be no point in
-		 * this record.
-		 *
-		 * No recovery conflicts are generated by these generic records - if a
-		 * resource manager needs to generate conflicts, it has to define a
-		 * separate WAL record type and redo routine.
-		 *
-		 * XLOG_FPI_FOR_HINT records are generated when a page needs to be
-		 * WAL- logged because of a hint bit update. They are only generated
-		 * when checksums are enabled. There is no difference in handling
-		 * XLOG_FPI and XLOG_FPI_FOR_HINT records, they use a different info
-		 * code just to distinguish them for statistics purposes.
-		 */
-		for (uint8 block_id = 0; block_id <= record->max_block_id; block_id++)
-		{
-			Buffer		buffer;
-
-			if (XLogReadBufferForRedo(record, block_id, &buffer) != BLK_RESTORED)
-				elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block");
-			UnlockReleaseBuffer(buffer);
-		}
-	}
-	else if (info == XLOG_BACKUP_END)
-	{
-		XLogRecPtr	startpoint;
-
-		memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
-
-		if (ControlFile->backupStartPoint == startpoint)
-		{
-			/*
-			 * We have reached the end of base backup, the point where
-			 * pg_stop_backup() was done. The data on disk is now consistent.
-			 * Reset backupStartPoint, and update minRecoveryPoint to make
-			 * sure we don't allow starting up at an earlier point even if
-			 * recovery is stopped and restarted soon after this.
-			 */
-			elog(DEBUG1, "end of backup reached");
-
-			LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
-
-			if (ControlFile->minRecoveryPoint < lsn)
-			{
-				ControlFile->minRecoveryPoint = lsn;
-				ControlFile->minRecoveryPointTLI = ThisTimeLineID;
-			}
-			ControlFile->backupStartPoint = InvalidXLogRecPtr;
-			ControlFile->backupEndRequired = false;
-			UpdateControlFile();
-
-			LWLockRelease(ControlFileLock);
-		}
-	}
-	else if (info == XLOG_PARAMETER_CHANGE)
-	{
-		xl_parameter_change xlrec;
-
-		/* Update our copy of the parameters in pg_control */
-		memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));
-
-		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
-		ControlFile->MaxConnections = xlrec.MaxConnections;
-		ControlFile->max_worker_processes = xlrec.max_worker_processes;
-		ControlFile->max_wal_senders = xlrec.max_wal_senders;
-		ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
-		ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
-		ControlFile->wal_level = xlrec.wal_level;
-		ControlFile->wal_log_hints = xlrec.wal_log_hints;
-
-		/*
-		 * Update minRecoveryPoint to ensure that if recovery is aborted, we
-		 * recover back up to this point before allowing hot standby again.
-		 * This is important if the max_* settings are decreased, to ensure
-		 * you don't run queries against the WAL preceding the change. The
-		 * local copies cannot be updated as long as crash recovery is
-		 * happening and we expect all the WAL to be replayed.
-		 */
-		if (InArchiveRecovery)
-		{
-			minRecoveryPoint = ControlFile->minRecoveryPoint;
-			minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
-		}
-		if (minRecoveryPoint != InvalidXLogRecPtr && minRecoveryPoint < lsn)
-		{
-			ControlFile->minRecoveryPoint = lsn;
-			ControlFile->minRecoveryPointTLI = ThisTimeLineID;
-		}
-
-		CommitTsParameterChange(xlrec.track_commit_timestamp,
-								ControlFile->track_commit_timestamp);
-		ControlFile->track_commit_timestamp = xlrec.track_commit_timestamp;
-
-		UpdateControlFile();
-		LWLockRelease(ControlFileLock);
-
-		/* Check to see if any parameter change gives a problem on recovery */
-		CheckRequiredParameterValues();
-	}
-	else if (info == XLOG_FPW_CHANGE)
-	{
-		bool		fpw;
-
-		memcpy(&fpw, XLogRecGetData(record), sizeof(bool));
-
-		/*
-		 * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
-		 * do_pg_start_backup() and do_pg_stop_backup() can check whether
-		 * full_page_writes has been disabled during online backup.
-		 */
-		if (!fpw)
-		{
-			SpinLockAcquire(&XLogCtl->info_lck);
-			if (XLogCtl->lastFpwDisableRecPtr < ReadRecPtr)
-				XLogCtl->lastFpwDisableRecPtr = ReadRecPtr;
-			SpinLockRelease(&XLogCtl->info_lck);
-		}
-
-		/* Keep track of full_page_writes */
-		lastFullPageWrites = fpw;
-	}
-}
-
-#ifdef WAL_DEBUG
-
-static void
-xlog_outrec(StringInfo buf, XLogReaderState *record)
-{
-	appendStringInfo(buf, "prev %X/%X; xid %u",
-					 LSN_FORMAT_ARGS(XLogRecGetPrev(record)),
-					 XLogRecGetXid(record));
-
-	appendStringInfo(buf, "; len %u",
-					 XLogRecGetDataLen(record));
-
-	xlog_block_info(buf, record);
-}
-#endif							/* WAL_DEBUG */
-
-/*
- * Returns a string giving information about all the blocks in an
- * XLogRecord.
- */
-static void
-xlog_block_info(StringInfo buf, XLogReaderState *record)
-{
-	int			block_id;
-
-	/* decode block references */
-	for (block_id = 0; block_id <= record->max_block_id; block_id++)
-	{
-		RelFileNode rnode;
-		ForkNumber	forknum;
-		BlockNumber blk;
-
-		if (!XLogRecHasBlockRef(record, block_id))
-			continue;
-
-		XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blk);
-		if (forknum != MAIN_FORKNUM)
-			appendStringInfo(buf, "; blkref #%u: rel %u/%u/%u, fork %u, blk %u",
-							 block_id,
-							 rnode.spcNode, rnode.dbNode, rnode.relNode,
-							 forknum,
-							 blk);
-		else
-			appendStringInfo(buf, "; blkref #%u: rel %u/%u/%u, blk %u",
-							 block_id,
-							 rnode.spcNode, rnode.dbNode, rnode.relNode,
-							 blk);
-		if (XLogRecHasBlockImage(record, block_id))
-			appendStringInfoString(buf, " FPW");
-	}
-}
-
-/*
- * Returns a string describing an XLogRecord, consisting of its identity
- * optionally followed by a colon, a space, and a further description.
- */
-static void
-xlog_outdesc(StringInfo buf, XLogReaderState *record)
-{
-	RmgrId		rmid = XLogRecGetRmid(record);
-	uint8		info = XLogRecGetInfo(record);
-	const char *id;
-
-	appendStringInfoString(buf, RmgrTable[rmid].rm_name);
-	appendStringInfoChar(buf, '/');
-
-	id = RmgrTable[rmid].rm_identify(info);
-	if (id == NULL)
-		appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
-	else
-		appendStringInfo(buf, "%s: ", id);
-
-	RmgrTable[rmid].rm_desc(buf, record);
-}
-
-
-/*
- * Return the (possible) sync flag used for opening a file, depending on the
- * value of the GUC wal_sync_method.
- */
-static int
-get_sync_bit(int method)
-{
-	int			o_direct_flag = 0;
-
-	/* If fsync is disabled, never open in sync mode */
-	if (!enableFsync)
-		return 0;
-
-	/*
-	 * Optimize writes by bypassing kernel cache with O_DIRECT when using
-	 * O_SYNC/O_FSYNC and O_DSYNC.  But only if archiving and streaming are
-	 * disabled, otherwise the archive command or walsender process will read
-	 * the WAL soon after writing it, which is guaranteed to cause a physical
-	 * read if we bypassed the kernel cache. We also skip the
-	 * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
-	 * reason.
-	 *
-	 * Never use O_DIRECT in walreceiver process for similar reasons; the WAL
-	 * written by walreceiver is normally read by the startup process soon
-	 * after it's written. Also, walreceiver performs unaligned writes, which
-	 * don't work with O_DIRECT, so it is required for correctness too.
-	 */
-	if (!XLogIsNeeded() && !AmWalReceiverProcess())
-		o_direct_flag = PG_O_DIRECT;
-
-	switch (method)
-	{
-			/*
-			 * enum values for all sync options are defined even if they are
-			 * not supported on the current platform.  But if not, they are
-			 * not included in the enum option array, and therefore will never
-			 * be seen here.
-			 */
-		case SYNC_METHOD_FSYNC:
-		case SYNC_METHOD_FSYNC_WRITETHROUGH:
-		case SYNC_METHOD_FDATASYNC:
-			return 0;
-#ifdef OPEN_SYNC_FLAG
-		case SYNC_METHOD_OPEN:
-			return OPEN_SYNC_FLAG | o_direct_flag;
-#endif
-#ifdef OPEN_DATASYNC_FLAG
-		case SYNC_METHOD_OPEN_DSYNC:
-			return OPEN_DATASYNC_FLAG | o_direct_flag;
-#endif
-		default:
-			/* can't happen (unless we are out of sync with option array) */
-			elog(ERROR, "unrecognized wal_sync_method: %d", method);
-			return 0;			/* silence warning */
-	}
-}
-
-/*
- * GUC support
- */
-void
-assign_xlog_sync_method(int new_sync_method, void *extra)
-{
-	if (sync_method != new_sync_method)
-	{
-		/*
-		 * To ensure that no blocks escape unsynced, force an fsync on the
-		 * currently open log segment (if any).  Also, if the open flag is
-		 * changing, close the log file so it will be reopened (with new flag
-		 * bit) at next use.
-		 */
-		if (openLogFile >= 0)
-		{
-			pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN);
-			if (pg_fsync(openLogFile) != 0)
-			{
-				char		xlogfname[MAXFNAMELEN];
-				int			save_errno;
-
-				save_errno = errno;
-				XLogFileName(xlogfname, ThisTimeLineID, openLogSegNo,
-							 wal_segment_size);
-				errno = save_errno;
-				ereport(PANIC,
-						(errcode_for_file_access(),
-						 errmsg("could not fsync file \"%s\": %m", xlogfname)));
-			}
-
-			pgstat_report_wait_end();
-			if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
-				XLogFileClose();
-		}
-	}
-}
-
-
-/*
- * Issue appropriate kind of fsync (if any) for an XLOG output file.
- *
- * 'fd' is a file descriptor for the XLOG file to be fsync'd.
- * 'segno' is for error reporting purposes.
- */
-void
-issue_xlog_fsync(int fd, XLogSegNo segno)
-{
-	char	   *msg = NULL;
-	instr_time	start;
-
-	/*
-	 * Quick exit if fsync is disabled or write() has already synced the WAL
-	 * file.
-	 */
-	if (!enableFsync ||
-		sync_method == SYNC_METHOD_OPEN ||
-		sync_method == SYNC_METHOD_OPEN_DSYNC)
-		return;
-
-	/* Measure I/O timing to sync the WAL file */
-	if (track_wal_io_timing)
-		INSTR_TIME_SET_CURRENT(start);
-
-	pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC);
-	switch (sync_method)
-	{
-		case SYNC_METHOD_FSYNC:
-			if (pg_fsync_no_writethrough(fd) != 0)
-				msg = _("could not fsync file \"%s\": %m");
-			break;
-#ifdef HAVE_FSYNC_WRITETHROUGH
-		case SYNC_METHOD_FSYNC_WRITETHROUGH:
-			if (pg_fsync_writethrough(fd) != 0)
-				msg = _("could not fsync write-through file \"%s\": %m");
-			break;
-#endif
-#ifdef HAVE_FDATASYNC
-		case SYNC_METHOD_FDATASYNC:
-			if (pg_fdatasync(fd) != 0)
-				msg = _("could not fdatasync file \"%s\": %m");
-			break;
-#endif
-		case SYNC_METHOD_OPEN:
-		case SYNC_METHOD_OPEN_DSYNC:
-			/* not reachable */
-			Assert(false);
-			break;
-		default:
-			elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
-			break;
-	}
-
-	/* PANIC if failed to fsync */
-	if (msg)
-	{
-		char		xlogfname[MAXFNAMELEN];
-		int			save_errno = errno;
-
-		XLogFileName(xlogfname, ThisTimeLineID, segno,
-					 wal_segment_size);
-		errno = save_errno;
-		ereport(PANIC,
-				(errcode_for_file_access(),
-				 errmsg(msg, xlogfname)));
-	}
-
-	pgstat_report_wait_end();
-
-	/*
-	 * Increment the I/O timing and the number of times WAL files were synced.
-	 */
-	if (track_wal_io_timing)
-	{
-		instr_time	duration;
-
-		INSTR_TIME_SET_CURRENT(duration);
-		INSTR_TIME_SUBTRACT(duration, start);
-		WalStats.m_wal_sync_time += INSTR_TIME_GET_MICROSEC(duration);
-	}
-
-	WalStats.m_wal_sync++;
-}
-
-/*
- * do_pg_start_backup
- *
- * Utility function called at the start of an online backup. It creates the
- * necessary starting checkpoint and constructs the backup label file.
- *
- * There are two kind of backups: exclusive and non-exclusive. An exclusive
- * backup is started with pg_start_backup(), and there can be only one active
- * at a time. The backup and tablespace map files of an exclusive backup are
- * written to $PGDATA/backup_label and $PGDATA/tablespace_map, and they are
- * removed by pg_stop_backup().
- *
- * A non-exclusive backup is used for the streaming base backups (see
- * src/backend/replication/basebackup.c). The difference to exclusive backups
- * is that the backup label and tablespace map files are not written to disk.
- * Instead, their would-be contents are returned in *labelfile and *tblspcmapfile,
- * and the caller is responsible for including them in the backup archive as
- * 'backup_label' and 'tablespace_map'. There can be many non-exclusive backups
- * active at the same time, and they don't conflict with an exclusive backup
- * either.
- *
- * labelfile and tblspcmapfile must be passed as NULL when starting an
- * exclusive backup, and as initially-empty StringInfos for a non-exclusive
- * backup.
- *
- * If "tablespaces" isn't NULL, it receives a list of tablespaceinfo structs
- * describing the cluster's tablespaces.
- *
- * tblspcmapfile is required mainly for tar format in windows as native windows
- * utilities are not able to create symlinks while extracting files from tar.
- * However for consistency, the same is used for all platforms.
- *
- * Returns the minimum WAL location that must be present to restore from this
- * backup, and the corresponding timeline ID in *starttli_p.
- *
- * Every successfully started non-exclusive backup must be stopped by calling
- * do_pg_stop_backup() or do_pg_abort_backup().
- *
- * It is the responsibility of the caller of this function to verify the
- * permissions of the calling user!
- */
-XLogRecPtr
-do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
-				   StringInfo labelfile, List **tablespaces,
-				   StringInfo tblspcmapfile)
-{
-	bool		exclusive = (labelfile == NULL);
-	bool		backup_started_in_recovery = false;
-	XLogRecPtr	checkpointloc;
-	XLogRecPtr	startpoint;
-	TimeLineID	starttli;
-	pg_time_t	stamp_time;
-	char		strfbuf[128];
-	char		xlogfilename[MAXFNAMELEN];
-	XLogSegNo	_logSegNo;
-	struct stat stat_buf;
-	FILE	   *fp;
-
-	backup_started_in_recovery = RecoveryInProgress();
-
-	/*
-	 * Currently only non-exclusive backup can be taken during recovery.
-	 */
-	if (backup_started_in_recovery && exclusive)
-		ereport(ERROR,
-				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
-				 errmsg("recovery is in progress"),
-				 errhint("WAL control functions cannot be executed during recovery.")));
-
-	/*
-	 * During recovery, we don't need to check WAL level. Because, if WAL
-	 * level is not sufficient, it's impossible to get here during recovery.
-	 */
-	if (!backup_started_in_recovery && !XLogIsNeeded())
-		ereport(ERROR,
-				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
-				 errmsg("WAL level not sufficient for making an online backup"),
-				 errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
-
-	if (strlen(backupidstr) > MAXPGPATH)
-		ereport(ERROR,
-				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-				 errmsg("backup label too long (max %d bytes)",
-						MAXPGPATH)));
-
-	/*
-	 * Mark backup active in shared memory.  We must do full-page WAL writes
-	 * during an on-line backup even if not doing so at other times, because
-	 * it's quite possible for the backup dump to obtain a "torn" (partially
-	 * written) copy of a database page if it reads the page concurrently with
-	 * our write to the same page.  This can be fixed as long as the first
-	 * write to the page in the WAL sequence is a full-page write. Hence, we
-	 * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
-	 * are no dirty pages in shared memory that might get dumped while the
-	 * backup is in progress without having a corresponding WAL record.  (Once
-	 * the backup is complete, we need not force full-page writes anymore,
-	 * since we expect that any pages not modified during the backup interval
-	 * must have been correctly captured by the backup.)
-	 *
-	 * Note that forcePageWrites has no effect during an online backup from
-	 * the standby.
-	 *
-	 * We must hold all the insertion locks to change the value of
-	 * forcePageWrites, to ensure adequate interlocking against
-	 * XLogInsertRecord().
-	 */
-	WALInsertLockAcquireExclusive();
-	if (exclusive)
-	{
-		/*
-		 * At first, mark that we're now starting an exclusive backup, to
-		 * ensure that there are no other sessions currently running
-		 * pg_start_backup() or pg_stop_backup().
-		 */
-		if (XLogCtl->Insert.exclusiveBackupState != EXCLUSIVE_BACKUP_NONE)
-		{
-			WALInsertLockRelease();
-			ereport(ERROR,
-					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
-					 errmsg("a backup is already in progress"),
-					 errhint("Run pg_stop_backup() and try again.")));
-		}
-		XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_STARTING;
-	}
-	else
-		XLogCtl->Insert.nonExclusiveBackups++;
-	XLogCtl->Insert.forcePageWrites = true;
-	WALInsertLockRelease();
-
-	/* Ensure we release forcePageWrites if fail below */
-	PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
-	{
-		bool		gotUniqueStartpoint = false;
-		DIR		   *tblspcdir;
-		struct dirent *de;
-		tablespaceinfo *ti;
-		int			datadirpathlen;
-
-		/*
-		 * Force an XLOG file switch before the checkpoint, to ensure that the
-		 * WAL segment the checkpoint is written to doesn't contain pages with
-		 * old timeline IDs.  That would otherwise happen if you called
-		 * pg_start_backup() right after restoring from a PITR archive: the
-		 * first WAL segment containing the startup checkpoint has pages in
-		 * the beginning with the old timeline ID.  That can cause trouble at
-		 * recovery: we won't have a history file covering the old timeline if
-		 * pg_wal directory was not included in the base backup and the WAL
-		 * archive was cleared too before starting the backup.
-		 *
-		 * This also ensures that we have emitted a WAL page header that has
-		 * XLP_BKP_REMOVABLE off before we emit the checkpoint record.
-		 * Therefore, if a WAL archiver (such as pglesslog) is trying to
-		 * compress out removable backup blocks, it won't remove any that
-		 * occur after this point.
-		 *
-		 * During recovery, we skip forcing XLOG file switch, which means that
-		 * the backup taken during recovery is not available for the special
-		 * recovery case described above.
-		 */
-		if (!backup_started_in_recovery)
-			RequestXLogSwitch(false);
-
-		do
-		{
-			bool		checkpointfpw;
-
-			/*
-			 * Force a CHECKPOINT.  Aside from being necessary to prevent torn
-			 * page problems, this guarantees that two successive backup runs
-			 * will have different checkpoint positions and hence different
-			 * history file names, even if nothing happened in between.
-			 *
-			 * During recovery, establish a restartpoint if possible. We use
-			 * the last restartpoint as the backup starting checkpoint. This
-			 * means that two successive backup runs can have same checkpoint
-			 * positions.
-			 *
-			 * Since the fact that we are executing do_pg_start_backup()
-			 * during recovery means that checkpointer is running, we can use
-			 * RequestCheckpoint() to establish a restartpoint.
-			 *
-			 * We use CHECKPOINT_IMMEDIATE only if requested by user (via
-			 * passing fast = true).  Otherwise this can take awhile.
-			 */
-			RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
-							  (fast ? CHECKPOINT_IMMEDIATE : 0));
-
-			/*
-			 * Now we need to fetch the checkpoint record location, and also
-			 * its REDO pointer.  The oldest point in WAL that would be needed
-			 * to restore starting from the checkpoint is precisely the REDO
-			 * pointer.
-			 */
-			LWLockAcquire(ControlFileLock, LW_SHARED);
-			checkpointloc = ControlFile->checkPoint;
-			startpoint = ControlFile->checkPointCopy.redo;
-			starttli = ControlFile->checkPointCopy.ThisTimeLineID;
-			checkpointfpw = ControlFile->checkPointCopy.fullPageWrites;
-			LWLockRelease(ControlFileLock);
-
-			if (backup_started_in_recovery)
-			{
-				XLogRecPtr	recptr;
-
-				/*
-				 * Check to see if all WAL replayed during online backup
-				 * (i.e., since last restartpoint used as backup starting
-				 * checkpoint) contain full-page writes.
-				 */
-				SpinLockAcquire(&XLogCtl->info_lck);
-				recptr = XLogCtl->lastFpwDisableRecPtr;
-				SpinLockRelease(&XLogCtl->info_lck);
-
-				if (!checkpointfpw || startpoint <= recptr)
-					ereport(ERROR,
-							(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
-							 errmsg("WAL generated with full_page_writes=off was replayed "
-									"since last restartpoint"),
-							 errhint("This means that the backup being taken on the standby "
-									 "is corrupt and should not be used. "
-									 "Enable full_page_writes and run CHECKPOINT on the primary, "
-									 "and then try an online backup again.")));
-
-				/*
-				 * During recovery, since we don't use the end-of-backup WAL
-				 * record and don't write the backup history file, the
-				 * starting WAL location doesn't need to be unique. This means
-				 * that two base backups started at the same time might use
-				 * the same checkpoint as starting locations.
-				 */
-				gotUniqueStartpoint = true;
-			}
-
-			/*
-			 * If two base backups are started at the same time (in WAL sender
-			 * processes), we need to make sure that they use different
-			 * checkpoints as starting locations, because we use the starting
-			 * WAL location as a unique identifier for the base backup in the
-			 * end-of-backup WAL record and when we write the backup history
-			 * file. Perhaps it would be better generate a separate unique ID
-			 * for each backup instead of forcing another checkpoint, but
-			 * taking a checkpoint right after another is not that expensive
-			 * either because only few buffers have been dirtied yet.
-			 */
-			WALInsertLockAcquireExclusive();
-			if (XLogCtl->Insert.lastBackupStart < startpoint)
-			{
-				XLogCtl->Insert.lastBackupStart = startpoint;
-				gotUniqueStartpoint = true;
-			}
-			WALInsertLockRelease();
-		} while (!gotUniqueStartpoint);
-
-		XLByteToSeg(startpoint, _logSegNo, wal_segment_size);
-		XLogFileName(xlogfilename, starttli, _logSegNo, wal_segment_size);
-
-		/*
-		 * Construct tablespace_map file.  If caller isn't interested in this,
-		 * we make a local StringInfo.
-		 */
-		if (tblspcmapfile == NULL)
-			tblspcmapfile = makeStringInfo();
-
-		datadirpathlen = strlen(DataDir);
-
-		/* Collect information about all tablespaces */
-		tblspcdir = AllocateDir("pg_tblspc");
-		while ((de = ReadDir(tblspcdir, "pg_tblspc")) != NULL)
-		{
-			char		fullpath[MAXPGPATH + 10];
-			char		linkpath[MAXPGPATH];
-			char	   *relpath = NULL;
-			int			rllen;
-			StringInfoData escapedpath;
-			char	   *s;
-
-			/* Skip anything that doesn't look like a tablespace */
-			if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
-				continue;
-
-			snprintf(fullpath, sizeof(fullpath), "pg_tblspc/%s", de->d_name);
-
-#if defined(HAVE_READLINK) || defined(WIN32)
-			rllen = readlink(fullpath, linkpath, sizeof(linkpath));
-			if (rllen < 0)
-			{
-				ereport(WARNING,
-						(errmsg("could not read symbolic link \"%s\": %m",
-								fullpath)));
-				continue;
-			}
-			else if (rllen >= sizeof(linkpath))
-			{
-				ereport(WARNING,
-						(errmsg("symbolic link \"%s\" target is too long",
-								fullpath)));
-				continue;
-			}
-			linkpath[rllen] = '\0';
-
-			/*
-			 * Build a backslash-escaped version of the link path to include
-			 * in the tablespace map file.
-			 */
-			initStringInfo(&escapedpath);
-			for (s = linkpath; *s; s++)
-			{
-				if (*s == '\n' || *s == '\r' || *s == '\\')
-					appendStringInfoChar(&escapedpath, '\\');
-				appendStringInfoChar(&escapedpath, *s);
-			}
+			oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
 
 			/*
-			 * Relpath holds the relative path of the tablespace directory
-			 * when it's located within PGDATA, or NULL if it's located
-			 * elsewhere.
+			 * Construct a RunningTransactions snapshot representing a shut
+			 * down server, with only prepared transactions still alive. We're
+			 * never overflowed at this point because all subxids are listed
+			 * with their parent prepared transactions.
 			 */
-			if (rllen > datadirpathlen &&
-				strncmp(linkpath, DataDir, datadirpathlen) == 0 &&
-				IS_DIR_SEP(linkpath[datadirpathlen]))
-				relpath = linkpath + datadirpathlen + 1;
-
-			ti = palloc(sizeof(tablespaceinfo));
-			ti->oid = pstrdup(de->d_name);
-			ti->path = pstrdup(linkpath);
-			ti->rpath = relpath ? pstrdup(relpath) : NULL;
-			ti->size = -1;
-
-			if (tablespaces)
-				*tablespaces = lappend(*tablespaces, ti);
-
-			appendStringInfo(tblspcmapfile, "%s %s\n",
-							 ti->oid, escapedpath.data);
+			running.xcnt = nxids;
+			running.subxcnt = 0;
+			running.subxid_overflow = false;
+			running.nextXid = XidFromFullTransactionId(checkPoint.nextXid);
+			running.oldestRunningXid = oldestActiveXID;
+			latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid);
+			TransactionIdRetreat(latestCompletedXid);
+			Assert(TransactionIdIsNormal(latestCompletedXid));
+			running.latestCompletedXid = latestCompletedXid;
+			running.xids = xids;
 
-			pfree(escapedpath.data);
-#else
+			ProcArrayApplyRecoveryInfo(&running);
 
-			/*
-			 * If the platform does not have symbolic links, it should not be
-			 * possible to have tablespaces - clearly somebody else created
-			 * them. Warn about it and ignore.
-			 */
-			ereport(WARNING,
-					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-					 errmsg("tablespaces are not supported on this platform")));
-#endif
+			StandbyRecoverPreparedTransactions();
 		}
-		FreeDir(tblspcdir);
 
-		/*
-		 * Construct backup label file.  If caller isn't interested in this,
-		 * we make a local StringInfo.
-		 */
-		if (labelfile == NULL)
-			labelfile = makeStringInfo();
+		/* ControlFile->checkPointCopy always tracks the latest ckpt XID */
+		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+		ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
+		LWLockRelease(ControlFileLock);
 
-		/* Use the log timezone here, not the session timezone */
-		stamp_time = (pg_time_t) time(NULL);
-		pg_strftime(strfbuf, sizeof(strfbuf),
-					"%Y-%m-%d %H:%M:%S %Z",
-					pg_localtime(&stamp_time, log_timezone));
-		appendStringInfo(labelfile, "START WAL LOCATION: %X/%X (file %s)\n",
-						 LSN_FORMAT_ARGS(startpoint), xlogfilename);
-		appendStringInfo(labelfile, "CHECKPOINT LOCATION: %X/%X\n",
-						 LSN_FORMAT_ARGS(checkpointloc));
-		appendStringInfo(labelfile, "BACKUP METHOD: %s\n",
-						 exclusive ? "pg_start_backup" : "streamed");
-		appendStringInfo(labelfile, "BACKUP FROM: %s\n",
-						 backup_started_in_recovery ? "standby" : "primary");
-		appendStringInfo(labelfile, "START TIME: %s\n", strfbuf);
-		appendStringInfo(labelfile, "LABEL: %s\n", backupidstr);
-		appendStringInfo(labelfile, "START TIMELINE: %u\n", starttli);
+		/* Update shared-memory copy of checkpoint XID/epoch */
+		SpinLockAcquire(&XLogCtl->info_lck);
+		XLogCtl->ckptFullXid = checkPoint.nextXid;
+		SpinLockRelease(&XLogCtl->info_lck);
 
 		/*
-		 * Okay, write the file, or return its contents to caller.
+		 * We should've already switched to the new TLI before replaying this
+		 * record.
 		 */
-		if (exclusive)
-		{
-			/*
-			 * Check for existing backup label --- implies a backup is already
-			 * running.  (XXX given that we checked exclusiveBackupState
-			 * above, maybe it would be OK to just unlink any such label
-			 * file?)
-			 */
-			if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
-			{
-				if (errno != ENOENT)
-					ereport(ERROR,
-							(errcode_for_file_access(),
-							 errmsg("could not stat file \"%s\": %m",
-									BACKUP_LABEL_FILE)));
-			}
-			else
-				ereport(ERROR,
-						(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
-						 errmsg("a backup is already in progress"),
-						 errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
-								 BACKUP_LABEL_FILE)));
+		if (checkPoint.ThisTimeLineID != ThisTimeLineID)
+			ereport(PANIC,
+					(errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
+							checkPoint.ThisTimeLineID, ThisTimeLineID)));
 
-			fp = AllocateFile(BACKUP_LABEL_FILE, "w");
+		RecoveryRestartPoint(record, &checkPoint);
+	}
+	else if (info == XLOG_CHECKPOINT_ONLINE)
+	{
+		CheckPoint	checkPoint;
 
-			if (!fp)
-				ereport(ERROR,
-						(errcode_for_file_access(),
-						 errmsg("could not create file \"%s\": %m",
-								BACKUP_LABEL_FILE)));
-			if (fwrite(labelfile->data, labelfile->len, 1, fp) != 1 ||
-				fflush(fp) != 0 ||
-				pg_fsync(fileno(fp)) != 0 ||
-				ferror(fp) ||
-				FreeFile(fp))
-				ereport(ERROR,
-						(errcode_for_file_access(),
-						 errmsg("could not write file \"%s\": %m",
-								BACKUP_LABEL_FILE)));
-			/* Allocated locally for exclusive backups, so free separately */
-			pfree(labelfile->data);
-			pfree(labelfile);
+		memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
+		/* In an ONLINE checkpoint, treat the XID counter as a minimum */
+		LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
+		if (FullTransactionIdPrecedes(ShmemVariableCache->nextXid,
+									  checkPoint.nextXid))
+			ShmemVariableCache->nextXid = checkPoint.nextXid;
+		LWLockRelease(XidGenLock);
 
-			/* Write backup tablespace_map file. */
-			if (tblspcmapfile->len > 0)
-			{
-				if (stat(TABLESPACE_MAP, &stat_buf) != 0)
-				{
-					if (errno != ENOENT)
-						ereport(ERROR,
-								(errcode_for_file_access(),
-								 errmsg("could not stat file \"%s\": %m",
-										TABLESPACE_MAP)));
-				}
-				else
-					ereport(ERROR,
-							(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
-							 errmsg("a backup is already in progress"),
-							 errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
-									 TABLESPACE_MAP)));
+		/*
+		 * We ignore the nextOid counter in an ONLINE checkpoint, preferring
+		 * to track OID assignment through XLOG_NEXTOID records.  The nextOid
+		 * counter is from the start of the checkpoint and might well be stale
+		 * compared to later XLOG_NEXTOID records.  We could try to take the
+		 * maximum of the nextOid counter and our latest value, but since
+		 * there's no particular guarantee about the speed with which the OID
+		 * counter wraps around, that's a risky thing to do.  In any case,
+		 * users of the nextOid counter are required to avoid assignment of
+		 * duplicates, so that a somewhat out-of-date value should be safe.
+		 */
 
-				fp = AllocateFile(TABLESPACE_MAP, "w");
+		/* Handle multixact */
+		MultiXactAdvanceNextMXact(checkPoint.nextMulti,
+								  checkPoint.nextMultiOffset);
 
-				if (!fp)
-					ereport(ERROR,
-							(errcode_for_file_access(),
-							 errmsg("could not create file \"%s\": %m",
-									TABLESPACE_MAP)));
-				if (fwrite(tblspcmapfile->data, tblspcmapfile->len, 1, fp) != 1 ||
-					fflush(fp) != 0 ||
-					pg_fsync(fileno(fp)) != 0 ||
-					ferror(fp) ||
-					FreeFile(fp))
-					ereport(ERROR,
-							(errcode_for_file_access(),
-							 errmsg("could not write file \"%s\": %m",
-									TABLESPACE_MAP)));
-			}
+		/*
+		 * NB: This may perform multixact truncation when replaying WAL
+		 * generated by an older primary.
+		 */
+		MultiXactAdvanceOldest(checkPoint.oldestMulti,
+							   checkPoint.oldestMultiDB);
+		if (TransactionIdPrecedes(ShmemVariableCache->oldestXid,
+								  checkPoint.oldestXid))
+			SetTransactionIdLimit(checkPoint.oldestXid,
+								  checkPoint.oldestXidDB);
+		/* ControlFile->checkPointCopy always tracks the latest ckpt XID */
+		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+		ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
+		LWLockRelease(ControlFileLock);
 
-			/* Allocated locally for exclusive backups, so free separately */
-			pfree(tblspcmapfile->data);
-			pfree(tblspcmapfile);
-		}
-	}
-	PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
+		/* Update shared-memory copy of checkpoint XID/epoch */
+		SpinLockAcquire(&XLogCtl->info_lck);
+		XLogCtl->ckptFullXid = checkPoint.nextXid;
+		SpinLockRelease(&XLogCtl->info_lck);
 
-	/*
-	 * Mark that start phase has correctly finished for an exclusive backup.
-	 * Session-level locks are updated as well to reflect that state.
-	 *
-	 * Note that CHECK_FOR_INTERRUPTS() must not occur while updating backup
-	 * counters and session-level lock. Otherwise they can be updated
-	 * inconsistently, and which might cause do_pg_abort_backup() to fail.
-	 */
-	if (exclusive)
-	{
-		WALInsertLockAcquireExclusive();
-		XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS;
+		/* TLI should not change in an on-line checkpoint */
+		if (checkPoint.ThisTimeLineID != ThisTimeLineID)
+			ereport(PANIC,
+					(errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
+							checkPoint.ThisTimeLineID, ThisTimeLineID)));
 
-		/* Set session-level lock */
-		sessionBackupState = SESSION_BACKUP_EXCLUSIVE;
-		WALInsertLockRelease();
+		RecoveryRestartPoint(record, &checkPoint);
 	}
-	else
-		sessionBackupState = SESSION_BACKUP_NON_EXCLUSIVE;
+	else if (info == XLOG_END_OF_RECOVERY)
+	{
+		xl_end_of_recovery xlrec;
 
-	/*
-	 * We're done.  As a convenience, return the starting WAL location.
-	 */
-	if (starttli_p)
-		*starttli_p = starttli;
-	return startpoint;
-}
+		memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
 
-/* Error cleanup callback for pg_start_backup */
-static void
-pg_start_backup_callback(int code, Datum arg)
-{
-	bool		exclusive = DatumGetBool(arg);
+		/*
+		 * For Hot Standby, we could treat this like a Shutdown Checkpoint,
+		 * but this case is rarer and harder to test, so the benefit doesn't
+		 * outweigh the potential extra cost of maintenance.
+		 */
 
-	/* Update backup counters and forcePageWrites on failure */
-	WALInsertLockAcquireExclusive();
-	if (exclusive)
+		/*
+		 * We should've already switched to the new TLI before replaying this
+		 * record.
+		 */
+		if (xlrec.ThisTimeLineID != ThisTimeLineID)
+			ereport(PANIC,
+					(errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
+							xlrec.ThisTimeLineID, ThisTimeLineID)));
+	}
+	else if (info == XLOG_NOOP)
 	{
-		Assert(XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_STARTING);
-		XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_NONE;
+		/* nothing to do here */
 	}
-	else
+	else if (info == XLOG_SWITCH)
 	{
-		Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
-		XLogCtl->Insert.nonExclusiveBackups--;
+		/* nothing to do here */
 	}
-
-	if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
-		XLogCtl->Insert.nonExclusiveBackups == 0)
+	else if (info == XLOG_RESTORE_POINT)
 	{
-		XLogCtl->Insert.forcePageWrites = false;
+		/* nothing to do here */
 	}
-	WALInsertLockRelease();
-}
+	else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT)
+	{
+		/*
+		 * Full-page image (FPI) records contain nothing else but a backup
+		 * block (or multiple backup blocks). Every block reference must
+		 * include a full-page image - otherwise there would be no point in
+		 * this record.
+		 *
+		 * No recovery conflicts are generated by these generic records - if a
+		 * resource manager needs to generate conflicts, it has to define a
+		 * separate WAL record type and redo routine.
+		 *
+		 * XLOG_FPI_FOR_HINT records are generated when a page needs to be
+		 * WAL- logged because of a hint bit update. They are only generated
+		 * when checksums are enabled. There is no difference in handling
+		 * XLOG_FPI and XLOG_FPI_FOR_HINT records, they use a different info
+		 * code just to distinguish them for statistics purposes.
+		 */
+		for (uint8 block_id = 0; block_id <= record->max_block_id; block_id++)
+		{
+			Buffer		buffer;
 
-/*
- * Error cleanup callback for pg_stop_backup
- */
-static void
-pg_stop_backup_callback(int code, Datum arg)
-{
-	bool		exclusive = DatumGetBool(arg);
+			if (XLogReadBufferForRedo(record, block_id, &buffer) != BLK_RESTORED)
+				elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block");
+			UnlockReleaseBuffer(buffer);
+		}
+	}
+	else if (info == XLOG_BACKUP_END)
+	{
+		XLogRecPtr	startpoint;
 
-	/* Update backup status on failure */
-	WALInsertLockAcquireExclusive();
-	if (exclusive)
+		memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
+
+		HandleBackupEndRecord(startpoint, lsn, ThisTimeLineID);
+	}
+	else if (info == XLOG_PARAMETER_CHANGE)
 	{
-		Assert(XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_STOPPING);
-		XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS;
+		xl_parameter_change xlrec;
+
+		/* Update our copy of the parameters in pg_control */
+		memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));
+
+		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+		ControlFile->MaxConnections = xlrec.MaxConnections;
+		ControlFile->max_worker_processes = xlrec.max_worker_processes;
+		ControlFile->max_wal_senders = xlrec.max_wal_senders;
+		ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
+		ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
+		ControlFile->wal_level = xlrec.wal_level;
+		ControlFile->wal_log_hints = xlrec.wal_log_hints;
+
+		/*
+		 * Update minRecoveryPoint to ensure that if recovery is aborted, we
+		 * recover back up to this point before allowing hot standby again.
+		 * This is important if the max_* settings are decreased, to ensure
+		 * you don't run queries against the WAL preceding the change. The
+		 * local copies cannot be updated as long as crash recovery is
+		 * happening and we expect all the WAL to be replayed.
+		 */
+		if (InArchiveRecovery)
+		{
+			LocalMinRecoveryPoint = ControlFile->minRecoveryPoint;
+			LocalMinRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
+		}
+		if (LocalMinRecoveryPoint != InvalidXLogRecPtr && LocalMinRecoveryPoint < lsn)
+		{
+			ControlFile->minRecoveryPoint = lsn;
+			ControlFile->minRecoveryPointTLI = ThisTimeLineID;
+		}
+
+		CommitTsParameterChange(xlrec.track_commit_timestamp,
+								ControlFile->track_commit_timestamp);
+		ControlFile->track_commit_timestamp = xlrec.track_commit_timestamp;
+
+		UpdateControlFile();
+		LWLockRelease(ControlFileLock);
+
+		/* Check to see if any parameter change gives a problem on recovery */
+		CheckRequiredParameterValues();
 	}
-	WALInsertLockRelease();
-}
+	else if (info == XLOG_FPW_CHANGE)
+	{
+		bool		fpw;
 
-/*
- * Utility routine to fetch the session-level status of a backup running.
- */
-SessionBackupState
-get_backup_status(void)
-{
-	return sessionBackupState;
-}
+		memcpy(&fpw, XLogRecGetData(record), sizeof(bool));
 
-/*
- * do_pg_stop_backup
- *
- * Utility function called at the end of an online backup. It cleans up the
- * backup state and can optionally wait for WAL segments to be archived.
- *
- * If labelfile is NULL, this stops an exclusive backup. Otherwise this stops
- * the non-exclusive backup specified by 'labelfile'.
- *
- * Returns the last WAL location that must be present to restore from this
- * backup, and the corresponding timeline ID in *stoptli_p.
- *
- * It is the responsibility of the caller of this function to verify the
- * permissions of the calling user!
- */
-XLogRecPtr
-do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
-{
-	bool		exclusive = (labelfile == NULL);
-	bool		backup_started_in_recovery = false;
-	XLogRecPtr	startpoint;
-	XLogRecPtr	stoppoint;
-	TimeLineID	stoptli;
-	pg_time_t	stamp_time;
-	char		strfbuf[128];
-	char		histfilepath[MAXPGPATH];
-	char		startxlogfilename[MAXFNAMELEN];
-	char		stopxlogfilename[MAXFNAMELEN];
-	char		lastxlogfilename[MAXFNAMELEN];
-	char		histfilename[MAXFNAMELEN];
-	char		backupfrom[20];
-	XLogSegNo	_logSegNo;
-	FILE	   *lfp;
-	FILE	   *fp;
-	char		ch;
-	int			seconds_before_warning;
-	int			waits = 0;
-	bool		reported_waiting = false;
-	char	   *remaining;
-	char	   *ptr;
-	uint32		hi,
-				lo;
+		/*
+		 * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
+		 * do_pg_start_backup() and do_pg_stop_backup() can check whether
+		 * full_page_writes has been disabled during online backup.
+		 */
+		if (!fpw)
+		{
+			SpinLockAcquire(&XLogCtl->info_lck);
+			if (XLogCtl->lastFpwDisableRecPtr < record->ReadRecPtr)
+				XLogCtl->lastFpwDisableRecPtr = record->ReadRecPtr;
+			SpinLockRelease(&XLogCtl->info_lck);
+		}
 
-	backup_started_in_recovery = RecoveryInProgress();
+		/* Keep track of full_page_writes */
+		lastFullPageWrites = fpw;
+	}
+}
 
-	/*
-	 * Currently only non-exclusive backup can be taken during recovery.
-	 */
-	if (backup_started_in_recovery && exclusive)
-		ereport(ERROR,
-				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
-				 errmsg("recovery is in progress"),
-				 errhint("WAL control functions cannot be executed during recovery.")));
+/*
+ * Return the (possible) sync flag used for opening a file, depending on the
+ * value of the GUC wal_sync_method.
+ */
+static int
+get_sync_bit(int method)
+{
+	int			o_direct_flag = 0;
+
+	/* If fsync is disabled, never open in sync mode */
+	if (!enableFsync)
+		return 0;
 
 	/*
-	 * During recovery, we don't need to check WAL level. Because, if WAL
-	 * level is not sufficient, it's impossible to get here during recovery.
+	 * Optimize writes by bypassing kernel cache with O_DIRECT when using
+	 * O_SYNC/O_FSYNC and O_DSYNC.  But only if archiving and streaming are
+	 * disabled, otherwise the archive command or walsender process will read
+	 * the WAL soon after writing it, which is guaranteed to cause a physical
+	 * read if we bypassed the kernel cache. We also skip the
+	 * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
+	 * reason.
+	 *
+	 * Never use O_DIRECT in walreceiver process for similar reasons; the WAL
+	 * written by walreceiver is normally read by the startup process soon
+	 * after it's written. Also, walreceiver performs unaligned writes, which
+	 * don't work with O_DIRECT, so it is required for correctness too.
 	 */
-	if (!backup_started_in_recovery && !XLogIsNeeded())
-		ereport(ERROR,
-				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
-				 errmsg("WAL level not sufficient for making an online backup"),
-				 errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
+	if (!XLogIsNeeded() && !AmWalReceiverProcess())
+		o_direct_flag = PG_O_DIRECT;
 
-	if (exclusive)
+	switch (method)
 	{
-		/*
-		 * At first, mark that we're now stopping an exclusive backup, to
-		 * ensure that there are no other sessions currently running
-		 * pg_start_backup() or pg_stop_backup().
-		 */
-		WALInsertLockAcquireExclusive();
-		if (XLogCtl->Insert.exclusiveBackupState != EXCLUSIVE_BACKUP_IN_PROGRESS)
-		{
-			WALInsertLockRelease();
-			ereport(ERROR,
-					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
-					 errmsg("exclusive backup not in progress")));
-		}
-		XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_STOPPING;
-		WALInsertLockRelease();
+			/*
+			 * enum values for all sync options are defined even if they are
+			 * not supported on the current platform.  But if not, they are
+			 * not included in the enum option array, and therefore will never
+			 * be seen here.
+			 */
+		case SYNC_METHOD_FSYNC:
+		case SYNC_METHOD_FSYNC_WRITETHROUGH:
+		case SYNC_METHOD_FDATASYNC:
+			return 0;
+#ifdef OPEN_SYNC_FLAG
+		case SYNC_METHOD_OPEN:
+			return OPEN_SYNC_FLAG | o_direct_flag;
+#endif
+#ifdef OPEN_DATASYNC_FLAG
+		case SYNC_METHOD_OPEN_DSYNC:
+			return OPEN_DATASYNC_FLAG | o_direct_flag;
+#endif
+		default:
+			/* can't happen (unless we are out of sync with option array) */
+			elog(ERROR, "unrecognized wal_sync_method: %d", method);
+			return 0;			/* silence warning */
+	}
+}
 
+/*
+ * GUC support
+ */
+void
+assign_xlog_sync_method(int new_sync_method, void *extra)
+{
+	if (sync_method != new_sync_method)
+	{
 		/*
-		 * Remove backup_label. In case of failure, the state for an exclusive
-		 * backup is switched back to in-progress.
+		 * To ensure that no blocks escape unsynced, force an fsync on the
+		 * currently open log segment (if any).  Also, if the open flag is
+		 * changing, close the log file so it will be reopened (with new flag
+		 * bit) at next use.
 		 */
-		PG_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive));
+		if (openLogFile >= 0)
 		{
-			/*
-			 * Read the existing label file into memory.
-			 */
-			struct stat statbuf;
-			int			r;
-
-			if (stat(BACKUP_LABEL_FILE, &statbuf))
+			pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN);
+			if (pg_fsync(openLogFile) != 0)
 			{
-				/* should not happen per the upper checks */
-				if (errno != ENOENT)
-					ereport(ERROR,
-							(errcode_for_file_access(),
-							 errmsg("could not stat file \"%s\": %m",
-									BACKUP_LABEL_FILE)));
-				ereport(ERROR,
-						(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
-						 errmsg("a backup is not in progress")));
-			}
+				char		xlogfname[MAXFNAMELEN];
+				int			save_errno;
 
-			lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
-			if (!lfp)
-			{
-				ereport(ERROR,
+				save_errno = errno;
+				XLogFileName(xlogfname, ThisTimeLineID, openLogSegNo,
+							 wal_segment_size);
+				errno = save_errno;
+				ereport(PANIC,
 						(errcode_for_file_access(),
-						 errmsg("could not read file \"%s\": %m",
-								BACKUP_LABEL_FILE)));
+						 errmsg("could not fsync file \"%s\": %m", xlogfname)));
 			}
-			labelfile = palloc(statbuf.st_size + 1);
-			r = fread(labelfile, statbuf.st_size, 1, lfp);
-			labelfile[statbuf.st_size] = '\0';
 
-			/*
-			 * Close and remove the backup label file
-			 */
-			if (r != 1 || ferror(lfp) || FreeFile(lfp))
-				ereport(ERROR,
-						(errcode_for_file_access(),
-						 errmsg("could not read file \"%s\": %m",
-								BACKUP_LABEL_FILE)));
-			durable_unlink(BACKUP_LABEL_FILE, ERROR);
-
-			/*
-			 * Remove tablespace_map file if present, it is created only if
-			 * there are tablespaces.
-			 */
-			durable_unlink(TABLESPACE_MAP, DEBUG1);
+			pgstat_report_wait_end();
+			if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
+				XLogFileClose();
 		}
-		PG_END_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive));
 	}
+}
+
+
+/*
+ * Issue appropriate kind of fsync (if any) for an XLOG output file.
+ *
+ * 'fd' is a file descriptor for the XLOG file to be fsync'd.
+ * 'segno' is for error reporting purposes.
+ */
+void
+issue_xlog_fsync(int fd, XLogSegNo segno)
+{
+	char	   *msg = NULL;
+	instr_time	start;
 
 	/*
-	 * OK to update backup counters, forcePageWrites and session-level lock.
-	 *
-	 * Note that CHECK_FOR_INTERRUPTS() must not occur while updating them.
-	 * Otherwise they can be updated inconsistently, and which might cause
-	 * do_pg_abort_backup() to fail.
+	 * Quick exit if fsync is disabled or write() has already synced the WAL
+	 * file.
 	 */
-	WALInsertLockAcquireExclusive();
-	if (exclusive)
+	if (!enableFsync ||
+		sync_method == SYNC_METHOD_OPEN ||
+		sync_method == SYNC_METHOD_OPEN_DSYNC)
+		return;
+
+	/* Measure I/O timing to sync the WAL file */
+	if (track_wal_io_timing)
+		INSTR_TIME_SET_CURRENT(start);
+
+	pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC);
+	switch (sync_method)
 	{
-		XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_NONE;
+		case SYNC_METHOD_FSYNC:
+			if (pg_fsync_no_writethrough(fd) != 0)
+				msg = _("could not fsync file \"%s\": %m");
+			break;
+#ifdef HAVE_FSYNC_WRITETHROUGH
+		case SYNC_METHOD_FSYNC_WRITETHROUGH:
+			if (pg_fsync_writethrough(fd) != 0)
+				msg = _("could not fsync write-through file \"%s\": %m");
+			break;
+#endif
+#ifdef HAVE_FDATASYNC
+		case SYNC_METHOD_FDATASYNC:
+			if (pg_fdatasync(fd) != 0)
+				msg = _("could not fdatasync file \"%s\": %m");
+			break;
+#endif
+		case SYNC_METHOD_OPEN:
+		case SYNC_METHOD_OPEN_DSYNC:
+			/* not reachable */
+			Assert(false);
+			break;
+		default:
+			elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
+			break;
 	}
-	else
+
+	/* PANIC if failed to fsync */
+	if (msg)
 	{
-		/*
-		 * The user-visible pg_start/stop_backup() functions that operate on
-		 * exclusive backups can be called at any time, but for non-exclusive
-		 * backups, it is expected that each do_pg_start_backup() call is
-		 * matched by exactly one do_pg_stop_backup() call.
-		 */
-		Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
-		XLogCtl->Insert.nonExclusiveBackups--;
+		char		xlogfname[MAXFNAMELEN];
+		int			save_errno = errno;
+
+		XLogFileName(xlogfname, ThisTimeLineID, segno,
+					 wal_segment_size);
+		errno = save_errno;
+		ereport(PANIC,
+				(errcode_for_file_access(),
+				 errmsg(msg, xlogfname)));
 	}
 
-	if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
-		XLogCtl->Insert.nonExclusiveBackups == 0)
+	pgstat_report_wait_end();
+
+	/*
+	 * Increment the I/O timing and the number of times WAL files were synced.
+	 */
+	if (track_wal_io_timing)
 	{
-		XLogCtl->Insert.forcePageWrites = false;
+		instr_time	duration;
+
+		INSTR_TIME_SET_CURRENT(duration);
+		INSTR_TIME_SUBTRACT(duration, start);
+		WalStats.m_wal_sync_time += INSTR_TIME_GET_MICROSEC(duration);
 	}
 
-	/*
-	 * Clean up session-level lock.
-	 *
-	 * You might think that WALInsertLockRelease() can be called before
-	 * cleaning up session-level lock because session-level lock doesn't need
-	 * to be protected with WAL insertion lock. But since
-	 * CHECK_FOR_INTERRUPTS() can occur in it, session-level lock must be
-	 * cleaned up before it.
-	 */
-	sessionBackupState = SESSION_BACKUP_NONE;
+	WalStats.m_wal_sync++;
+}
 
-	WALInsertLockRelease();
+/*
+ * do_pg_start_backup
+ *
+ * Utility function called at the start of an online backup. It creates the
+ * necessary starting checkpoint and constructs the backup label file.
+ *
+ * There are two kind of backups: exclusive and non-exclusive. An exclusive
+ * backup is started with pg_start_backup(), and there can be only one active
+ * at a time. The backup and tablespace map files of an exclusive backup are
+ * written to $PGDATA/backup_label and $PGDATA/tablespace_map, and they are
+ * removed by pg_stop_backup().
+ *
+ * A non-exclusive backup is used for the streaming base backups (see
+ * src/backend/replication/basebackup.c). The difference to exclusive backups
+ * is that the backup label and tablespace map files are not written to disk.
+ * Instead, their would-be contents are returned in *labelfile and *tblspcmapfile,
+ * and the caller is responsible for including them in the backup archive as
+ * 'backup_label' and 'tablespace_map'. There can be many non-exclusive backups
+ * active at the same time, and they don't conflict with an exclusive backup
+ * either.
+ *
+ * labelfile and tblspcmapfile must be passed as NULL when starting an
+ * exclusive backup, and as initially-empty StringInfos for a non-exclusive
+ * backup.
+ *
+ * If "tablespaces" isn't NULL, it receives a list of tablespaceinfo structs
+ * describing the cluster's tablespaces.
+ *
+ * tblspcmapfile is required mainly for tar format in windows as native windows
+ * utilities are not able to create symlinks while extracting files from tar.
+ * However for consistency, the same is used for all platforms.
+ *
+ * Returns the minimum WAL location that must be present to restore from this
+ * backup, and the corresponding timeline ID in *starttli_p.
+ *
+ * Every successfully started non-exclusive backup must be stopped by calling
+ * do_pg_stop_backup() or do_pg_abort_backup().
+ *
+ * It is the responsibility of the caller of this function to verify the
+ * permissions of the calling user!
+ */
+XLogRecPtr
+do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
+				   StringInfo labelfile, List **tablespaces,
+				   StringInfo tblspcmapfile)
+{
+	bool		exclusive = (labelfile == NULL);
+	bool		backup_started_in_recovery = false;
+	XLogRecPtr	checkpointloc;
+	XLogRecPtr	startpoint;
+	TimeLineID	starttli;
+	pg_time_t	stamp_time;
+	char		strfbuf[128];
+	char		xlogfilename[MAXFNAMELEN];
+	XLogSegNo	_logSegNo;
+	struct stat stat_buf;
+	FILE	   *fp;
+
+	backup_started_in_recovery = RecoveryInProgress();
 
 	/*
-	 * Read and parse the START WAL LOCATION line (this code is pretty crude,
-	 * but we are not expecting any variability in the file format).
+	 * Currently only non-exclusive backup can be taken during recovery.
 	 */
-	if (sscanf(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c",
-			   &hi, &lo, startxlogfilename,
-			   &ch) != 4 || ch != '\n')
+	if (backup_started_in_recovery && exclusive)
 		ereport(ERROR,
 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
-				 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
-	startpoint = ((uint64) hi) << 32 | lo;
-	remaining = strchr(labelfile, '\n') + 1;	/* %n is not portable enough */
+				 errmsg("recovery is in progress"),
+				 errhint("WAL control functions cannot be executed during recovery.")));
 
 	/*
-	 * Parse the BACKUP FROM line. If we are taking an online backup from the
-	 * standby, we confirm that the standby has not been promoted during the
-	 * backup.
+	 * During recovery, we don't need to check WAL level. Because, if WAL
+	 * level is not sufficient, it's impossible to get here during recovery.
 	 */
-	ptr = strstr(remaining, "BACKUP FROM:");
-	if (!ptr || sscanf(ptr, "BACKUP FROM: %19s\n", backupfrom) != 1)
+	if (!backup_started_in_recovery && !XLogIsNeeded())
 		ereport(ERROR,
 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
-				 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
-	if (strcmp(backupfrom, "standby") == 0 && !backup_started_in_recovery)
+				 errmsg("WAL level not sufficient for making an online backup"),
+				 errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
+
+	if (strlen(backupidstr) > MAXPGPATH)
 		ereport(ERROR,
-				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
-				 errmsg("the standby was promoted during online backup"),
-				 errhint("This means that the backup being taken is corrupt "
-						 "and should not be used. "
-						 "Try taking another online backup.")));
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("backup label too long (max %d bytes)",
+						MAXPGPATH)));
 
 	/*
-	 * During recovery, we don't write an end-of-backup record. We assume that
-	 * pg_control was backed up last and its minimum recovery point can be
-	 * available as the backup end location. Since we don't have an
-	 * end-of-backup record, we use the pg_control value to check whether
-	 * we've reached the end of backup when starting recovery from this
-	 * backup. We have no way of checking if pg_control wasn't backed up last
-	 * however.
-	 *
-	 * We don't force a switch to new WAL file but it is still possible to
-	 * wait for all the required files to be archived if waitforarchive is
-	 * true. This is okay if we use the backup to start a standby and fetch
-	 * the missing WAL using streaming replication. But in the case of an
-	 * archive recovery, a user should set waitforarchive to true and wait for
-	 * them to be archived to ensure that all the required files are
-	 * available.
+	 * Mark backup active in shared memory.  We must do full-page WAL writes
+	 * during an on-line backup even if not doing so at other times, because
+	 * it's quite possible for the backup dump to obtain a "torn" (partially
+	 * written) copy of a database page if it reads the page concurrently with
+	 * our write to the same page.  This can be fixed as long as the first
+	 * write to the page in the WAL sequence is a full-page write. Hence, we
+	 * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
+	 * are no dirty pages in shared memory that might get dumped while the
+	 * backup is in progress without having a corresponding WAL record.  (Once
+	 * the backup is complete, we need not force full-page writes anymore,
+	 * since we expect that any pages not modified during the backup interval
+	 * must have been correctly captured by the backup.)
 	 *
-	 * We return the current minimum recovery point as the backup end
-	 * location. Note that it can be greater than the exact backup end
-	 * location if the minimum recovery point is updated after the backup of
-	 * pg_control. This is harmless for current uses.
+	 * Note that forcePageWrites has no effect during an online backup from
+	 * the standby.
 	 *
-	 * XXX currently a backup history file is for informational and debug
-	 * purposes only. It's not essential for an online backup. Furthermore,
-	 * even if it's created, it will not be archived during recovery because
-	 * an archiver is not invoked. So it doesn't seem worthwhile to write a
-	 * backup history file during recovery.
+	 * We must hold all the insertion locks to change the value of
+	 * forcePageWrites, to ensure adequate interlocking against
+	 * XLogInsertRecord().
 	 */
-	if (backup_started_in_recovery)
+	WALInsertLockAcquireExclusive();
+	if (exclusive)
 	{
-		XLogRecPtr	recptr;
-
 		/*
-		 * Check to see if all WAL replayed during online backup contain
-		 * full-page writes.
+		 * At first, mark that we're now starting an exclusive backup, to
+		 * ensure that there are no other sessions currently running
+		 * pg_start_backup() or pg_stop_backup().
 		 */
-		SpinLockAcquire(&XLogCtl->info_lck);
-		recptr = XLogCtl->lastFpwDisableRecPtr;
-		SpinLockRelease(&XLogCtl->info_lck);
-
-		if (startpoint <= recptr)
+		if (XLogCtl->Insert.exclusiveBackupState != EXCLUSIVE_BACKUP_NONE)
+		{
+			WALInsertLockRelease();
 			ereport(ERROR,
 					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
-					 errmsg("WAL generated with full_page_writes=off was replayed "
-							"during online backup"),
-					 errhint("This means that the backup being taken on the standby "
-							 "is corrupt and should not be used. "
-							 "Enable full_page_writes and run CHECKPOINT on the primary, "
-							 "and then try an online backup again.")));
-
-
-		LWLockAcquire(ControlFileLock, LW_SHARED);
-		stoppoint = ControlFile->minRecoveryPoint;
-		stoptli = ControlFile->minRecoveryPointTLI;
-		LWLockRelease(ControlFileLock);
+					 errmsg("a backup is already in progress"),
+					 errhint("Run pg_stop_backup() and try again.")));
+		}
+		XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_STARTING;
 	}
 	else
+		XLogCtl->Insert.nonExclusiveBackups++;
+	XLogCtl->Insert.forcePageWrites = true;
+	WALInsertLockRelease();
+
+	/* Ensure we release forcePageWrites if fail below */
+	PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
 	{
+		bool		gotUniqueStartpoint = false;
+		DIR		   *tblspcdir;
+		struct dirent *de;
+		tablespaceinfo *ti;
+		int			datadirpathlen;
+
 		/*
-		 * Write the backup-end xlog record
+		 * Force an XLOG file switch before the checkpoint, to ensure that the
+		 * WAL segment the checkpoint is written to doesn't contain pages with
+		 * old timeline IDs.  That would otherwise happen if you called
+		 * pg_start_backup() right after restoring from a PITR archive: the
+		 * first WAL segment containing the startup checkpoint has pages in
+		 * the beginning with the old timeline ID.  That can cause trouble at
+		 * recovery: we won't have a history file covering the old timeline if
+		 * pg_wal directory was not included in the base backup and the WAL
+		 * archive was cleared too before starting the backup.
+		 *
+		 * This also ensures that we have emitted a WAL page header that has
+		 * XLP_BKP_REMOVABLE off before we emit the checkpoint record.
+		 * Therefore, if a WAL archiver (such as pglesslog) is trying to
+		 * compress out removable backup blocks, it won't remove any that
+		 * occur after this point.
+		 *
+		 * During recovery, we skip forcing XLOG file switch, which means that
+		 * the backup taken during recovery is not available for the special
+		 * recovery case described above.
 		 */
-		XLogBeginInsert();
-		XLogRegisterData((char *) (&startpoint), sizeof(startpoint));
-		stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END);
-		stoptli = ThisTimeLineID;
+		if (!backup_started_in_recovery)
+			RequestXLogSwitch(false);
+
+		do
+		{
+			bool		checkpointfpw;
+
+			/*
+			 * Force a CHECKPOINT.  Aside from being necessary to prevent torn
+			 * page problems, this guarantees that two successive backup runs
+			 * will have different checkpoint positions and hence different
+			 * history file names, even if nothing happened in between.
+			 *
+			 * During recovery, establish a restartpoint if possible. We use
+			 * the last restartpoint as the backup starting checkpoint. This
+			 * means that two successive backup runs can have same checkpoint
+			 * positions.
+			 *
+			 * Since the fact that we are executing do_pg_start_backup()
+			 * during recovery means that checkpointer is running, we can use
+			 * RequestCheckpoint() to establish a restartpoint.
+			 *
+			 * We use CHECKPOINT_IMMEDIATE only if requested by user (via
+			 * passing fast = true).  Otherwise this can take awhile.
+			 */
+			RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
+							  (fast ? CHECKPOINT_IMMEDIATE : 0));
+
+			/*
+			 * Now we need to fetch the checkpoint record location, and also
+			 * its REDO pointer.  The oldest point in WAL that would be needed
+			 * to restore starting from the checkpoint is precisely the REDO
+			 * pointer.
+			 */
+			LWLockAcquire(ControlFileLock, LW_SHARED);
+			checkpointloc = ControlFile->checkPoint;
+			startpoint = ControlFile->checkPointCopy.redo;
+			starttli = ControlFile->checkPointCopy.ThisTimeLineID;
+			checkpointfpw = ControlFile->checkPointCopy.fullPageWrites;
+			LWLockRelease(ControlFileLock);
 
-		/*
-		 * Force a switch to a new xlog segment file, so that the backup is
-		 * valid as soon as archiver moves out the current segment file.
-		 */
-		RequestXLogSwitch(false);
+			if (backup_started_in_recovery)
+			{
+				XLogRecPtr	recptr;
 
-		XLByteToPrevSeg(stoppoint, _logSegNo, wal_segment_size);
-		XLogFileName(stopxlogfilename, stoptli, _logSegNo, wal_segment_size);
+				/*
+				 * Check to see if all WAL replayed during online backup
+				 * (i.e., since last restartpoint used as backup starting
+				 * checkpoint) contain full-page writes.
+				 */
+				SpinLockAcquire(&XLogCtl->info_lck);
+				recptr = XLogCtl->lastFpwDisableRecPtr;
+				SpinLockRelease(&XLogCtl->info_lck);
 
-		/* Use the log timezone here, not the session timezone */
-		stamp_time = (pg_time_t) time(NULL);
-		pg_strftime(strfbuf, sizeof(strfbuf),
-					"%Y-%m-%d %H:%M:%S %Z",
-					pg_localtime(&stamp_time, log_timezone));
+				if (!checkpointfpw || startpoint <= recptr)
+					ereport(ERROR,
+							(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+							 errmsg("WAL generated with full_page_writes=off was replayed "
+									"since last restartpoint"),
+							 errhint("This means that the backup being taken on the standby "
+									 "is corrupt and should not be used. "
+									 "Enable full_page_writes and run CHECKPOINT on the primary, "
+									 "and then try an online backup again.")));
 
-		/*
-		 * Write the backup history file
-		 */
-		XLByteToSeg(startpoint, _logSegNo, wal_segment_size);
-		BackupHistoryFilePath(histfilepath, stoptli, _logSegNo,
-							  startpoint, wal_segment_size);
-		fp = AllocateFile(histfilepath, "w");
-		if (!fp)
-			ereport(ERROR,
-					(errcode_for_file_access(),
-					 errmsg("could not create file \"%s\": %m",
-							histfilepath)));
-		fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
-				LSN_FORMAT_ARGS(startpoint), startxlogfilename);
-		fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
-				LSN_FORMAT_ARGS(stoppoint), stopxlogfilename);
+				/*
+				 * During recovery, since we don't use the end-of-backup WAL
+				 * record and don't write the backup history file, the
+				 * starting WAL location doesn't need to be unique. This means
+				 * that two base backups started at the same time might use
+				 * the same checkpoint as starting locations.
+				 */
+				gotUniqueStartpoint = true;
+			}
 
-		/*
-		 * Transfer remaining lines including label and start timeline to
-		 * history file.
-		 */
-		fprintf(fp, "%s", remaining);
-		fprintf(fp, "STOP TIME: %s\n", strfbuf);
-		fprintf(fp, "STOP TIMELINE: %u\n", stoptli);
-		if (fflush(fp) || ferror(fp) || FreeFile(fp))
-			ereport(ERROR,
-					(errcode_for_file_access(),
-					 errmsg("could not write file \"%s\": %m",
-							histfilepath)));
+			/*
+			 * If two base backups are started at the same time (in WAL sender
+			 * processes), we need to make sure that they use different
+			 * checkpoints as starting locations, because we use the starting
+			 * WAL location as a unique identifier for the base backup in the
+			 * end-of-backup WAL record and when we write the backup history
+			 * file. Perhaps it would be better generate a separate unique ID
+			 * for each backup instead of forcing another checkpoint, but
+			 * taking a checkpoint right after another is not that expensive
+			 * either because only few buffers have been dirtied yet.
+			 */
+			WALInsertLockAcquireExclusive();
+			if (XLogCtl->Insert.lastBackupStart < startpoint)
+			{
+				XLogCtl->Insert.lastBackupStart = startpoint;
+				gotUniqueStartpoint = true;
+			}
+			WALInsertLockRelease();
+		} while (!gotUniqueStartpoint);
+
+		XLByteToSeg(startpoint, _logSegNo, wal_segment_size);
+		XLogFileName(xlogfilename, starttli, _logSegNo, wal_segment_size);
 
 		/*
-		 * Clean out any no-longer-needed history files.  As a side effect,
-		 * this will post a .ready file for the newly created history file,
-		 * notifying the archiver that history file may be archived
-		 * immediately.
+		 * Construct tablespace_map file.  If caller isn't interested in this,
+		 * we make a local StringInfo.
 		 */
-		CleanupBackupHistory();
-	}
-
-	/*
-	 * If archiving is enabled, wait for all the required WAL files to be
-	 * archived before returning. If archiving isn't enabled, the required WAL
-	 * needs to be transported via streaming replication (hopefully with
-	 * wal_keep_size set high enough), or some more exotic mechanism like
-	 * polling and copying files from pg_wal with script. We have no knowledge
-	 * of those mechanisms, so it's up to the user to ensure that he gets all
-	 * the required WAL.
-	 *
-	 * We wait until both the last WAL file filled during backup and the
-	 * history file have been archived, and assume that the alphabetic sorting
-	 * property of the WAL files ensures any earlier WAL files are safely
-	 * archived as well.
-	 *
-	 * We wait forever, since archive_command is supposed to work and we
-	 * assume the admin wanted his backup to work completely. If you don't
-	 * wish to wait, then either waitforarchive should be passed in as false,
-	 * or you can set statement_timeout.  Also, some notices are issued to
-	 * clue in anyone who might be doing this interactively.
-	 */
+		if (tblspcmapfile == NULL)
+			tblspcmapfile = makeStringInfo();
 
-	if (waitforarchive &&
-		((!backup_started_in_recovery && XLogArchivingActive()) ||
-		 (backup_started_in_recovery && XLogArchivingAlways())))
-	{
-		XLByteToPrevSeg(stoppoint, _logSegNo, wal_segment_size);
-		XLogFileName(lastxlogfilename, stoptli, _logSegNo, wal_segment_size);
+		datadirpathlen = strlen(DataDir);
 
-		XLByteToSeg(startpoint, _logSegNo, wal_segment_size);
-		BackupHistoryFileName(histfilename, stoptli, _logSegNo,
-							  startpoint, wal_segment_size);
+		/* Collect information about all tablespaces */
+		tblspcdir = AllocateDir("pg_tblspc");
+		while ((de = ReadDir(tblspcdir, "pg_tblspc")) != NULL)
+		{
+			char		fullpath[MAXPGPATH + 10];
+			char		linkpath[MAXPGPATH];
+			char	   *relpath = NULL;
+			int			rllen;
+			StringInfoData escapedpath;
+			char	   *s;
 
-		seconds_before_warning = 60;
-		waits = 0;
+			/* Skip anything that doesn't look like a tablespace */
+			if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
+				continue;
 
-		while (XLogArchiveIsBusy(lastxlogfilename) ||
-			   XLogArchiveIsBusy(histfilename))
-		{
-			CHECK_FOR_INTERRUPTS();
+			snprintf(fullpath, sizeof(fullpath), "pg_tblspc/%s", de->d_name);
 
-			if (!reported_waiting && waits > 5)
+#if defined(HAVE_READLINK) || defined(WIN32)
+			rllen = readlink(fullpath, linkpath, sizeof(linkpath));
+			if (rllen < 0)
 			{
-				ereport(NOTICE,
-						(errmsg("base backup done, waiting for required WAL segments to be archived")));
-				reported_waiting = true;
+				ereport(WARNING,
+						(errmsg("could not read symbolic link \"%s\": %m",
+								fullpath)));
+				continue;
 			}
-
-			pgstat_report_wait_start(WAIT_EVENT_BACKUP_WAIT_WAL_ARCHIVE);
-			pg_usleep(1000000L);
-			pgstat_report_wait_end();
-
-			if (++waits >= seconds_before_warning)
+			else if (rllen >= sizeof(linkpath))
 			{
-				seconds_before_warning *= 2;	/* This wraps in >10 years... */
 				ereport(WARNING,
-						(errmsg("still waiting for all required WAL segments to be archived (%d seconds elapsed)",
-								waits),
-						 errhint("Check that your archive_command is executing properly.  "
-								 "You can safely cancel this backup, "
-								 "but the database backup will not be usable without all the WAL segments.")));
+						(errmsg("symbolic link \"%s\" target is too long",
+								fullpath)));
+				continue;
 			}
-		}
-
-		ereport(NOTICE,
-				(errmsg("all required WAL segments have been archived")));
-	}
-	else if (waitforarchive)
-		ereport(NOTICE,
-				(errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
-
-	/*
-	 * We're done.  As a convenience, return the ending WAL location.
-	 */
-	if (stoptli_p)
-		*stoptli_p = stoptli;
-	return stoppoint;
-}
-
-
-/*
- * do_pg_abort_backup: abort a running backup
- *
- * This does just the most basic steps of do_pg_stop_backup(), by taking the
- * system out of backup mode, thus making it a lot more safe to call from
- * an error handler.
- *
- * The caller can pass 'arg' as 'true' or 'false' to control whether a warning
- * is emitted.
- *
- * NB: This is only for aborting a non-exclusive backup that doesn't write
- * backup_label. A backup started with pg_start_backup() needs to be finished
- * with pg_stop_backup().
- *
- * NB: This gets used as a before_shmem_exit handler, hence the odd-looking
- * signature.
- */
-void
-do_pg_abort_backup(int code, Datum arg)
-{
-	bool		emit_warning = DatumGetBool(arg);
-
-	/*
-	 * Quick exit if session is not keeping around a non-exclusive backup
-	 * already started.
-	 */
-	if (sessionBackupState != SESSION_BACKUP_NON_EXCLUSIVE)
-		return;
-
-	WALInsertLockAcquireExclusive();
-	Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
-	XLogCtl->Insert.nonExclusiveBackups--;
+			linkpath[rllen] = '\0';
 
-	if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
-		XLogCtl->Insert.nonExclusiveBackups == 0)
-	{
-		XLogCtl->Insert.forcePageWrites = false;
-	}
-	WALInsertLockRelease();
+			/*
+			 * Build a backslash-escaped version of the link path to include
+			 * in the tablespace map file.
+			 */
+			initStringInfo(&escapedpath);
+			for (s = linkpath; *s; s++)
+			{
+				if (*s == '\n' || *s == '\r' || *s == '\\')
+					appendStringInfoChar(&escapedpath, '\\');
+				appendStringInfoChar(&escapedpath, *s);
+			}
 
-	if (emit_warning)
-		ereport(WARNING,
-				(errmsg("aborting backup due to backend exiting before pg_stop_backup was called")));
-}
+			/*
+			 * Relpath holds the relative path of the tablespace directory
+			 * when it's located within PGDATA, or NULL if it's located
+			 * elsewhere.
+			 */
+			if (rllen > datadirpathlen &&
+				strncmp(linkpath, DataDir, datadirpathlen) == 0 &&
+				IS_DIR_SEP(linkpath[datadirpathlen]))
+				relpath = linkpath + datadirpathlen + 1;
 
-/*
- * Register a handler that will warn about unterminated backups at end of
- * session, unless this has already been done.
- */
-void
-register_persistent_abort_backup_handler(void)
-{
-	static bool already_done = false;
+			ti = palloc(sizeof(tablespaceinfo));
+			ti->oid = pstrdup(de->d_name);
+			ti->path = pstrdup(linkpath);
+			ti->rpath = relpath ? pstrdup(relpath) : NULL;
+			ti->size = -1;
 
-	if (already_done)
-		return;
-	before_shmem_exit(do_pg_abort_backup, DatumGetBool(true));
-	already_done = true;
-}
+			if (tablespaces)
+				*tablespaces = lappend(*tablespaces, ti);
 
-/*
- * Get latest redo apply position.
- *
- * Exported to allow WALReceiver to read the pointer directly.
- */
-XLogRecPtr
-GetXLogReplayRecPtr(TimeLineID *replayTLI)
-{
-	XLogRecPtr	recptr;
-	TimeLineID	tli;
+			appendStringInfo(tblspcmapfile, "%s %s\n",
+							 ti->oid, escapedpath.data);
 
-	SpinLockAcquire(&XLogCtl->info_lck);
-	recptr = XLogCtl->lastReplayedEndRecPtr;
-	tli = XLogCtl->lastReplayedTLI;
-	SpinLockRelease(&XLogCtl->info_lck);
+			pfree(escapedpath.data);
+#else
 
-	if (replayTLI)
-		*replayTLI = tli;
-	return recptr;
-}
+			/*
+			 * If the platform does not have symbolic links, it should not be
+			 * possible to have tablespaces - clearly somebody else created
+			 * them. Warn about it and ignore.
+			 */
+			ereport(WARNING,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("tablespaces are not supported on this platform")));
+#endif
+		}
+		FreeDir(tblspcdir);
 
-/*
- * Get latest WAL insert pointer
- */
-XLogRecPtr
-GetXLogInsertRecPtr(void)
-{
-	XLogCtlInsert *Insert = &XLogCtl->Insert;
-	uint64		current_bytepos;
+		/*
+		 * Construct backup label file.  If caller isn't interested in this,
+		 * we make a local StringInfo.
+		 */
+		if (labelfile == NULL)
+			labelfile = makeStringInfo();
 
-	SpinLockAcquire(&Insert->insertpos_lck);
-	current_bytepos = Insert->CurrBytePos;
-	SpinLockRelease(&Insert->insertpos_lck);
+		/* Use the log timezone here, not the session timezone */
+		stamp_time = (pg_time_t) time(NULL);
+		pg_strftime(strfbuf, sizeof(strfbuf),
+					"%Y-%m-%d %H:%M:%S %Z",
+					pg_localtime(&stamp_time, log_timezone));
+		appendStringInfo(labelfile, "START WAL LOCATION: %X/%X (file %s)\n",
+						 LSN_FORMAT_ARGS(startpoint), xlogfilename);
+		appendStringInfo(labelfile, "CHECKPOINT LOCATION: %X/%X\n",
+						 LSN_FORMAT_ARGS(checkpointloc));
+		appendStringInfo(labelfile, "BACKUP METHOD: %s\n",
+						 exclusive ? "pg_start_backup" : "streamed");
+		appendStringInfo(labelfile, "BACKUP FROM: %s\n",
+						 backup_started_in_recovery ? "standby" : "primary");
+		appendStringInfo(labelfile, "START TIME: %s\n", strfbuf);
+		appendStringInfo(labelfile, "LABEL: %s\n", backupidstr);
+		appendStringInfo(labelfile, "START TIMELINE: %u\n", starttli);
 
-	return XLogBytePosToRecPtr(current_bytepos);
-}
+		/*
+		 * Okay, write the file, or return its contents to caller.
+		 */
+		if (exclusive)
+		{
+			/*
+			 * Check for existing backup label --- implies a backup is already
+			 * running.  (XXX given that we checked exclusiveBackupState
+			 * above, maybe it would be OK to just unlink any such label
+			 * file?)
+			 */
+			if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
+			{
+				if (errno != ENOENT)
+					ereport(ERROR,
+							(errcode_for_file_access(),
+							 errmsg("could not stat file \"%s\": %m",
+									BACKUP_LABEL_FILE)));
+			}
+			else
+				ereport(ERROR,
+						(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+						 errmsg("a backup is already in progress"),
+						 errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
+								 BACKUP_LABEL_FILE)));
 
-/*
- * Get latest WAL write pointer
- */
-XLogRecPtr
-GetXLogWriteRecPtr(void)
-{
-	SpinLockAcquire(&XLogCtl->info_lck);
-	LogwrtResult = XLogCtl->LogwrtResult;
-	SpinLockRelease(&XLogCtl->info_lck);
+			fp = AllocateFile(BACKUP_LABEL_FILE, "w");
 
-	return LogwrtResult.Write;
-}
+			if (!fp)
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not create file \"%s\": %m",
+								BACKUP_LABEL_FILE)));
+			if (fwrite(labelfile->data, labelfile->len, 1, fp) != 1 ||
+				fflush(fp) != 0 ||
+				pg_fsync(fileno(fp)) != 0 ||
+				ferror(fp) ||
+				FreeFile(fp))
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not write file \"%s\": %m",
+								BACKUP_LABEL_FILE)));
+			/* Allocated locally for exclusive backups, so free separately */
+			pfree(labelfile->data);
+			pfree(labelfile);
 
-/*
- * Returns the redo pointer of the last checkpoint or restartpoint. This is
- * the oldest point in WAL that we still need, if we have to restart recovery.
- */
-void
-GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli)
-{
-	LWLockAcquire(ControlFileLock, LW_SHARED);
-	*oldrecptr = ControlFile->checkPointCopy.redo;
-	*oldtli = ControlFile->checkPointCopy.ThisTimeLineID;
-	LWLockRelease(ControlFileLock);
-}
+			/* Write backup tablespace_map file. */
+			if (tblspcmapfile->len > 0)
+			{
+				if (stat(TABLESPACE_MAP, &stat_buf) != 0)
+				{
+					if (errno != ENOENT)
+						ereport(ERROR,
+								(errcode_for_file_access(),
+								 errmsg("could not stat file \"%s\": %m",
+										TABLESPACE_MAP)));
+				}
+				else
+					ereport(ERROR,
+							(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+							 errmsg("a backup is already in progress"),
+							 errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
+									 TABLESPACE_MAP)));
 
-/*
- * read_backup_label: check to see if a backup_label file is present
- *
- * If we see a backup_label during recovery, we assume that we are recovering
- * from a backup dump file, and we therefore roll forward from the checkpoint
- * identified by the label file, NOT what pg_control says.  This avoids the
- * problem that pg_control might have been archived one or more checkpoints
- * later than the start of the dump, and so if we rely on it as the start
- * point, we will fail to restore a consistent database state.
- *
- * Returns true if a backup_label was found (and fills the checkpoint
- * location and its REDO location into *checkPointLoc and RedoStartLSN,
- * respectively); returns false if not. If this backup_label came from a
- * streamed backup, *backupEndRequired is set to true. If this backup_label
- * was created during recovery, *backupFromStandby is set to true.
- */
-static bool
-read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired,
-				  bool *backupFromStandby)
-{
-	char		startxlogfilename[MAXFNAMELEN];
-	TimeLineID	tli_from_walseg,
-				tli_from_file;
-	FILE	   *lfp;
-	char		ch;
-	char		backuptype[20];
-	char		backupfrom[20];
-	char		backuplabel[MAXPGPATH];
-	char		backuptime[128];
-	uint32		hi,
-				lo;
+				fp = AllocateFile(TABLESPACE_MAP, "w");
 
-	*backupEndRequired = false;
-	*backupFromStandby = false;
+				if (!fp)
+					ereport(ERROR,
+							(errcode_for_file_access(),
+							 errmsg("could not create file \"%s\": %m",
+									TABLESPACE_MAP)));
+				if (fwrite(tblspcmapfile->data, tblspcmapfile->len, 1, fp) != 1 ||
+					fflush(fp) != 0 ||
+					pg_fsync(fileno(fp)) != 0 ||
+					ferror(fp) ||
+					FreeFile(fp))
+					ereport(ERROR,
+							(errcode_for_file_access(),
+							 errmsg("could not write file \"%s\": %m",
+									TABLESPACE_MAP)));
+			}
 
-	/*
-	 * See if label file is present
-	 */
-	lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
-	if (!lfp)
-	{
-		if (errno != ENOENT)
-			ereport(FATAL,
-					(errcode_for_file_access(),
-					 errmsg("could not read file \"%s\": %m",
-							BACKUP_LABEL_FILE)));
-		return false;			/* it's not there, all is fine */
+			/* Allocated locally for exclusive backups, so free separately */
+			pfree(tblspcmapfile->data);
+			pfree(tblspcmapfile);
+		}
 	}
+	PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
 
 	/*
-	 * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
-	 * is pretty crude, but we are not expecting any variability in the file
-	 * format).
-	 */
-	if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
-			   &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n')
-		ereport(FATAL,
-				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
-				 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
-	RedoStartLSN = ((uint64) hi) << 32 | lo;
-	if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
-			   &hi, &lo, &ch) != 3 || ch != '\n')
-		ereport(FATAL,
-				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
-				 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
-	*checkPointLoc = ((uint64) hi) << 32 | lo;
-
-	/*
-	 * BACKUP METHOD and BACKUP FROM lines are new in 9.2. We can't restore
-	 * from an older backup anyway, but since the information on it is not
-	 * strictly required, don't error out if it's missing for some reason.
+	 * Mark that start phase has correctly finished for an exclusive backup.
+	 * Session-level locks are updated as well to reflect that state.
+	 *
+	 * Note that CHECK_FOR_INTERRUPTS() must not occur while updating backup
+	 * counters and session-level lock. Otherwise they can be updated
+	 * inconsistently, and which might cause do_pg_abort_backup() to fail.
 	 */
-	if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
+	if (exclusive)
 	{
-		if (strcmp(backuptype, "streamed") == 0)
-			*backupEndRequired = true;
-	}
+		WALInsertLockAcquireExclusive();
+		XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS;
 
-	if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
-	{
-		if (strcmp(backupfrom, "standby") == 0)
-			*backupFromStandby = true;
+		/* Set session-level lock */
+		sessionBackupState = SESSION_BACKUP_EXCLUSIVE;
+		WALInsertLockRelease();
 	}
+	else
+		sessionBackupState = SESSION_BACKUP_NON_EXCLUSIVE;
 
 	/*
-	 * Parse START TIME and LABEL. Those are not mandatory fields for recovery
-	 * but checking for their presence is useful for debugging and the next
-	 * sanity checks. Cope also with the fact that the result buffers have a
-	 * pre-allocated size, hence if the backup_label file has been generated
-	 * with strings longer than the maximum assumed here an incorrect parsing
-	 * happens. That's fine as only minor consistency checks are done
-	 * afterwards.
-	 */
-	if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1)
-		ereport(DEBUG1,
-				(errmsg_internal("backup time %s in file \"%s\"",
-								 backuptime, BACKUP_LABEL_FILE)));
-
-	if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1)
-		ereport(DEBUG1,
-				(errmsg_internal("backup label %s in file \"%s\"",
-								 backuplabel, BACKUP_LABEL_FILE)));
-
-	/*
-	 * START TIMELINE is new as of 11. Its parsing is not mandatory, still use
-	 * it as a sanity check if present.
+	 * We're done.  As a convenience, return the starting WAL location.
 	 */
-	if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1)
-	{
-		if (tli_from_walseg != tli_from_file)
-			ereport(FATAL,
-					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
-					 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE),
-					 errdetail("Timeline ID parsed is %u, but expected %u.",
-							   tli_from_file, tli_from_walseg)));
-
-		ereport(DEBUG1,
-				(errmsg_internal("backup timeline %u in file \"%s\"",
-								 tli_from_file, BACKUP_LABEL_FILE)));
-	}
-
-	if (ferror(lfp) || FreeFile(lfp))
-		ereport(FATAL,
-				(errcode_for_file_access(),
-				 errmsg("could not read file \"%s\": %m",
-						BACKUP_LABEL_FILE)));
-
-	return true;
+	if (starttli_p)
+		*starttli_p = starttli;
+	return startpoint;
 }
 
-/*
- * read_tablespace_map: check to see if a tablespace_map file is present
- *
- * If we see a tablespace_map file during recovery, we assume that we are
- * recovering from a backup dump file, and we therefore need to create symlinks
- * as per the information present in tablespace_map file.
- *
- * Returns true if a tablespace_map file was found (and fills *tablespaces
- * with a tablespaceinfo struct for each tablespace listed in the file);
- * returns false if not.
- */
-static bool
-read_tablespace_map(List **tablespaces)
+/* Error cleanup callback for pg_start_backup */
+static void
+pg_start_backup_callback(int code, Datum arg)
 {
-	tablespaceinfo *ti;
-	FILE	   *lfp;
-	char		str[MAXPGPATH];
-	int			ch,
-				i,
-				n;
-	bool		was_backslash;
+	bool		exclusive = DatumGetBool(arg);
 
-	/*
-	 * See if tablespace_map file is present
-	 */
-	lfp = AllocateFile(TABLESPACE_MAP, "r");
-	if (!lfp)
+	/* Update backup counters and forcePageWrites on failure */
+	WALInsertLockAcquireExclusive();
+	if (exclusive)
 	{
-		if (errno != ENOENT)
-			ereport(FATAL,
-					(errcode_for_file_access(),
-					 errmsg("could not read file \"%s\": %m",
-							TABLESPACE_MAP)));
-		return false;			/* it's not there, all is fine */
+		Assert(XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_STARTING);
+		XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_NONE;
 	}
-
-	/*
-	 * Read and parse the link name and path lines from tablespace_map file
-	 * (this code is pretty crude, but we are not expecting any variability in
-	 * the file format).  De-escape any backslashes that were inserted.
-	 */
-	i = 0;
-	was_backslash = false;
-	while ((ch = fgetc(lfp)) != EOF)
+	else
 	{
-		if (!was_backslash && (ch == '\n' || ch == '\r'))
-		{
-			if (i == 0)
-				continue;		/* \r immediately followed by \n */
-
-			/*
-			 * The de-escaped line should contain an OID followed by exactly
-			 * one space followed by a path.  The path might start with
-			 * spaces, so don't be too liberal about parsing.
-			 */
-			str[i] = '\0';
-			n = 0;
-			while (str[n] && str[n] != ' ')
-				n++;
-			if (n < 1 || n >= i - 1)
-				ereport(FATAL,
-						(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
-						 errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
-			str[n++] = '\0';
-
-			ti = palloc0(sizeof(tablespaceinfo));
-			ti->oid = pstrdup(str);
-			ti->path = pstrdup(str + n);
-			*tablespaces = lappend(*tablespaces, ti);
-
-			i = 0;
-			continue;
-		}
-		else if (!was_backslash && ch == '\\')
-			was_backslash = true;
-		else
-		{
-			if (i < sizeof(str) - 1)
-				str[i++] = ch;
-			was_backslash = false;
-		}
+		Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
+		XLogCtl->Insert.nonExclusiveBackups--;
 	}
 
-	if (i != 0 || was_backslash)	/* last line not terminated? */
-		ereport(FATAL,
-				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
-				 errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
-
-	if (ferror(lfp) || FreeFile(lfp))
-		ereport(FATAL,
-				(errcode_for_file_access(),
-				 errmsg("could not read file \"%s\": %m",
-						TABLESPACE_MAP)));
-
-	return true;
+	if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
+		XLogCtl->Insert.nonExclusiveBackups == 0)
+	{
+		XLogCtl->Insert.forcePageWrites = false;
+	}
+	WALInsertLockRelease();
 }
 
 /*
- * Error context callback for errors occurring during rm_redo().
+ * Error cleanup callback for pg_stop_backup
  */
 static void
-rm_redo_error_callback(void *arg)
+pg_stop_backup_callback(int code, Datum arg)
 {
-	XLogReaderState *record = (XLogReaderState *) arg;
-	StringInfoData buf;
-
-	initStringInfo(&buf);
-	xlog_outdesc(&buf, record);
-	xlog_block_info(&buf, record);
-
-	/* translator: %s is a WAL record description */
-	errcontext("WAL redo at %X/%X for %s",
-			   LSN_FORMAT_ARGS(record->ReadRecPtr),
-			   buf.data);
+	bool		exclusive = DatumGetBool(arg);
 
-	pfree(buf.data);
+	/* Update backup status on failure */
+	WALInsertLockAcquireExclusive();
+	if (exclusive)
+	{
+		Assert(XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_STOPPING);
+		XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS;
+	}
+	WALInsertLockRelease();
 }
 
 /*
- * BackupInProgress: check if online backup mode is active
- *
- * This is done by checking for existence of the "backup_label" file.
+ * Utility routine to fetch the session-level status of a backup running.
  */
-bool
-BackupInProgress(void)
+SessionBackupState
+get_backup_status(void)
 {
-	struct stat stat_buf;
-
-	return (stat(BACKUP_LABEL_FILE, &stat_buf) == 0);
+	return sessionBackupState;
 }
 
 /*
- * CancelBackup: rename the "backup_label" and "tablespace_map"
- *				 files to cancel backup mode
+ * do_pg_stop_backup
  *
- * If the "backup_label" file exists, it will be renamed to "backup_label.old".
- * Similarly, if the "tablespace_map" file exists, it will be renamed to
- * "tablespace_map.old".
+ * Utility function called at the end of an online backup. It cleans up the
+ * backup state and can optionally wait for WAL segments to be archived.
  *
- * Note that this will render an online backup in progress
- * useless. To correctly finish an online backup, pg_stop_backup must be
- * called.
+ * If labelfile is NULL, this stops an exclusive backup. Otherwise this stops
+ * the non-exclusive backup specified by 'labelfile'.
+ *
+ * Returns the last WAL location that must be present to restore from this
+ * backup, and the corresponding timeline ID in *stoptli_p.
+ *
+ * It is the responsibility of the caller of this function to verify the
+ * permissions of the calling user!
  */
-void
-CancelBackup(void)
+XLogRecPtr
+do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
 {
-	struct stat stat_buf;
-
-	/* if the backup_label file is not there, return */
-	if (stat(BACKUP_LABEL_FILE, &stat_buf) < 0)
-		return;
-
-	/* remove leftover file from previously canceled backup if it exists */
-	unlink(BACKUP_LABEL_OLD);
-
-	if (durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, DEBUG1) != 0)
-	{
-		ereport(WARNING,
-				(errcode_for_file_access(),
-				 errmsg("online backup mode was not canceled"),
-				 errdetail("File \"%s\" could not be renamed to \"%s\": %m.",
-						   BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
-		return;
-	}
-
-	/* if the tablespace_map file is not there, return */
-	if (stat(TABLESPACE_MAP, &stat_buf) < 0)
-	{
-		ereport(LOG,
-				(errmsg("online backup mode canceled"),
-				 errdetail("File \"%s\" was renamed to \"%s\".",
-						   BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
-		return;
-	}
-
-	/* remove leftover file from previously canceled backup if it exists */
-	unlink(TABLESPACE_MAP_OLD);
-
-	if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
-	{
-		ereport(LOG,
-				(errmsg("online backup mode canceled"),
-				 errdetail("Files \"%s\" and \"%s\" were renamed to "
-						   "\"%s\" and \"%s\", respectively.",
-						   BACKUP_LABEL_FILE, TABLESPACE_MAP,
-						   BACKUP_LABEL_OLD, TABLESPACE_MAP_OLD)));
-	}
-	else
-	{
-		ereport(WARNING,
-				(errcode_for_file_access(),
-				 errmsg("online backup mode canceled"),
-				 errdetail("File \"%s\" was renamed to \"%s\", but "
-						   "file \"%s\" could not be renamed to \"%s\": %m.",
-						   BACKUP_LABEL_FILE, BACKUP_LABEL_OLD,
-						   TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
-	}
-}
+	bool		exclusive = (labelfile == NULL);
+	bool		backup_started_in_recovery = false;
+	XLogRecPtr	startpoint;
+	XLogRecPtr	stoppoint;
+	TimeLineID	stoptli;
+	pg_time_t	stamp_time;
+	char		strfbuf[128];
+	char		histfilepath[MAXPGPATH];
+	char		startxlogfilename[MAXFNAMELEN];
+	char		stopxlogfilename[MAXFNAMELEN];
+	char		lastxlogfilename[MAXFNAMELEN];
+	char		histfilename[MAXFNAMELEN];
+	char		backupfrom[20];
+	XLogSegNo	_logSegNo;
+	FILE	   *lfp;
+	FILE	   *fp;
+	char		ch;
+	int			seconds_before_warning;
+	int			waits = 0;
+	bool		reported_waiting = false;
+	char	   *remaining;
+	char	   *ptr;
+	uint32		hi,
+				lo;
 
-/*
- * Read the XLOG page containing RecPtr into readBuf (if not read already).
- * Returns number of bytes read, if the page is read successfully, or -1
- * in case of errors.  When errors occur, they are ereport'ed, but only
- * if they have not been previously reported.
- *
- * This is responsible for restoring files from archive as needed, as well
- * as for waiting for the requested WAL record to arrive in standby mode.
- *
- * 'emode' specifies the log level used for reporting "file not found" or
- * "end of WAL" situations in archive recovery, or in standby mode when a
- * trigger file is found. If set to WARNING or below, XLogPageRead() returns
- * false in those situations, on higher log levels the ereport() won't
- * return.
- *
- * In standby mode, if after a successful return of XLogPageRead() the
- * caller finds the record it's interested in to be broken, it should
- * ereport the error with the level determined by
- * emode_for_corrupt_record(), and then set lastSourceFailed
- * and call XLogPageRead() again with the same arguments. This lets
- * XLogPageRead() to try fetching the record from another source, or to
- * sleep and retry.
- */
-static int
-XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
-			 XLogRecPtr targetRecPtr, char *readBuf)
-{
-	XLogPageReadPrivate *private =
-	(XLogPageReadPrivate *) xlogreader->private_data;
-	int			emode = private->emode;
-	uint32		targetPageOff;
-	XLogSegNo	targetSegNo PG_USED_FOR_ASSERTS_ONLY;
-	int			r;
+	backup_started_in_recovery = RecoveryInProgress();
 
-	XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
-	targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
+	/*
+	 * Currently only non-exclusive backup can be taken during recovery.
+	 */
+	if (backup_started_in_recovery && exclusive)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("recovery is in progress"),
+				 errhint("WAL control functions cannot be executed during recovery.")));
 
 	/*
-	 * See if we need to switch to a new segment because the requested record
-	 * is not in the currently open one.
+	 * During recovery, we don't need to check WAL level. Because, if WAL
+	 * level is not sufficient, it's impossible to get here during recovery.
 	 */
-	if (readFile >= 0 &&
-		!XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size))
+	if (!backup_started_in_recovery && !XLogIsNeeded())
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("WAL level not sufficient for making an online backup"),
+				 errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
+
+	if (exclusive)
 	{
 		/*
-		 * Request a restartpoint if we've replayed too much xlog since the
-		 * last one.
+		 * At first, mark that we're now stopping an exclusive backup, to
+		 * ensure that there are no other sessions currently running
+		 * pg_start_backup() or pg_stop_backup().
 		 */
-		if (bgwriterLaunched)
+		WALInsertLockAcquireExclusive();
+		if (XLogCtl->Insert.exclusiveBackupState != EXCLUSIVE_BACKUP_IN_PROGRESS)
 		{
-			if (XLogCheckpointNeeded(readSegNo))
-			{
-				(void) GetRedoRecPtr();
-				if (XLogCheckpointNeeded(readSegNo))
-					RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
-			}
+			WALInsertLockRelease();
+			ereport(ERROR,
+					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+					 errmsg("exclusive backup not in progress")));
 		}
+		XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_STOPPING;
+		WALInsertLockRelease();
 
-		close(readFile);
-		readFile = -1;
-		readSource = XLOG_FROM_ANY;
-	}
+		/*
+		 * Remove backup_label. In case of failure, the state for an exclusive
+		 * backup is switched back to in-progress.
+		 */
+		PG_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive));
+		{
+			/*
+			 * Read the existing label file into memory.
+			 */
+			struct stat statbuf;
+			int			r;
+
+			if (stat(BACKUP_LABEL_FILE, &statbuf))
+			{
+				/* should not happen per the upper checks */
+				if (errno != ENOENT)
+					ereport(ERROR,
+							(errcode_for_file_access(),
+							 errmsg("could not stat file \"%s\": %m",
+									BACKUP_LABEL_FILE)));
+				ereport(ERROR,
+						(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+						 errmsg("a backup is not in progress")));
+			}
 
-	XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size);
+			lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
+			if (!lfp)
+			{
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not read file \"%s\": %m",
+								BACKUP_LABEL_FILE)));
+			}
+			labelfile = palloc(statbuf.st_size + 1);
+			r = fread(labelfile, statbuf.st_size, 1, lfp);
+			labelfile[statbuf.st_size] = '\0';
 
-retry:
-	/* See if we need to retrieve more data */
-	if (readFile < 0 ||
-		(readSource == XLOG_FROM_STREAM &&
-		 flushedUpto < targetPagePtr + reqLen))
-	{
-		if (!WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
-										 private->randAccess,
-										 private->fetching_ckpt,
-										 targetRecPtr))
-		{
-			if (readFile >= 0)
-				close(readFile);
-			readFile = -1;
-			readLen = 0;
-			readSource = XLOG_FROM_ANY;
+			/*
+			 * Close and remove the backup label file
+			 */
+			if (r != 1 || ferror(lfp) || FreeFile(lfp))
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not read file \"%s\": %m",
+								BACKUP_LABEL_FILE)));
+			durable_unlink(BACKUP_LABEL_FILE, ERROR);
 
-			return -1;
+			/*
+			 * Remove tablespace_map file if present, it is created only if
+			 * there are tablespaces.
+			 */
+			durable_unlink(TABLESPACE_MAP, DEBUG1);
 		}
+		PG_END_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive));
 	}
 
 	/*
-	 * At this point, we have the right segment open and if we're streaming we
-	 * know the requested record is in it.
-	 */
-	Assert(readFile != -1);
-
-	/*
-	 * If the current segment is being streamed from the primary, calculate
-	 * how much of the current page we have received already. We know the
-	 * requested record has been received, but this is for the benefit of
-	 * future calls, to allow quick exit at the top of this function.
+	 * OK to update backup counters, forcePageWrites and session-level lock.
+	 *
+	 * Note that CHECK_FOR_INTERRUPTS() must not occur while updating them.
+	 * Otherwise they can be updated inconsistently, and which might cause
+	 * do_pg_abort_backup() to fail.
 	 */
-	if (readSource == XLOG_FROM_STREAM)
+	WALInsertLockAcquireExclusive();
+	if (exclusive)
 	{
-		if (((targetPagePtr) / XLOG_BLCKSZ) != (flushedUpto / XLOG_BLCKSZ))
-			readLen = XLOG_BLCKSZ;
-		else
-			readLen = XLogSegmentOffset(flushedUpto, wal_segment_size) -
-				targetPageOff;
+		XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_NONE;
 	}
 	else
-		readLen = XLOG_BLCKSZ;
-
-	/* Read the requested page */
-	readOff = targetPageOff;
-
-	pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
-	r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff);
-	if (r != XLOG_BLCKSZ)
 	{
-		char		fname[MAXFNAMELEN];
-		int			save_errno = errno;
-
-		pgstat_report_wait_end();
-		XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size);
-		if (r < 0)
-		{
-			errno = save_errno;
-			ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
-					(errcode_for_file_access(),
-					 errmsg("could not read from log segment %s, offset %u: %m",
-							fname, readOff)));
-		}
-		else
-			ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
-					(errcode(ERRCODE_DATA_CORRUPTED),
-					 errmsg("could not read from log segment %s, offset %u: read %d of %zu",
-							fname, readOff, r, (Size) XLOG_BLCKSZ)));
-		goto next_record_is_invalid;
+		/*
+		 * The user-visible pg_start/stop_backup() functions that operate on
+		 * exclusive backups can be called at any time, but for non-exclusive
+		 * backups, it is expected that each do_pg_start_backup() call is
+		 * matched by exactly one do_pg_stop_backup() call.
+		 */
+		Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
+		XLogCtl->Insert.nonExclusiveBackups--;
 	}
-	pgstat_report_wait_end();
 
-	Assert(targetSegNo == readSegNo);
-	Assert(targetPageOff == readOff);
-	Assert(reqLen <= readLen);
-
-	xlogreader->seg.ws_tli = curFileTLI;
-
-	/*
-	 * Check the page header immediately, so that we can retry immediately if
-	 * it's not valid. This may seem unnecessary, because XLogReadRecord()
-	 * validates the page header anyway, and would propagate the failure up to
-	 * ReadRecord(), which would retry. However, there's a corner case with
-	 * continuation records, if a record is split across two pages such that
-	 * we would need to read the two pages from different sources. For
-	 * example, imagine a scenario where a streaming replica is started up,
-	 * and replay reaches a record that's split across two WAL segments. The
-	 * first page is only available locally, in pg_wal, because it's already
-	 * been recycled on the primary. The second page, however, is not present
-	 * in pg_wal, and we should stream it from the primary. There is a
-	 * recycled WAL segment present in pg_wal, with garbage contents, however.
-	 * We would read the first page from the local WAL segment, but when
-	 * reading the second page, we would read the bogus, recycled, WAL
-	 * segment. If we didn't catch that case here, we would never recover,
-	 * because ReadRecord() would retry reading the whole record from the
-	 * beginning.
-	 *
-	 * Of course, this only catches errors in the page header, which is what
-	 * happens in the case of a recycled WAL segment. Other kinds of errors or
-	 * corruption still has the same problem. But this at least fixes the
-	 * common case, which can happen as part of normal operation.
-	 *
-	 * Validating the page header is cheap enough that doing it twice
-	 * shouldn't be a big deal from a performance point of view.
-	 */
-	if (!XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
+	if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
+		XLogCtl->Insert.nonExclusiveBackups == 0)
 	{
-		/* reset any error XLogReaderValidatePageHeader() might have set */
-		xlogreader->errormsg_buf[0] = '\0';
-		goto next_record_is_invalid;
+		XLogCtl->Insert.forcePageWrites = false;
 	}
 
-	return readLen;
-
-next_record_is_invalid:
-	lastSourceFailed = true;
+	/*
+	 * Clean up session-level lock.
+	 *
+	 * You might think that WALInsertLockRelease() can be called before
+	 * cleaning up session-level lock because session-level lock doesn't need
+	 * to be protected with WAL insertion lock. But since
+	 * CHECK_FOR_INTERRUPTS() can occur in it, session-level lock must be
+	 * cleaned up before it.
+	 */
+	sessionBackupState = SESSION_BACKUP_NONE;
 
-	if (readFile >= 0)
-		close(readFile);
-	readFile = -1;
-	readLen = 0;
-	readSource = XLOG_FROM_ANY;
+	WALInsertLockRelease();
 
-	/* In standby-mode, keep trying */
-	if (StandbyMode)
-		goto retry;
-	else
-		return -1;
-}
+	/*
+	 * Read and parse the START WAL LOCATION line (this code is pretty crude,
+	 * but we are not expecting any variability in the file format).
+	 */
+	if (sscanf(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c",
+			   &hi, &lo, startxlogfilename,
+			   &ch) != 4 || ch != '\n')
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
+	startpoint = ((uint64) hi) << 32 | lo;
+	remaining = strchr(labelfile, '\n') + 1;	/* %n is not portable enough */
 
-/*
- * Open the WAL segment containing WAL location 'RecPtr'.
- *
- * The segment can be fetched via restore_command, or via walreceiver having
- * streamed the record, or it can already be present in pg_wal. Checking
- * pg_wal is mainly for crash recovery, but it will be polled in standby mode
- * too, in case someone copies a new segment directly to pg_wal. That is not
- * documented or recommended, though.
- *
- * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
- * prepare to read WAL starting from RedoStartLSN after this.
- *
- * 'RecPtr' might not point to the beginning of the record we're interested
- * in, it might also point to the page or segment header. In that case,
- * 'tliRecPtr' is the position of the WAL record we're interested in. It is
- * used to decide which timeline to stream the requested WAL from.
- *
- * If the record is not immediately available, the function returns false
- * if we're not in standby mode. In standby mode, waits for it to become
- * available.
- *
- * When the requested record becomes available, the function opens the file
- * containing it (if not open already), and returns true. When end of standby
- * mode is triggered by the user, and there is no more WAL available, returns
- * false.
- */
-static bool
-WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
-							bool fetching_ckpt, XLogRecPtr tliRecPtr)
-{
-	static TimestampTz last_fail_time = 0;
-	TimestampTz now;
-	bool		streaming_reply_sent = false;
+	/*
+	 * Parse the BACKUP FROM line. If we are taking an online backup from the
+	 * standby, we confirm that the standby has not been promoted during the
+	 * backup.
+	 */
+	ptr = strstr(remaining, "BACKUP FROM:");
+	if (!ptr || sscanf(ptr, "BACKUP FROM: %19s\n", backupfrom) != 1)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
+	if (strcmp(backupfrom, "standby") == 0 && !backup_started_in_recovery)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("the standby was promoted during online backup"),
+				 errhint("This means that the backup being taken is corrupt "
+						 "and should not be used. "
+						 "Try taking another online backup.")));
 
-	/*-------
-	 * Standby mode is implemented by a state machine:
-	 *
-	 * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
-	 *	  pg_wal (XLOG_FROM_PG_WAL)
-	 * 2. Check trigger file
-	 * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
-	 * 4. Rescan timelines
-	 * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
+	/*
+	 * During recovery, we don't write an end-of-backup record. We assume that
+	 * pg_control was backed up last and its minimum recovery point can be
+	 * available as the backup end location. Since we don't have an
+	 * end-of-backup record, we use the pg_control value to check whether
+	 * we've reached the end of backup when starting recovery from this
+	 * backup. We have no way of checking if pg_control wasn't backed up last
+	 * however.
 	 *
-	 * Failure to read from the current source advances the state machine to
-	 * the next state.
+	 * We don't force a switch to new WAL file but it is still possible to
+	 * wait for all the required files to be archived if waitforarchive is
+	 * true. This is okay if we use the backup to start a standby and fetch
+	 * the missing WAL using streaming replication. But in the case of an
+	 * archive recovery, a user should set waitforarchive to true and wait for
+	 * them to be archived to ensure that all the required files are
+	 * available.
 	 *
-	 * 'currentSource' indicates the current state. There are no currentSource
-	 * values for "check trigger", "rescan timelines", and "sleep" states,
-	 * those actions are taken when reading from the previous source fails, as
-	 * part of advancing to the next state.
+	 * We return the current minimum recovery point as the backup end
+	 * location. Note that it can be greater than the exact backup end
+	 * location if the minimum recovery point is updated after the backup of
+	 * pg_control. This is harmless for current uses.
 	 *
-	 * If standby mode is turned off while reading WAL from stream, we move
-	 * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching
-	 * the files (which would be required at end of recovery, e.g., timeline
-	 * history file) from archive or pg_wal. We don't need to kill WAL receiver
-	 * here because it's already stopped when standby mode is turned off at
-	 * the end of recovery.
-	 *-------
+	 * XXX currently a backup history file is for informational and debug
+	 * purposes only. It's not essential for an online backup. Furthermore,
+	 * even if it's created, it will not be archived during recovery because
+	 * an archiver is not invoked. So it doesn't seem worthwhile to write a
+	 * backup history file during recovery.
 	 */
-	if (!InArchiveRecovery)
-		currentSource = XLOG_FROM_PG_WAL;
-	else if (currentSource == XLOG_FROM_ANY ||
-			 (!StandbyMode && currentSource == XLOG_FROM_STREAM))
-	{
-		lastSourceFailed = false;
-		currentSource = XLOG_FROM_ARCHIVE;
-	}
-
-	for (;;)
+	if (backup_started_in_recovery)
 	{
-		XLogSource	oldSource = currentSource;
-		bool		startWalReceiver = false;
+		XLogRecPtr	recptr;
 
 		/*
-		 * First check if we failed to read from the current source, and
-		 * advance the state machine if so. The failure to read might've
-		 * happened outside this function, e.g when a CRC check fails on a
-		 * record, or within this loop.
+		 * Check to see if all WAL replayed during online backup contain
+		 * full-page writes.
 		 */
-		if (lastSourceFailed)
-		{
-			switch (currentSource)
-			{
-				case XLOG_FROM_ARCHIVE:
-				case XLOG_FROM_PG_WAL:
-
-					/*
-					 * Check to see if the trigger file exists. Note that we
-					 * do this only after failure, so when you create the
-					 * trigger file, we still finish replaying as much as we
-					 * can from archive and pg_wal before failover.
-					 */
-					if (StandbyMode && CheckForStandbyTrigger())
-					{
-						ShutdownWalRcv();
-						return false;
-					}
-
-					/*
-					 * Not in standby mode, and we've now tried the archive
-					 * and pg_wal.
-					 */
-					if (!StandbyMode)
-						return false;
-
-					/*
-					 * Move to XLOG_FROM_STREAM state, and set to start a
-					 * walreceiver if necessary.
-					 */
-					currentSource = XLOG_FROM_STREAM;
-					startWalReceiver = true;
-					break;
-
-				case XLOG_FROM_STREAM:
-
-					/*
-					 * Failure while streaming. Most likely, we got here
-					 * because streaming replication was terminated, or
-					 * promotion was triggered. But we also get here if we
-					 * find an invalid record in the WAL streamed from the
-					 * primary, in which case something is seriously wrong.
-					 * There's little chance that the problem will just go
-					 * away, but PANIC is not good for availability either,
-					 * especially in hot standby mode. So, we treat that the
-					 * same as disconnection, and retry from archive/pg_wal
-					 * again. The WAL in the archive should be identical to
-					 * what was streamed, so it's unlikely that it helps, but
-					 * one can hope...
-					 */
-
-					/*
-					 * We should be able to move to XLOG_FROM_STREAM only in
-					 * standby mode.
-					 */
-					Assert(StandbyMode);
-
-					/*
-					 * Before we leave XLOG_FROM_STREAM state, make sure that
-					 * walreceiver is not active, so that it won't overwrite
-					 * WAL that we restore from archive.
-					 */
-					if (WalRcvStreaming())
-						ShutdownWalRcv();
-
-					/*
-					 * Before we sleep, re-scan for possible new timelines if
-					 * we were requested to recover to the latest timeline.
-					 */
-					if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
-					{
-						if (rescanLatestTimeLine())
-						{
-							currentSource = XLOG_FROM_ARCHIVE;
-							break;
-						}
-					}
+		SpinLockAcquire(&XLogCtl->info_lck);
+		recptr = XLogCtl->lastFpwDisableRecPtr;
+		SpinLockRelease(&XLogCtl->info_lck);
 
-					/*
-					 * XLOG_FROM_STREAM is the last state in our state
-					 * machine, so we've exhausted all the options for
-					 * obtaining the requested WAL. We're going to loop back
-					 * and retry from the archive, but if it hasn't been long
-					 * since last attempt, sleep wal_retrieve_retry_interval
-					 * milliseconds to avoid busy-waiting.
-					 */
-					now = GetCurrentTimestamp();
-					if (!TimestampDifferenceExceeds(last_fail_time, now,
-													wal_retrieve_retry_interval))
-					{
-						long		wait_time;
-
-						wait_time = wal_retrieve_retry_interval -
-							TimestampDifferenceMilliseconds(last_fail_time, now);
-
-						(void) WaitLatch(&XLogCtl->recoveryWakeupLatch,
-										 WL_LATCH_SET | WL_TIMEOUT |
-										 WL_EXIT_ON_PM_DEATH,
-										 wait_time,
-										 WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL);
-						ResetLatch(&XLogCtl->recoveryWakeupLatch);
-						now = GetCurrentTimestamp();
-
-						/* Handle interrupt signals of startup process */
-						HandleStartupProcInterrupts();
-					}
-					last_fail_time = now;
-					currentSource = XLOG_FROM_ARCHIVE;
-					break;
-
-				default:
-					elog(ERROR, "unexpected WAL source %d", currentSource);
-			}
-		}
-		else if (currentSource == XLOG_FROM_PG_WAL)
-		{
-			/*
-			 * We just successfully read a file in pg_wal. We prefer files in
-			 * the archive over ones in pg_wal, so try the next file again
-			 * from the archive first.
-			 */
-			if (InArchiveRecovery)
-				currentSource = XLOG_FROM_ARCHIVE;
-		}
+		if (startpoint <= recptr)
+			ereport(ERROR,
+					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+					 errmsg("WAL generated with full_page_writes=off was replayed "
+							"during online backup"),
+					 errhint("This means that the backup being taken on the standby "
+							 "is corrupt and should not be used. "
+							 "Enable full_page_writes and run CHECKPOINT on the primary, "
+							 "and then try an online backup again.")));
 
-		if (currentSource != oldSource)
-			elog(DEBUG2, "switched WAL source from %s to %s after %s",
-				 xlogSourceNames[oldSource], xlogSourceNames[currentSource],
-				 lastSourceFailed ? "failure" : "success");
 
+		LWLockAcquire(ControlFileLock, LW_SHARED);
+		stoppoint = ControlFile->minRecoveryPoint;
+		stoptli = ControlFile->minRecoveryPointTLI;
+		LWLockRelease(ControlFileLock);
+	}
+	else
+	{
 		/*
-		 * We've now handled possible failure. Try to read from the chosen
-		 * source.
+		 * Write the backup-end xlog record
 		 */
-		lastSourceFailed = false;
-
-		switch (currentSource)
-		{
-			case XLOG_FROM_ARCHIVE:
-			case XLOG_FROM_PG_WAL:
+		XLogBeginInsert();
+		XLogRegisterData((char *) (&startpoint), sizeof(startpoint));
+		stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END);
+		stoptli = ThisTimeLineID;
 
-				/*
-				 * WAL receiver must not be running when reading WAL from
-				 * archive or pg_wal.
-				 */
-				Assert(!WalRcvStreaming());
+		/*
+		 * Force a switch to a new xlog segment file, so that the backup is
+		 * valid as soon as archiver moves out the current segment file.
+		 */
+		RequestXLogSwitch(false);
 
-				/* Close any old file we might have open. */
-				if (readFile >= 0)
-				{
-					close(readFile);
-					readFile = -1;
-				}
-				/* Reset curFileTLI if random fetch. */
-				if (randAccess)
-					curFileTLI = 0;
+		XLByteToPrevSeg(stoppoint, _logSegNo, wal_segment_size);
+		XLogFileName(stopxlogfilename, stoptli, _logSegNo, wal_segment_size);
 
-				/*
-				 * Try to restore the file from archive, or read an existing
-				 * file from pg_wal.
-				 */
-				readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2,
-											  currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY :
-											  currentSource);
-				if (readFile >= 0)
-					return true;	/* success! */
+		/* Use the log timezone here, not the session timezone */
+		stamp_time = (pg_time_t) time(NULL);
+		pg_strftime(strfbuf, sizeof(strfbuf),
+					"%Y-%m-%d %H:%M:%S %Z",
+					pg_localtime(&stamp_time, log_timezone));
 
-				/*
-				 * Nope, not found in archive or pg_wal.
-				 */
-				lastSourceFailed = true;
-				break;
+		/*
+		 * Write the backup history file
+		 */
+		XLByteToSeg(startpoint, _logSegNo, wal_segment_size);
+		BackupHistoryFilePath(histfilepath, stoptli, _logSegNo,
+							  startpoint, wal_segment_size);
+		fp = AllocateFile(histfilepath, "w");
+		if (!fp)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not create file \"%s\": %m",
+							histfilepath)));
+		fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
+				LSN_FORMAT_ARGS(startpoint), startxlogfilename);
+		fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
+				LSN_FORMAT_ARGS(stoppoint), stopxlogfilename);
 
-			case XLOG_FROM_STREAM:
-				{
-					bool		havedata;
+		/*
+		 * Transfer remaining lines including label and start timeline to
+		 * history file.
+		 */
+		fprintf(fp, "%s", remaining);
+		fprintf(fp, "STOP TIME: %s\n", strfbuf);
+		fprintf(fp, "STOP TIMELINE: %u\n", stoptli);
+		if (fflush(fp) || ferror(fp) || FreeFile(fp))
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not write file \"%s\": %m",
+							histfilepath)));
 
-					/*
-					 * We should be able to move to XLOG_FROM_STREAM only in
-					 * standby mode.
-					 */
-					Assert(StandbyMode);
+		/*
+		 * Clean out any no-longer-needed history files.  As a side effect,
+		 * this will post a .ready file for the newly created history file,
+		 * notifying the archiver that history file may be archived
+		 * immediately.
+		 */
+		CleanupBackupHistory();
+	}
 
-					/*
-					 * First, shutdown walreceiver if its restart has been
-					 * requested -- but no point if we're already slated for
-					 * starting it.
-					 */
-					if (pendingWalRcvRestart && !startWalReceiver)
-					{
-						ShutdownWalRcv();
-
-						/*
-						 * Re-scan for possible new timelines if we were
-						 * requested to recover to the latest timeline.
-						 */
-						if (recoveryTargetTimeLineGoal ==
-							RECOVERY_TARGET_TIMELINE_LATEST)
-							rescanLatestTimeLine();
-
-						startWalReceiver = true;
-					}
-					pendingWalRcvRestart = false;
+	/*
+	 * If archiving is enabled, wait for all the required WAL files to be
+	 * archived before returning. If archiving isn't enabled, the required WAL
+	 * needs to be transported via streaming replication (hopefully with
+	 * wal_keep_size set high enough), or some more exotic mechanism like
+	 * polling and copying files from pg_wal with script. We have no knowledge
+	 * of those mechanisms, so it's up to the user to ensure that he gets all
+	 * the required WAL.
+	 *
+	 * We wait until both the last WAL file filled during backup and the
+	 * history file have been archived, and assume that the alphabetic sorting
+	 * property of the WAL files ensures any earlier WAL files are safely
+	 * archived as well.
+	 *
+	 * We wait forever, since archive_command is supposed to work and we
+	 * assume the admin wanted his backup to work completely. If you don't
+	 * wish to wait, then either waitforarchive should be passed in as false,
+	 * or you can set statement_timeout.  Also, some notices are issued to
+	 * clue in anyone who might be doing this interactively.
+	 */
 
-					/*
-					 * Launch walreceiver if needed.
-					 *
-					 * If fetching_ckpt is true, RecPtr points to the initial
-					 * checkpoint location. In that case, we use RedoStartLSN
-					 * as the streaming start position instead of RecPtr, so
-					 * that when we later jump backwards to start redo at
-					 * RedoStartLSN, we will have the logs streamed already.
-					 */
-					if (startWalReceiver &&
-						PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0)
-					{
-						XLogRecPtr	ptr;
-						TimeLineID	tli;
-
-						if (fetching_ckpt)
-						{
-							ptr = RedoStartLSN;
-							tli = ControlFile->checkPointCopy.ThisTimeLineID;
-						}
-						else
-						{
-							ptr = RecPtr;
-
-							/*
-							 * Use the record begin position to determine the
-							 * TLI, rather than the position we're reading.
-							 */
-							tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
-
-							if (curFileTLI > 0 && tli < curFileTLI)
-								elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
-									 LSN_FORMAT_ARGS(tliRecPtr),
-									 tli, curFileTLI);
-						}
-						curFileTLI = tli;
-						RequestXLogStreaming(tli, ptr, PrimaryConnInfo,
-											 PrimarySlotName,
-											 wal_receiver_create_temp_slot);
-						flushedUpto = 0;
-					}
+	if (waitforarchive &&
+		((!backup_started_in_recovery && XLogArchivingActive()) ||
+		 (backup_started_in_recovery && XLogArchivingAlways())))
+	{
+		XLByteToPrevSeg(stoppoint, _logSegNo, wal_segment_size);
+		XLogFileName(lastxlogfilename, stoptli, _logSegNo, wal_segment_size);
 
-					/*
-					 * Check if WAL receiver is active or wait to start up.
-					 */
-					if (!WalRcvStreaming())
-					{
-						lastSourceFailed = true;
-						break;
-					}
+		XLByteToSeg(startpoint, _logSegNo, wal_segment_size);
+		BackupHistoryFileName(histfilename, stoptli, _logSegNo,
+							  startpoint, wal_segment_size);
 
-					/*
-					 * Walreceiver is active, so see if new data has arrived.
-					 *
-					 * We only advance XLogReceiptTime when we obtain fresh
-					 * WAL from walreceiver and observe that we had already
-					 * processed everything before the most recent "chunk"
-					 * that it flushed to disk.  In steady state where we are
-					 * keeping up with the incoming data, XLogReceiptTime will
-					 * be updated on each cycle. When we are behind,
-					 * XLogReceiptTime will not advance, so the grace time
-					 * allotted to conflicting queries will decrease.
-					 */
-					if (RecPtr < flushedUpto)
-						havedata = true;
-					else
-					{
-						XLogRecPtr	latestChunkStart;
-
-						flushedUpto = GetWalRcvFlushRecPtr(&latestChunkStart, &receiveTLI);
-						if (RecPtr < flushedUpto && receiveTLI == curFileTLI)
-						{
-							havedata = true;
-							if (latestChunkStart <= RecPtr)
-							{
-								XLogReceiptTime = GetCurrentTimestamp();
-								SetCurrentChunkStartTime(XLogReceiptTime);
-							}
-						}
-						else
-							havedata = false;
-					}
-					if (havedata)
-					{
-						/*
-						 * Great, streamed far enough.  Open the file if it's
-						 * not open already.  Also read the timeline history
-						 * file if we haven't initialized timeline history
-						 * yet; it should be streamed over and present in
-						 * pg_wal by now.  Use XLOG_FROM_STREAM so that source
-						 * info is set correctly and XLogReceiptTime isn't
-						 * changed.
-						 *
-						 * NB: We must set readTimeLineHistory based on
-						 * recoveryTargetTLI, not receiveTLI. Normally they'll
-						 * be the same, but if recovery_target_timeline is
-						 * 'latest' and archiving is configured, then it's
-						 * possible that we managed to retrieve one or more
-						 * new timeline history files from the archive,
-						 * updating recoveryTargetTLI.
-						 */
-						if (readFile < 0)
-						{
-							if (!expectedTLEs)
-								expectedTLEs = readTimeLineHistory(recoveryTargetTLI);
-							readFile = XLogFileRead(readSegNo, PANIC,
-													receiveTLI,
-													XLOG_FROM_STREAM, false);
-							Assert(readFile >= 0);
-						}
-						else
-						{
-							/* just make sure source info is correct... */
-							readSource = XLOG_FROM_STREAM;
-							XLogReceiptSource = XLOG_FROM_STREAM;
-							return true;
-						}
-						break;
-					}
+		seconds_before_warning = 60;
+		waits = 0;
 
-					/*
-					 * Data not here yet. Check for trigger, then wait for
-					 * walreceiver to wake us up when new WAL arrives.
-					 */
-					if (CheckForStandbyTrigger())
-					{
-						/*
-						 * Note that we don't "return false" immediately here.
-						 * After being triggered, we still want to replay all
-						 * the WAL that was already streamed. It's in pg_wal
-						 * now, so we just treat this as a failure, and the
-						 * state machine will move on to replay the streamed
-						 * WAL from pg_wal, and then recheck the trigger and
-						 * exit replay.
-						 */
-						lastSourceFailed = true;
-						break;
-					}
+		while (XLogArchiveIsBusy(lastxlogfilename) ||
+			   XLogArchiveIsBusy(histfilename))
+		{
+			CHECK_FOR_INTERRUPTS();
 
-					/*
-					 * Since we have replayed everything we have received so
-					 * far and are about to start waiting for more WAL, let's
-					 * tell the upstream server our replay location now so
-					 * that pg_stat_replication doesn't show stale
-					 * information.
-					 */
-					if (!streaming_reply_sent)
-					{
-						WalRcvForceReply();
-						streaming_reply_sent = true;
-					}
+			if (!reported_waiting && waits > 5)
+			{
+				ereport(NOTICE,
+						(errmsg("base backup done, waiting for required WAL segments to be archived")));
+				reported_waiting = true;
+			}
 
-					/*
-					 * Wait for more WAL to arrive. Time out after 5 seconds
-					 * to react to a trigger file promptly and to check if the
-					 * WAL receiver is still active.
-					 */
-					(void) WaitLatch(&XLogCtl->recoveryWakeupLatch,
-									 WL_LATCH_SET | WL_TIMEOUT |
-									 WL_EXIT_ON_PM_DEATH,
-									 5000L, WAIT_EVENT_RECOVERY_WAL_STREAM);
-					ResetLatch(&XLogCtl->recoveryWakeupLatch);
-					break;
-				}
+			pgstat_report_wait_start(WAIT_EVENT_BACKUP_WAIT_WAL_ARCHIVE);
+			pg_usleep(1000000L);
+			pgstat_report_wait_end();
 
-			default:
-				elog(ERROR, "unexpected WAL source %d", currentSource);
+			if (++waits >= seconds_before_warning)
+			{
+				seconds_before_warning *= 2;	/* This wraps in >10 years... */
+				ereport(WARNING,
+						(errmsg("still waiting for all required WAL segments to be archived (%d seconds elapsed)",
+								waits),
+						 errhint("Check that your archive_command is executing properly.  "
+								 "You can safely cancel this backup, "
+								 "but the database backup will not be usable without all the WAL segments.")));
+			}
 		}
 
-		/*
-		 * Check for recovery pause here so that we can confirm more quickly
-		 * that a requested pause has actually taken effect.
-		 */
-		if (((volatile XLogCtlData *) XLogCtl)->recoveryPauseState !=
-			RECOVERY_NOT_PAUSED)
-			recoveryPausesHere(false);
-
-		/*
-		 * This possibly-long loop needs to handle interrupts of startup
-		 * process.
-		 */
-		HandleStartupProcInterrupts();
+		ereport(NOTICE,
+				(errmsg("all required WAL segments have been archived")));
 	}
+	else if (waitforarchive)
+		ereport(NOTICE,
+				(errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
 
-	return false;				/* not reached */
+	/*
+	 * We're done.  As a convenience, return the ending WAL location.
+	 */
+	if (stoptli_p)
+		*stoptli_p = stoptli;
+	return stoppoint;
 }
 
+
 /*
- * Set flag to signal the walreceiver to restart.  (The startup process calls
- * this on noticing a relevant configuration change.)
+ * do_pg_abort_backup: abort a running backup
+ *
+ * This does just the most basic steps of do_pg_stop_backup(), by taking the
+ * system out of backup mode, thus making it a lot more safe to call from
+ * an error handler.
+ *
+ * The caller can pass 'arg' as 'true' or 'false' to control whether a warning
+ * is emitted.
+ *
+ * NB: This is only for aborting a non-exclusive backup that doesn't write
+ * backup_label. A backup started with pg_start_backup() needs to be finished
+ * with pg_stop_backup().
+ *
+ * NB: This gets used as a before_shmem_exit handler, hence the odd-looking
+ * signature.
  */
 void
-StartupRequestWalReceiverRestart(void)
+do_pg_abort_backup(int code, Datum arg)
 {
-	if (currentSource == XLOG_FROM_STREAM && WalRcvRunning())
-	{
-		ereport(LOG,
-				(errmsg("WAL receiver process shutdown requested")));
+	bool		emit_warning = DatumGetBool(arg);
 
-		pendingWalRcvRestart = true;
-	}
-}
+	/*
+	 * Quick exit if session is not keeping around a non-exclusive backup
+	 * already started.
+	 */
+	if (sessionBackupState != SESSION_BACKUP_NON_EXCLUSIVE)
+		return;
 
-/*
- * Determine what log level should be used to report a corrupt WAL record
- * in the current WAL page, previously read by XLogPageRead().
- *
- * 'emode' is the error mode that would be used to report a file-not-found
- * or legitimate end-of-WAL situation.   Generally, we use it as-is, but if
- * we're retrying the exact same record that we've tried previously, only
- * complain the first time to keep the noise down.  However, we only do when
- * reading from pg_wal, because we don't expect any invalid records in archive
- * or in records streamed from the primary. Files in the archive should be complete,
- * and we should never hit the end of WAL because we stop and wait for more WAL
- * to arrive before replaying it.
- *
- * NOTE: This function remembers the RecPtr value it was last called with,
- * to suppress repeated messages about the same record. Only call this when
- * you are about to ereport(), or you might cause a later message to be
- * erroneously suppressed.
- */
-static int
-emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
-{
-	static XLogRecPtr lastComplaint = 0;
+	WALInsertLockAcquireExclusive();
+	Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
+	XLogCtl->Insert.nonExclusiveBackups--;
 
-	if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
+	if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
+		XLogCtl->Insert.nonExclusiveBackups == 0)
 	{
-		if (RecPtr == lastComplaint)
-			emode = DEBUG1;
-		else
-			lastComplaint = RecPtr;
+		XLogCtl->Insert.forcePageWrites = false;
 	}
-	return emode;
+	WALInsertLockRelease();
+
+	if (emit_warning)
+		ereport(WARNING,
+				(errmsg("aborting backup due to backend exiting before pg_stop_backup was called")));
 }
 
 /*
- * Has a standby promotion already been triggered?
- *
- * Unlike CheckForStandbyTrigger(), this works in any process
- * that's connected to shared memory.
+ * Register a handler that will warn about unterminated backups at end of
+ * session, unless this has already been done.
  */
-bool
-PromoteIsTriggered(void)
+void
+register_persistent_abort_backup_handler(void)
 {
-	/*
-	 * We check shared state each time only until a standby promotion is
-	 * triggered. We can't trigger a promotion again, so there's no need to
-	 * keep checking after the shared variable has once been seen true.
-	 */
-	if (LocalPromoteIsTriggered)
-		return true;
-
-	SpinLockAcquire(&XLogCtl->info_lck);
-	LocalPromoteIsTriggered = XLogCtl->SharedPromoteIsTriggered;
-	SpinLockRelease(&XLogCtl->info_lck);
+	static bool already_done = false;
 
-	return LocalPromoteIsTriggered;
+	if (already_done)
+		return;
+	before_shmem_exit(do_pg_abort_backup, DatumGetBool(true));
+	already_done = true;
 }
 
-static void
-SetPromoteIsTriggered(void)
+/*
+ * Get latest WAL insert pointer
+ */
+XLogRecPtr
+GetXLogInsertRecPtr(void)
 {
-	SpinLockAcquire(&XLogCtl->info_lck);
-	XLogCtl->SharedPromoteIsTriggered = true;
-	SpinLockRelease(&XLogCtl->info_lck);
+	XLogCtlInsert *Insert = &XLogCtl->Insert;
+	uint64		current_bytepos;
 
-	/*
-	 * Mark the recovery pause state as 'not paused' because the paused state
-	 * ends and promotion continues if a promotion is triggered while recovery
-	 * is paused. Otherwise pg_get_wal_replay_pause_state() can mistakenly
-	 * return 'paused' while a promotion is ongoing.
-	 */
-	SetRecoveryPause(false);
+	SpinLockAcquire(&Insert->insertpos_lck);
+	current_bytepos = Insert->CurrBytePos;
+	SpinLockRelease(&Insert->insertpos_lck);
 
-	LocalPromoteIsTriggered = true;
+	return XLogBytePosToRecPtr(current_bytepos);
 }
 
 /*
- * Check to see whether the user-specified trigger file exists and whether a
- * promote request has arrived.  If either condition holds, return true.
+ * Get latest WAL write pointer
  */
-static bool
-CheckForStandbyTrigger(void)
+XLogRecPtr
+GetXLogWriteRecPtr(void)
 {
-	struct stat stat_buf;
-
-	if (LocalPromoteIsTriggered)
-		return true;
-
-	if (IsPromoteSignaled() && CheckPromoteSignal())
-	{
-		ereport(LOG, (errmsg("received promote request")));
-		RemovePromoteSignalFiles();
-		ResetPromoteSignaled();
-		SetPromoteIsTriggered();
-		return true;
-	}
-
-	if (PromoteTriggerFile == NULL || strcmp(PromoteTriggerFile, "") == 0)
-		return false;
-
-	if (stat(PromoteTriggerFile, &stat_buf) == 0)
-	{
-		ereport(LOG,
-				(errmsg("promote trigger file found: %s", PromoteTriggerFile)));
-		unlink(PromoteTriggerFile);
-		SetPromoteIsTriggered();
-		return true;
-	}
-	else if (errno != ENOENT)
-		ereport(ERROR,
-				(errcode_for_file_access(),
-				 errmsg("could not stat promote trigger file \"%s\": %m",
-						PromoteTriggerFile)));
+	SpinLockAcquire(&XLogCtl->info_lck);
+	LogwrtResult = XLogCtl->LogwrtResult;
+	SpinLockRelease(&XLogCtl->info_lck);
 
-	return false;
+	return LogwrtResult.Write;
 }
 
 /*
- * Remove the files signaling a standby promotion request.
+ * Returns the redo pointer of the last checkpoint or restartpoint. This is
+ * the oldest point in WAL that we still need, if we have to restart recovery.
  */
 void
-RemovePromoteSignalFiles(void)
+GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli)
 {
-	unlink(PROMOTE_SIGNAL_FILE);
+	LWLockAcquire(ControlFileLock, LW_SHARED);
+	*oldrecptr = ControlFile->checkPointCopy.redo;
+	*oldtli = ControlFile->checkPointCopy.ThisTimeLineID;
+	LWLockRelease(ControlFileLock);
 }
 
 /*
- * Check to see if a promote request has arrived.
+ * BackupInProgress: check if online backup mode is active
+ *
+ * This is done by checking for existence of the "backup_label" file.
  */
 bool
-CheckPromoteSignal(void)
+BackupInProgress(void)
 {
 	struct stat stat_buf;
 
-	if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
-		return true;
-
-	return false;
+	return (stat(BACKUP_LABEL_FILE, &stat_buf) == 0);
 }
 
 /*
- * Wake up startup process to replay newly arrived WAL, or to notice that
- * failover has been requested.
+ * CancelBackup: rename the "backup_label" and "tablespace_map"
+ *				 files to cancel backup mode
+ *
+ * If the "backup_label" file exists, it will be renamed to "backup_label.old".
+ * Similarly, if the "tablespace_map" file exists, it will be renamed to
+ * "tablespace_map.old".
+ *
+ * Note that this will render an online backup in progress
+ * useless. To correctly finish an online backup, pg_stop_backup must be
+ * called.
  */
 void
-WakeupRecovery(void)
+CancelBackup(void)
 {
-	SetLatch(&XLogCtl->recoveryWakeupLatch);
+	struct stat stat_buf;
+
+	/* if the backup_label file is not there, return */
+	if (stat(BACKUP_LABEL_FILE, &stat_buf) < 0)
+		return;
+
+	/* remove leftover file from previously canceled backup if it exists */
+	unlink(BACKUP_LABEL_OLD);
+
+	if (durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, DEBUG1) != 0)
+	{
+		ereport(WARNING,
+				(errcode_for_file_access(),
+				 errmsg("online backup mode was not canceled"),
+				 errdetail("File \"%s\" could not be renamed to \"%s\": %m.",
+						   BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
+		return;
+	}
+
+	/* if the tablespace_map file is not there, return */
+	if (stat(TABLESPACE_MAP, &stat_buf) < 0)
+	{
+		ereport(LOG,
+				(errmsg("online backup mode canceled"),
+				 errdetail("File \"%s\" was renamed to \"%s\".",
+						   BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
+		return;
+	}
+
+	/* remove leftover file from previously canceled backup if it exists */
+	unlink(TABLESPACE_MAP_OLD);
+
+	if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
+	{
+		ereport(LOG,
+				(errmsg("online backup mode canceled"),
+				 errdetail("Files \"%s\" and \"%s\" were renamed to "
+						   "\"%s\" and \"%s\", respectively.",
+						   BACKUP_LABEL_FILE, TABLESPACE_MAP,
+						   BACKUP_LABEL_OLD, TABLESPACE_MAP_OLD)));
+	}
+	else
+	{
+		ereport(WARNING,
+				(errcode_for_file_access(),
+				 errmsg("online backup mode canceled"),
+				 errdetail("File \"%s\" was renamed to \"%s\", but "
+						   "file \"%s\" could not be renamed to \"%s\": %m.",
+						   BACKUP_LABEL_FILE, BACKUP_LABEL_OLD,
+						   TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
+	}
 }
 
 /*
@@ -12921,12 +8928,3 @@ SetWalWriterSleeping(bool sleeping)
 	XLogCtl->WalWriterSleeping = sleeping;
 	SpinLockRelease(&XLogCtl->info_lck);
 }
-
-/*
- * Schedule a walreceiver wakeup in the main recovery loop.
- */
-void
-XLogRequestWalReceiverReply(void)
-{
-	doRequestWalReceiverReply = true;
-}
diff --git a/src/backend/access/transam/xlogfuncs.c b/src/backend/access/transam/xlogfuncs.c
index b98deb72ec6..ce380b355e0 100644
--- a/src/backend/access/transam/xlogfuncs.c
+++ b/src/backend/access/transam/xlogfuncs.c
@@ -19,8 +19,8 @@
 #include <unistd.h>
 
 #include "access/htup_details.h"
-#include "access/xlog.h"
 #include "access/xlog_internal.h"
+#include "access/xlogrecovery.h"
 #include "access/xlogutils.h"
 #include "catalog/pg_type.h"
 #include "funcapi.h"
diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c
new file mode 100644
index 00000000000..726ab4d55a5
--- /dev/null
+++ b/src/backend/access/transam/xlogrecovery.c
@@ -0,0 +1,4422 @@
+/*-------------------------------------------------------------------------
+ *
+ * xlogrecovery.c
+ *		Functions for WAL recovery, standby mode
+ *
+ *
+ * - Functions for figuring out whether recovery is needed
+ * - whether it's crash or archive recovery
+ * - where to start
+ * - reading backup label
+ * - handling recovery target
+ * - main redo loop
+ * - figuring out when we're consistent
+ * - standby mode, start wal receiver, restore from WAL archive
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/access/transam/xlogrecovery.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <ctype.h>
+#include <math.h>
+#include <time.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+#include "access/timeline.h"
+#include "access/transam.h"
+#include "access/xact.h"
+#include "access/xlog_internal.h"
+#include "access/xlogarchive.h"
+#include "access/xlogreader.h"
+#include "access/xlogrecovery.h"
+#include "access/xlogutils.h"
+#include "catalog/pg_control.h"
+#include "commands/tablespace.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "postmaster/bgwriter.h"
+#include "postmaster/startup.h"
+#include "replication/basebackup.h"
+#include "replication/walreceiver.h"
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "storage/latch.h"
+#include "storage/pmsignal.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/spin.h"
+#include "utils/builtins.h"
+#include "utils/guc.h"
+#include "utils/ps_status.h"
+#include "utils/pg_rusage.h"
+
+/* Unsupported old recovery command file names (relative to $PGDATA) */
+#define RECOVERY_COMMAND_FILE	"recovery.conf"
+#define RECOVERY_COMMAND_DONE	"recovery.done"
+
+/* options formerly taken from recovery.conf for archive recovery */
+char	   *recoveryRestoreCommand = NULL;
+char	   *recoveryEndCommand = NULL;
+char	   *archiveCleanupCommand = NULL;
+RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
+bool		recoveryTargetInclusive = true;
+int			recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
+TransactionId recoveryTargetXid;
+char	   *recovery_target_time_string;
+TimestampTz recoveryTargetTime;
+const char *recoveryTargetName;
+XLogRecPtr	recoveryTargetLSN;
+int			recovery_min_apply_delay = 0;
+
+/* options formerly taken from recovery.conf for XLOG streaming */
+char	   *PrimaryConnInfo = NULL;
+char	   *PrimarySlotName = NULL;
+char	   *PromoteTriggerFile = NULL;
+bool		wal_receiver_create_temp_slot = false;
+
+/*
+ * GUC support
+ */
+const struct config_enum_entry recovery_target_action_options[] = {
+	{"pause", RECOVERY_TARGET_ACTION_PAUSE, false},
+	{"promote", RECOVERY_TARGET_ACTION_PROMOTE, false},
+	{"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false},
+	{NULL, 0, false}
+};
+
+/*
+ * During normal operation, the only timeline we care about is ThisTimeLineID.
+ * During recovery, however, things are more complicated.  To simplify life
+ * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we
+ * scan through the WAL history (that is, it is the line that was active when
+ * the currently-scanned WAL record was generated).  We also need these
+ * timeline values:
+ *
+ * recoveryTargetTimeLineGoal: what the user requested, if any
+ *
+ * recoveryTargetTLIRequested: numeric value of requested timeline, if constant
+ *
+ * recoveryTargetTLI: the currently understood target timeline; changes
+ *
+ * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and
+ * the timelines of its known parents, newest first (so recoveryTargetTLI is
+ * always the first list member).  Only these TLIs are expected to be seen in
+ * the WAL segments we read, and indeed only these TLIs will be considered as
+ * candidate WAL files to open at all.
+ *
+ * curFileTLI: the TLI appearing in the name of the current input WAL file.
+ * (This is not necessarily the same as ThisTimeLineID, because we could
+ * be scanning data that was copied from an ancestor timeline when the current
+ * file was created.)  During a sequential scan we do not allow this value
+ * to decrease.
+ */
+RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal = RECOVERY_TARGET_TIMELINE_LATEST;
+TimeLineID	recoveryTargetTLIRequested = 0;
+TimeLineID	recoveryTargetTLI = 0;
+static List *expectedTLEs;
+static TimeLineID curFileTLI;
+
+/*
+ * When ArchiveRecoveryRequested is set, archive recovery was requested,
+ * ie. signal files were present. When InArchiveRecovery is set, we are
+ * currently recovering using offline XLOG archives. These variables are only
+ * valid in the startup process.
+ *
+ * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
+ * currently performing crash recovery using only XLOG files in pg_wal, but
+ * will switch to using offline XLOG archives as soon as we reach the end of
+ * WAL in pg_wal.
+*/
+bool		ArchiveRecoveryRequested = false;
+bool		InArchiveRecovery = false;
+
+/* option set locally in startup process only when signal files exist */
+static bool StandbyModeRequested = false;
+/* are we currently in standby mode? */
+bool		StandbyMode = false;
+
+/* was a signal file present at startup? */
+static bool standby_signal_file_found = false;
+static bool recovery_signal_file_found = false;
+
+/*
+ * RedoStartLSN points to the checkpoint's REDO location which is specified
+ * in a backup label file, backup history file or control file.  In standby
+ * mode, XLOG streaming usually starts from the position where an invalid
+ * record was found.  But if we fail to read even the initial checkpoint
+ * record, we use the REDO location instead of the checkpoint location as
+ * the start position of XLOG streaming.  Otherwise we would have to jump
+ * backwards to the REDO location after reading the checkpoint record,
+ * because the REDO record can precede the checkpoint record.
+ */
+static XLogRecPtr CheckPointLoc = InvalidXLogRecPtr;
+static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
+static TimeLineID RedoStartTLI = 0;
+
+/* have we launched bgwriter during recovery? */
+static bool bgwriterLaunched = false;
+
+/*
+ * Have we reached a consistent database state? In crash recovery, we have
+ * to replay all the WAL, so reachedConsistency is never set. During archive
+ * recovery, the database is consistent once minRecoveryPoint is reached.
+ *
+ * Consistent state means that the system is internally consistent, all
+ * the WAL has been replayed up to a certain point, and importantly, there
+ * is no trace of later actions on disk.
+ */
+bool		reachedConsistency = false;
+
+/*
+ * Local copy of SharedHotStandbyActive variable. False actually means "not
+ * known, need to check the shared state".
+ */
+static bool LocalHotStandbyActive = false;
+
+/*
+ * Local copy of SharedPromoteIsTriggered variable. False actually means "not
+ * known, need to check the shared state".
+ */
+static bool LocalPromoteIsTriggered = false;
+
+/* Has the recovery code requested a walreceiver wakeup? */
+static bool doRequestWalReceiverReply;
+
+/* XLogReader object used to parse the WAL records */
+static XLogReaderState *xlogreader = NULL;
+
+/* Parameters passed down from ReadRecord to the XLogPageRead callback. */
+typedef struct XLogPageReadPrivate
+{
+	int			emode;
+	bool		fetching_ckpt;	/* are we fetching a checkpoint record? */
+	bool		randAccess;
+} XLogPageReadPrivate;
+
+/* flag to tell XLogPageRead that we have started replaying */
+static bool InRedo = false;
+
+/*
+ * Codes indicating where we got a WAL file from during recovery, or where
+ * to attempt to get one.
+ */
+typedef enum
+{
+	XLOG_FROM_ANY = 0,			/* request to read WAL from any source */
+	XLOG_FROM_ARCHIVE,			/* restored using restore_command */
+	XLOG_FROM_PG_WAL,			/* existing file in pg_wal */
+	XLOG_FROM_STREAM			/* streamed from primary */
+} XLogSource;
+
+/* human-readable names for XLogSources, for debugging output */
+static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
+
+/*
+ * readFile is -1 or a kernel FD for the log file segment that's currently
+ * open for reading.  readSegNo identifies the segment.  readOff is the offset
+ * of the page just read, readLen indicates how much of it has been read into
+ * readBuf, and readSource indicates where we got the currently open file from.
+ *
+ * Note: we could use Reserve/ReleaseExternalFD to track consumption of this
+ * FD too (like for openLogFile in xlog.c); but it doesn't currently seem
+ * worthwhile, since the XLOG is not read by general-purpose sessions.
+ */
+static int	readFile = -1;
+static XLogSegNo readSegNo = 0;
+static uint32 readOff = 0;
+static uint32 readLen = 0;
+static XLogSource readSource = XLOG_FROM_ANY;
+
+/*
+ * Keeps track of which source we're currently reading from. This is
+ * different from readSource in that this is always set, even when we don't
+ * currently have a WAL file open. If lastSourceFailed is set, our last
+ * attempt to read from currentSource failed, and we should try another source
+ * next.
+ *
+ * pendingWalRcvRestart is set when a config change occurs that requires a
+ * walreceiver restart.  This is only valid in XLOG_FROM_STREAM state.
+ */
+static XLogSource currentSource = XLOG_FROM_ANY;
+static bool lastSourceFailed = false;
+static bool pendingWalRcvRestart = false;
+
+/*
+ * These variables track when we last obtained some WAL data to process,
+ * and where we got it from.  (XLogReceiptSource is initially the same as
+ * readSource, but readSource gets reset to zero when we don't have data
+ * to process right now.  It is also different from currentSource, which
+ * also changes when we try to read from a source and fail, while
+ * XLogReceiptSource tracks where we last successfully read some WAL.)
+ */
+static TimestampTz XLogReceiptTime = 0;
+static XLogSource XLogReceiptSource = XLOG_FROM_ANY;
+
+/* Local copy of WalRcv->flushedUpto */
+static XLogRecPtr flushedUpto = 0;
+static TimeLineID receiveTLI = 0;
+
+/*
+ * Copy of minRecoveryPoint and backupEndPoint from the control file.
+ *
+ * In order to reach consistency, we must replay the WAL up to
+ * minRecoveryPoint.  If backupEndRequired is true, we must also reach
+ * backupEndPoint, or if it's invalid, an end-of-backup record corresponding
+ * to backupStartPoint.
+ *
+ * Note: In archive recovery, after consistency has been reached, the
+ * functions in xlog.c will start updating minRecoveryPoint in the control
+ * file.  But this copy of minRecoveryPoint variable reflects the value at the
+ * beginning of recovery, and is *not* updated after consistency is reached.
+ */
+static XLogRecPtr minRecoveryPoint;
+static TimeLineID minRecoveryPointTLI;
+
+static XLogRecPtr backupStartPoint;
+static XLogRecPtr backupEndPoint;
+static bool backupEndRequired = false;
+
+/* Buffers dedicated to consistency checks of size BLCKSZ */
+static char *replay_image_masked = NULL;
+static char *primary_image_masked = NULL;
+
+
+/*
+ * Shared-memory state for WAL recovery.
+ */
+typedef struct XLogRecoveryCtlData
+{
+	/*
+	 * SharedHotStandbyActive indicates if we allow hot standby queries to be
+	 * run.  Protected by info_lck.
+	 */
+	bool		SharedHotStandbyActive;
+
+	/*
+	 * SharedPromoteIsTriggered indicates if a standby promotion has been
+	 * triggered.  Protected by info_lck.
+	 */
+	bool		SharedPromoteIsTriggered;
+
+	/*
+	 * recoveryWakeupLatch is used to wake up the startup process to continue
+	 * WAL replay, if it is waiting for WAL to arrive or failover trigger file
+	 * to appear.
+	 *
+	 * Note that the startup process also uses another latch, its procLatch,
+	 * to wait for recovery conflict. If we get rid of recoveryWakeupLatch for
+	 * signaling the startup process in favor of using its procLatch, which
+	 * comports better with possible generic signal handlers using that latch.
+	 * But we should not do that because the startup process doesn't assume
+	 * that it's waken up by walreceiver process or SIGHUP signal handler
+	 * while it's waiting for recovery conflict. The separate latches,
+	 * recoveryWakeupLatch and procLatch, should be used for inter-process
+	 * communication for WAL replay and recovery conflict, respectively.
+	 */
+	Latch		recoveryWakeupLatch;
+
+	/*
+	 * lastReplayedEndRecPtr points to end+1 of the last record successfully
+	 * replayed. When we're currently replaying a record, ie. in a redo
+	 * function, replayEndRecPtr points to the end+1 of the record being
+	 * replayed, otherwise it's equal to lastReplayedEndRecPtr.
+	 */
+	XLogRecPtr	lastReplayedEndRecPtr;
+	TimeLineID	lastReplayedTLI;
+	XLogRecPtr	replayEndRecPtr;
+	TimeLineID	replayEndTLI;
+	/* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
+	TimestampTz recoveryLastXTime;
+
+	/*
+	 * timestamp of when we started replaying the current chunk of WAL data,
+	 * only relevant for replication or archive recovery
+	 */
+	TimestampTz currentChunkStartTime;
+	/* Recovery pause state */
+	RecoveryPauseState recoveryPauseState;
+	ConditionVariable recoveryNotPausedCV;
+
+	slock_t		info_lck;		/* locks shared variables shown above */
+} XLogRecoveryCtlData;
+
+static XLogRecoveryCtlData *XLogRecCtl = NULL;
+
+/* start position of the last replayed record */
+static XLogRecPtr LastReplayedReadRecPtr;
+
+/*
+ * if recoveryStopsBefore/After returns true, it saves information of the stop
+ * point here
+ */
+static TransactionId recoveryStopXid;
+static TimestampTz recoveryStopTime;
+static XLogRecPtr recoveryStopLSN;
+static char recoveryStopName[MAXFNAMELEN];
+static bool recoveryStopAfter;
+
+/* prototypes for local functions */
+static void xlog_block_info(StringInfo buf, XLogReaderState *record);
+
+static void readRecoverySignalFile(void);
+static void validateRecoveryParameters(void);
+static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
+								TimeLineID prevTLI);
+static void checkXLogConsistency(XLogReaderState *record);
+
+static void rm_redo_error_callback(void *arg);
+
+static bool getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime);
+static bool recoveryStopsBefore(XLogReaderState *record);
+static bool recoveryStopsAfter(XLogReaderState *record);
+static void recoveryPausesHere(bool endOfRecovery);
+static bool recoveryApplyDelay(XLogReaderState *record);
+
+static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
+										bool fetching_ckpt, XLogRecPtr tliRecPtr);
+static int	emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
+static void SetCurrentChunkStartTime(TimestampTz xtime);
+static void SetLatestXTime(TimestampTz xtime);
+static bool rescanLatestTimeLine(void);
+
+static bool read_backup_label(XLogRecPtr *checkPointLoc,
+							  bool *backupEndRequired, bool *backupFromStandby);
+static bool read_tablespace_map(List **tablespaces);
+static char *getRecoveryStopReason(void);
+
+static void ConfirmRecoveryPaused(void);
+
+static void CheckRecoveryConsistency(void);
+static bool CheckForStandbyTrigger(void);
+
+static void SetPromoteIsTriggered(void);
+
+static bool HotStandbyActiveInReplay(void);
+
+static XLogRecord *ReadRecord(XLogReaderState *xlogreader,
+							  int emode, bool fetching_ckpt);
+
+static int	XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
+						 XLogSource source, bool notfoundOk);
+static int	XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source);
+static int	XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
+						 int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
+
+/*
+ * Initialization of shared memory for WAL recovery
+ */
+Size
+XLogRecoveryShmemSize(void)
+{
+	Size		size;
+
+	/* XLogRecCtl */
+	size = sizeof(XLogRecoveryCtlData);
+
+	return size;
+}
+
+void
+XLogRecoveryShmemInit(void)
+{
+	bool		found;
+
+	XLogRecCtl = (XLogRecoveryCtlData *)
+		ShmemInitStruct("XLOG Recovery Ctl", XLogRecoveryShmemSize(), &found);
+	if (found)
+		return;
+	memset(XLogRecCtl, 0, sizeof(XLogRecoveryCtlData));
+
+	SpinLockInit(&XLogRecCtl->info_lck);
+	InitSharedLatch(&XLogRecCtl->recoveryWakeupLatch);
+	ConditionVariableInit(&XLogRecCtl->recoveryNotPausedCV);
+}
+
+/*
+ * Prepare the system for WAL recovery, if needed.
+ *
+ * This is called by StartupXLOG() which coordinates the server startup
+ * sequence.  This function analyzes the control file and the backup label
+ * file, if any, and figures out whether we need to perform crash recovery or
+ * archive recovery, and how far we need to replay the WAL to reach a
+ * consistent state.
+ *
+ * This doesn't yet change the on-disk state, except for creating the symlinks
+ * from table space map file if any, and for fetching WAL files needed to find
+ * the checkpoint record.  On entry, the caller has already read the control
+ * file into memory, and passes it as argument.  This function updates it to
+ * reflect the recovery state, and the caller is expected to write it back to
+ * disk does after initializing other subsystems, but before calling
+ * PerformWalRecovery().
+ *
+ * This initializes some global variables like ArchiveModeRequested, and
+ * StandbyModeRequested and InRecovery.
+ */
+void
+InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
+				bool *haveBackupLabel_ptr, bool *haveTblspcMap_ptr)
+{
+	XLogPageReadPrivate *private;
+	struct stat st;
+	bool		wasShutdown;
+	XLogRecord *record;
+	DBState		dbstate_at_startup;
+	bool		haveTblspcMap = false;
+	bool		haveBackupLabel = false;
+	CheckPoint	checkPoint;
+	bool backupFromStandby = false;
+
+	dbstate_at_startup = ControlFile->state;
+
+	/*
+	 * Initialize on the assumption we want to recover to the latest timeline
+	 * that's active according to pg_control.
+	 */
+	if (ControlFile->minRecoveryPointTLI >
+		ControlFile->checkPointCopy.ThisTimeLineID)
+		recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
+	else
+		recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
+
+	/*
+	 * Check for signal files, and if so set up state for offline recovery
+	 */
+	readRecoverySignalFile();
+	validateRecoveryParameters();
+
+	if (ArchiveRecoveryRequested)
+	{
+		if (StandbyModeRequested)
+			ereport(LOG,
+					(errmsg("entering standby mode")));
+		else if (recoveryTarget == RECOVERY_TARGET_XID)
+			ereport(LOG,
+					(errmsg("starting point-in-time recovery to XID %u",
+							recoveryTargetXid)));
+		else if (recoveryTarget == RECOVERY_TARGET_TIME)
+			ereport(LOG,
+					(errmsg("starting point-in-time recovery to %s",
+							timestamptz_to_str(recoveryTargetTime))));
+		else if (recoveryTarget == RECOVERY_TARGET_NAME)
+			ereport(LOG,
+					(errmsg("starting point-in-time recovery to \"%s\"",
+							recoveryTargetName)));
+		else if (recoveryTarget == RECOVERY_TARGET_LSN)
+			ereport(LOG,
+					(errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"",
+							LSN_FORMAT_ARGS(recoveryTargetLSN))));
+		else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
+			ereport(LOG,
+					(errmsg("starting point-in-time recovery to earliest consistent point")));
+		else
+			ereport(LOG,
+					(errmsg("starting archive recovery")));
+	}
+
+	/*
+	 * Take ownership of the wakeup latch if we're going to sleep during
+	 * recovery.
+	 */
+	if (ArchiveRecoveryRequested)
+		OwnLatch(&XLogRecCtl->recoveryWakeupLatch);
+
+	private = palloc0(sizeof(XLogPageReadPrivate));
+	xlogreader =
+		XLogReaderAllocate(wal_segment_size, NULL,
+						   XL_ROUTINE(.page_read = &XLogPageRead,
+									  .segment_open = NULL,
+									  .segment_close = wal_segment_close),
+						   private);
+	if (!xlogreader)
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of memory"),
+				 errdetail("Failed while allocating a WAL reading processor.")));
+	xlogreader->system_identifier = ControlFile->system_identifier;
+
+	/*
+	 * Allocate two page buffers dedicated to WAL consistency checks.  We do
+	 * it this way, rather than just making static arrays, for two reasons:
+	 * (1) no need to waste the storage in most instantiations of the backend;
+	 * (2) a static char array isn't guaranteed to have any particular
+	 * alignment, whereas palloc() will provide MAXALIGN'd storage.
+	 */
+	replay_image_masked = (char *) palloc(BLCKSZ);
+	primary_image_masked = (char *) palloc(BLCKSZ);
+
+	if (read_backup_label(&CheckPointLoc, &backupEndRequired,
+						  &backupFromStandby))
+	{
+		List	   *tablespaces = NIL;
+
+		/*
+		 * Archive recovery was requested, and thanks to the backup label
+		 * file, we know how far we need to replay to reach consistency. Enter
+		 * archive recovery directly.
+		 */
+		InArchiveRecovery = true;
+		if (StandbyModeRequested)
+			StandbyMode = true;
+
+		/*
+		 * When a backup_label file is present, we want to roll forward from
+		 * the checkpoint it identifies, rather than using pg_control.
+		 */
+		record = ReadCheckpointRecord(CheckPointLoc, 0, true);
+		if (record != NULL)
+		{
+			memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
+			wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
+			ereport(DEBUG1,
+					(errmsg_internal("checkpoint record is at %X/%X",
+									 LSN_FORMAT_ARGS(CheckPointLoc))));
+			InRecovery = true;	/* force recovery even if SHUTDOWNED */
+
+			/*
+			 * Make sure that REDO location exists. This may not be the case
+			 * if there was a crash during an online backup, which left a
+			 * backup_label around that references a WAL segment that's
+			 * already been archived.
+			 */
+			if (checkPoint.redo < CheckPointLoc)
+			{
+				XLogBeginRead(xlogreader, checkPoint.redo);
+				if (!ReadRecord(xlogreader, LOG, false))
+					ereport(FATAL,
+							(errmsg("could not find redo location referenced by checkpoint record"),
+							 errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n"
+									 "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
+									 "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
+									 DataDir, DataDir, DataDir)));
+			}
+		}
+		else
+		{
+			ereport(FATAL,
+					(errmsg("could not locate required checkpoint record"),
+					 errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n"
+							 "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
+							 "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
+							 DataDir, DataDir, DataDir)));
+			wasShutdown = false;	/* keep compiler quiet */
+		}
+
+		/* Read the tablespace_map file if present and create symlinks. */
+		if (read_tablespace_map(&tablespaces))
+		{
+			ListCell   *lc;
+
+			foreach(lc, tablespaces)
+			{
+				tablespaceinfo *ti = lfirst(lc);
+				char	   *linkloc;
+
+				linkloc = psprintf("pg_tblspc/%s", ti->oid);
+
+				/*
+				 * Remove the existing symlink if any and Create the symlink
+				 * under PGDATA.
+				 */
+				remove_tablespace_symlink(linkloc);
+
+				if (symlink(ti->path, linkloc) < 0)
+					ereport(ERROR,
+							(errcode_for_file_access(),
+							 errmsg("could not create symbolic link \"%s\": %m",
+									linkloc)));
+
+				pfree(ti->oid);
+				pfree(ti->path);
+				pfree(ti);
+			}
+
+			/* tell the caller to delete it later */
+			haveTblspcMap = true;
+		}
+
+		/* tell the caller to delete it later */
+		haveBackupLabel = true;
+	}
+	else
+	{
+		/*
+		 * If tablespace_map file is present without backup_label file, there
+		 * is no use of such file.  There is no harm in retaining it, but it
+		 * is better to get rid of the map file so that we don't have any
+		 * redundant file in data directory and it will avoid any sort of
+		 * confusion.  It seems prudent though to just rename the file out of
+		 * the way rather than delete it completely, also we ignore any error
+		 * that occurs in rename operation as even if map file is present
+		 * without backup_label file, it is harmless.
+		 */
+		if (stat(TABLESPACE_MAP, &st) == 0)
+		{
+			unlink(TABLESPACE_MAP_OLD);
+			if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
+				ereport(LOG,
+						(errmsg("ignoring file \"%s\" because no file \"%s\" exists",
+								TABLESPACE_MAP, BACKUP_LABEL_FILE),
+						 errdetail("File \"%s\" was renamed to \"%s\".",
+								   TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
+			else
+				ereport(LOG,
+						(errmsg("ignoring file \"%s\" because no file \"%s\" exists",
+								TABLESPACE_MAP, BACKUP_LABEL_FILE),
+						 errdetail("Could not rename file \"%s\" to \"%s\": %m.",
+								   TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
+		}
+
+		/*
+		 * It's possible that archive recovery was requested, but we don't
+		 * know how far we need to replay the WAL before we reach consistency.
+		 * This can happen for example if a base backup is taken from a
+		 * running server using an atomic filesystem snapshot, without calling
+		 * pg_start/stop_backup. Or if you just kill a running primary server
+		 * and put it into archive recovery by creating a recovery signal
+		 * file.
+		 *
+		 * Our strategy in that case is to perform crash recovery first,
+		 * replaying all the WAL present in pg_wal, and only enter archive
+		 * recovery after that.
+		 *
+		 * But usually we already know how far we need to replay the WAL (up
+		 * to minRecoveryPoint, up to backupEndPoint, or until we see an
+		 * end-of-backup record), and we can enter archive recovery directly.
+		 */
+		if (ArchiveRecoveryRequested &&
+			(ControlFile->minRecoveryPoint != InvalidXLogRecPtr ||
+			 ControlFile->backupEndRequired ||
+			 ControlFile->backupEndPoint != InvalidXLogRecPtr ||
+			 ControlFile->state == DB_SHUTDOWNED))
+		{
+			InArchiveRecovery = true;
+			if (StandbyModeRequested)
+				StandbyMode = true;
+		}
+
+		/* Get the last valid checkpoint record. */
+		CheckPointLoc = ControlFile->checkPoint;
+		RedoStartLSN = ControlFile->checkPointCopy.redo;
+		RedoStartTLI = ControlFile->checkPointCopy.ThisTimeLineID;
+		record = ReadCheckpointRecord(CheckPointLoc, 1, true);
+		if (record != NULL)
+		{
+			ereport(DEBUG1,
+					(errmsg_internal("checkpoint record is at %X/%X",
+									 LSN_FORMAT_ARGS(CheckPointLoc))));
+		}
+		else
+		{
+			/*
+			 * We used to attempt to go back to a secondary checkpoint record
+			 * here, but only when not in standby mode. We now just fail if we
+			 * can't read the last checkpoint because this allows us to
+			 * simplify processing around checkpoints.
+			 */
+			ereport(PANIC,
+					(errmsg("could not locate a valid checkpoint record")));
+		}
+		memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
+		wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
+	}
+
+	/*
+	 * If the location of the checkpoint record is not on the expected
+	 * timeline in the history of the requested timeline, we cannot proceed:
+	 * the backup is not part of the history of the requested timeline.
+	 */
+	Assert(expectedTLEs);		/* was initialized by reading checkpoint
+								 * record */
+	if (tliOfPointInHistory(CheckPointLoc, expectedTLEs) !=
+		checkPoint.ThisTimeLineID)
+	{
+		XLogRecPtr	switchpoint;
+
+		/*
+		 * tliSwitchPoint will throw an error if the checkpoint's timeline is
+		 * not in expectedTLEs at all.
+		 */
+		switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs, NULL);
+		ereport(FATAL,
+				(errmsg("requested timeline %u is not a child of this server's history",
+						recoveryTargetTLI),
+				 errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
+						   LSN_FORMAT_ARGS(ControlFile->checkPoint),
+						   ControlFile->checkPointCopy.ThisTimeLineID,
+						   LSN_FORMAT_ARGS(switchpoint))));
+	}
+
+	/*
+	 * The min recovery point should be part of the requested timeline's
+	 * history, too.
+	 */
+	if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) &&
+		tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
+		ControlFile->minRecoveryPointTLI)
+		ereport(FATAL,
+				(errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
+						recoveryTargetTLI,
+						LSN_FORMAT_ARGS(ControlFile->minRecoveryPoint),
+						ControlFile->minRecoveryPointTLI)));
+
+	ereport(DEBUG1,
+			(errmsg_internal("redo record is at %X/%X; shutdown %s",
+							 LSN_FORMAT_ARGS(checkPoint.redo),
+							 wasShutdown ? "true" : "false")));
+	ereport(DEBUG1,
+			(errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u",
+							 U64FromFullTransactionId(checkPoint.nextXid),
+							 checkPoint.nextOid)));
+	ereport(DEBUG1,
+			(errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u",
+							 checkPoint.nextMulti, checkPoint.nextMultiOffset)));
+	ereport(DEBUG1,
+			(errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
+							 checkPoint.oldestXid, checkPoint.oldestXidDB)));
+	ereport(DEBUG1,
+			(errmsg_internal("oldest MultiXactId: %u, in database %u",
+							 checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
+	ereport(DEBUG1,
+			(errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
+							 checkPoint.oldestCommitTsXid,
+							 checkPoint.newestCommitTsXid)));
+	if (!TransactionIdIsNormal(XidFromFullTransactionId(checkPoint.nextXid)))
+		ereport(PANIC,
+				(errmsg("invalid next transaction ID")));
+
+	/* sanity check */
+	if (checkPoint.redo > CheckPointLoc)
+		ereport(PANIC,
+				(errmsg("invalid redo in checkpoint record")));
+
+	/*
+	 * Check whether we need to force recovery from WAL.  If it appears to
+	 * have been a clean shutdown and we did not have a recovery signal file,
+	 * then assume no recovery needed.
+	 */
+	if (checkPoint.redo < CheckPointLoc)
+	{
+		if (wasShutdown)
+			ereport(PANIC,
+					(errmsg("invalid redo record in shutdown checkpoint")));
+		InRecovery = true;
+	}
+	else if (ControlFile->state != DB_SHUTDOWNED)
+		InRecovery = true;
+	else if (ArchiveRecoveryRequested)
+	{
+		/* force recovery due to presence of recovery signal file */
+		InRecovery = true;
+	}
+
+	/*
+	 * Update pg_control to show that we are recovering and to show the
+	 * selected checkpoint as the place we are starting from. We also mark
+	 * pg_control with any minimum recovery stop point obtained from a
+	 * backup history file.
+	 */
+	if (InArchiveRecovery)
+	{
+		ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
+	}
+	else
+	{
+		ereport(LOG,
+				(errmsg("database system was not properly shut down; "
+						"automatic recovery in progress")));
+		if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID)
+			ereport(LOG,
+					(errmsg("crash recovery starts in timeline %u "
+							"and has target timeline %u",
+							ControlFile->checkPointCopy.ThisTimeLineID,
+							recoveryTargetTLI)));
+		ControlFile->state = DB_IN_CRASH_RECOVERY;
+	}
+	ControlFile->checkPoint = CheckPointLoc;
+	ControlFile->checkPointCopy = checkPoint;
+	if (InArchiveRecovery)
+	{
+		/* initialize minRecoveryPoint if not set yet */
+		if (ControlFile->minRecoveryPoint < checkPoint.redo)
+		{
+			ControlFile->minRecoveryPoint = checkPoint.redo;
+			ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
+		}
+	}
+
+	/*
+	 * Set backupStartPoint if we're starting recovery from a base backup.
+	 *
+	 * Also set backupEndPoint and use minRecoveryPoint as the backup end
+	 * location if we're starting recovery from a base backup which was
+	 * taken from a standby. In this case, the database system status in
+	 * pg_control must indicate that the database was already in recovery.
+	 * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
+	 * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
+	 * before reaching this point; e.g. because restore_command or
+	 * primary_conninfo were faulty.
+	 *
+	 * Any other state indicates that the backup somehow became corrupted
+	 * and we can't sensibly continue with recovery.
+	 */
+	if (haveBackupLabel)
+	{
+		ControlFile->backupStartPoint = checkPoint.redo;
+		ControlFile->backupEndRequired = backupEndRequired;
+
+		if (backupFromStandby)
+		{
+			if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
+				dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
+				ereport(FATAL,
+						(errmsg("backup_label contains data inconsistent with control file"),
+						 errhint("This means that the backup is corrupted and you will "
+								 "have to use another backup for recovery.")));
+			ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
+		}
+	}
+	ControlFile->time = (pg_time_t) time(NULL);
+
+	/* remember these, so that we know when we have reached consistency */
+	backupStartPoint = ControlFile->backupStartPoint;
+	backupEndRequired = ControlFile->backupEndRequired;
+	backupEndPoint = ControlFile->backupEndPoint;
+	if (InArchiveRecovery)
+	{
+		minRecoveryPoint = ControlFile->minRecoveryPoint;
+		minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
+	}
+	else
+	{
+		minRecoveryPoint = InvalidXLogRecPtr;
+		minRecoveryPointTLI = 0;
+	}
+
+	LastReplayedReadRecPtr = CheckPointLoc;
+
+	*wasShutdown_ptr = wasShutdown;
+	*haveBackupLabel_ptr = haveBackupLabel;
+	*haveTblspcMap_ptr = haveTblspcMap;
+}
+
+/*
+ * read_backup_label: check to see if a backup_label file is present
+ *
+ * If we see a backup_label during recovery, we assume that we are recovering
+ * from a backup dump file, and we therefore roll forward from the checkpoint
+ * identified by the label file, NOT what pg_control says.  This avoids the
+ * problem that pg_control might have been archived one or more checkpoints
+ * later than the start of the dump, and so if we rely on it as the start
+ * point, we will fail to restore a consistent database state.
+ *
+ * Returns true if a backup_label was found (and fills the checkpoint
+ * location and its REDO location into *checkPointLoc and RedoStartLSN,
+ * respectively); returns false if not. If this backup_label came from a
+ * streamed backup, *backupEndRequired is set to true. If this backup_label
+ * was created during recovery, *backupFromStandby is set to true.
+ */
+static bool
+read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired,
+				  bool *backupFromStandby)
+{
+	char		startxlogfilename[MAXFNAMELEN];
+	TimeLineID	tli_from_walseg,
+				tli_from_file;
+	FILE	   *lfp;
+	char		ch;
+	char		backuptype[20];
+	char		backupfrom[20];
+	char		backuplabel[MAXPGPATH];
+	char		backuptime[128];
+	uint32		hi,
+				lo;
+
+	*backupEndRequired = false;
+	*backupFromStandby = false;
+
+	/*
+	 * See if label file is present
+	 */
+	lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
+	if (!lfp)
+	{
+		if (errno != ENOENT)
+			ereport(FATAL,
+					(errcode_for_file_access(),
+					 errmsg("could not read file \"%s\": %m",
+							BACKUP_LABEL_FILE)));
+		return false;			/* it's not there, all is fine */
+	}
+
+	/*
+	 * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
+	 * is pretty crude, but we are not expecting any variability in the file
+	 * format).
+	 */
+	if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
+			   &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n')
+		ereport(FATAL,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
+	RedoStartLSN = ((uint64) hi) << 32 | lo;
+	RedoStartTLI = tli_from_walseg;
+	if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
+			   &hi, &lo, &ch) != 3 || ch != '\n')
+		ereport(FATAL,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
+	*checkPointLoc = ((uint64) hi) << 32 | lo;
+
+	/*
+	 * BACKUP METHOD and BACKUP FROM lines are new in 9.2. We can't restore
+	 * from an older backup anyway, but since the information on it is not
+	 * strictly required, don't error out if it's missing for some reason.
+	 */
+	if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
+	{
+		if (strcmp(backuptype, "streamed") == 0)
+			*backupEndRequired = true;
+	}
+
+	if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
+	{
+		if (strcmp(backupfrom, "standby") == 0)
+			*backupFromStandby = true;
+	}
+
+	/*
+	 * Parse START TIME and LABEL. Those are not mandatory fields for recovery
+	 * but checking for their presence is useful for debugging and the next
+	 * sanity checks. Cope also with the fact that the result buffers have a
+	 * pre-allocated size, hence if the backup_label file has been generated
+	 * with strings longer than the maximum assumed here an incorrect parsing
+	 * happens. That's fine as only minor consistency checks are done
+	 * afterwards.
+	 */
+	if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1)
+		ereport(DEBUG1,
+				(errmsg_internal("backup time %s in file \"%s\"",
+								 backuptime, BACKUP_LABEL_FILE)));
+
+	if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1)
+		ereport(DEBUG1,
+				(errmsg_internal("backup label %s in file \"%s\"",
+								 backuplabel, BACKUP_LABEL_FILE)));
+
+	/*
+	 * START TIMELINE is new as of 11. Its parsing is not mandatory, still use
+	 * it as a sanity check if present.
+	 */
+	if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1)
+	{
+		if (tli_from_walseg != tli_from_file)
+			ereport(FATAL,
+					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+					 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE),
+					 errdetail("Timeline ID parsed is %u, but expected %u.",
+							   tli_from_file, tli_from_walseg)));
+
+		ereport(DEBUG1,
+				(errmsg_internal("backup timeline %u in file \"%s\"",
+								 tli_from_file, BACKUP_LABEL_FILE)));
+	}
+
+	if (ferror(lfp) || FreeFile(lfp))
+		ereport(FATAL,
+				(errcode_for_file_access(),
+				 errmsg("could not read file \"%s\": %m",
+						BACKUP_LABEL_FILE)));
+
+	return true;
+}
+
+/*
+ * read_tablespace_map: check to see if a tablespace_map file is present
+ *
+ * If we see a tablespace_map file during recovery, we assume that we are
+ * recovering from a backup dump file, and we therefore need to create symlinks
+ * as per the information present in tablespace_map file.
+ *
+ * Returns true if a tablespace_map file was found (and fills *tablespaces
+ * with a tablespaceinfo struct for each tablespace listed in the file);
+ * returns false if not.
+ */
+static bool
+read_tablespace_map(List **tablespaces)
+{
+	tablespaceinfo *ti;
+	FILE	   *lfp;
+	char		str[MAXPGPATH];
+	int			ch,
+				i,
+				n;
+	bool		was_backslash;
+
+	/*
+	 * See if tablespace_map file is present
+	 */
+	lfp = AllocateFile(TABLESPACE_MAP, "r");
+	if (!lfp)
+	{
+		if (errno != ENOENT)
+			ereport(FATAL,
+					(errcode_for_file_access(),
+					 errmsg("could not read file \"%s\": %m",
+							TABLESPACE_MAP)));
+		return false;			/* it's not there, all is fine */
+	}
+
+	/*
+	 * Read and parse the link name and path lines from tablespace_map file
+	 * (this code is pretty crude, but we are not expecting any variability in
+	 * the file format).  De-escape any backslashes that were inserted.
+	 */
+	i = 0;
+	was_backslash = false;
+	while ((ch = fgetc(lfp)) != EOF)
+	{
+		if (!was_backslash && (ch == '\n' || ch == '\r'))
+		{
+			if (i == 0)
+				continue;		/* \r immediately followed by \n */
+
+			/*
+			 * The de-escaped line should contain an OID followed by exactly
+			 * one space followed by a path.  The path might start with
+			 * spaces, so don't be too liberal about parsing.
+			 */
+			str[i] = '\0';
+			n = 0;
+			while (str[n] && str[n] != ' ')
+				n++;
+			if (n < 1 || n >= i - 1)
+				ereport(FATAL,
+						(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+						 errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
+			str[n++] = '\0';
+
+			ti = palloc0(sizeof(tablespaceinfo));
+			ti->oid = pstrdup(str);
+			ti->path = pstrdup(str + n);
+			*tablespaces = lappend(*tablespaces, ti);
+
+			i = 0;
+			continue;
+		}
+		else if (!was_backslash && ch == '\\')
+			was_backslash = true;
+		else
+		{
+			if (i < sizeof(str) - 1)
+				str[i++] = ch;
+			was_backslash = false;
+		}
+	}
+
+	if (i != 0 || was_backslash)	/* last line not terminated? */
+		ereport(FATAL,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
+
+	if (ferror(lfp) || FreeFile(lfp))
+		ereport(FATAL,
+				(errcode_for_file_access(),
+				 errmsg("could not read file \"%s\": %m",
+						TABLESPACE_MAP)));
+
+	return true;
+}
+
+/*
+ * Finish WAL recovery.
+ *
+ * Output values:
+ *
+ * LastRec - start position of the last valid or applied record, after which
+ * new WAL can be appended.
+ *
+ * EndOfLog/EndOfLogTLI - end position of the last valid or applied record.
+ *
+ * lastPageBeginPtr - LSN position of the page that contains EndOfLog
+ *
+ * lastPage - copy of the last page, up to EndOfLog
+ *
+ * reason - a short human-readable string describing why recovery was ended
+ *
+ * bgwriterLaunched - set to true if the bgwriter process was launched
+ *
+ * standby/recovery_signal_file_found - set to true if the signal file was found
+ *
+ * Returns the position of the last valid or applied record, after which new
+ * WAL should be appended. **reason is filled with a short human-readable text
+ * describing the recovery stop condition that was reached (the caller writes
+ * it to the timeline history file for informative purposes.)
+ *
+ * *lastPage is a filled with a palloc'd copy of the last partial page, the
+ * one containing EndOfLog.
+ *
+ * This does not close the 'xlogreader' yet, because in some cases the caller
+ * still wants to re-read the last checkpoint record by calling
+ * ReadCheckPointRecord().
+ */
+void
+EndWalRecovery(XLogRecPtr *LastRec, XLogRecPtr *EndOfLog_p, TimeLineID *EndOfLogTLI_p,
+			   XLogRecPtr *lastPageBeginPtr, char **lastPage, char **reason,
+			   bool *bgwriterLaunched_p,
+			   bool *standby_signal_file_found_p,
+			   bool *recovery_signal_file_found_p)
+{
+	XLogRecPtr EndOfLog;
+	TimeLineID EndOfLogTLI;
+
+	/*
+	 * Kill WAL receiver, if it's still running, before we continue to write
+	 * the startup checkpoint record. It will trump over the checkpoint and
+	 * subsequent records if it's still alive when we start writing WAL.
+	 */
+	ShutdownWalRcv();
+
+	/*
+	 * We are now done reading the xlog from stream. Turn off streaming
+	 * recovery to force fetching the files (which would be required at end of
+	 * recovery, e.g., timeline history file) from archive or pg_wal.
+	 *
+	 * Note that standby mode must be turned off after killing WAL receiver,
+	 * i.e., calling ShutdownWalRcv().
+	 */
+	Assert(!WalRcvStreaming());
+	StandbyMode = false;
+
+	/*
+	 * Re-fetch the last valid or last applied record, so we can identify the
+	 * exact endpoint of what we consider the valid portion of WAL.
+	 *
+	 * An important side-effect of this is to load the last page into xlogreader.
+	 * The caller uses it to initialize the WAL for writing.
+	 */
+	XLogBeginRead(xlogreader, LastReplayedReadRecPtr);
+	(void) ReadRecord(xlogreader, PANIC, false);
+	*LastRec = xlogreader->ReadRecPtr;
+	EndOfLog = xlogreader->EndRecPtr;
+
+	/*
+	 * EndOfLogTLI is the TLI in the filename of the XLOG segment containing
+	 * the end-of-log. It could be different from the timeline that EndOfLog
+	 * nominally belongs to, if there was a timeline switch in that segment,
+	 * and we were reading the old WAL from a segment belonging to a higher
+	 * timeline.
+	 */
+	EndOfLogTLI = xlogreader->seg.ws_tli;
+
+	if (ArchiveRecoveryRequested)
+	{
+		/*
+		 * We are no longer in archive recovery state.
+		 *
+		 * We are now done reading the old WAL.  Turn off archive fetching if
+		 * it was active.
+		 */
+		Assert(InArchiveRecovery);
+		InArchiveRecovery = false;
+
+		/*
+		 * If the ending log segment is still open, close it (to avoid problems on
+		 * Windows with trying to rename or delete an open file).
+		 */
+		if (readFile >= 0)
+		{
+			close(readFile);
+			readFile = -1;
+		}
+	}
+
+	/*
+	 * Copy the last partial block to the caller, for initializing the WAL
+	 * buffer for appending new WAL.
+	 */
+	if (EndOfLog % XLOG_BLCKSZ != 0)
+	{
+		char	   *page;
+		int			len;
+		XLogRecPtr	pageBeginPtr;
+
+		pageBeginPtr = EndOfLog - (EndOfLog % XLOG_BLCKSZ);
+		Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size));
+
+		/* Copy the valid part of the last block */
+		len = EndOfLog % XLOG_BLCKSZ;
+		page = palloc(len);
+		memcpy(page, xlogreader->readBuf, len);
+
+		*lastPageBeginPtr = pageBeginPtr;
+		*lastPage = page;
+	}
+	else
+	{
+		/* There is no partial block to copy. */
+		*lastPageBeginPtr = EndOfLog;
+		*lastPage = NULL;
+	}
+
+	/*
+	 * Create a comment for the history file to explain why and where
+	 * timeline changed.
+	 */
+	*reason = getRecoveryStopReason();
+
+	*EndOfLog_p = EndOfLog;
+	*EndOfLogTLI_p = EndOfLogTLI;
+
+	*bgwriterLaunched_p = bgwriterLaunched;
+	*standby_signal_file_found_p = standby_signal_file_found;
+	*recovery_signal_file_found_p = recovery_signal_file_found;
+}
+
+/*
+ * Clean up the WAL reader and leftovers from restoring WAL from archive
+ */
+void
+FreeWalRecovery(void)
+{
+	char		recoveryPath[MAXPGPATH];
+
+	/* Shut down xlogreader */
+	if (readFile >= 0)
+	{
+		close(readFile);
+		readFile = -1;
+	}
+	XLogReaderFree(xlogreader);
+
+	if (ArchiveRecoveryRequested)
+	{
+		/*
+		 * Since there might be a partial WAL segment named RECOVERYXLOG, get
+		 * rid of it.
+		 */
+		snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
+		unlink(recoveryPath);	/* ignore any error */
+
+		/* Get rid of any remaining recovered timeline-history file, too */
+		snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
+		unlink(recoveryPath);	/* ignore any error */
+	}
+
+	/*
+	 * We don't need the latch anymore. It's not strictly necessary to disown
+	 * it, but let's do it for the sake of tidiness.
+	 */
+	if (ArchiveRecoveryRequested)
+		DisownLatch(&XLogRecCtl->recoveryWakeupLatch);
+}
+
+/*
+ * Perform WAL recovery.
+ *
+ * If the system was shut down cleanly, this is never called.
+ */
+void
+PerformWalRecovery(void)
+{
+	int			rmid;
+	XLogRecord *record;
+	bool		reachedRecoveryTarget = false;
+
+	/*
+	 * Initialize shared variables for tracking progress of WAL replay, as
+	 * if we had just replayed the record before the REDO location (or the
+	 * checkpoint record itself, if it's a shutdown checkpoint).
+	 */
+	SpinLockAcquire(&XLogRecCtl->info_lck);
+
+	if (RedoStartLSN < CheckPointLoc)
+		XLogRecCtl->lastReplayedEndRecPtr = RedoStartLSN;
+	else
+		XLogRecCtl->lastReplayedEndRecPtr = xlogreader->EndRecPtr;
+	XLogRecCtl->lastReplayedTLI = ThisTimeLineID;
+	XLogRecCtl->replayEndRecPtr = XLogRecCtl->lastReplayedEndRecPtr;
+	XLogRecCtl->replayEndTLI = XLogRecCtl->lastReplayedTLI;
+	XLogRecCtl->recoveryLastXTime = 0;
+	XLogRecCtl->currentChunkStartTime = 0;
+	XLogRecCtl->recoveryPauseState = RECOVERY_NOT_PAUSED;
+	SpinLockRelease(&XLogRecCtl->info_lck);
+
+	/* Also ensure XLogReceiptTime has a sane value */
+	XLogReceiptTime = GetCurrentTimestamp();
+
+	/*
+	 * Let postmaster know we've started redo now, so that it can launch
+	 * checkpointer to perform restartpoints.  We don't bother during
+	 * crash recovery as restartpoints can only be performed during
+	 * archive recovery.  And we'd like to keep crash recovery simple, to
+	 * avoid introducing bugs that could affect you when recovering after
+	 * crash.
+	 *
+	 * After this point, we can no longer assume that we're the only
+	 * process in addition to postmaster!  Also, fsync requests are
+	 * subsequently to be handled by the checkpointer, not locally.
+	 */
+	if (ArchiveRecoveryRequested && IsUnderPostmaster)
+	{
+		PublishStartupProcessInformation();
+		EnableSyncRequestForwarding();
+		SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
+		bgwriterLaunched = true;
+	}
+
+	/*
+	 * Allow read-only connections immediately if we're consistent
+	 * already.
+	 */
+	CheckRecoveryConsistency();
+
+	/*
+	 * Find the first record that logically follows the checkpoint --- it
+	 * might physically precede it, though.
+	 */
+	if (RedoStartLSN < CheckPointLoc)
+	{
+		/* back up to find the record */
+		XLogBeginRead(xlogreader, RedoStartLSN);
+		record = ReadRecord(xlogreader, PANIC, false);
+	}
+	else
+	{
+		/* just have to read next record after CheckPoint */
+		record = ReadRecord(xlogreader, LOG, false);
+	}
+
+	if (record != NULL)
+	{
+		ErrorContextCallback errcallback;
+		TimestampTz xtime;
+		PGRUsage	ru0;
+		XLogRecPtr	ReadRecPtr;
+		XLogRecPtr	EndRecPtr;
+
+		pg_rusage_init(&ru0);
+
+		InRedo = true;
+
+		/* Initialize resource managers */
+		for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
+		{
+			if (RmgrTable[rmid].rm_startup != NULL)
+				RmgrTable[rmid].rm_startup();
+		}
+
+		ereport(LOG,
+				(errmsg("redo starts at %X/%X",
+						LSN_FORMAT_ARGS(xlogreader->ReadRecPtr))));
+
+		/*
+		 * main redo apply loop
+		 */
+		do
+		{
+			bool		switchedTLI = false;
+
+			ReadRecPtr = xlogreader->ReadRecPtr;
+			EndRecPtr = xlogreader->EndRecPtr;
+
+#ifdef WAL_DEBUG
+			if (XLOG_DEBUG ||
+				(rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
+				(rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
+			{
+				StringInfoData buf;
+
+				initStringInfo(&buf);
+				appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
+								 LSN_FORMAT_ARGS(ReadRecPtr),
+								 LSN_FORMAT_ARGS(EndRecPtr));
+				xlog_outrec(&buf, xlogreader);
+				appendStringInfoString(&buf, " - ");
+				xlog_outdesc(&buf, xlogreader);
+				elog(LOG, "%s", buf.data);
+				pfree(buf.data);
+			}
+#endif
+
+			/* Handle interrupt signals of startup process */
+			HandleStartupProcInterrupts();
+
+			/*
+			 * Pause WAL replay, if requested by a hot-standby session via
+			 * SetRecoveryPause().
+			 *
+			 * Note that we intentionally don't take the info_lck spinlock
+			 * here.  We might therefore read a slightly stale value of
+			 * the recoveryPause flag, but it can't be very stale (no
+			 * worse than the last spinlock we did acquire).  Since a
+			 * pause request is a pretty asynchronous thing anyway,
+			 * possibly responding to it one WAL record later than we
+			 * otherwise would is a minor issue, so it doesn't seem worth
+			 * adding another spinlock cycle to prevent that.
+			 */
+			if (((volatile XLogRecoveryCtlData *) XLogRecCtl)->recoveryPauseState !=
+				RECOVERY_NOT_PAUSED)
+				recoveryPausesHere(false);
+
+			/*
+			 * Have we reached our recovery target?
+			 */
+			if (recoveryStopsBefore(xlogreader))
+			{
+				reachedRecoveryTarget = true;
+				break;
+			}
+
+			/*
+			 * If we've been asked to lag the primary, wait on latch until
+			 * enough time has passed.
+			 */
+			if (recoveryApplyDelay(xlogreader))
+			{
+				/*
+				 * We test for paused recovery again here. If user sets
+				 * delayed apply, it may be because they expect to pause
+				 * recovery in case of problems, so we must test again
+				 * here otherwise pausing during the delay-wait wouldn't
+				 * work.
+				 */
+				if (((volatile XLogRecoveryCtlData *) XLogRecCtl)->recoveryPauseState !=
+					RECOVERY_NOT_PAUSED)
+					recoveryPausesHere(false);
+			}
+
+			/* Setup error traceback support for ereport() */
+			errcallback.callback = rm_redo_error_callback;
+			errcallback.arg = (void *) xlogreader;
+			errcallback.previous = error_context_stack;
+			error_context_stack = &errcallback;
+
+			/*
+			 * ShmemVariableCache->nextXid must be beyond record's xid.
+			 */
+			AdvanceNextFullTransactionIdPastXid(record->xl_xid);
+
+			/*
+			 * Before replaying this record, check if this record causes
+			 * the current timeline to change. The record is already
+			 * considered to be part of the new timeline, so we update
+			 * ThisTimeLineID before replaying it. That's important so
+			 * that replayEndTLI, which is recorded as the minimum
+			 * recovery point's TLI if recovery stops after this record,
+			 * is set correctly.
+			 */
+			if (record->xl_rmid == RM_XLOG_ID)
+			{
+				TimeLineID	newTLI = ThisTimeLineID;
+				TimeLineID	prevTLI = ThisTimeLineID;
+				uint8		info = record->xl_info & ~XLR_INFO_MASK;
+
+				if (info == XLOG_CHECKPOINT_SHUTDOWN)
+				{
+					CheckPoint	checkPoint;
+
+					memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
+					newTLI = checkPoint.ThisTimeLineID;
+					prevTLI = checkPoint.PrevTimeLineID;
+				}
+				else if (info == XLOG_END_OF_RECOVERY)
+				{
+					xl_end_of_recovery xlrec;
+
+					memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
+					newTLI = xlrec.ThisTimeLineID;
+					prevTLI = xlrec.PrevTimeLineID;
+				}
+
+				if (newTLI != ThisTimeLineID)
+				{
+					/* Check that it's OK to switch to this TLI */
+					checkTimeLineSwitch(EndRecPtr, newTLI, prevTLI);
+
+					/* Following WAL records should be run with new TLI */
+					ThisTimeLineID = newTLI;
+					switchedTLI = true;
+				}
+			}
+
+			/*
+			 * Update shared replayEndRecPtr before replaying this record,
+			 * so that XLogFlush will update minRecoveryPoint correctly.
+			 */
+			SpinLockAcquire(&XLogRecCtl->info_lck);
+			XLogRecCtl->replayEndRecPtr = EndRecPtr;
+			XLogRecCtl->replayEndTLI = ThisTimeLineID;
+			SpinLockRelease(&XLogRecCtl->info_lck);
+
+			/*
+			 * If we are attempting to enter Hot Standby mode, process
+			 * XIDs we see
+			 */
+			if (standbyState >= STANDBY_INITIALIZED &&
+				TransactionIdIsValid(record->xl_xid))
+				RecordKnownAssignedTransactionIds(record->xl_xid);
+
+			/* Now apply the WAL record itself */
+			RmgrTable[record->xl_rmid].rm_redo(xlogreader);
+
+			/*
+			 * After redo, check whether the backup pages associated with
+			 * the WAL record are consistent with the existing pages. This
+			 * check is done only if consistency check is enabled for this
+			 * record.
+			 */
+			if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
+				checkXLogConsistency(xlogreader);
+
+			/* Pop the error context stack */
+			error_context_stack = errcallback.previous;
+
+			/*
+			 * Update lastReplayedEndRecPtr after this record has been
+			 * successfully replayed.
+			 */
+			SpinLockAcquire(&XLogRecCtl->info_lck);
+			XLogRecCtl->lastReplayedEndRecPtr = EndRecPtr;
+			XLogRecCtl->lastReplayedTLI = ThisTimeLineID;
+			SpinLockRelease(&XLogRecCtl->info_lck);
+
+			/* Also remember its starting position. */
+			LastReplayedReadRecPtr = ReadRecPtr;
+
+			/*
+			 * If rm_redo called XLogRequestWalReceiverReply, then we wake
+			 * up the receiver so that it notices the updated
+			 * lastReplayedEndRecPtr and sends a reply to the primary.
+			 */
+			if (doRequestWalReceiverReply)
+			{
+				doRequestWalReceiverReply = false;
+				WalRcvForceReply();
+			}
+
+			/* Allow read-only connections if we're consistent now */
+			CheckRecoveryConsistency();
+
+			/* Is this a timeline switch? */
+			if (switchedTLI)
+			{
+				/*
+				 * Before we continue on the new timeline, clean up any
+				 * (possibly bogus) future WAL segments on the old
+				 * timeline.
+				 */
+				RemoveNonParentXlogFiles(EndRecPtr, ThisTimeLineID);
+
+				/*
+				 * Wake up any walsenders to notice that we are on a new
+				 * timeline.
+				 */
+				if (AllowCascadeReplication())
+					WalSndWakeup();
+			}
+
+			/* Exit loop if we reached inclusive recovery target */
+			if (recoveryStopsAfter(xlogreader))
+			{
+				reachedRecoveryTarget = true;
+				break;
+			}
+
+			/* Else, try to fetch the next WAL record */
+			record = ReadRecord(xlogreader, LOG, false);
+		} while (record != NULL);
+
+		/*
+		 * end of main redo apply loop
+		 */
+
+		if (reachedRecoveryTarget)
+		{
+			if (!reachedConsistency)
+				ereport(FATAL,
+						(errmsg("requested recovery stop point is before consistent recovery point")));
+
+			/*
+			 * This is the last point where we can restart recovery with a
+			 * new recovery target, if we shutdown and begin again. After
+			 * this, Resource Managers may choose to do permanent
+			 * corrective actions at end of recovery.
+			 */
+			switch (recoveryTargetAction)
+			{
+				case RECOVERY_TARGET_ACTION_SHUTDOWN:
+
+					/*
+					 * exit with special return code to request shutdown
+					 * of postmaster.  Log messages issued from
+					 * postmaster.
+					 */
+					proc_exit(3);
+
+				case RECOVERY_TARGET_ACTION_PAUSE:
+					SetRecoveryPause(true);
+					recoveryPausesHere(true);
+
+					/* drop into promote */
+
+				case RECOVERY_TARGET_ACTION_PROMOTE:
+					break;
+			}
+		}
+
+		/* Allow resource managers to do any required cleanup. */
+		for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
+		{
+			if (RmgrTable[rmid].rm_cleanup != NULL)
+				RmgrTable[rmid].rm_cleanup();
+		}
+
+		ereport(LOG,
+				(errmsg("redo done at %X/%X system usage: %s",
+						LSN_FORMAT_ARGS(ReadRecPtr),
+						pg_rusage_show(&ru0))));
+		xtime = GetLatestXTime();
+		if (xtime)
+			ereport(LOG,
+					(errmsg("last completed transaction was at log time %s",
+							timestamptz_to_str(xtime))));
+
+		InRedo = false;
+	}
+	else
+	{
+		/* there are no WAL records following the checkpoint */
+		ereport(LOG,
+				(errmsg("redo is not required")));
+
+	}
+
+	/*
+	 * This check is intentionally after the above log messages that
+	 * indicate how far recovery went.
+	 */
+	if (ArchiveRecoveryRequested &&
+		recoveryTarget != RECOVERY_TARGET_UNSET &&
+		!reachedRecoveryTarget)
+		ereport(FATAL,
+				(errmsg("recovery ended before configured recovery target was reached")));
+}
+
+/*
+ * Error context callback for errors occurring during rm_redo().
+ */
+static void
+rm_redo_error_callback(void *arg)
+{
+	XLogReaderState *record = (XLogReaderState *) arg;
+	StringInfoData buf;
+
+	initStringInfo(&buf);
+	xlog_outdesc(&buf, record);
+	xlog_block_info(&buf, record);
+
+	/* translator: %s is a WAL record description */
+	errcontext("WAL redo at %X/%X for %s",
+			   LSN_FORMAT_ARGS(record->ReadRecPtr),
+			   buf.data);
+
+	pfree(buf.data);
+}
+
+/*
+ * Returns a string describing an XLogRecord, consisting of its identity
+ * optionally followed by a colon, a space, and a further description.
+ */
+void
+xlog_outdesc(StringInfo buf, XLogReaderState *record)
+{
+	RmgrId		rmid = XLogRecGetRmid(record);
+	uint8		info = XLogRecGetInfo(record);
+	const char *id;
+
+	appendStringInfoString(buf, RmgrTable[rmid].rm_name);
+	appendStringInfoChar(buf, '/');
+
+	id = RmgrTable[rmid].rm_identify(info);
+	if (id == NULL)
+		appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
+	else
+		appendStringInfo(buf, "%s: ", id);
+
+	RmgrTable[rmid].rm_desc(buf, record);
+}
+
+#ifdef WAL_DEBUG
+
+static void
+xlog_outrec(StringInfo buf, XLogReaderState *record)
+{
+	appendStringInfo(buf, "prev %X/%X; xid %u",
+					 LSN_FORMAT_ARGS(XLogRecGetPrev(record)),
+					 XLogRecGetXid(record));
+
+	appendStringInfo(buf, "; len %u",
+					 XLogRecGetDataLen(record));
+
+	xlog_block_info(buf, record);
+}
+#endif							/* WAL_DEBUG */
+
+/*
+ * Returns a string giving information about all the blocks in an
+ * XLogRecord.
+ */
+static void
+xlog_block_info(StringInfo buf, XLogReaderState *record)
+{
+	int			block_id;
+
+	/* decode block references */
+	for (block_id = 0; block_id <= record->max_block_id; block_id++)
+	{
+		RelFileNode rnode;
+		ForkNumber	forknum;
+		BlockNumber blk;
+
+		if (!XLogRecHasBlockRef(record, block_id))
+			continue;
+
+		XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blk);
+		if (forknum != MAIN_FORKNUM)
+			appendStringInfo(buf, "; blkref #%u: rel %u/%u/%u, fork %u, blk %u",
+							 block_id,
+							 rnode.spcNode, rnode.dbNode, rnode.relNode,
+							 forknum,
+							 blk);
+		else
+			appendStringInfo(buf, "; blkref #%u: rel %u/%u/%u, blk %u",
+							 block_id,
+							 rnode.spcNode, rnode.dbNode, rnode.relNode,
+							 blk);
+		if (XLogRecHasBlockImage(record, block_id))
+			appendStringInfoString(buf, " FPW");
+	}
+}
+
+
+/*
+ * Check that it's OK to switch to new timeline during recovery.
+ *
+ * 'lsn' is the address of the shutdown checkpoint record we're about to
+ * replay. (Currently, timeline can only change at a shutdown checkpoint).
+ */
+static void
+checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI)
+{
+	/* Check that the record agrees on what the current (old) timeline is */
+	if (prevTLI != ThisTimeLineID)
+		ereport(PANIC,
+				(errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
+						prevTLI, ThisTimeLineID)));
+
+	/*
+	 * The new timeline better be in the list of timelines we expect to see,
+	 * according to the timeline history. It should also not decrease.
+	 */
+	if (newTLI < ThisTimeLineID || !tliInHistory(newTLI, expectedTLEs))
+		ereport(PANIC,
+				(errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
+						newTLI, ThisTimeLineID)));
+
+	/*
+	 * If we have not yet reached min recovery point, and we're about to
+	 * switch to a timeline greater than the timeline of the min recovery
+	 * point: trouble. After switching to the new timeline, we could not
+	 * possibly visit the min recovery point on the correct timeline anymore.
+	 * This can happen if there is a newer timeline in the archive that
+	 * branched before the timeline the min recovery point is on, and you
+	 * attempt to do PITR to the new timeline.
+	 */
+	if (!XLogRecPtrIsInvalid(minRecoveryPoint) &&
+		lsn < minRecoveryPoint &&
+		newTLI > minRecoveryPointTLI)
+		ereport(PANIC,
+				(errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
+						newTLI,
+						LSN_FORMAT_ARGS(minRecoveryPoint),
+						minRecoveryPointTLI)));
+
+	/* Looks good */
+}
+
+
+/*
+ * Extract timestamp from WAL record.
+ *
+ * If the record contains a timestamp, returns true, and saves the timestamp
+ * in *recordXtime. If the record type has no timestamp, returns false.
+ * Currently, only transaction commit/abort records and restore points contain
+ * timestamps.
+ */
+static bool
+getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime)
+{
+	uint8		info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+	uint8		xact_info = info & XLOG_XACT_OPMASK;
+	uint8		rmid = XLogRecGetRmid(record);
+
+	if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
+	{
+		*recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
+		return true;
+	}
+	if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
+							   xact_info == XLOG_XACT_COMMIT_PREPARED))
+	{
+		*recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
+		return true;
+	}
+	if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
+							   xact_info == XLOG_XACT_ABORT_PREPARED))
+	{
+		*recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
+		return true;
+	}
+	return false;
+}
+
+/*
+ * For point-in-time recovery, this function decides whether we want to
+ * stop applying the XLOG before the current record.
+ *
+ * Returns true if we are stopping, false otherwise. If stopping, some
+ * information is saved in recoveryStopXid et al for use in annotating the
+ * new timeline's history file.
+ */
+static bool
+recoveryStopsBefore(XLogReaderState *record)
+{
+	bool		stopsHere = false;
+	uint8		xact_info;
+	bool		isCommit;
+	TimestampTz recordXtime = 0;
+	TransactionId recordXid;
+
+	/*
+	 * Ignore recovery target settings when not in archive recovery (meaning
+	 * we are in crash recovery).
+	 */
+	if (!ArchiveRecoveryRequested)
+		return false;
+
+	/* Check if we should stop as soon as reaching consistency */
+	if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
+	{
+		ereport(LOG,
+				(errmsg("recovery stopping after reaching consistency")));
+
+		recoveryStopAfter = false;
+		recoveryStopXid = InvalidTransactionId;
+		recoveryStopLSN = InvalidXLogRecPtr;
+		recoveryStopTime = 0;
+		recoveryStopName[0] = '\0';
+		return true;
+	}
+
+	/* Check if target LSN has been reached */
+	if (recoveryTarget == RECOVERY_TARGET_LSN &&
+		!recoveryTargetInclusive &&
+		record->ReadRecPtr >= recoveryTargetLSN)
+	{
+		recoveryStopAfter = false;
+		recoveryStopXid = InvalidTransactionId;
+		recoveryStopLSN = record->ReadRecPtr;
+		recoveryStopTime = 0;
+		recoveryStopName[0] = '\0';
+		ereport(LOG,
+				(errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"",
+						LSN_FORMAT_ARGS(recoveryStopLSN))));
+		return true;
+	}
+
+	/* Otherwise we only consider stopping before COMMIT or ABORT records. */
+	if (XLogRecGetRmid(record) != RM_XACT_ID)
+		return false;
+
+	xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
+
+	if (xact_info == XLOG_XACT_COMMIT)
+	{
+		isCommit = true;
+		recordXid = XLogRecGetXid(record);
+	}
+	else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
+	{
+		xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
+		xl_xact_parsed_commit parsed;
+
+		isCommit = true;
+		ParseCommitRecord(XLogRecGetInfo(record),
+						  xlrec,
+						  &parsed);
+		recordXid = parsed.twophase_xid;
+	}
+	else if (xact_info == XLOG_XACT_ABORT)
+	{
+		isCommit = false;
+		recordXid = XLogRecGetXid(record);
+	}
+	else if (xact_info == XLOG_XACT_ABORT_PREPARED)
+	{
+		xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
+		xl_xact_parsed_abort parsed;
+
+		isCommit = true;
+		ParseAbortRecord(XLogRecGetInfo(record),
+						 xlrec,
+						 &parsed);
+		recordXid = parsed.twophase_xid;
+	}
+	else
+		return false;
+
+	if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive)
+	{
+		/*
+		 * There can be only one transaction end record with this exact
+		 * transactionid
+		 *
+		 * when testing for an xid, we MUST test for equality only, since
+		 * transactions are numbered in the order they start, not the order
+		 * they complete. A higher numbered xid will complete before you about
+		 * 50% of the time...
+		 */
+		stopsHere = (recordXid == recoveryTargetXid);
+	}
+
+	if (recoveryTarget == RECOVERY_TARGET_TIME &&
+		getRecordTimestamp(record, &recordXtime))
+	{
+		/*
+		 * There can be many transactions that share the same commit time, so
+		 * we stop after the last one, if we are inclusive, or stop at the
+		 * first one if we are exclusive
+		 */
+		if (recoveryTargetInclusive)
+			stopsHere = (recordXtime > recoveryTargetTime);
+		else
+			stopsHere = (recordXtime >= recoveryTargetTime);
+	}
+
+	if (stopsHere)
+	{
+		recoveryStopAfter = false;
+		recoveryStopXid = recordXid;
+		recoveryStopTime = recordXtime;
+		recoveryStopLSN = InvalidXLogRecPtr;
+		recoveryStopName[0] = '\0';
+
+		if (isCommit)
+		{
+			ereport(LOG,
+					(errmsg("recovery stopping before commit of transaction %u, time %s",
+							recoveryStopXid,
+							timestamptz_to_str(recoveryStopTime))));
+		}
+		else
+		{
+			ereport(LOG,
+					(errmsg("recovery stopping before abort of transaction %u, time %s",
+							recoveryStopXid,
+							timestamptz_to_str(recoveryStopTime))));
+		}
+	}
+
+	return stopsHere;
+}
+
+/*
+ * Same as recoveryStopsBefore, but called after applying the record.
+ *
+ * We also track the timestamp of the latest applied COMMIT/ABORT
+ * record in XLogRecCtl->recoveryLastXTime.
+ */
+static bool
+recoveryStopsAfter(XLogReaderState *record)
+{
+	uint8		info;
+	uint8		xact_info;
+	uint8		rmid;
+	TimestampTz recordXtime;
+
+	/*
+	 * Ignore recovery target settings when not in archive recovery (meaning
+	 * we are in crash recovery).
+	 */
+	if (!ArchiveRecoveryRequested)
+		return false;
+
+	info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+	rmid = XLogRecGetRmid(record);
+
+	/*
+	 * There can be many restore points that share the same name; we stop at
+	 * the first one.
+	 */
+	if (recoveryTarget == RECOVERY_TARGET_NAME &&
+		rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
+	{
+		xl_restore_point *recordRestorePointData;
+
+		recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
+
+		if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
+		{
+			recoveryStopAfter = true;
+			recoveryStopXid = InvalidTransactionId;
+			recoveryStopLSN = InvalidXLogRecPtr;
+			(void) getRecordTimestamp(record, &recoveryStopTime);
+			strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
+
+			ereport(LOG,
+					(errmsg("recovery stopping at restore point \"%s\", time %s",
+							recoveryStopName,
+							timestamptz_to_str(recoveryStopTime))));
+			return true;
+		}
+	}
+
+	/* Check if the target LSN has been reached */
+	if (recoveryTarget == RECOVERY_TARGET_LSN &&
+		recoveryTargetInclusive &&
+		record->ReadRecPtr >= recoveryTargetLSN)
+	{
+		recoveryStopAfter = true;
+		recoveryStopXid = InvalidTransactionId;
+		recoveryStopLSN = record->ReadRecPtr;
+		recoveryStopTime = 0;
+		recoveryStopName[0] = '\0';
+		ereport(LOG,
+				(errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"",
+						LSN_FORMAT_ARGS(recoveryStopLSN))));
+		return true;
+	}
+
+	if (rmid != RM_XACT_ID)
+		return false;
+
+	xact_info = info & XLOG_XACT_OPMASK;
+
+	if (xact_info == XLOG_XACT_COMMIT ||
+		xact_info == XLOG_XACT_COMMIT_PREPARED ||
+		xact_info == XLOG_XACT_ABORT ||
+		xact_info == XLOG_XACT_ABORT_PREPARED)
+	{
+		TransactionId recordXid;
+
+		/* Update the last applied transaction timestamp */
+		if (getRecordTimestamp(record, &recordXtime))
+			SetLatestXTime(recordXtime);
+
+		/* Extract the XID of the committed/aborted transaction */
+		if (xact_info == XLOG_XACT_COMMIT_PREPARED)
+		{
+			xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
+			xl_xact_parsed_commit parsed;
+
+			ParseCommitRecord(XLogRecGetInfo(record),
+							  xlrec,
+							  &parsed);
+			recordXid = parsed.twophase_xid;
+		}
+		else if (xact_info == XLOG_XACT_ABORT_PREPARED)
+		{
+			xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
+			xl_xact_parsed_abort parsed;
+
+			ParseAbortRecord(XLogRecGetInfo(record),
+							 xlrec,
+							 &parsed);
+			recordXid = parsed.twophase_xid;
+		}
+		else
+			recordXid = XLogRecGetXid(record);
+
+		/*
+		 * There can be only one transaction end record with this exact
+		 * transactionid
+		 *
+		 * when testing for an xid, we MUST test for equality only, since
+		 * transactions are numbered in the order they start, not the order
+		 * they complete. A higher numbered xid will complete before you about
+		 * 50% of the time...
+		 */
+		if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive &&
+			recordXid == recoveryTargetXid)
+		{
+			recoveryStopAfter = true;
+			recoveryStopXid = recordXid;
+			recoveryStopTime = recordXtime;
+			recoveryStopLSN = InvalidXLogRecPtr;
+			recoveryStopName[0] = '\0';
+
+			if (xact_info == XLOG_XACT_COMMIT ||
+				xact_info == XLOG_XACT_COMMIT_PREPARED)
+			{
+				ereport(LOG,
+						(errmsg("recovery stopping after commit of transaction %u, time %s",
+								recoveryStopXid,
+								timestamptz_to_str(recoveryStopTime))));
+			}
+			else if (xact_info == XLOG_XACT_ABORT ||
+					 xact_info == XLOG_XACT_ABORT_PREPARED)
+			{
+				ereport(LOG,
+						(errmsg("recovery stopping after abort of transaction %u, time %s",
+								recoveryStopXid,
+								timestamptz_to_str(recoveryStopTime))));
+			}
+			return true;
+		}
+	}
+
+	/* Check if we should stop as soon as reaching consistency */
+	if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
+	{
+		ereport(LOG,
+				(errmsg("recovery stopping after reaching consistency")));
+
+		recoveryStopAfter = true;
+		recoveryStopXid = InvalidTransactionId;
+		recoveryStopTime = 0;
+		recoveryStopLSN = InvalidXLogRecPtr;
+		recoveryStopName[0] = '\0';
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ * Wait until shared recoveryPauseState is set to RECOVERY_NOT_PAUSED.
+ *
+ * endOfRecovery is true if the recovery target is reached and
+ * the paused state starts at the end of recovery because of
+ * recovery_target_action=pause, and false otherwise.
+ */
+static void
+recoveryPausesHere(bool endOfRecovery)
+{
+	/* Don't pause unless users can connect! */
+	if (!LocalHotStandbyActive)
+		return;
+
+	/* Don't pause after standby promotion has been triggered */
+	if (LocalPromoteIsTriggered)
+		return;
+
+	if (endOfRecovery)
+		ereport(LOG,
+				(errmsg("pausing at the end of recovery"),
+				 errhint("Execute pg_wal_replay_resume() to promote.")));
+	else
+		ereport(LOG,
+				(errmsg("recovery has paused"),
+				 errhint("Execute pg_wal_replay_resume() to continue.")));
+
+	/* loop until recoveryPauseState is set to RECOVERY_NOT_PAUSED */
+	while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED)
+	{
+		HandleStartupProcInterrupts();
+		if (CheckForStandbyTrigger())
+			return;
+
+		/*
+		 * If recovery pause is requested then set it paused.  While we are in
+		 * the loop, user might resume and pause again so set this every time.
+		 */
+		ConfirmRecoveryPaused();
+
+		/*
+		 * We wait on a condition variable that will wake us as soon as the
+		 * pause ends, but we use a timeout so we can check the above exit
+		 * condition periodically too.
+		 */
+		ConditionVariableTimedSleep(&XLogRecCtl->recoveryNotPausedCV, 1000,
+									WAIT_EVENT_RECOVERY_PAUSE);
+	}
+	ConditionVariableCancelSleep();
+}
+
+/*
+ * When recovery_min_apply_delay is set, we wait long enough to make sure
+ * certain record types are applied at least that interval behind the primary.
+ *
+ * Returns true if we waited.
+ *
+ * Note that the delay is calculated between the WAL record log time and
+ * the current time on standby. We would prefer to keep track of when this
+ * standby received each WAL record, which would allow a more consistent
+ * approach and one not affected by time synchronisation issues, but that
+ * is significantly more effort and complexity for little actual gain in
+ * usability.
+ */
+static bool
+recoveryApplyDelay(XLogReaderState *record)
+{
+	uint8		xact_info;
+	TimestampTz xtime;
+	TimestampTz delayUntil;
+	long		msecs;
+
+	/* nothing to do if no delay configured */
+	if (recovery_min_apply_delay <= 0)
+		return false;
+
+	/* no delay is applied on a database not yet consistent */
+	if (!reachedConsistency)
+		return false;
+
+	/* nothing to do if crash recovery is requested */
+	if (!ArchiveRecoveryRequested)
+		return false;
+
+	/*
+	 * Is it a COMMIT record?
+	 *
+	 * We deliberately choose not to delay aborts since they have no effect on
+	 * MVCC. We already allow replay of records that don't have a timestamp,
+	 * so there is already opportunity for issues caused by early conflicts on
+	 * standbys.
+	 */
+	if (XLogRecGetRmid(record) != RM_XACT_ID)
+		return false;
+
+	xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
+
+	if (xact_info != XLOG_XACT_COMMIT &&
+		xact_info != XLOG_XACT_COMMIT_PREPARED)
+		return false;
+
+	if (!getRecordTimestamp(record, &xtime))
+		return false;
+
+	delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
+
+	/*
+	 * Exit without arming the latch if it's already past time to apply this
+	 * record
+	 */
+	msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(), delayUntil);
+	if (msecs <= 0)
+		return false;
+
+	while (true)
+	{
+		ResetLatch(&XLogRecCtl->recoveryWakeupLatch);
+
+		/* might change the trigger file's location */
+		HandleStartupProcInterrupts();
+
+		if (CheckForStandbyTrigger())
+			break;
+
+		/*
+		 * Wait for difference between GetCurrentTimestamp() and delayUntil
+		 */
+		msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(),
+												delayUntil);
+
+		if (msecs <= 0)
+			break;
+
+		elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs);
+
+		(void) WaitLatch(&XLogRecCtl->recoveryWakeupLatch,
+						 WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
+						 msecs,
+						 WAIT_EVENT_RECOVERY_APPLY_DELAY);
+	}
+	return true;
+}
+
+/*
+ * Get the current state of the recovery pause request.
+ */
+RecoveryPauseState
+GetRecoveryPauseState(void)
+{
+	RecoveryPauseState state;
+
+	SpinLockAcquire(&XLogRecCtl->info_lck);
+	state = XLogRecCtl->recoveryPauseState;
+	SpinLockRelease(&XLogRecCtl->info_lck);
+
+	return state;
+}
+
+/*
+ * Set the recovery pause state.
+ *
+ * If recovery pause is requested then sets the recovery pause state to
+ * 'pause requested' if it is not already 'paused'.  Otherwise, sets it
+ * to 'not paused' to resume the recovery.  The recovery pause will be
+ * confirmed by the ConfirmRecoveryPaused.
+ */
+void
+SetRecoveryPause(bool recoveryPause)
+{
+	SpinLockAcquire(&XLogRecCtl->info_lck);
+
+	if (!recoveryPause)
+		XLogRecCtl->recoveryPauseState = RECOVERY_NOT_PAUSED;
+	else if (XLogRecCtl->recoveryPauseState == RECOVERY_NOT_PAUSED)
+		XLogRecCtl->recoveryPauseState = RECOVERY_PAUSE_REQUESTED;
+
+	SpinLockRelease(&XLogRecCtl->info_lck);
+
+	if (!recoveryPause)
+		ConditionVariableBroadcast(&XLogRecCtl->recoveryNotPausedCV);
+}
+
+/*
+ * Confirm the recovery pause by setting the recovery pause state to
+ * RECOVERY_PAUSED.
+ */
+static void
+ConfirmRecoveryPaused(void)
+{
+	/* If recovery pause is requested then set it paused */
+	SpinLockAcquire(&XLogRecCtl->info_lck);
+	if (XLogRecCtl->recoveryPauseState == RECOVERY_PAUSE_REQUESTED)
+		XLogRecCtl->recoveryPauseState = RECOVERY_PAUSED;
+	SpinLockRelease(&XLogRecCtl->info_lck);
+}
+
+
+/*
+ * Attempt to read the next XLOG record.
+ *
+ * Before first call, the reader needs to be positioned to the first record
+ * by calling XLogBeginRead().
+ *
+ * If no valid record is available, returns NULL, or fails if emode is PANIC.
+ * (emode must be either PANIC, LOG). In standby mode, retries until a valid
+ * record is available.
+ */
+static XLogRecord *
+ReadRecord(XLogReaderState *xlogreader, int emode,
+		   bool fetching_ckpt)
+{
+	XLogRecord *record;
+	XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
+
+	/* Pass through parameters to XLogPageRead */
+	private->fetching_ckpt = fetching_ckpt;
+	private->emode = emode;
+	private->randAccess = (xlogreader->ReadRecPtr == InvalidXLogRecPtr);
+
+	/* This is the first attempt to read this page. */
+	lastSourceFailed = false;
+
+	for (;;)
+	{
+		char	   *errormsg;
+		XLogRecPtr	EndRecPtr;
+
+		record = XLogReadRecord(xlogreader, &errormsg);
+		EndRecPtr = xlogreader->EndRecPtr;
+		if (record == NULL)
+		{
+			if (readFile >= 0)
+			{
+				close(readFile);
+				readFile = -1;
+			}
+
+			/*
+			 * We only end up here without a message when XLogPageRead()
+			 * failed - in that case we already logged something. In
+			 * StandbyMode that only happens if we have been triggered, so we
+			 * shouldn't loop anymore in that case.
+			 */
+			if (errormsg)
+				ereport(emode_for_corrupt_record(emode, EndRecPtr),
+						(errmsg_internal("%s", errormsg) /* already translated */ ));
+		}
+
+		/*
+		 * Check page TLI is one of the expected values.
+		 */
+		else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
+		{
+			char		fname[MAXFNAMELEN];
+			XLogSegNo	segno;
+			int32		offset;
+
+			XLByteToSeg(xlogreader->latestPagePtr, segno, wal_segment_size);
+			offset = XLogSegmentOffset(xlogreader->latestPagePtr,
+									   wal_segment_size);
+			XLogFileName(fname, xlogreader->seg.ws_tli, segno,
+						 wal_segment_size);
+			ereport(emode_for_corrupt_record(emode, EndRecPtr),
+					(errmsg("unexpected timeline ID %u in log segment %s, offset %u",
+							xlogreader->latestPageTLI,
+							fname,
+							offset)));
+			record = NULL;
+		}
+
+		if (record)
+		{
+			/* Great, got a record */
+			return record;
+		}
+		else
+		{
+			/* No valid record available from this source */
+			lastSourceFailed = true;
+
+			/*
+			 * If archive recovery was requested, but we were still doing
+			 * crash recovery, switch to archive recovery and retry using the
+			 * offline archive. We have now replayed all the valid WAL in
+			 * pg_wal, so we are presumably now consistent.
+			 *
+			 * We require that there's at least some valid WAL present in
+			 * pg_wal, however (!fetching_ckpt).  We could recover using the
+			 * WAL from the archive, even if pg_wal is completely empty, but
+			 * we'd have no idea how far we'd have to replay to reach
+			 * consistency.  So err on the safe side and give up.
+			 */
+			if (!InArchiveRecovery && ArchiveRecoveryRequested &&
+				!fetching_ckpt)
+			{
+				ereport(DEBUG1,
+						(errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
+				InArchiveRecovery = true;
+				if (StandbyModeRequested)
+					StandbyMode = true;
+
+				SwitchIntoArchiveRecovery(EndRecPtr);
+				minRecoveryPoint = EndRecPtr;
+				minRecoveryPointTLI = ThisTimeLineID;
+
+				CheckRecoveryConsistency();
+
+				/*
+				 * Before we retry, reset lastSourceFailed and currentSource
+				 * so that we will check the archive next.
+				 */
+				lastSourceFailed = false;
+				currentSource = XLOG_FROM_ANY;
+
+				continue;
+			}
+
+			/* In standby mode, loop back to retry. Otherwise, give up. */
+			if (StandbyMode && !CheckForStandbyTrigger())
+				continue;
+			else
+				return NULL;
+		}
+	}
+}
+
+
+
+/*
+ * Read the XLOG page containing RecPtr into readBuf (if not read already).
+ * Returns number of bytes read, if the page is read successfully, or -1
+ * in case of errors.  When errors occur, they are ereport'ed, but only
+ * if they have not been previously reported.
+ *
+ * This is responsible for restoring files from archive as needed, as well
+ * as for waiting for the requested WAL record to arrive in standby mode.
+ *
+ * 'emode' specifies the log level used for reporting "file not found" or
+ * "end of WAL" situations in archive recovery, or in standby mode when a
+ * trigger file is found. If set to WARNING or below, XLogPageRead() returns
+ * false in those situations, on higher log levels the ereport() won't
+ * return.
+ *
+ * In standby mode, if after a successful return of XLogPageRead() the
+ * caller finds the record it's interested in to be broken, it should
+ * ereport the error with the level determined by
+ * emode_for_corrupt_record(), and then set lastSourceFailed
+ * and call XLogPageRead() again with the same arguments. This lets
+ * XLogPageRead() to try fetching the record from another source, or to
+ * sleep and retry.
+ */
+static int
+XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
+			 XLogRecPtr targetRecPtr, char *readBuf)
+{
+	XLogPageReadPrivate *private =
+	(XLogPageReadPrivate *) xlogreader->private_data;
+	int			emode = private->emode;
+	uint32		targetPageOff;
+	XLogSegNo	targetSegNo PG_USED_FOR_ASSERTS_ONLY;
+	int			r;
+
+	XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
+	targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
+
+	/*
+	 * See if we need to switch to a new segment because the requested record
+	 * is not in the currently open one.
+	 */
+	if (readFile >= 0 &&
+		!XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size))
+	{
+		/*
+		 * Request a restartpoint if we've replayed too much xlog since the
+		 * last one.
+		 */
+		if (bgwriterLaunched)
+		{
+			if (XLogCheckpointNeeded(readSegNo))
+			{
+				(void) GetRedoRecPtr();
+				if (XLogCheckpointNeeded(readSegNo))
+					RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
+			}
+		}
+
+		close(readFile);
+		readFile = -1;
+		readSource = XLOG_FROM_ANY;
+	}
+
+	XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size);
+
+retry:
+	/* See if we need to retrieve more data */
+	if (readFile < 0 ||
+		(readSource == XLOG_FROM_STREAM &&
+		 flushedUpto < targetPagePtr + reqLen))
+	{
+		if (!WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
+										 private->randAccess,
+										 private->fetching_ckpt,
+										 targetRecPtr))
+		{
+			if (readFile >= 0)
+				close(readFile);
+			readFile = -1;
+			readLen = 0;
+			readSource = XLOG_FROM_ANY;
+
+			return -1;
+		}
+	}
+
+	/*
+	 * At this point, we have the right segment open and if we're streaming we
+	 * know the requested record is in it.
+	 */
+	Assert(readFile != -1);
+
+	/*
+	 * If the current segment is being streamed from the primary, calculate
+	 * how much of the current page we have received already. We know the
+	 * requested record has been received, but this is for the benefit of
+	 * future calls, to allow quick exit at the top of this function.
+	 */
+	if (readSource == XLOG_FROM_STREAM)
+	{
+		if (((targetPagePtr) / XLOG_BLCKSZ) != (flushedUpto / XLOG_BLCKSZ))
+			readLen = XLOG_BLCKSZ;
+		else
+			readLen = XLogSegmentOffset(flushedUpto, wal_segment_size) -
+				targetPageOff;
+	}
+	else
+		readLen = XLOG_BLCKSZ;
+
+	/* Read the requested page */
+	readOff = targetPageOff;
+
+	pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
+	r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff);
+	if (r != XLOG_BLCKSZ)
+	{
+		char		fname[MAXFNAMELEN];
+		int			save_errno = errno;
+
+		pgstat_report_wait_end();
+		XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size);
+		if (r < 0)
+		{
+			errno = save_errno;
+			ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
+					(errcode_for_file_access(),
+					 errmsg("could not read from log segment %s, offset %u: %m",
+							fname, readOff)));
+		}
+		else
+			ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
+					(errcode(ERRCODE_DATA_CORRUPTED),
+					 errmsg("could not read from log segment %s, offset %u: read %d of %zu",
+							fname, readOff, r, (Size) XLOG_BLCKSZ)));
+		goto next_record_is_invalid;
+	}
+	pgstat_report_wait_end();
+
+	Assert(targetSegNo == readSegNo);
+	Assert(targetPageOff == readOff);
+	Assert(reqLen <= readLen);
+
+	xlogreader->seg.ws_tli = curFileTLI;
+
+	/*
+	 * Check the page header immediately, so that we can retry immediately if
+	 * it's not valid. This may seem unnecessary, because XLogReadRecord()
+	 * validates the page header anyway, and would propagate the failure up to
+	 * ReadRecord(), which would retry. However, there's a corner case with
+	 * continuation records, if a record is split across two pages such that
+	 * we would need to read the two pages from different sources. For
+	 * example, imagine a scenario where a streaming replica is started up,
+	 * and replay reaches a record that's split across two WAL segments. The
+	 * first page is only available locally, in pg_wal, because it's already
+	 * been recycled on the primary. The second page, however, is not present
+	 * in pg_wal, and we should stream it from the primary. There is a
+	 * recycled WAL segment present in pg_wal, with garbage contents, however.
+	 * We would read the first page from the local WAL segment, but when
+	 * reading the second page, we would read the bogus, recycled, WAL
+	 * segment. If we didn't catch that case here, we would never recover,
+	 * because ReadRecord() would retry reading the whole record from the
+	 * beginning.
+	 *
+	 * Of course, this only catches errors in the page header, which is what
+	 * happens in the case of a recycled WAL segment. Other kinds of errors or
+	 * corruption still has the same problem. But this at least fixes the
+	 * common case, which can happen as part of normal operation.
+	 *
+	 * Validating the page header is cheap enough that doing it twice
+	 * shouldn't be a big deal from a performance point of view.
+	 */
+	if (!XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
+	{
+		/* reset any error XLogReaderValidatePageHeader() might have set */
+		xlogreader->errormsg_buf[0] = '\0';
+		goto next_record_is_invalid;
+	}
+
+	return readLen;
+
+next_record_is_invalid:
+	lastSourceFailed = true;
+
+	if (readFile >= 0)
+		close(readFile);
+	readFile = -1;
+	readLen = 0;
+	readSource = XLOG_FROM_ANY;
+
+	/* In standby-mode, keep trying */
+	if (StandbyMode)
+		goto retry;
+	else
+		return -1;
+}
+
+/*
+ * Open the WAL segment containing WAL location 'RecPtr'.
+ *
+ * The segment can be fetched via restore_command, or via walreceiver having
+ * streamed the record, or it can already be present in pg_wal. Checking
+ * pg_wal is mainly for crash recovery, but it will be polled in standby mode
+ * too, in case someone copies a new segment directly to pg_wal. That is not
+ * documented or recommended, though.
+ *
+ * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
+ * prepare to read WAL starting from RedoStartLSN after this.
+ *
+ * 'RecPtr' might not point to the beginning of the record we're interested
+ * in, it might also point to the page or segment header. In that case,
+ * 'tliRecPtr' is the position of the WAL record we're interested in. It is
+ * used to decide which timeline to stream the requested WAL from.
+ *
+ * If the record is not immediately available, the function returns false
+ * if we're not in standby mode. In standby mode, waits for it to become
+ * available.
+ *
+ * When the requested record becomes available, the function opens the file
+ * containing it (if not open already), and returns true. When end of standby
+ * mode is triggered by the user, and there is no more WAL available, returns
+ * false.
+ */
+static bool
+WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
+							bool fetching_ckpt, XLogRecPtr tliRecPtr)
+{
+	static TimestampTz last_fail_time = 0;
+	TimestampTz now;
+	bool		streaming_reply_sent = false;
+
+	/*-------
+	 * Standby mode is implemented by a state machine:
+	 *
+	 * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
+	 *	  pg_wal (XLOG_FROM_PG_WAL)
+	 * 2. Check trigger file
+	 * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
+	 * 4. Rescan timelines
+	 * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
+	 *
+	 * Failure to read from the current source advances the state machine to
+	 * the next state.
+	 *
+	 * 'currentSource' indicates the current state. There are no currentSource
+	 * values for "check trigger", "rescan timelines", and "sleep" states,
+	 * those actions are taken when reading from the previous source fails, as
+	 * part of advancing to the next state.
+	 *
+	 * If standby mode is turned off while reading WAL from stream, we move
+	 * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching
+	 * the files (which would be required at end of recovery, e.g., timeline
+	 * history file) from archive or pg_wal. We don't need to kill WAL receiver
+	 * here because it's already stopped when standby mode is turned off at
+	 * the end of recovery.
+	 *-------
+	 */
+	if (!InArchiveRecovery)
+		currentSource = XLOG_FROM_PG_WAL;
+	else if (currentSource == XLOG_FROM_ANY ||
+			 (!StandbyMode && currentSource == XLOG_FROM_STREAM))
+	{
+		lastSourceFailed = false;
+		currentSource = XLOG_FROM_ARCHIVE;
+	}
+
+	for (;;)
+	{
+		XLogSource	oldSource = currentSource;
+		bool		startWalReceiver = false;
+
+		/*
+		 * First check if we failed to read from the current source, and
+		 * advance the state machine if so. The failure to read might've
+		 * happened outside this function, e.g when a CRC check fails on a
+		 * record, or within this loop.
+		 */
+		if (lastSourceFailed)
+		{
+			switch (currentSource)
+			{
+				case XLOG_FROM_ARCHIVE:
+				case XLOG_FROM_PG_WAL:
+
+					/*
+					 * Check to see if the trigger file exists. Note that we
+					 * do this only after failure, so when you create the
+					 * trigger file, we still finish replaying as much as we
+					 * can from archive and pg_wal before failover.
+					 */
+					if (StandbyMode && CheckForStandbyTrigger())
+					{
+						ShutdownWalRcv();
+						return false;
+					}
+
+					/*
+					 * Not in standby mode, and we've now tried the archive
+					 * and pg_wal.
+					 */
+					if (!StandbyMode)
+						return false;
+
+					/*
+					 * Move to XLOG_FROM_STREAM state, and set to start a
+					 * walreceiver if necessary.
+					 */
+					currentSource = XLOG_FROM_STREAM;
+					startWalReceiver = true;
+					break;
+
+				case XLOG_FROM_STREAM:
+
+					/*
+					 * Failure while streaming. Most likely, we got here
+					 * because streaming replication was terminated, or
+					 * promotion was triggered. But we also get here if we
+					 * find an invalid record in the WAL streamed from the
+					 * primary, in which case something is seriously wrong.
+					 * There's little chance that the problem will just go
+					 * away, but PANIC is not good for availability either,
+					 * especially in hot standby mode. So, we treat that the
+					 * same as disconnection, and retry from archive/pg_wal
+					 * again. The WAL in the archive should be identical to
+					 * what was streamed, so it's unlikely that it helps, but
+					 * one can hope...
+					 */
+
+					/*
+					 * We should be able to move to XLOG_FROM_STREAM only in
+					 * standby mode.
+					 */
+					Assert(StandbyMode);
+
+					/*
+					 * Before we leave XLOG_FROM_STREAM state, make sure that
+					 * walreceiver is not active, so that it won't overwrite
+					 * WAL that we restore from archive.
+					 */
+					if (WalRcvStreaming())
+						ShutdownWalRcv();
+
+					/*
+					 * Before we sleep, re-scan for possible new timelines if
+					 * we were requested to recover to the latest timeline.
+					 */
+					if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
+					{
+						if (rescanLatestTimeLine())
+						{
+							currentSource = XLOG_FROM_ARCHIVE;
+							break;
+						}
+					}
+
+					/*
+					 * XLOG_FROM_STREAM is the last state in our state
+					 * machine, so we've exhausted all the options for
+					 * obtaining the requested WAL. We're going to loop back
+					 * and retry from the archive, but if it hasn't been long
+					 * since last attempt, sleep wal_retrieve_retry_interval
+					 * milliseconds to avoid busy-waiting.
+					 */
+					now = GetCurrentTimestamp();
+					if (!TimestampDifferenceExceeds(last_fail_time, now,
+													wal_retrieve_retry_interval))
+					{
+						long		wait_time;
+
+						wait_time = wal_retrieve_retry_interval -
+							TimestampDifferenceMilliseconds(last_fail_time, now);
+
+						elog(LOG, "waiting for WAL to become available at %X/%X",
+							 LSN_FORMAT_ARGS(RecPtr));
+
+						(void) WaitLatch(&XLogRecCtl->recoveryWakeupLatch,
+										 WL_LATCH_SET | WL_TIMEOUT |
+										 WL_EXIT_ON_PM_DEATH,
+										 wait_time,
+										 WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL);
+						ResetLatch(&XLogRecCtl->recoveryWakeupLatch);
+						now = GetCurrentTimestamp();
+
+						/* Handle interrupt signals of startup process */
+						HandleStartupProcInterrupts();
+					}
+					last_fail_time = now;
+					currentSource = XLOG_FROM_ARCHIVE;
+					break;
+
+				default:
+					elog(ERROR, "unexpected WAL source %d", currentSource);
+			}
+		}
+		else if (currentSource == XLOG_FROM_PG_WAL)
+		{
+			/*
+			 * We just successfully read a file in pg_wal. We prefer files in
+			 * the archive over ones in pg_wal, so try the next file again
+			 * from the archive first.
+			 */
+			if (InArchiveRecovery)
+				currentSource = XLOG_FROM_ARCHIVE;
+		}
+
+		if (currentSource != oldSource)
+			elog(DEBUG2, "switched WAL source from %s to %s after %s",
+				 xlogSourceNames[oldSource], xlogSourceNames[currentSource],
+				 lastSourceFailed ? "failure" : "success");
+
+		/*
+		 * We've now handled possible failure. Try to read from the chosen
+		 * source.
+		 */
+		lastSourceFailed = false;
+
+		switch (currentSource)
+		{
+			case XLOG_FROM_ARCHIVE:
+			case XLOG_FROM_PG_WAL:
+
+				/*
+				 * WAL receiver must not be running when reading WAL from
+				 * archive or pg_wal.
+				 */
+				Assert(!WalRcvStreaming());
+
+				/* Close any old file we might have open. */
+				if (readFile >= 0)
+				{
+					close(readFile);
+					readFile = -1;
+				}
+				/* Reset curFileTLI if random fetch. */
+				if (randAccess)
+					curFileTLI = 0;
+
+				/*
+				 * Try to restore the file from archive, or read an existing
+				 * file from pg_wal.
+				 */
+				readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2,
+											  currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY :
+											  currentSource);
+				if (readFile >= 0)
+					return true;	/* success! */
+
+				/*
+				 * Nope, not found in archive or pg_wal.
+				 */
+				lastSourceFailed = true;
+				break;
+
+			case XLOG_FROM_STREAM:
+				{
+					bool		havedata;
+
+					/*
+					 * We should be able to move to XLOG_FROM_STREAM only in
+					 * standby mode.
+					 */
+					Assert(StandbyMode);
+
+					/*
+					 * First, shutdown walreceiver if its restart has been
+					 * requested -- but no point if we're already slated for
+					 * starting it.
+					 */
+					if (pendingWalRcvRestart && !startWalReceiver)
+					{
+						ShutdownWalRcv();
+
+						/*
+						 * Re-scan for possible new timelines if we were
+						 * requested to recover to the latest timeline.
+						 */
+						if (recoveryTargetTimeLineGoal ==
+							RECOVERY_TARGET_TIMELINE_LATEST)
+							rescanLatestTimeLine();
+
+						startWalReceiver = true;
+					}
+					pendingWalRcvRestart = false;
+
+					/*
+					 * Launch walreceiver if needed.
+					 *
+					 * If fetching_ckpt is true, RecPtr points to the initial
+					 * checkpoint location. In that case, we use RedoStartLSN
+					 * as the streaming start position instead of RecPtr, so
+					 * that when we later jump backwards to start redo at
+					 * RedoStartLSN, we will have the logs streamed already.
+					 */
+					if (startWalReceiver &&
+						PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0)
+					{
+						XLogRecPtr	ptr;
+						TimeLineID	tli;
+
+						if (fetching_ckpt)
+						{
+							ptr = RedoStartLSN;
+							tli = RedoStartTLI;
+						}
+						else
+						{
+							ptr = RecPtr;
+
+							/*
+							 * Use the record begin position to determine the
+							 * TLI, rather than the position we're reading.
+							 */
+							tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
+
+							if (curFileTLI > 0 && tli < curFileTLI)
+								elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
+									 LSN_FORMAT_ARGS(tliRecPtr),
+									 tli, curFileTLI);
+						}
+						curFileTLI = tli;
+						RequestXLogStreaming(tli, ptr, PrimaryConnInfo,
+											 PrimarySlotName,
+											 wal_receiver_create_temp_slot);
+						flushedUpto = 0;
+					}
+
+					/*
+					 * Check if WAL receiver is active or wait to start up.
+					 */
+					if (!WalRcvStreaming())
+					{
+						lastSourceFailed = true;
+						break;
+					}
+
+					/*
+					 * Walreceiver is active, so see if new data has arrived.
+					 *
+					 * We only advance XLogReceiptTime when we obtain fresh
+					 * WAL from walreceiver and observe that we had already
+					 * processed everything before the most recent "chunk"
+					 * that it flushed to disk.  In steady state where we are
+					 * keeping up with the incoming data, XLogReceiptTime will
+					 * be updated on each cycle. When we are behind,
+					 * XLogReceiptTime will not advance, so the grace time
+					 * allotted to conflicting queries will decrease.
+					 */
+					if (RecPtr < flushedUpto)
+						havedata = true;
+					else
+					{
+						XLogRecPtr	latestChunkStart;
+
+						flushedUpto = GetWalRcvFlushRecPtr(&latestChunkStart, &receiveTLI);
+						if (RecPtr < flushedUpto && receiveTLI == curFileTLI)
+						{
+							havedata = true;
+							if (latestChunkStart <= RecPtr)
+							{
+								XLogReceiptTime = GetCurrentTimestamp();
+								SetCurrentChunkStartTime(XLogReceiptTime);
+							}
+						}
+						else
+							havedata = false;
+					}
+					if (havedata)
+					{
+						/*
+						 * Great, streamed far enough.  Open the file if it's
+						 * not open already.  Also read the timeline history
+						 * file if we haven't initialized timeline history
+						 * yet; it should be streamed over and present in
+						 * pg_wal by now.  Use XLOG_FROM_STREAM so that source
+						 * info is set correctly and XLogReceiptTime isn't
+						 * changed.
+						 *
+						 * NB: We must set readTimeLineHistory based on
+						 * recoveryTargetTLI, not receiveTLI. Normally they'll
+						 * be the same, but if recovery_target_timeline is
+						 * 'latest' and archiving is configured, then it's
+						 * possible that we managed to retrieve one or more
+						 * new timeline history files from the archive,
+						 * updating recoveryTargetTLI.
+						 */
+						if (readFile < 0)
+						{
+							if (!expectedTLEs)
+								expectedTLEs = readTimeLineHistory(recoveryTargetTLI);
+							readFile = XLogFileRead(readSegNo, PANIC,
+													receiveTLI,
+													XLOG_FROM_STREAM, false);
+							Assert(readFile >= 0);
+						}
+						else
+						{
+							/* just make sure source info is correct... */
+							readSource = XLOG_FROM_STREAM;
+							XLogReceiptSource = XLOG_FROM_STREAM;
+							return true;
+						}
+						break;
+					}
+
+					/*
+					 * Data not here yet. Check for trigger, then wait for
+					 * walreceiver to wake us up when new WAL arrives.
+					 */
+					if (CheckForStandbyTrigger())
+					{
+						/*
+						 * Note that we don't "return false" immediately here.
+						 * After being triggered, we still want to replay all
+						 * the WAL that was already streamed. It's in pg_wal
+						 * now, so we just treat this as a failure, and the
+						 * state machine will move on to replay the streamed
+						 * WAL from pg_wal, and then recheck the trigger and
+						 * exit replay.
+						 */
+						lastSourceFailed = true;
+						break;
+					}
+
+					/*
+					 * Since we have replayed everything we have received so
+					 * far and are about to start waiting for more WAL, let's
+					 * tell the upstream server our replay location now so
+					 * that pg_stat_replication doesn't show stale
+					 * information.
+					 */
+					if (!streaming_reply_sent)
+					{
+						WalRcvForceReply();
+						streaming_reply_sent = true;
+					}
+
+					/*
+					 * Wait for more WAL to arrive. Time out after 5 seconds
+					 * to react to a trigger file promptly and to check if the
+					 * WAL receiver is still active.
+					 */
+					(void) WaitLatch(&XLogRecCtl->recoveryWakeupLatch,
+									 WL_LATCH_SET | WL_TIMEOUT |
+									 WL_EXIT_ON_PM_DEATH,
+									 5000L, WAIT_EVENT_RECOVERY_WAL_STREAM);
+					ResetLatch(&XLogRecCtl->recoveryWakeupLatch);
+					break;
+				}
+
+			default:
+				elog(ERROR, "unexpected WAL source %d", currentSource);
+		}
+
+		/*
+		 * Check for recovery pause here so that we can confirm more quickly
+		 * that a requested pause has actually taken effect.
+		 */
+		if (((volatile XLogRecoveryCtlData *) XLogRecCtl)->recoveryPauseState !=
+			RECOVERY_NOT_PAUSED)
+			recoveryPausesHere(false);
+
+		/*
+		 * This possibly-long loop needs to handle interrupts of startup
+		 * process.
+		 */
+		HandleStartupProcInterrupts();
+	}
+
+	return false;				/* not reached */
+}
+
+
+/*
+ * Determine what log level should be used to report a corrupt WAL record
+ * in the current WAL page, previously read by XLogPageRead().
+ *
+ * 'emode' is the error mode that would be used to report a file-not-found
+ * or legitimate end-of-WAL situation.   Generally, we use it as-is, but if
+ * we're retrying the exact same record that we've tried previously, only
+ * complain the first time to keep the noise down.  However, we only do when
+ * reading from pg_wal, because we don't expect any invalid records in archive
+ * or in records streamed from the primary. Files in the archive should be complete,
+ * and we should never hit the end of WAL because we stop and wait for more WAL
+ * to arrive before replaying it.
+ *
+ * NOTE: This function remembers the RecPtr value it was last called with,
+ * to suppress repeated messages about the same record. Only call this when
+ * you are about to ereport(), or you might cause a later message to be
+ * erroneously suppressed.
+ */
+static int
+emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
+{
+	static XLogRecPtr lastComplaint = 0;
+
+	if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
+	{
+		if (RecPtr == lastComplaint)
+			emode = DEBUG1;
+		else
+			lastComplaint = RecPtr;
+	}
+	return emode;
+}
+
+
+/*
+ * Subroutine to try to fetch and validate a prior checkpoint record.
+ *
+ * whichChkpt identifies the checkpoint (merely for reporting purposes).
+ * 1 for "primary", 0 for "other" (backup_label)
+ */
+XLogRecord *
+ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt, bool report)
+{
+	XLogRecord *record;
+	uint8		info;
+
+	Assert(xlogreader != NULL);
+
+	if (!XRecOffIsValid(RecPtr))
+	{
+		if (!report)
+			return NULL;
+
+		switch (whichChkpt)
+		{
+			case 1:
+				ereport(LOG,
+						(errmsg("invalid primary checkpoint link in control file")));
+				break;
+			default:
+				ereport(LOG,
+						(errmsg("invalid checkpoint link in backup_label file")));
+				break;
+		}
+		return NULL;
+	}
+
+	XLogBeginRead(xlogreader, RecPtr);
+	record = ReadRecord(xlogreader, LOG, true);
+
+	if (record == NULL)
+	{
+		if (!report)
+			return NULL;
+
+		switch (whichChkpt)
+		{
+			case 1:
+				ereport(LOG,
+						(errmsg("invalid primary checkpoint record")));
+				break;
+			default:
+				ereport(LOG,
+						(errmsg("invalid checkpoint record")));
+				break;
+		}
+		return NULL;
+	}
+	if (record->xl_rmid != RM_XLOG_ID)
+	{
+		switch (whichChkpt)
+		{
+			case 1:
+				ereport(LOG,
+						(errmsg("invalid resource manager ID in primary checkpoint record")));
+				break;
+			default:
+				ereport(LOG,
+						(errmsg("invalid resource manager ID in checkpoint record")));
+				break;
+		}
+		return NULL;
+	}
+	info = record->xl_info & ~XLR_INFO_MASK;
+	if (info != XLOG_CHECKPOINT_SHUTDOWN &&
+		info != XLOG_CHECKPOINT_ONLINE)
+	{
+		switch (whichChkpt)
+		{
+			case 1:
+				ereport(LOG,
+						(errmsg("invalid xl_info in primary checkpoint record")));
+				break;
+			default:
+				ereport(LOG,
+						(errmsg("invalid xl_info in checkpoint record")));
+				break;
+		}
+		return NULL;
+	}
+	if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint))
+	{
+		switch (whichChkpt)
+		{
+			case 1:
+				ereport(LOG,
+						(errmsg("invalid length of primary checkpoint record")));
+				break;
+			default:
+				ereport(LOG,
+						(errmsg("invalid length of checkpoint record")));
+				break;
+		}
+		return NULL;
+	}
+	return record;
+}
+
+void
+HandleBackupEndRecord(XLogRecPtr startpoint, XLogRecPtr endLsn, TimeLineID endTLI)
+{
+	if (backupStartPoint == startpoint)
+	{
+		/*
+		 * We have reached the end of base backup, the point where
+		 * pg_stop_backup() was done. The data on disk is now consistent.
+		 * Reset backupStartPoint, and update minRecoveryPoint to make
+		 * sure we don't allow starting up at an earlier point even if
+		 * recovery is stopped and restarted soon after this.
+		 */
+		elog(DEBUG1, "end of backup record reached");
+
+		backupEndPoint = endLsn;
+	}
+	else
+		elog(DEBUG1, "saw end-of-backup record for backup starting at %X/%X, waiting for %X/%X",
+			 LSN_FORMAT_ARGS(startpoint), LSN_FORMAT_ARGS(backupStartPoint));
+}
+
+/*
+ * Checks if recovery has reached a consistent state. When consistency is
+ * reached and we have a valid starting standby snapshot, tell postmaster
+ * that it can start accepting read-only connections.
+ */
+static void
+CheckRecoveryConsistency(void)
+{
+	XLogRecPtr	lastReplayedEndRecPtr;
+	TimeLineID  lastReplayedTLI;
+
+	/*
+	 * During crash recovery, we don't reach a consistent state until we've
+	 * replayed all the WAL.
+	 */
+	if (XLogRecPtrIsInvalid(minRecoveryPoint))
+		return;
+
+	Assert(InArchiveRecovery);
+
+	/*
+	 * assume that we are called in the startup process, and hence don't need
+	 * a lock to read lastReplayedEndRecPtr
+	 */
+	lastReplayedEndRecPtr = XLogRecCtl->lastReplayedEndRecPtr;
+	lastReplayedTLI = XLogRecCtl->lastReplayedTLI;
+
+	/*
+	 * Have we reached the point where our base backup was completed?
+	 */
+	if (!XLogRecPtrIsInvalid(backupEndPoint) &&
+		backupEndPoint <= lastReplayedEndRecPtr)
+	{
+		elog(DEBUG1, "end of backup LSN reached");
+
+		/*
+		 * We have reached the end of base backup, as indicated by pg_control.
+		 * Update pg_control accordingly.
+		 */
+		ReachedEndOfBackup(lastReplayedEndRecPtr, lastReplayedTLI);
+		backupEndRequired = false;
+	}
+
+	/*
+	 * Have we passed our safe starting point? Note that minRecoveryPoint is
+	 * known to be incorrectly set if ControlFile->backupEndRequired, until
+	 * the XLOG_BACKUP_END arrives to advise us of the correct
+	 * minRecoveryPoint. All we know prior to that is that we're not
+	 * consistent yet.
+	 */
+	if (!reachedConsistency && !backupEndRequired &&
+		minRecoveryPoint <= lastReplayedEndRecPtr)
+	{
+		/*
+		 * Check to see if the XLOG sequence contained any unresolved
+		 * references to uninitialized pages.
+		 */
+		XLogCheckInvalidPages();
+
+		reachedConsistency = true;
+		ereport(LOG,
+				(errmsg("consistent recovery state reached at %X/%X",
+						LSN_FORMAT_ARGS(lastReplayedEndRecPtr))));
+	}
+
+	/*
+	 * Have we got a valid starting snapshot that will allow queries to be
+	 * run? If so, we can tell postmaster that the database is consistent now,
+	 * enabling connections.
+	 */
+	if (standbyState == STANDBY_SNAPSHOT_READY &&
+		!LocalHotStandbyActive &&
+		reachedConsistency &&
+		IsUnderPostmaster)
+	{
+		SpinLockAcquire(&XLogRecCtl->info_lck);
+		XLogRecCtl->SharedHotStandbyActive = true;
+		SpinLockRelease(&XLogRecCtl->info_lck);
+
+		LocalHotStandbyActive = true;
+
+		SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
+	}
+}
+
+/*
+ * Save timestamp of the next chunk of WAL records to apply.
+ *
+ * We keep this in XLogRecCtl, not a simple static variable, so that it can be
+ * seen by all backends.
+ */
+static void
+SetCurrentChunkStartTime(TimestampTz xtime)
+{
+	SpinLockAcquire(&XLogRecCtl->info_lck);
+	XLogRecCtl->currentChunkStartTime = xtime;
+	SpinLockRelease(&XLogRecCtl->info_lck);
+}
+
+/*
+ * Save timestamp of latest processed commit/abort record.
+ *
+ * We keep this in XLogRecCtl, not a simple static variable, so that it can be
+ * seen by processes other than the startup process.  Note in particular
+ * that CreateRestartPoint is executed in the checkpointer.
+ */
+static void
+SetLatestXTime(TimestampTz xtime)
+{
+	SpinLockAcquire(&XLogRecCtl->info_lck);
+	XLogRecCtl->recoveryLastXTime = xtime;
+	SpinLockRelease(&XLogRecCtl->info_lck);
+}
+
+/*
+ * Fetch timestamp of latest processed commit/abort record.
+ */
+TimestampTz
+GetLatestXTime(void)
+{
+	TimestampTz xtime;
+
+	SpinLockAcquire(&XLogRecCtl->info_lck);
+	xtime = XLogRecCtl->recoveryLastXTime;
+	SpinLockRelease(&XLogRecCtl->info_lck);
+
+	return xtime;
+}
+
+/*
+ * Scan for new timelines that might have appeared in the archive since we
+ * started recovery.
+ *
+ * If there are any, the function changes recovery target TLI to the latest
+ * one and returns 'true'.
+ */
+static bool
+rescanLatestTimeLine(void)
+{
+	List	   *newExpectedTLEs;
+	bool		found;
+	ListCell   *cell;
+	TimeLineID	newtarget;
+	TimeLineID	oldtarget = recoveryTargetTLI;
+	TimeLineHistoryEntry *currentTle = NULL;
+
+	newtarget = findNewestTimeLine(recoveryTargetTLI);
+	if (newtarget == recoveryTargetTLI)
+	{
+		/* No new timelines found */
+		return false;
+	}
+
+	/*
+	 * Determine the list of expected TLIs for the new TLI
+	 */
+
+	newExpectedTLEs = readTimeLineHistory(newtarget);
+
+	/*
+	 * If the current timeline is not part of the history of the new timeline,
+	 * we cannot proceed to it.
+	 */
+	found = false;
+	foreach(cell, newExpectedTLEs)
+	{
+		currentTle = (TimeLineHistoryEntry *) lfirst(cell);
+
+		if (currentTle->tli == recoveryTargetTLI)
+		{
+			found = true;
+			break;
+		}
+	}
+	if (!found)
+	{
+		ereport(LOG,
+				(errmsg("new timeline %u is not a child of database system timeline %u",
+						newtarget,
+						ThisTimeLineID)));
+		return false;
+	}
+
+	/*
+	 * The current timeline was found in the history file, but check that the
+	 * next timeline was forked off from it *after* the current recovery
+	 * location.
+	 */
+	if (currentTle->end < xlogreader->EndRecPtr)
+	{
+		ereport(LOG,
+				(errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
+						newtarget,
+						ThisTimeLineID,
+						LSN_FORMAT_ARGS(xlogreader->EndRecPtr))));
+		return false;
+	}
+
+	/* The new timeline history seems valid. Switch target */
+	recoveryTargetTLI = newtarget;
+	list_free_deep(expectedTLEs);
+	expectedTLEs = newExpectedTLEs;
+
+	/*
+	 * As in StartupXLOG(), try to ensure we have all the history files
+	 * between the old target and new target in pg_wal.
+	 */
+	restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
+
+	ereport(LOG,
+			(errmsg("new target timeline is %u",
+					recoveryTargetTLI)));
+
+	return true;
+}
+
+
+/*
+ * Open a logfile segment for reading (during recovery).
+ *
+ * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
+ * Otherwise, it's assumed to be already available in pg_wal.
+ */
+static int
+XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
+			 XLogSource source, bool notfoundOk)
+{
+	char		xlogfname[MAXFNAMELEN];
+	char		activitymsg[MAXFNAMELEN + 16];
+	char		path[MAXPGPATH];
+	int			fd;
+
+	XLogFileName(xlogfname, tli, segno, wal_segment_size);
+
+	switch (source)
+	{
+		case XLOG_FROM_ARCHIVE:
+			/* Report recovery progress in PS display */
+			snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
+					 xlogfname);
+			set_ps_display(activitymsg);
+
+			if (!RestoreArchivedFile(path, xlogfname,
+									 "RECOVERYXLOG",
+									 wal_segment_size,
+									 InRedo))
+				return -1;
+			break;
+
+		case XLOG_FROM_PG_WAL:
+		case XLOG_FROM_STREAM:
+			XLogFilePath(path, tli, segno, wal_segment_size);
+			break;
+
+		default:
+			elog(ERROR, "invalid XLogFileRead source %d", source);
+	}
+
+	/*
+	 * If the segment was fetched from archival storage, replace the existing
+	 * xlog segment (if any) with the archival version.
+	 */
+	if (source == XLOG_FROM_ARCHIVE)
+	{
+		KeepFileRestoredFromArchive(path, xlogfname);
+
+		/*
+		 * Set path to point at the new file in pg_wal.
+		 */
+		snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
+	}
+
+	fd = BasicOpenFile(path, O_RDONLY | PG_BINARY);
+	if (fd >= 0)
+	{
+		/* Success! */
+		curFileTLI = tli;
+
+		/* Report recovery progress in PS display */
+		snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
+				 xlogfname);
+		set_ps_display(activitymsg);
+
+		/* Track source of data in assorted state variables */
+		readSource = source;
+		XLogReceiptSource = source;
+		/* In FROM_STREAM case, caller tracks receipt time, not me */
+		if (source != XLOG_FROM_STREAM)
+			XLogReceiptTime = GetCurrentTimestamp();
+
+		return fd;
+	}
+	if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
+		ereport(PANIC,
+				(errcode_for_file_access(),
+				 errmsg("could not open file \"%s\": %m", path)));
+	return -1;
+}
+
+/*
+ * Open a logfile segment for reading (during recovery).
+ *
+ * This version searches for the segment with any TLI listed in expectedTLEs.
+ */
+static int
+XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source)
+{
+	char		path[MAXPGPATH];
+	ListCell   *cell;
+	int			fd;
+	List	   *tles;
+
+	/*
+	 * Loop looking for a suitable timeline ID: we might need to read any of
+	 * the timelines listed in expectedTLEs.
+	 *
+	 * We expect curFileTLI on entry to be the TLI of the preceding file in
+	 * sequence, or 0 if there was no predecessor.  We do not allow curFileTLI
+	 * to go backwards; this prevents us from picking up the wrong file when a
+	 * parent timeline extends to higher segment numbers than the child we
+	 * want to read.
+	 *
+	 * If we haven't read the timeline history file yet, read it now, so that
+	 * we know which TLIs to scan.  We don't save the list in expectedTLEs,
+	 * however, unless we actually find a valid segment.  That way if there is
+	 * neither a timeline history file nor a WAL segment in the archive, and
+	 * streaming replication is set up, we'll read the timeline history file
+	 * streamed from the primary when we start streaming, instead of
+	 * recovering with a dummy history generated here.
+	 */
+	if (expectedTLEs)
+		tles = expectedTLEs;
+	else
+		tles = readTimeLineHistory(recoveryTargetTLI);
+
+	foreach(cell, tles)
+	{
+		TimeLineHistoryEntry *hent = (TimeLineHistoryEntry *) lfirst(cell);
+		TimeLineID	tli = hent->tli;
+
+		if (tli < curFileTLI)
+			break;				/* don't bother looking at too-old TLIs */
+
+		/*
+		 * Skip scanning the timeline ID that the logfile segment to read
+		 * doesn't belong to
+		 */
+		if (hent->begin != InvalidXLogRecPtr)
+		{
+			XLogSegNo	beginseg = 0;
+
+			XLByteToSeg(hent->begin, beginseg, wal_segment_size);
+
+			/*
+			 * The logfile segment that doesn't belong to the timeline is
+			 * older or newer than the segment that the timeline started or
+			 * ended at, respectively. It's sufficient to check only the
+			 * starting segment of the timeline here. Since the timelines are
+			 * scanned in descending order in this loop, any segments newer
+			 * than the ending segment should belong to newer timeline and
+			 * have already been read before. So it's not necessary to check
+			 * the ending segment of the timeline here.
+			 */
+			if (segno < beginseg)
+				continue;
+		}
+
+		if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
+		{
+			fd = XLogFileRead(segno, emode, tli,
+							  XLOG_FROM_ARCHIVE, true);
+			if (fd != -1)
+			{
+				elog(DEBUG1, "got WAL segment from archive");
+				if (!expectedTLEs)
+					expectedTLEs = tles;
+				return fd;
+			}
+		}
+
+		if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_WAL)
+		{
+			fd = XLogFileRead(segno, emode, tli,
+							  XLOG_FROM_PG_WAL, true);
+			if (fd != -1)
+			{
+				if (!expectedTLEs)
+					expectedTLEs = tles;
+				return fd;
+			}
+		}
+	}
+
+	/* Couldn't find it.  For simplicity, complain about front timeline */
+	XLogFilePath(path, recoveryTargetTLI, segno, wal_segment_size);
+	errno = ENOENT;
+	ereport(emode,
+			(errcode_for_file_access(),
+			 errmsg("could not open file \"%s\": %m", path)));
+	return -1;
+}
+
+
+/*
+ * Checks whether the current buffer page and backup page stored in the
+ * WAL record are consistent or not. Before comparing the two pages, a
+ * masking can be applied to the pages to ignore certain areas like hint bits,
+ * unused space between pd_lower and pd_upper among other things. This
+ * function should be called once WAL replay has been completed for a
+ * given record.
+ */
+static void
+checkXLogConsistency(XLogReaderState *record)
+{
+	RmgrId		rmid = XLogRecGetRmid(record);
+	RelFileNode rnode;
+	ForkNumber	forknum;
+	BlockNumber blkno;
+	int			block_id;
+
+	/* Records with no backup blocks have no need for consistency checks. */
+	if (!XLogRecHasAnyBlockRefs(record))
+		return;
+
+	Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0);
+
+	for (block_id = 0; block_id <= record->max_block_id; block_id++)
+	{
+		Buffer		buf;
+		Page		page;
+
+		if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno))
+		{
+			/*
+			 * WAL record doesn't contain a block reference with the given id.
+			 * Do nothing.
+			 */
+			continue;
+		}
+
+		Assert(XLogRecHasBlockImage(record, block_id));
+
+		if (XLogRecBlockImageApply(record, block_id))
+		{
+			/*
+			 * WAL record has already applied the page, so bypass the
+			 * consistency check as that would result in comparing the full
+			 * page stored in the record with itself.
+			 */
+			continue;
+		}
+
+		/*
+		 * Read the contents from the current buffer and store it in a
+		 * temporary page.
+		 */
+		buf = XLogReadBufferExtended(rnode, forknum, blkno,
+									 RBM_NORMAL_NO_LOG);
+		if (!BufferIsValid(buf))
+			continue;
+
+		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+		page = BufferGetPage(buf);
+
+		/*
+		 * Take a copy of the local page where WAL has been applied to have a
+		 * comparison base before masking it...
+		 */
+		memcpy(replay_image_masked, page, BLCKSZ);
+
+		/* No need for this page anymore now that a copy is in. */
+		UnlockReleaseBuffer(buf);
+
+		/*
+		 * If the block LSN is already ahead of this WAL record, we can't
+		 * expect contents to match.  This can happen if recovery is
+		 * restarted.
+		 */
+		if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
+			continue;
+
+		/*
+		 * Read the contents from the backup copy, stored in WAL record and
+		 * store it in a temporary page. There is no need to allocate a new
+		 * page here, a local buffer is fine to hold its contents and a mask
+		 * can be directly applied on it.
+		 */
+		if (!RestoreBlockImage(record, block_id, primary_image_masked))
+			elog(ERROR, "failed to restore block image");
+
+		/*
+		 * If masking function is defined, mask both the primary and replay
+		 * images
+		 */
+		if (RmgrTable[rmid].rm_mask != NULL)
+		{
+			RmgrTable[rmid].rm_mask(replay_image_masked, blkno);
+			RmgrTable[rmid].rm_mask(primary_image_masked, blkno);
+		}
+
+		/* Time to compare the primary and replay images. */
+		if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0)
+		{
+			elog(FATAL,
+				 "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
+				 rnode.spcNode, rnode.dbNode, rnode.relNode,
+				 forknum, blkno);
+		}
+	}
+}
+
+
+/*
+ * Set flag to signal the walreceiver to restart.  (The startup process calls
+ * this on noticing a relevant configuration change.)
+ */
+void
+StartupRequestWalReceiverRestart(void)
+{
+	if (currentSource == XLOG_FROM_STREAM && WalRcvRunning())
+	{
+		ereport(LOG,
+				(errmsg("WAL receiver process shutdown requested")));
+
+		pendingWalRcvRestart = true;
+	}
+}
+
+
+/*
+ * Returns time of receipt of current chunk of XLOG data, as well as
+ * whether it was received from streaming replication or from archives.
+ */
+void
+GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
+{
+	/*
+	 * This must be executed in the startup process, since we don't export the
+	 * relevant state to shared memory.
+	 */
+	Assert(InRecovery);
+
+	*rtime = XLogReceiptTime;
+	*fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
+}
+
+
+/*
+ * Has a standby promotion already been triggered?
+ *
+ * Unlike CheckForStandbyTrigger(), this works in any process
+ * that's connected to shared memory.
+ */
+bool
+PromoteIsTriggered(void)
+{
+	/*
+	 * We check shared state each time only until a standby promotion is
+	 * triggered. We can't trigger a promotion again, so there's no need to
+	 * keep checking after the shared variable has once been seen true.
+	 */
+	if (LocalPromoteIsTriggered)
+		return true;
+
+	SpinLockAcquire(&XLogRecCtl->info_lck);
+	LocalPromoteIsTriggered = XLogRecCtl->SharedPromoteIsTriggered;
+	SpinLockRelease(&XLogRecCtl->info_lck);
+
+	return LocalPromoteIsTriggered;
+}
+
+static void
+SetPromoteIsTriggered(void)
+{
+	SpinLockAcquire(&XLogRecCtl->info_lck);
+	XLogRecCtl->SharedPromoteIsTriggered = true;
+	SpinLockRelease(&XLogRecCtl->info_lck);
+
+	/*
+	 * Mark the recovery pause state as 'not paused' because the paused state
+	 * ends and promotion continues if a promotion is triggered while recovery
+	 * is paused. Otherwise pg_get_wal_replay_pause_state() can mistakenly
+	 * return 'paused' while a promotion is ongoing.
+	 */
+	SetRecoveryPause(false);
+
+	LocalPromoteIsTriggered = true;
+}
+
+/*
+ * Check to see whether the user-specified trigger file exists and whether a
+ * promote request has arrived.  If either condition holds, return true.
+ */
+static bool
+CheckForStandbyTrigger(void)
+{
+	struct stat stat_buf;
+
+	if (LocalPromoteIsTriggered)
+		return true;
+
+	if (IsPromoteSignaled() && CheckPromoteSignal())
+	{
+		ereport(LOG, (errmsg("received promote request")));
+		RemovePromoteSignalFiles();
+		ResetPromoteSignaled();
+		SetPromoteIsTriggered();
+		return true;
+	}
+
+	if (PromoteTriggerFile == NULL || strcmp(PromoteTriggerFile, "") == 0)
+		return false;
+
+	if (stat(PromoteTriggerFile, &stat_buf) == 0)
+	{
+		ereport(LOG,
+				(errmsg("promote trigger file found: %s", PromoteTriggerFile)));
+		unlink(PromoteTriggerFile);
+		SetPromoteIsTriggered();
+		return true;
+	}
+	else if (errno != ENOENT)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not stat promote trigger file \"%s\": %m",
+						PromoteTriggerFile)));
+
+	return false;
+}
+
+/*
+ * Remove the files signaling a standby promotion request.
+ */
+void
+RemovePromoteSignalFiles(void)
+{
+	unlink(PROMOTE_SIGNAL_FILE);
+}
+
+/*
+ * Check to see if a promote request has arrived.
+ */
+bool
+CheckPromoteSignal(void)
+{
+	struct stat stat_buf;
+
+	if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
+		return true;
+
+	return false;
+}
+
+/*
+ * Wake up startup process to replay newly arrived WAL, or to notice that
+ * failover has been requested.
+ */
+void
+WakeupRecovery(void)
+{
+	SetLatch(&XLogRecCtl->recoveryWakeupLatch);
+}
+
+/*
+ * Schedule a walreceiver wakeup in the main recovery loop.
+ */
+void
+XLogRequestWalReceiverReply(void)
+{
+	doRequestWalReceiverReply = true;
+}
+
+static char *
+getRecoveryStopReason(void)
+{
+	char		reason[200];
+
+	/*
+	 * Create a comment for the history file to explain why and where
+	 * timeline changed.
+	 */
+	if (recoveryTarget == RECOVERY_TARGET_XID)
+		snprintf(reason, sizeof(reason),
+				 "%s transaction %u",
+				 recoveryStopAfter ? "after" : "before",
+				 recoveryStopXid);
+	else if (recoveryTarget == RECOVERY_TARGET_TIME)
+		snprintf(reason, sizeof(reason),
+				 "%s %s\n",
+				 recoveryStopAfter ? "after" : "before",
+				 timestamptz_to_str(recoveryStopTime));
+	else if (recoveryTarget == RECOVERY_TARGET_LSN)
+		snprintf(reason, sizeof(reason),
+				 "%s LSN %X/%X\n",
+				 recoveryStopAfter ? "after" : "before",
+				 LSN_FORMAT_ARGS(recoveryStopLSN));
+	else if (recoveryTarget == RECOVERY_TARGET_NAME)
+		snprintf(reason, sizeof(reason),
+				 "at restore point \"%s\"",
+				 recoveryStopName);
+	else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
+		snprintf(reason, sizeof(reason), "reached consistency");
+	else
+		snprintf(reason, sizeof(reason), "no recovery target specified");
+
+	return pstrdup(reason);
+}
+
+/*
+ * Is HotStandby active yet? This is only important in special backends
+ * since normal backends won't ever be able to connect until this returns
+ * true. Postmaster knows this by way of signal, not via shared memory.
+ *
+ * Unlike testing standbyState, this works in any process that's connected to
+ * shared memory.  (And note that standbyState alone doesn't tell the truth
+ * anyway.)
+ */
+bool
+HotStandbyActive(void)
+{
+	/*
+	 * We check shared state each time only until Hot Standby is active. We
+	 * can't de-activate Hot Standby, so there's no need to keep checking
+	 * after the shared variable has once been seen true.
+	 */
+	if (LocalHotStandbyActive)
+		return true;
+	else
+	{
+		/* spinlock is essential on machines with weak memory ordering! */
+		SpinLockAcquire(&XLogRecCtl->info_lck);
+		LocalHotStandbyActive = XLogRecCtl->SharedHotStandbyActive;
+		SpinLockRelease(&XLogRecCtl->info_lck);
+
+		return LocalHotStandbyActive;
+	}
+}
+
+/*
+ * Like HotStandbyActive(), but to be used only in WAL replay code,
+ * where we don't need to ask any other process what the state is.
+ */
+static bool
+HotStandbyActiveInReplay(void)
+{
+	Assert(AmStartupProcess() || !IsPostmasterEnvironment);
+	return LocalHotStandbyActive;
+}
+
+
+/*
+ * Get latest redo apply position.
+ *
+ * Exported to allow WALReceiver to read the pointer directly.
+ */
+XLogRecPtr
+GetXLogReplayRecPtr(TimeLineID *replayTLI)
+{
+	XLogRecPtr	recptr;
+	TimeLineID	tli;
+
+	SpinLockAcquire(&XLogRecCtl->info_lck);
+	recptr = XLogRecCtl->lastReplayedEndRecPtr;
+	tli = XLogRecCtl->lastReplayedTLI;
+	SpinLockRelease(&XLogRecCtl->info_lck);
+
+	if (replayTLI)
+		*replayTLI = tli;
+	return recptr;
+}
+
+
+/*
+ * Get position of last applied, or the record being applied.
+ *
+ * This is different from GetLogReplayRecPtr() in that if a WAL
+ * record is currently being applied, this includes that record.
+ */
+XLogRecPtr
+GetCurrentReplayRecPtr(TimeLineID *replayEndTLI)
+{
+	XLogRecPtr	recptr;
+	TimeLineID	tli;
+
+	SpinLockAcquire(&XLogRecCtl->info_lck);
+	recptr = XLogRecCtl->replayEndRecPtr;
+	tli = XLogRecCtl->replayEndTLI;
+	SpinLockRelease(&XLogRecCtl->info_lck);
+
+	if (replayEndTLI)
+		*replayEndTLI = tli;
+	return recptr;
+}
+
+
+/*
+ * Fetch timestamp of latest processed commit/abort record.
+ * Startup process maintains an accurate local copy in XLogReceiptTime
+ */
+TimestampTz
+GetCurrentChunkReplayStartTime(void)
+{
+	TimestampTz xtime;
+
+	SpinLockAcquire(&XLogRecCtl->info_lck);
+	xtime = XLogRecCtl->currentChunkStartTime;
+	SpinLockRelease(&XLogRecCtl->info_lck);
+
+	return xtime;
+}
+
+
+/*
+ * Note that text field supplied is a parameter name and does not require
+ * translation
+ */
+void
+RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue)
+{
+	if (currValue < minValue)
+	{
+		if (HotStandbyActiveInReplay())
+		{
+			bool		warned_for_promote = false;
+
+			ereport(WARNING,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("hot standby is not possible because of insufficient parameter settings"),
+					 errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
+							   param_name,
+							   currValue,
+							   minValue)));
+
+			SetRecoveryPause(true);
+
+			ereport(LOG,
+					(errmsg("recovery has paused"),
+					 errdetail("If recovery is unpaused, the server will shut down."),
+					 errhint("You can then restart the server after making the necessary configuration changes.")));
+
+			while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED)
+			{
+				HandleStartupProcInterrupts();
+
+				if (CheckForStandbyTrigger())
+				{
+					if (!warned_for_promote)
+						ereport(WARNING,
+								(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+								 errmsg("promotion is not possible because of insufficient parameter settings"),
+
+						/*
+						 * Repeat the detail from above so it's easy to find
+						 * in the log.
+						 */
+								 errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
+										   param_name,
+										   currValue,
+										   minValue),
+								 errhint("Restart the server after making the necessary configuration changes.")));
+					warned_for_promote = true;
+				}
+
+				/*
+				 * If recovery pause is requested then set it paused.  While
+				 * we are in the loop, user might resume and pause again so
+				 * set this every time.
+				 */
+				ConfirmRecoveryPaused();
+
+				/*
+				 * We wait on a condition variable that will wake us as soon
+				 * as the pause ends, but we use a timeout so we can check the
+				 * above conditions periodically too.
+				 */
+				ConditionVariableTimedSleep(&XLogRecCtl->recoveryNotPausedCV, 1000,
+											WAIT_EVENT_RECOVERY_PAUSE);
+			}
+			ConditionVariableCancelSleep();
+		}
+
+		ereport(FATAL,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("recovery aborted because of insufficient parameter settings"),
+		/* Repeat the detail from above so it's easy to find in the log. */
+				 errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
+						   param_name,
+						   currValue,
+						   minValue),
+				 errhint("You can restart the server after making the necessary configuration changes.")));
+	}
+}
+
+
+/*
+ * See if there are any recovery signal files and if so, set state for
+ * recovery.
+ *
+ * See if there is a recovery command file (recovery.conf), and if so
+ * throw an ERROR since as of PG12 we no longer recognize that.
+ */
+static void
+readRecoverySignalFile(void)
+{
+	struct stat stat_buf;
+
+	if (IsBootstrapProcessingMode())
+		return;
+
+	/*
+	 * Check for old recovery API file: recovery.conf
+	 */
+	if (stat(RECOVERY_COMMAND_FILE, &stat_buf) == 0)
+		ereport(FATAL,
+				(errcode_for_file_access(),
+				 errmsg("using recovery command file \"%s\" is not supported",
+						RECOVERY_COMMAND_FILE)));
+
+	/*
+	 * Remove unused .done file, if present. Ignore if absent.
+	 */
+	unlink(RECOVERY_COMMAND_DONE);
+
+	/*
+	 * Check for recovery signal files and if found, fsync them since they
+	 * represent server state information.  We don't sweat too much about the
+	 * possibility of fsync failure, however.
+	 *
+	 * If present, standby signal file takes precedence. If neither is present
+	 * then we won't enter archive recovery.
+	 */
+	if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0)
+	{
+		int			fd;
+
+		fd = BasicOpenFilePerm(STANDBY_SIGNAL_FILE, O_RDWR | PG_BINARY /*| get_sync_bit(sync_method) */,
+							   S_IRUSR | S_IWUSR);
+		if (fd >= 0)
+		{
+			(void) pg_fsync(fd);
+			close(fd);
+		}
+		standby_signal_file_found = true;
+	}
+	else if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0)
+	{
+		int			fd;
+
+		fd = BasicOpenFilePerm(RECOVERY_SIGNAL_FILE, O_RDWR | PG_BINARY /*| get_sync_bit(sync_method) */,
+							   S_IRUSR | S_IWUSR);
+		if (fd >= 0)
+		{
+			(void) pg_fsync(fd);
+			close(fd);
+		}
+		recovery_signal_file_found = true;
+	}
+
+	StandbyModeRequested = false;
+	ArchiveRecoveryRequested = false;
+	if (standby_signal_file_found)
+	{
+		StandbyModeRequested = true;
+		ArchiveRecoveryRequested = true;
+	}
+	else if (recovery_signal_file_found)
+	{
+		StandbyModeRequested = false;
+		ArchiveRecoveryRequested = true;
+	}
+	else
+		return;
+
+	/*
+	 * We don't support standby mode in standalone backends; that requires
+	 * other processes such as the WAL receiver to be alive.
+	 */
+	if (StandbyModeRequested && !IsUnderPostmaster)
+		ereport(FATAL,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("standby mode is not supported by single-user servers")));
+}
+
+static void
+validateRecoveryParameters(void)
+{
+	if (!ArchiveRecoveryRequested)
+		return;
+
+	/*
+	 * Check for compulsory parameters
+	 */
+	if (StandbyModeRequested)
+	{
+		if ((PrimaryConnInfo == NULL || strcmp(PrimaryConnInfo, "") == 0) &&
+			(recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0))
+			ereport(WARNING,
+					(errmsg("specified neither primary_conninfo nor restore_command"),
+					 errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
+	}
+	else
+	{
+		if (recoveryRestoreCommand == NULL ||
+			strcmp(recoveryRestoreCommand, "") == 0)
+			ereport(FATAL,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("must specify restore_command when standby mode is not enabled")));
+	}
+
+	/*
+	 * Override any inconsistent requests. Note that this is a change of
+	 * behaviour in 9.5; prior to this we simply ignored a request to pause if
+	 * hot_standby = off, which was surprising behaviour.
+	 */
+	if (recoveryTargetAction == RECOVERY_TARGET_ACTION_PAUSE &&
+		!EnableHotStandby)
+		recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
+
+	/*
+	 * Final parsing of recovery_target_time string; see also
+	 * check_recovery_target_time().
+	 */
+	if (recoveryTarget == RECOVERY_TARGET_TIME)
+	{
+		recoveryTargetTime = DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
+																	 CStringGetDatum(recovery_target_time_string),
+																	 ObjectIdGetDatum(InvalidOid),
+																	 Int32GetDatum(-1)));
+	}
+
+	/*
+	 * If user specified recovery_target_timeline, validate it or compute the
+	 * "latest" value.  We can't do this until after we've gotten the restore
+	 * command and set InArchiveRecovery, because we need to fetch timeline
+	 * history files from the archive.
+	 */
+	if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC)
+	{
+		TimeLineID	rtli = recoveryTargetTLIRequested;
+
+		/* Timeline 1 does not have a history file, all else should */
+		if (rtli != 1 && !existsTimeLineHistory(rtli))
+			ereport(FATAL,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("recovery target timeline %u does not exist",
+							rtli)));
+		recoveryTargetTLI = rtli;
+	}
+	else if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
+	{
+		/* We start the "latest" search from pg_control's timeline */
+		recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
+	}
+	else
+	{
+		/*
+		 * else we just use the recoveryTargetTLI as already read from
+		 * ControlFile
+		 */
+		Assert(recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_CONTROLFILE);
+	}
+}
diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c
index d17d660f460..c077f891a0e 100644
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -21,10 +21,12 @@
 
 #include "access/timeline.h"
 #include "access/xlog.h"
+#include "access/xlogrecovery.h"
 #include "access/xlog_internal.h"
 #include "access/xlogutils.h"
 #include "miscadmin.h"
 #include "pgstat.h"
+#include "storage/fd.h"
 #include "storage/smgr.h"
 #include "utils/guc.h"
 #include "utils/hsearch.h"
@@ -34,6 +36,25 @@
 /* GUC variable */
 bool		ignore_invalid_pages = false;
 
+/*
+ * Are we doing recovery from XLOG?
+ *
+ * This is only ever true in the startup process; it should be read as meaning
+ * "this process is replaying WAL records", rather than "the system is in
+ * recovery mode".  It should be examined primarily by functions that need
+ * to act differently when called from a WAL redo function (e.g., to skip WAL
+ * logging).  To check whether the system is in recovery regardless of which
+ * process you're running in, use RecoveryInProgress() but only after shared
+ * memory startup and lock initialization.
+ *
+ * This is updated by xlogrecovery.c and xlog.c, but it lives here because it's
+ * mostly read by WAL redo functions.
+ */
+bool		InRecovery = false;
+
+/* Are we in Hot Standby mode? Only valid in startup process, see xlogutils.h */
+HotStandbyState standbyState = STANDBY_DISABLED;
+
 /*
  * During XLOG replay, we may see XLOG records for incremental updates of
  * pages that no longer exist, because their relation was later dropped or
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 2b159b60ebb..c959c7f462d 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -30,6 +30,7 @@
 #include "access/tableam.h"
 #include "access/xact.h"
 #include "access/xloginsert.h"
+#include "access/xlogrecovery.h"
 #include "access/xlogutils.h"
 #include "catalog/catalog.h"
 #include "catalog/dependency.h"
diff --git a/src/backend/commands/tablespace.c b/src/backend/commands/tablespace.c
index 69ea155d502..27f9bfccf97 100644
--- a/src/backend/commands/tablespace.c
+++ b/src/backend/commands/tablespace.c
@@ -58,6 +58,7 @@
 #include "access/xact.h"
 #include "access/xlog.h"
 #include "access/xloginsert.h"
+#include "access/xlogutils.h"
 #include "catalog/catalog.h"
 #include "catalog/dependency.h"
 #include "catalog/indexing.h"
diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c
index 75a95f3de7a..63868e77aab 100644
--- a/src/backend/postmaster/checkpointer.c
+++ b/src/backend/postmaster/checkpointer.c
@@ -41,6 +41,7 @@
 
 #include "access/xlog.h"
 #include "access/xlog_internal.h"
+#include "access/xlogrecovery.h"
 #include "libpq/pqsignal.h"
 #include "miscadmin.h"
 #include "pgstat.h"
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 5a050898fec..6e3a4d27526 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -95,6 +95,7 @@
 
 #include "access/transam.h"
 #include "access/xlog.h"
+#include "access/xlogrecovery.h"
 #include "bootstrap/bootstrap.h"
 #include "catalog/pg_control.h"
 #include "common/file_perm.h"
diff --git a/src/backend/postmaster/startup.c b/src/backend/postmaster/startup.c
index 69077bd2075..5d7914bf84c 100644
--- a/src/backend/postmaster/startup.c
+++ b/src/backend/postmaster/startup.c
@@ -20,6 +20,8 @@
 #include "postgres.h"
 
 #include "access/xlog.h"
+#include "access/xlogrecovery.h"
+#include "access/xlogutils.h"
 #include "libpq/pqsignal.h"
 #include "miscadmin.h"
 #include "pgstat.h"
diff --git a/src/backend/replication/logical/logicalfuncs.c b/src/backend/replication/logical/logicalfuncs.c
index 1f38c5b33ea..070f9ad2df3 100644
--- a/src/backend/replication/logical/logicalfuncs.c
+++ b/src/backend/replication/logical/logicalfuncs.c
@@ -19,6 +19,7 @@
 
 #include "access/xact.h"
 #include "access/xlog_internal.h"
+#include "access/xlogrecovery.h"
 #include "access/xlogutils.h"
 #include "catalog/pg_type.h"
 #include "fmgr.h"
diff --git a/src/backend/replication/slotfuncs.c b/src/backend/replication/slotfuncs.c
index 31e74d38322..ee4b5675bb5 100644
--- a/src/backend/replication/slotfuncs.c
+++ b/src/backend/replication/slotfuncs.c
@@ -14,6 +14,7 @@
 
 #include "access/htup_details.h"
 #include "access/xlog_internal.h"
+#include "access/xlogrecovery.h"
 #include "access/xlogutils.h"
 #include "funcapi.h"
 #include "miscadmin.h"
diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c
index b94910bfe9a..689ff996dfb 100644
--- a/src/backend/replication/walreceiver.c
+++ b/src/backend/replication/walreceiver.c
@@ -56,6 +56,7 @@
 #include "access/transam.h"
 #include "access/xlog_internal.h"
 #include "access/xlogarchive.h"
+#include "access/xlogrecovery.h"
 #include "catalog/pg_authid.h"
 #include "catalog/pg_type.h"
 #include "common/ip.h"
diff --git a/src/backend/replication/walreceiverfuncs.c b/src/backend/replication/walreceiverfuncs.c
index 6f0acbfdef4..6ee810851f2 100644
--- a/src/backend/replication/walreceiverfuncs.c
+++ b/src/backend/replication/walreceiverfuncs.c
@@ -23,6 +23,7 @@
 #include <signal.h>
 
 #include "access/xlog_internal.h"
+#include "access/xlogrecovery.h"
 #include "pgstat.h"
 #include "postmaster/startup.h"
 #include "replication/walreceiver.h"
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index 32245363561..1effef3dee4 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -55,6 +55,7 @@
 #include "access/xact.h"
 #include "access/xlog_internal.h"
 #include "access/xlogreader.h"
+#include "access/xlogrecovery.h"
 #include "access/xlogutils.h"
 #include "catalog/pg_authid.h"
 #include "catalog/pg_type.h"
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 4b296a22c45..baacbcc9b13 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -34,7 +34,7 @@
 #include <unistd.h>
 
 #include "access/tableam.h"
-#include "access/xlog.h"
+#include "access/xlogutils.h"
 #include "catalog/catalog.h"
 #include "catalog/storage.h"
 #include "executor/instrument.h"
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index 3e4ec53a97e..2bf879233c8 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -22,6 +22,7 @@
 #include "access/subtrans.h"
 #include "access/syncscan.h"
 #include "access/twophase.h"
+#include "access/xlogrecovery.h"
 #include "commands/async.h"
 #include "miscadmin.h"
 #include "pgstat.h"
@@ -126,6 +127,7 @@ CreateSharedMemoryAndSemaphores(void)
 		size = add_size(size, PredicateLockShmemSize());
 		size = add_size(size, ProcGlobalShmemSize());
 		size = add_size(size, XLOGShmemSize());
+		size = add_size(size, XLogRecoveryShmemSize());
 		size = add_size(size, CLOGShmemSize());
 		size = add_size(size, CommitTsShmemSize());
 		size = add_size(size, SUBTRANSShmemSize());
@@ -217,6 +219,7 @@ CreateSharedMemoryAndSemaphores(void)
 	 * Set up xlog, clog, and buffers
 	 */
 	XLOGShmemInit();
+	XLogRecoveryShmemInit();
 	CLOGShmemInit();
 	CommitTsShmemInit();
 	SUBTRANSShmemInit();
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index 085bd1e4077..e8d02f27789 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -52,7 +52,7 @@
 #include "access/transam.h"
 #include "access/twophase.h"
 #include "access/xact.h"
-#include "access/xlog.h"
+#include "access/xlogutils.h"
 #include "catalog/catalog.h"
 #include "catalog/pg_authid.h"
 #include "commands/dbcommands.h"
diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c
index 553b6e54603..99383b8b172 100644
--- a/src/backend/storage/ipc/standby.c
+++ b/src/backend/storage/ipc/standby.c
@@ -19,8 +19,9 @@
 #include "access/transam.h"
 #include "access/twophase.h"
 #include "access/xact.h"
-#include "access/xlog.h"
 #include "access/xloginsert.h"
+#include "access/xlogrecovery.h"
+#include "access/xlogutils.h"
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "storage/bufmgr.h"
diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c
index 108b4d90238..e52350d6f64 100644
--- a/src/backend/storage/lmgr/lock.c
+++ b/src/backend/storage/lmgr/lock.c
@@ -37,6 +37,7 @@
 #include "access/twophase_rmgr.h"
 #include "access/xact.h"
 #include "access/xlog.h"
+#include "access/xlogutils.h"
 #include "miscadmin.h"
 #include "pg_trace.h"
 #include "pgstat.h"
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index 2575ea1ca0d..8314cc9bbbc 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -38,6 +38,7 @@
 #include "access/transam.h"
 #include "access/twophase.h"
 #include "access/xact.h"
+#include "access/xlogutils.h"
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "postmaster/autovacuum.h"
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 4dc24649df9..4209a22bf51 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -18,6 +18,7 @@
 #include "postgres.h"
 
 #include "access/xlog.h"
+#include "access/xlogutils.h"
 #include "lib/ilist.h"
 #include "storage/bufmgr.h"
 #include "storage/ipc.h"
diff --git a/src/backend/storage/sync/sync.c b/src/backend/storage/sync/sync.c
index bc3ceb27125..02e456077d4 100644
--- a/src/backend/storage/sync/sync.c
+++ b/src/backend/storage/sync/sync.c
@@ -29,6 +29,7 @@
 #include "portability/instr_time.h"
 #include "postmaster/bgwriter.h"
 #include "storage/bufmgr.h"
+#include "storage/fd.h"
 #include "storage/ipc.h"
 #include "storage/md.h"
 #include "utils/hsearch.h"
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 68b62d523dc..33211366f91 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -41,6 +41,7 @@
 #include "access/twophase.h"
 #include "access/xact.h"
 #include "access/xlog_internal.h"
+#include "access/xlogrecovery.h"
 #include "catalog/namespace.h"
 #include "catalog/pg_authid.h"
 #include "catalog/storage.h"
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 77187c12beb..784fb5ad633 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -11,14 +11,12 @@
 #ifndef XLOG_H
 #define XLOG_H
 
-#include "access/rmgr.h"
 #include "access/xlogdefs.h"
 #include "access/xloginsert.h"
 #include "access/xlogreader.h"
 #include "datatype/timestamp.h"
 #include "lib/stringinfo.h"
 #include "nodes/pg_list.h"
-#include "storage/fd.h"
 
 
 /* Sync methods */
@@ -31,78 +29,10 @@ extern int	sync_method;
 
 extern PGDLLIMPORT TimeLineID ThisTimeLineID;	/* current TLI */
 
-/*
- * Prior to 8.4, all activity during recovery was carried out by the startup
- * process. This local variable continues to be used in many parts of the
- * code to indicate actions taken by RecoveryManagers. Other processes that
- * potentially perform work during recovery should check RecoveryInProgress().
- * See XLogCtl notes in xlog.c.
- */
-extern bool InRecovery;
-
-/*
- * Like InRecovery, standbyState is only valid in the startup process.
- * In all other processes it will have the value STANDBY_DISABLED (so
- * InHotStandby will read as false).
- *
- * In DISABLED state, we're performing crash recovery or hot standby was
- * disabled in postgresql.conf.
- *
- * In INITIALIZED state, we've run InitRecoveryTransactionEnvironment, but
- * we haven't yet processed a RUNNING_XACTS or shutdown-checkpoint WAL record
- * to initialize our primary-transaction tracking system.
- *
- * When the transaction tracking is initialized, we enter the SNAPSHOT_PENDING
- * state. The tracked information might still be incomplete, so we can't allow
- * connections yet, but redo functions must update the in-memory state when
- * appropriate.
- *
- * In SNAPSHOT_READY mode, we have full knowledge of transactions that are
- * (or were) running on the primary at the current WAL location. Snapshots
- * can be taken, and read-only queries can be run.
- */
-typedef enum
-{
-	STANDBY_DISABLED,
-	STANDBY_INITIALIZED,
-	STANDBY_SNAPSHOT_PENDING,
-	STANDBY_SNAPSHOT_READY
-} HotStandbyState;
-
-extern HotStandbyState standbyState;
-
-#define InHotStandby (standbyState >= STANDBY_SNAPSHOT_PENDING)
-
-/*
- * Recovery target type.
- * Only set during a Point in Time recovery, not when in standby mode.
- */
-typedef enum
-{
-	RECOVERY_TARGET_UNSET,
-	RECOVERY_TARGET_XID,
-	RECOVERY_TARGET_TIME,
-	RECOVERY_TARGET_NAME,
-	RECOVERY_TARGET_LSN,
-	RECOVERY_TARGET_IMMEDIATE
-} RecoveryTargetType;
-
-/*
- * Recovery target TimeLine goal
- */
-typedef enum
-{
-	RECOVERY_TARGET_TIMELINE_CONTROLFILE,
-	RECOVERY_TARGET_TIMELINE_LATEST,
-	RECOVERY_TARGET_TIMELINE_NUMERIC
-} RecoveryTargetTimeLineGoal;
-
 extern XLogRecPtr ProcLastRecPtr;
 extern XLogRecPtr XactLastRecEnd;
 extern PGDLLIMPORT XLogRecPtr XactLastCommitEnd;
 
-extern bool reachedConsistency;
-
 /* these variables are GUC parameters related to XLOG */
 extern int	wal_segment_size;
 extern int	min_wal_size_mb;
@@ -122,34 +52,10 @@ extern bool wal_recycle;
 extern bool *wal_consistency_checking;
 extern char *wal_consistency_checking_string;
 extern bool log_checkpoints;
-extern char *recoveryRestoreCommand;
-extern char *recoveryEndCommand;
-extern char *archiveCleanupCommand;
-extern bool recoveryTargetInclusive;
-extern int	recoveryTargetAction;
-extern int	recovery_min_apply_delay;
-extern char *PrimaryConnInfo;
-extern char *PrimarySlotName;
-extern bool wal_receiver_create_temp_slot;
 extern bool track_wal_io_timing;
 
-/* indirectly set via GUC system */
-extern TransactionId recoveryTargetXid;
-extern char *recovery_target_time_string;
-extern const char *recoveryTargetName;
-extern XLogRecPtr recoveryTargetLSN;
-extern RecoveryTargetType recoveryTarget;
-extern char *PromoteTriggerFile;
-extern RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal;
-extern TimeLineID recoveryTargetTLIRequested;
-extern TimeLineID recoveryTargetTLI;
-
 extern int	CheckPointSegments;
 
-/* option set locally in startup process only when signal files exist */
-extern bool StandbyModeRequested;
-extern bool StandbyMode;
-
 /* Archive modes */
 typedef enum ArchiveMode
 {
@@ -175,14 +81,6 @@ typedef enum RecoveryState
 	RECOVERY_STATE_DONE			/* currently in production */
 } RecoveryState;
 
-/* Recovery pause states */
-typedef enum RecoveryPauseState
-{
-	RECOVERY_NOT_PAUSED,		/* pause not requested */
-	RECOVERY_PAUSE_REQUESTED,	/* pause requested, but not yet paused */
-	RECOVERY_PAUSED				/* recovery is paused */
-} RecoveryPauseState;
-
 extern PGDLLIMPORT int wal_level;
 
 /* Is WAL archiving enabled (always or only while server is running normally)? */
@@ -297,7 +195,6 @@ extern void XLogFlush(XLogRecPtr RecPtr);
 extern bool XLogBackgroundFlush(void);
 extern bool XLogNeedsFlush(XLogRecPtr RecPtr);
 extern int	XLogFileInit(XLogSegNo segno, bool *use_existent, bool use_lock);
-extern int	XLogFileOpen(XLogSegNo segno);
 
 extern void CheckXLogRemoved(XLogSegNo segno, TimeLineID tli);
 extern XLogSegNo XLogGetLastRemovedSegno(void);
@@ -312,19 +209,10 @@ extern void issue_xlog_fsync(int fd, XLogSegNo segno);
 
 extern bool RecoveryInProgress(void);
 extern RecoveryState GetRecoveryState(void);
-extern bool HotStandbyActive(void);
-extern bool HotStandbyActiveInReplay(void);
 extern bool XLogInsertAllowed(void);
-extern void GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream);
-extern XLogRecPtr GetXLogReplayRecPtr(TimeLineID *replayTLI);
 extern XLogRecPtr GetXLogInsertRecPtr(void);
 extern XLogRecPtr GetXLogWriteRecPtr(void);
-extern RecoveryPauseState GetRecoveryPauseState(void);
-extern void SetRecoveryPause(bool recoveryPause);
-extern TimestampTz GetLatestXTime(void);
-extern TimestampTz GetCurrentChunkReplayStartTime(void);
 
-extern void UpdateControlFile(void);
 extern uint64 GetSystemIdentifier(void);
 extern char *GetMockAuthenticationNonce(void);
 extern bool DataChecksumsEnabled(void);
@@ -348,15 +236,14 @@ extern XLogRecPtr GetRedoRecPtr(void);
 extern XLogRecPtr GetInsertRecPtr(void);
 extern XLogRecPtr GetFlushRecPtr(void);
 extern XLogRecPtr GetLastImportantRecPtr(void);
-extern void RemovePromoteSignalFiles(void);
 
-extern bool PromoteIsTriggered(void);
-extern bool CheckPromoteSignal(void);
-extern void WakeupRecovery(void);
 extern void SetWalWriterSleeping(bool sleeping);
 
-extern void StartupRequestWalReceiverRestart(void);
-extern void XLogRequestWalReceiverReply(void);
+extern void RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI);
+
+extern bool XLogCheckpointNeeded(XLogSegNo new_segno);
+extern void SwitchIntoArchiveRecovery(XLogRecPtr EndRecPtr);
+extern void ReachedEndOfBackup(XLogRecPtr EndRecPtr, TimeLineID tli);
 
 extern void assign_max_wal_size(int newval, void *extra);
 extern void assign_checkpoint_completion_target(double newval, void *extra);
diff --git a/src/include/access/xlogrecovery.h b/src/include/access/xlogrecovery.h
new file mode 100644
index 00000000000..b044ffbc808
--- /dev/null
+++ b/src/include/access/xlogrecovery.h
@@ -0,0 +1,117 @@
+/*
+ * xlogrecovery.h
+ *
+ * Functions for WAL recovery and standby mode
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/xlogrecovery.h
+ */
+#ifndef XLOGRECOVERY_H
+#define XLOGRECOVERY_H
+
+#include "access/xlogreader.h"
+#include "catalog/pg_control.h"
+#include "lib/stringinfo.h"
+#include "utils/timestamp.h"
+
+/*
+ * Recovery target type.
+ * Only set during a Point in Time recovery, not when in standby mode.
+ */
+typedef enum
+{
+	RECOVERY_TARGET_UNSET,
+	RECOVERY_TARGET_XID,
+	RECOVERY_TARGET_TIME,
+	RECOVERY_TARGET_NAME,
+	RECOVERY_TARGET_LSN,
+	RECOVERY_TARGET_IMMEDIATE
+} RecoveryTargetType;
+
+/*
+ * Recovery target TimeLine goal
+ */
+typedef enum
+{
+	RECOVERY_TARGET_TIMELINE_CONTROLFILE,
+	RECOVERY_TARGET_TIMELINE_LATEST,
+	RECOVERY_TARGET_TIMELINE_NUMERIC
+} RecoveryTargetTimeLineGoal;
+
+/* Recovery pause states */
+typedef enum RecoveryPauseState
+{
+	RECOVERY_NOT_PAUSED,		/* pause not requested */
+	RECOVERY_PAUSE_REQUESTED,	/* pause requested, but not yet paused */
+	RECOVERY_PAUSED				/* recovery is paused */
+} RecoveryPauseState;
+
+/* User-settable GUC parameters */
+extern bool recoveryTargetInclusive;
+extern int	recoveryTargetAction;
+extern int	recovery_min_apply_delay;
+extern char *PrimaryConnInfo;
+extern char *PrimarySlotName;
+extern char *recoveryRestoreCommand;
+extern char *recoveryEndCommand;
+extern char *archiveCleanupCommand;
+
+/* indirectly set via GUC system */
+extern TransactionId recoveryTargetXid;
+extern char *recovery_target_time_string;
+extern TimestampTz recoveryTargetTime;
+extern const char *recoveryTargetName;
+extern XLogRecPtr recoveryTargetLSN;
+extern RecoveryTargetType recoveryTarget;
+extern char *PromoteTriggerFile;
+extern bool wal_receiver_create_temp_slot;
+extern RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal;
+extern TimeLineID recoveryTargetTLIRequested;
+extern TimeLineID recoveryTargetTLI;
+
+/* Have we already reached a consistent database state? */
+extern bool reachedConsistency;
+
+/* Are we currently in standby mode? */
+extern bool StandbyMode;
+
+extern Size XLogRecoveryShmemSize(void);
+extern void XLogRecoveryShmemInit(void);
+
+extern void InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdownPtr, bool *haveBackupLabel, bool *haveTblspcMap);
+extern void PerformWalRecovery(void);
+
+extern void EndWalRecovery(XLogRecPtr *LastRec, XLogRecPtr *EndOfLog, TimeLineID *EndOfLogTLI, XLogRecPtr *lastPageBeginPtr, char **lastPage, char **reason,
+						   bool *bgwriterLaunched,
+						   bool *standby_signal_file_found_p,
+						   bool *recovery_signal_file_found_p);
+extern void FreeWalRecovery(void);
+extern void RemovePromoteSignalFiles(void);
+
+extern XLogRecord *ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt, bool report);
+
+extern void HandleBackupEndRecord(XLogRecPtr startpoint, XLogRecPtr endLsn, TimeLineID endTLI);
+
+extern bool HotStandbyActive(void);
+extern XLogRecPtr GetXLogReplayRecPtr(TimeLineID *replayTLI);
+extern RecoveryPauseState GetRecoveryPauseState(void);
+extern void SetRecoveryPause(bool recoveryPause);
+extern void GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream);
+extern TimestampTz GetLatestXTime(void);
+extern TimestampTz GetCurrentChunkReplayStartTime(void);
+extern XLogRecPtr GetCurrentReplayRecPtr(TimeLineID *replayEndTLI);
+
+extern bool PromoteIsTriggered(void);
+extern bool CheckPromoteSignal(void);
+extern void WakeupRecovery(void);
+
+extern void StartupRequestWalReceiverRestart(void);
+extern void XLogRequestWalReceiverReply(void);
+
+extern void RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue);
+
+extern void xlog_outdesc(StringInfo buf, XLogReaderState *record);
+
+#endif							/* XLOGRECOVERY_H */
diff --git a/src/include/access/xlogutils.h b/src/include/access/xlogutils.h
index 9ac602b674d..a5cb3d322c5 100644
--- a/src/include/access/xlogutils.h
+++ b/src/include/access/xlogutils.h
@@ -14,6 +14,48 @@
 #include "access/xlogreader.h"
 #include "storage/bufmgr.h"
 
+/*
+ * Prior to 8.4, all activity during recovery was carried out by the startup
+ * process. This local variable continues to be used in many parts of the
+ * code to indicate actions taken by RecoveryManagers. Other processes that
+ * potentially perform work during recovery should check RecoveryInProgress().
+ * See XLogCtl notes in xlog.c.
+ */
+extern bool InRecovery;
+
+/*
+ * Like InRecovery, standbyState is only valid in the startup process.
+ * In all other processes it will have the value STANDBY_DISABLED (so
+ * InHotStandby will read as false).
+ *
+ * In DISABLED state, we're performing crash recovery or hot standby was
+ * disabled in postgresql.conf.
+ *
+ * In INITIALIZED state, we've run InitRecoveryTransactionEnvironment, but
+ * we haven't yet processed a RUNNING_XACTS or shutdown-checkpoint WAL record
+ * to initialize our primary-transaction tracking system.
+ *
+ * When the transaction tracking is initialized, we enter the SNAPSHOT_PENDING
+ * state. The tracked information might still be incomplete, so we can't allow
+ * connections yet, but redo functions must update the in-memory state when
+ * appropriate.
+ *
+ * In SNAPSHOT_READY mode, we have full knowledge of transactions that are
+ * (or were) running on the primary at the current WAL location. Snapshots
+ * can be taken, and read-only queries can be run.
+ */
+typedef enum
+{
+	STANDBY_DISABLED,
+	STANDBY_INITIALIZED,
+	STANDBY_SNAPSHOT_PENDING,
+	STANDBY_SNAPSHOT_READY
+} HotStandbyState;
+
+extern HotStandbyState standbyState;
+
+#define InHotStandby (standbyState >= STANDBY_SNAPSHOT_PENDING)
+
 
 extern bool XLogHaveInvalidPages(void);
 extern void XLogCheckInvalidPages(void);
-- 
2.30.2

