*** src/backend/access/transam/clog.c
--- src/backend/access/transam/clog.c
***************
*** 475,480 **** ZeroCLOGPage(int pageno, bool writeXlog)
--- 475,483 ----
  /*
   * This must be called ONCE during postmaster or standalone-backend startup,
   * after StartupXLOG has initialized ShmemVariableCache->nextXid.
+  *
+  * We access just a single clog page, so this action is atomic and safe
+  * for use if other processes are active during recovery.
   */
  void
  StartupCLOG(void)
*** src/backend/access/transam/multixact.c
--- src/backend/access/transam/multixact.c
***************
*** 1413,1420 **** ZeroMultiXactMemberPage(int pageno, bool writeXlog)
   * MultiXactSetNextMXact and/or MultiXactAdvanceNextMXact.	Note that we
   * may already have replayed WAL data into the SLRU files.
   *
!  * We don't need any locks here, really; the SLRU locks are taken
!  * only because slru.c expects to be called with locks held.
   */
  void
  StartupMultiXact(void)
--- 1413,1423 ----
   * MultiXactSetNextMXact and/or MultiXactAdvanceNextMXact.	Note that we
   * may already have replayed WAL data into the SLRU files.
   *
!  * We want this operation to be atomic to ensure that other processes can 
!  * use MultiXact while we complete recovery. We access one page only from the
!  * offset and members buffers, so once locks are acquired they will not be
!  * dropped and re-acquired by SLRU code. So we take both locks at start, then
!  * hold them all the way to the end.
   */
  void
  StartupMultiXact(void)
***************
*** 1426,1431 **** StartupMultiXact(void)
--- 1429,1435 ----
  
  	/* Clean up offsets state */
  	LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
+ 	LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE);
  
  	/*
  	 * Initialize our idea of the latest page number.
***************
*** 1452,1461 **** StartupMultiXact(void)
  		MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
  	}
  
- 	LWLockRelease(MultiXactOffsetControlLock);
- 
  	/* And the same for members */
- 	LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE);
  
  	/*
  	 * Initialize our idea of the latest page number.
--- 1456,1462 ----
***************
*** 1483,1488 **** StartupMultiXact(void)
--- 1484,1490 ----
  	}
  
  	LWLockRelease(MultiXactMemberControlLock);
+ 	LWLockRelease(MultiXactOffsetControlLock);
  
  	/*
  	 * Initialize lastTruncationPoint to invalid, ensuring that the first
***************
*** 1542,1549 **** CheckPointMultiXact(void)
  	 * isn't valid (because StartupMultiXact hasn't been called yet) and so
  	 * SimpleLruTruncate would get confused.  It seems best not to risk
  	 * removing any data during recovery anyway, so don't truncate.
  	 */
! 	if (!InRecovery)
  		TruncateMultiXact();
  
  	TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true);
--- 1544,1552 ----
  	 * isn't valid (because StartupMultiXact hasn't been called yet) and so
  	 * SimpleLruTruncate would get confused.  It seems best not to risk
  	 * removing any data during recovery anyway, so don't truncate.
+ 	 * We are executing in the bgwriter, so we must access shared status.
  	 */
! 	if (!IsRecoveryProcessingMode())
  		TruncateMultiXact();
  
  	TRACE_POSTGRESQL_MULTIXACT_CHECKPOINT_DONE(true);
*** src/backend/access/transam/slru.c
--- src/backend/access/transam/slru.c
***************
*** 598,604 **** SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
  	 * commands to set the commit status of transactions whose bits are in
  	 * already-truncated segments of the commit log (see notes in
  	 * SlruPhysicalWritePage).	Hence, if we are InRecovery, allow the case
! 	 * where the file doesn't exist, and return zeroes instead.
  	 */
  	fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
  	if (fd < 0)
--- 598,605 ----
  	 * commands to set the commit status of transactions whose bits are in
  	 * already-truncated segments of the commit log (see notes in
  	 * SlruPhysicalWritePage).	Hence, if we are InRecovery, allow the case
! 	 * where the file doesn't exist, and return zeroes instead. We also
! 	 * return a zeroed page when seek and read fails. 
  	 */
  	fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
  	if (fd < 0)
***************
*** 619,624 **** SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
--- 620,633 ----
  
  	if (lseek(fd, (off_t) offset, SEEK_SET) < 0)
  	{
+ 		if (InRecovery)
+ 		{
+ 			ereport(LOG,
+ 					(errmsg("file \"%s\" doesn't exist, reading as zeroes",
+ 							path)));
+ 			MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
+ 			return true;
+ 		}
  		slru_errcause = SLRU_SEEK_FAILED;
  		slru_errno = errno;
  		close(fd);
***************
*** 628,633 **** SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
--- 637,650 ----
  	errno = 0;
  	if (read(fd, shared->page_buffer[slotno], BLCKSZ) != BLCKSZ)
  	{
+ 		if (InRecovery)
+ 		{
+ 			ereport(LOG,
+ 					(errmsg("file \"%s\" doesn't exist, reading as zeroes",
+ 							path)));
+ 			MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
+ 			return true;
+ 		}
  		slru_errcause = SLRU_READ_FAILED;
  		slru_errno = errno;
  		close(fd);
*** src/backend/access/transam/subtrans.c
--- src/backend/access/transam/subtrans.c
***************
*** 223,255 **** ZeroSUBTRANSPage(int pageno)
  /*
   * This must be called ONCE during postmaster or standalone-backend startup,
   * after StartupXLOG has initialized ShmemVariableCache->nextXid.
-  *
-  * oldestActiveXID is the oldest XID of any prepared transaction, or nextXid
-  * if there are none.
   */
  void
  StartupSUBTRANS(TransactionId oldestActiveXID)
  {
! 	int			startPage;
! 	int			endPage;
  
- 	/*
- 	 * Since we don't expect pg_subtrans to be valid across crashes, we
- 	 * initialize the currently-active page(s) to zeroes during startup.
- 	 * Whenever we advance into a new page, ExtendSUBTRANS will likewise zero
- 	 * the new page without regard to whatever was previously on disk.
- 	 */
  	LWLockAcquire(SubtransControlLock, LW_EXCLUSIVE);
  
! 	startPage = TransactionIdToPage(oldestActiveXID);
! 	endPage = TransactionIdToPage(ShmemVariableCache->nextXid);
! 
! 	while (startPage != endPage)
! 	{
! 		(void) ZeroSUBTRANSPage(startPage);
! 		startPage++;
! 	}
! 	(void) ZeroSUBTRANSPage(startPage);
  
  	LWLockRelease(SubtransControlLock);
  }
--- 223,241 ----
  /*
   * This must be called ONCE during postmaster or standalone-backend startup,
   * after StartupXLOG has initialized ShmemVariableCache->nextXid.
   */
  void
  StartupSUBTRANS(TransactionId oldestActiveXID)
  {
! 	TransactionId xid = ShmemVariableCache->nextXid;
! 	int			pageno = TransactionIdToPage(xid);
  
  	LWLockAcquire(SubtransControlLock, LW_EXCLUSIVE);
  
! 	/*
! 	 * Initialize our idea of the latest page number.
! 	 */
! 	SubTransCtl->shared->latest_page_number = pageno;
  
  	LWLockRelease(SubtransControlLock);
  }
*** src/backend/access/transam/xact.c
--- src/backend/access/transam/xact.c
***************
*** 40,45 ****
--- 40,46 ----
  #include "storage/fd.h"
  #include "storage/lmgr.h"
  #include "storage/procarray.h"
+ #include "storage/sinval.h"
  #include "storage/sinvaladt.h"
  #include "storage/smgr.h"
  #include "utils/combocid.h"
*** src/backend/access/transam/xlog.c
--- src/backend/access/transam/xlog.c
***************
*** 114,120 **** CheckpointStatsData CheckpointStats;
  
  /*
   * ThisTimeLineID will be same in all backends --- it identifies current
!  * WAL timeline for the database system.
   */
  TimeLineID	ThisTimeLineID = 0;
  
--- 114,121 ----
  
  /*
   * ThisTimeLineID will be same in all backends --- it identifies current
!  * WAL timeline for the database system. Zero is always a bug, so we 
!  * start with that to allow us to spot any errors.
   */
  TimeLineID	ThisTimeLineID = 0;
  
***************
*** 122,128 **** TimeLineID	ThisTimeLineID = 0;
  bool		InRecovery = false;
  
  /* Are we recovering using offline XLOG archives? */
! static bool InArchiveRecovery = false;
  
  /* Was the last xlog file restored from archive, or local? */
  static bool restoredFromArchive = false;
--- 123,136 ----
  bool		InRecovery = false;
  
  /* Are we recovering using offline XLOG archives? */
! bool 		InArchiveRecovery = false;
! 
! /* Local copy of shared RecoveryProcessingMode state */
! static bool LocalRecoveryProcessingMode = true;
! static bool knownProcessingMode = false;
! 
! /* is the database proven consistent yet? */
! bool	reachedSafeStartPoint = false;
  
  /* Was the last xlog file restored from archive, or local? */
  static bool restoredFromArchive = false;
***************
*** 241,250 **** static XLogRecPtr RedoRecPtr;
   * ControlFileLock: must be held to read/update control file or create
   * new log file.
   *
!  * CheckpointLock: must be held to do a checkpoint (ensures only one
!  * checkpointer at a time; currently, with all checkpoints done by the
!  * bgwriter, this is just pro forma).
   *
   *----------
   */
  
--- 249,278 ----
   * ControlFileLock: must be held to read/update control file or create
   * new log file.
   *
!  * CheckpointLock: must be held to do a checkpoint or restartpoint, ensuring
!  * we get just one of those at any time. In 8.4+ recovery, both startup and
!  * bgwriter processes may take restartpoints, so this locking must be strict 
!  * to ensure there are no mistakes.
   *
+  * In 8.4 we progress through a number of states at startup. Initially, the
+  * postmaster is in PM_STARTUP state and spawns the Startup process. We then
+  * progress until the database is in a consistent state, then if we are in
+  * InArchiveRecovery we go into PM_RECOVERY state. The bgwriter then starts
+  * up and takes over responsibility for performing restartpoints. We then
+  * progress until the end of recovery when we enter PM_RUN state upon
+  * termination of the Startup process. In summary:
+  * 
+  * PM_STARTUP state:	Startup process performs restartpoints
+  * PM_RECOVERY state:	bgwriter process performs restartpoints
+  * PM_RUN state: 		bgwriter process performs checkpoints
+  *
+  * These transitions are fairly delicate, with many things that need to
+  * happen at the same time in order to change state successfully throughout
+  * the system. Changing PM_STARTUP to PM_RECOVERY only occurs when we can
+  * prove the databases are in a consistent state. Changing from PM_RECOVERY
+  * to PM_RUN happens whenever recovery ends, which could be forced upon us
+  * externally or it can occur because of damage or termination of the WAL
+  * sequence.
   *----------
   */
  
***************
*** 312,317 **** typedef struct XLogCtlData
--- 340,372 ----
  	int			XLogCacheBlck;	/* highest allocated xlog buffer index */
  	TimeLineID	ThisTimeLineID;
  
+ 	/*
+ 	 * IsRecoveryProcessingMode shows whether the postmaster is in a
+ 	 * postmaster state earlier than PM_RUN, or not. This is a globally
+ 	 * accessible state to allow EXEC_BACKEND case.
+ 	 *
+ 	 * We also retain a local state variable InRecovery. InRecovery=true
+ 	 * means the code is being executed by Startup process and therefore
+ 	 * always during Recovery Processing Mode. This allows us to identify
+ 	 * code executed *during* Recovery Processing Mode but not necessarily
+ 	 * by Startup process itself.
+ 	 *
+ 	 * This is only written to by the startup process, so no need for locking.
+ 	 */
+ 	bool		SharedRecoveryProcessingMode;
+ 
+ 	/*
+ 	 * recovery target control information
+ 	 *
+ 	 * Protected by info_lck
+ 	 */
+ 	TransactionId	recoveryTargetXid;
+ 	TimestampTz		recoveryTargetTime;
+ 	int				recoveryTargetAdvance;
+ 
+ 	TimestampTz 	recoveryLastXTime;
+ 	TransactionId 	recoveryLastXid;
+ 
  	slock_t		info_lck;		/* locks shared variables shown above */
  } XLogCtlData;
  
***************
*** 398,405 **** static void XLogArchiveCleanup(const char *xlog);
--- 453,462 ----
  static void readRecoveryCommandFile(void);
  static void exitArchiveRecovery(TimeLineID endTLI,
  					uint32 endLogId, uint32 endLogSeg);
+ static void exitRecovery(void);
  static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
  static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
+ static XLogRecPtr GetRedoLocationForCheckpoint(void);
  
  static bool XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites,
  				XLogRecPtr *lsn, BkpBlock *bkpb);
***************
*** 482,487 **** XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
--- 539,552 ----
  	bool		updrqst;
  	bool		doPageWrites;
  	bool		isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
+ 	bool		isRecoveryEnd = (rmid == RM_XLOG_ID && 
+ 									(info == XLOG_RECOVERY_END ||
+ 									 info == XLOG_CHECKPOINT_ONLINE));
+ 
+ 	/* cross-check on whether we should be here or not */
+ 	if (IsRecoveryProcessingMode() && !isRecoveryEnd)
+ 		elog(FATAL, "cannot make new WAL entries during recovery "
+ 					"(RMgrId = %d info = %d)", rmid, info);
  
  	/* info's high bits are reserved for use by me */
  	if (info & XLR_INFO_MASK)
***************
*** 1728,1735 **** XLogFlush(XLogRecPtr record)
  	XLogRecPtr	WriteRqstPtr;
  	XLogwrtRqst WriteRqst;
  
! 	/* Disabled during REDO */
! 	if (InRedo)
  		return;
  
  	/* Quick exit if already known flushed */
--- 1793,1799 ----
  	XLogRecPtr	WriteRqstPtr;
  	XLogwrtRqst WriteRqst;
  
! 	if (IsRecoveryProcessingMode())
  		return;
  
  	/* Quick exit if already known flushed */
***************
*** 1817,1826 **** XLogFlush(XLogRecPtr record)
  	 * the bad page is encountered again during recovery then we would be
  	 * unable to restart the database at all!  (This scenario has actually
  	 * happened in the field several times with 7.1 releases. Note that we
! 	 * cannot get here while InRedo is true, but if the bad page is brought in
! 	 * and marked dirty during recovery then CreateCheckPoint will try to
! 	 * flush it at the end of recovery.)
  	 *
  	 * The current approach is to ERROR under normal conditions, but only
  	 * WARNING during recovery, so that the system can be brought up even if
  	 * there's a corrupt LSN.  Note that for calls from xact.c, the ERROR will
--- 1881,1891 ----
  	 * the bad page is encountered again during recovery then we would be
  	 * unable to restart the database at all!  (This scenario has actually
  	 * happened in the field several times with 7.1 releases. Note that we
! 	 * cannot get here while IsRecoveryProcessingMode(), but if the bad page is
! 	 * brought in and marked dirty during recovery, the next checkpoint after
! 	 * recovery will try to flush it.
  	 *
+ 	 * XXX obsolete comment
  	 * The current approach is to ERROR under normal conditions, but only
  	 * WARNING during recovery, so that the system can be brought up even if
  	 * there's a corrupt LSN.  Note that for calls from xact.c, the ERROR will
***************
*** 1829,1835 **** XLogFlush(XLogRecPtr record)
  	 * and so we will not force a restart for a bad LSN on a data page.
  	 */
  	if (XLByteLT(LogwrtResult.Flush, record))
! 		elog(InRecovery ? WARNING : ERROR,
  		"xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
  			 record.xlogid, record.xrecoff,
  			 LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
--- 1894,1900 ----
  	 * and so we will not force a restart for a bad LSN on a data page.
  	 */
  	if (XLByteLT(LogwrtResult.Flush, record))
! 		elog(ERROR,
  		"xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
  			 record.xlogid, record.xrecoff,
  			 LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
***************
*** 2102,2108 **** XLogFileInit(uint32 log, uint32 seg,
  		unlink(tmppath);
  	}
  
! 	elog(DEBUG2, "done creating and filling new WAL file");
  
  	/* Set flag to tell caller there was no existent file */
  	*use_existent = false;
--- 2167,2174 ----
  		unlink(tmppath);
  	}
  
! 	XLogFileName(tmppath, ThisTimeLineID, log, seg);
! 	elog(DEBUG2, "done creating and filling new WAL file %s", tmppath);
  
  	/* Set flag to tell caller there was no existent file */
  	*use_existent = false;
***************
*** 2408,2413 **** XLogFileRead(uint32 log, uint32 seg, int emode)
--- 2474,2501 ----
  					 xlogfname);
  			set_ps_display(activitymsg, false);
  
+ 			/* 
+ 			 * Calculate and write out a new safeStartPoint. This defines
+ 			 * the latest LSN that might appear on-disk while we apply
+ 			 * the WAL records in this file. If we crash during recovery
+ 			 * we must reach this point again before we can prove
+ 			 * database consistency. Not a restartpoint! Restart points
+ 			 * define where we should start recovery from, if we crash.
+ 			 */
+ 			if (InArchiveRecovery)
+ 			{
+ 				uint32 nextLog = log;
+ 				uint32 nextSeg = seg;
+ 
+ 				NextLogSeg(nextLog, nextSeg);
+ 
+ 				LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+ 				ControlFile->minSafeStartPoint.xlogid = nextLog;
+ 				ControlFile->minSafeStartPoint.xrecoff = nextSeg * XLogSegSize;
+ 				UpdateControlFile();
+ 				LWLockRelease(ControlFileLock);
+ 			}
+ 
  			return fd;
  		}
  		if (errno != ENOENT)	/* unexpected failure? */
***************
*** 4733,4754 **** exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
  	unlink(recoveryPath);		/* ignore any error */
  
  	/*
! 	 * Rename the config file out of the way, so that we don't accidentally
! 	 * re-enter archive recovery mode in a subsequent crash.
  	 */
- 	unlink(RECOVERY_COMMAND_DONE);
- 	if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
- 		ereport(FATAL,
- 				(errcode_for_file_access(),
- 				 errmsg("could not rename file \"%s\" to \"%s\": %m",
- 						RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE)));
  
  	ereport(LOG,
  			(errmsg("archive recovery complete")));
  }
  
  /*
!  * For point-in-time recovery, this function decides whether we want to
   * stop applying the XLOG at or after the current record.
   *
   * Returns TRUE if we are stopping, FALSE otherwise.  On TRUE return,
--- 4821,4840 ----
  	unlink(recoveryPath);		/* ignore any error */
  
  	/*
! 	 * As of 8.4 we no longer rename the recovery.conf file out of the
! 	 * way until after we have performed a full checkpoint. This ensures
! 	 * that any crash between now and the end of the checkpoint does not
! 	 * attempt to restart from a WAL file that is no longer available to us.
! 	 * As soon as we remove recovery.conf we lose our recovery_command and
! 	 * cannot reaccess WAL files from the archive.
  	 */
  
  	ereport(LOG,
  			(errmsg("archive recovery complete")));
  }
  
  /*
!  * For archive recovery, this function decides whether we want to
   * stop applying the XLOG at or after the current record.
   *
   * Returns TRUE if we are stopping, FALSE otherwise.  On TRUE return,
***************
*** 4876,4881 **** StartupXLOG(void)
--- 4962,4968 ----
  	CheckPoint	checkPoint;
  	bool		wasShutdown;
  	bool		reachedStopPoint = false;
+ 	bool		performedRecovery = false;
  	bool		haveBackupLabel = false;
  	XLogRecPtr	RecPtr,
  				LastRec,
***************
*** 4888,4893 **** StartupXLOG(void)
--- 4975,4982 ----
  	uint32		freespace;
  	TransactionId oldestActiveXID;
  
+ 	XLogCtl->SharedRecoveryProcessingMode = true;
+ 
  	/*
  	 * Read control file and check XLOG status looks valid.
  	 *
***************
*** 5108,5116 **** StartupXLOG(void)
--- 5197,5211 ----
  		if (minRecoveryLoc.xlogid != 0 || minRecoveryLoc.xrecoff != 0)
  			ControlFile->minRecoveryPoint = minRecoveryLoc;
  		ControlFile->time = (pg_time_t) time(NULL);
+ 		/* No need to hold ControlFileLock yet, we aren't up far enough */
  		UpdateControlFile();
  
  		/*
+ 		 * Reset pgstat data, because it may be invalid after recovery.
+ 		 */
+ 		pgstat_reset_all();
+ 
+ 		/*
  		 * If there was a backup label file, it's done its job and the info
  		 * has now been propagated into pg_control.  We must get rid of the
  		 * label file so that if we crash during recovery, we'll pick up at
***************
*** 5220,5225 **** StartupXLOG(void)
--- 5315,5348 ----
  
  				LastRec = ReadRecPtr;
  
+ 				/*
+ 				 * Can we signal Postmaster to enter consistent recovery mode?
+ 				 *
+ 				 * There are two points in the log that we must pass. The first
+ 				 * is minRecoveryPoint, which is the LSN at the time the
+ 				 * base backup was taken that we are about to rollforward from.
+ 				 * If recovery has ever crashed or was stopped there is also
+ 				 * another point also: minSafeStartPoint, which we know the
+ 				 * latest LSN that recovery could have reached prior to crash.
+ 				 *
+ 				 * We must also have assembled sufficient information about
+ 				 * transaction state to allow valid snapshots to be taken.
+ 				 */
+ 				if (!reachedSafeStartPoint &&
+ 					 XLByteLE(ControlFile->minSafeStartPoint, EndRecPtr) && 
+ 					 XLByteLE(ControlFile->minRecoveryPoint, EndRecPtr))
+ 				{
+ 					reachedSafeStartPoint = true;
+ 					if (InArchiveRecovery)
+ 					{
+ 						ereport(LOG,
+ 							(errmsg("database has now reached consistent state at %X/%X",
+ 								EndRecPtr.xlogid, EndRecPtr.xrecoff)));
+ 						if (IsUnderPostmaster)
+ 							SendPostmasterSignal(PMSIGNAL_RECOVERY_START);
+ 					}
+ 				}
+ 
  				record = ReadRecord(NULL, LOG);
  			} while (record != NULL && recoveryContinue);
  
***************
*** 5241,5246 **** StartupXLOG(void)
--- 5364,5370 ----
  			/* there are no WAL records following the checkpoint */
  			ereport(LOG,
  					(errmsg("redo is not required")));
+ 			reachedSafeStartPoint = true;
  		}
  	}
  
***************
*** 5254,5269 **** StartupXLOG(void)
  
  	/*
  	 * Complain if we did not roll forward far enough to render the backup
! 	 * dump consistent.
  	 */
! 	if (XLByteLT(EndOfLog, ControlFile->minRecoveryPoint))
  	{
  		if (reachedStopPoint)	/* stopped because of stop request */
  			ereport(FATAL,
  					(errmsg("requested recovery stop point is before end time of backup dump")));
  		else	/* ran off end of WAL */
  			ereport(FATAL,
! 					(errmsg("WAL ends before end time of backup dump")));
  	}
  
  	/*
--- 5378,5393 ----
  
  	/*
  	 * Complain if we did not roll forward far enough to render the backup
! 	 * dump consistent and start safely.
  	 */
! 	if (InArchiveRecovery && !reachedSafeStartPoint)
  	{
  		if (reachedStopPoint)	/* stopped because of stop request */
  			ereport(FATAL,
  					(errmsg("requested recovery stop point is before end time of backup dump")));
  		else	/* ran off end of WAL */
  			ereport(FATAL,
! 					(errmsg("end of WAL reached before end time of backup dump")));
  	}
  
  	/*
***************
*** 5378,5416 **** StartupXLOG(void)
  		XLogCheckInvalidPages();
  
  		/*
! 		 * Reset pgstat data, because it may be invalid after recovery.
  		 */
! 		pgstat_reset_all();
  
! 		/*
! 		 * Perform a checkpoint to update all our recovery activity to disk.
! 		 *
! 		 * Note that we write a shutdown checkpoint rather than an on-line
! 		 * one. This is not particularly critical, but since we may be
! 		 * assigning a new TLI, using a shutdown checkpoint allows us to have
! 		 * the rule that TLI only changes in shutdown checkpoints, which
! 		 * allows some extra error checking in xlog_redo.
! 		 */
! 		CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
  	}
  
- 	/*
- 	 * Preallocate additional log files, if wanted.
- 	 */
- 	PreallocXlogFiles(EndOfLog);
- 
- 	/*
- 	 * Okay, we're officially UP.
- 	 */
- 	InRecovery = false;
- 
- 	ControlFile->state = DB_IN_PRODUCTION;
- 	ControlFile->time = (pg_time_t) time(NULL);
- 	UpdateControlFile();
- 
- 	/* start the archive_timeout timer running */
- 	XLogCtl->Write.lastSegSwitchTime = ControlFile->time;
- 
  	/* initialize shared-memory copy of latest checkpoint XID/epoch */
  	XLogCtl->ckptXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
  	XLogCtl->ckptXid = ControlFile->checkPointCopy.nextXid;
--- 5502,5515 ----
  		XLogCheckInvalidPages();
  
  		/*
! 		 * Finally exit recovery and mark that in WAL. Pre-8.4 we wrote
! 		 * a shutdown checkpoint here, but we ask bgwriter to do that now.
  		 */
! 		exitRecovery();
  
! 		performedRecovery = true;
  	}
  
  	/* initialize shared-memory copy of latest checkpoint XID/epoch */
  	XLogCtl->ckptXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
  	XLogCtl->ckptXid = ControlFile->checkPointCopy.nextXid;
***************
*** 5444,5449 **** StartupXLOG(void)
--- 5543,5641 ----
  		readRecordBuf = NULL;
  		readRecordBufSize = 0;
  	}
+ 
+ 	/*
+ 	 * Prior to 8.4 we wrote a Shutdown Checkpoint at the end of recovery.
+ 	 * This could add minutes to the startup time, so we want bgwriter
+ 	 * to perform it. This then frees the Startup process to complete so we can
+ 	 * allow transactions and WAL inserts. We still write a checkpoint, but
+ 	 * it will be an online checkpoint. Online checkpoints have a redo
+ 	 * location that can be prior to the actual checkpoint record. So we want
+ 	 * to derive that redo location *before* we let anybody else write WAL,
+ 	 * otherwise we might miss some WAL records if we crash.
+ 	 */
+ 	if (performedRecovery)
+ 	{
+ 		XLogRecPtr	redo;
+ 
+ 		/* 
+ 		 * We must grab the pointer before anybody writes WAL 
+ 		 */
+ 		redo = GetRedoLocationForCheckpoint();
+ 
+ 		/* 
+ 		 * Set up information for the bgwriter, but if it is not active
+ 		 * for whatever reason, perform the checkpoint ourselves.
+ 		 */
+ 		if (SetRedoLocationForArchiveCheckpoint(redo))
+ 		{
+ 			/*
+ 			 * Okay, we can come up now. Allow others to write WAL.
+ 			 */
+ 			XLogCtl->SharedRecoveryProcessingMode = false;
+ 
+ 			/*
+ 			 * Now request checkpoint from bgwriter.
+ 			 */
+ 			RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_IMMEDIATE);
+ 		}
+ 		else
+ 		{
+ 			/*
+ 			 * Startup process performs the checkpoint, but defers
+ 			 * the change in processing mode until afterwards.
+ 			 */
+ 			CreateCheckPoint(CHECKPOINT_FORCE | CHECKPOINT_IMMEDIATE);
+ 		}
+ 	}
+ 	else
+ 	{
+ 		/*
+ 		 * No recovery, so lets just get on with it. 
+ 		 */
+ 		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+ 		ControlFile->state = DB_IN_PRODUCTION;
+ 		ControlFile->time = (pg_time_t) time(NULL);
+ 		UpdateControlFile();
+ 		LWLockRelease(ControlFileLock);
+ 	}
+ 
+ 	/*
+ 	 * Okay, we can come up now. Allow others to write WAL.
+ 	 */
+ 	XLogCtl->SharedRecoveryProcessingMode = false;
+ 
+ 	/* start the archive_timeout timer running */
+ 	XLogCtl->Write.lastSegSwitchTime = (pg_time_t) time(NULL);
+ }
+ 
+ /*
+  * IsRecoveryProcessingMode()
+  *
+  * Fast test for whether we're still in recovery or not. We test the shared
+  * state each time only until we leave recovery mode. After that we never
+  * look again, relying upon the settings of our local state variables. This
+  * is designed to avoid the need for a separate initialisation step.
+  */
+ bool
+ IsRecoveryProcessingMode(void)
+ {
+ 	if (knownProcessingMode && !LocalRecoveryProcessingMode)
+ 		return false;
+ 
+ 	{
+ 		/* use volatile pointer to prevent code rearrangement */
+ 		volatile XLogCtlData *xlogctl = XLogCtl;
+ 
+ 		if (xlogctl == NULL)
+ 			return false;
+ 
+ 		LocalRecoveryProcessingMode = XLogCtl->SharedRecoveryProcessingMode;
+ 	}
+ 
+ 	knownProcessingMode = true;
+ 
+ 	return LocalRecoveryProcessingMode;
  }
  
  /*
***************
*** 5701,5720 **** ShutdownXLOG(int code, Datum arg)
  static void
  LogCheckpointStart(int flags)
  {
! 	elog(LOG, "checkpoint starting:%s%s%s%s%s%s",
! 		 (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
! 		 (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
! 		 (flags & CHECKPOINT_FORCE) ? " force" : "",
! 		 (flags & CHECKPOINT_WAIT) ? " wait" : "",
! 		 (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
! 		 (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "");
  }
  
  /*
   * Log end of a checkpoint.
   */
  static void
! LogCheckpointEnd(void)
  {
  	long		write_secs,
  				sync_secs,
--- 5893,5916 ----
  static void
  LogCheckpointStart(int flags)
  {
! 	if (flags & CHECKPOINT_RESTARTPOINT)
! 		elog(LOG, "restartpoint starting:%s",
! 			(flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "");
! 	else
! 		elog(LOG, "checkpoint starting:%s%s%s%s%s%s",
! 			 (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
! 			 (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
! 			 (flags & CHECKPOINT_FORCE) ? " force" : "",
! 			 (flags & CHECKPOINT_WAIT) ? " wait" : "",
! 			 (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
! 			 (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "");
  }
  
  /*
   * Log end of a checkpoint.
   */
  static void
! LogCheckpointEnd(int flags)
  {
  	long		write_secs,
  				sync_secs,
***************
*** 5737,5753 **** LogCheckpointEnd(void)
  						CheckpointStats.ckpt_sync_end_t,
  						&sync_secs, &sync_usecs);
  
! 	elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
! 		 "%d transaction log file(s) added, %d removed, %d recycled; "
! 		 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
! 		 CheckpointStats.ckpt_bufs_written,
! 		 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
! 		 CheckpointStats.ckpt_segs_added,
! 		 CheckpointStats.ckpt_segs_removed,
! 		 CheckpointStats.ckpt_segs_recycled,
! 		 write_secs, write_usecs / 1000,
! 		 sync_secs, sync_usecs / 1000,
! 		 total_secs, total_usecs / 1000);
  }
  
  /*
--- 5933,5958 ----
  						CheckpointStats.ckpt_sync_end_t,
  						&sync_secs, &sync_usecs);
  
! 	if (flags & CHECKPOINT_RESTARTPOINT)
! 		elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
! 			 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
! 			 CheckpointStats.ckpt_bufs_written,
! 			 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
! 			 write_secs, write_usecs / 1000,
! 			 sync_secs, sync_usecs / 1000,
! 			 total_secs, total_usecs / 1000);
! 	else
! 		elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
! 			 "%d transaction log file(s) added, %d removed, %d recycled; "
! 			 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
! 			 CheckpointStats.ckpt_bufs_written,
! 			 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
! 			 CheckpointStats.ckpt_segs_added,
! 			 CheckpointStats.ckpt_segs_removed,
! 			 CheckpointStats.ckpt_segs_recycled,
! 			 write_secs, write_usecs / 1000,
! 			 sync_secs, sync_usecs / 1000,
! 			 total_secs, total_usecs / 1000);
  }
  
  /*
***************
*** 5772,5788 **** CreateCheckPoint(int flags)
  	XLogRecPtr	recptr;
  	XLogCtlInsert *Insert = &XLogCtl->Insert;
  	XLogRecData rdata;
- 	uint32		freespace;
  	uint32		_logId;
  	uint32		_logSeg;
  	TransactionId *inCommitXids;
  	int			nInCommit;
  
  	/*
  	 * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
! 	 * (This is just pro forma, since in the present system structure there is
! 	 * only one process that is allowed to issue checkpoints at any given
! 	 * time.)
  	 */
  	LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
  
--- 5977,5992 ----
  	XLogRecPtr	recptr;
  	XLogCtlInsert *Insert = &XLogCtl->Insert;
  	XLogRecData rdata;
  	uint32		_logId;
  	uint32		_logSeg;
  	TransactionId *inCommitXids;
  	int			nInCommit;
+ 	bool		leavingArchiveRecovery = false;
  
  	/*
  	 * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
! 	 * That shouldn't be happening, but checkpoints are an important aspect
! 	 * of our resilience, so we take no chances.
  	 */
  	LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
  
***************
*** 5797,5811 **** CreateCheckPoint(int flags)
--- 6001,6024 ----
  	CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
  
  	/*
+ 	 * Find out if this is the first checkpoint after archive recovery.
+ 	 */
+ 	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+ 	leavingArchiveRecovery = (ControlFile->state == DB_IN_ARCHIVE_RECOVERY);
+ 	LWLockRelease(ControlFileLock);
+ 
+ 	/*
  	 * Use a critical section to force system panic if we have trouble.
  	 */
  	START_CRIT_SECTION();
  
  	if (shutdown)
  	{
+ 		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
  		ControlFile->state = DB_SHUTDOWNING;
  		ControlFile->time = (pg_time_t) time(NULL);
  		UpdateControlFile();
+ 		LWLockRelease(ControlFileLock);
  	}
  
  	/*
***************
*** 5861,5901 **** CreateCheckPoint(int flags)
  		}
  	}
  
! 	/*
! 	 * Compute new REDO record ptr = location of next XLOG record.
! 	 *
! 	 * NB: this is NOT necessarily where the checkpoint record itself will be,
! 	 * since other backends may insert more XLOG records while we're off doing
! 	 * the buffer flush work.  Those XLOG records are logically after the
! 	 * checkpoint, even though physically before it.  Got that?
! 	 */
! 	freespace = INSERT_FREESPACE(Insert);
! 	if (freespace < SizeOfXLogRecord)
! 	{
! 		(void) AdvanceXLInsertBuffer(false);
! 		/* OK to ignore update return flag, since we will do flush anyway */
! 		freespace = INSERT_FREESPACE(Insert);
! 	}
! 	INSERT_RECPTR(checkPoint.redo, Insert, Insert->curridx);
! 
! 	/*
! 	 * Here we update the shared RedoRecPtr for future XLogInsert calls; this
! 	 * must be done while holding the insert lock AND the info_lck.
! 	 *
! 	 * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
! 	 * pointing past where it really needs to point.  This is okay; the only
! 	 * consequence is that XLogInsert might back up whole buffers that it
! 	 * didn't really need to.  We can't postpone advancing RedoRecPtr because
! 	 * XLogInserts that happen while we are dumping buffers must assume that
! 	 * their buffer changes are not included in the checkpoint.
! 	 */
  	{
! 		/* use volatile pointer to prevent code rearrangement */
! 		volatile XLogCtlData *xlogctl = XLogCtl;
! 
! 		SpinLockAcquire(&xlogctl->info_lck);
! 		RedoRecPtr = xlogctl->Insert.RedoRecPtr = checkPoint.redo;
! 		SpinLockRelease(&xlogctl->info_lck);
  	}
  
  	/*
--- 6074,6092 ----
  		}
  	}
  
! 	if (leavingArchiveRecovery)
! 		checkPoint.redo = GetRedoLocationForArchiveCheckpoint();
! 	else
  	{
! 		/*
! 		 * Compute new REDO record ptr = location of next XLOG record.
! 		 *
! 		 * NB: this is NOT necessarily where the checkpoint record itself will be,
! 		 * since other backends may insert more XLOG records while we're off doing
! 		 * the buffer flush work.  Those XLOG records are logically after the
! 		 * checkpoint, even though physically before it.  Got that?
! 		 */
! 		checkPoint.redo = GetRedoLocationForCheckpoint();
  	}
  
  	/*
***************
*** 6013,6023 **** CreateCheckPoint(int flags)
  	XLByteToSeg(ControlFile->checkPointCopy.redo, _logId, _logSeg);
  
  	/*
! 	 * Update the control file.
  	 */
  	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
  	if (shutdown)
  		ControlFile->state = DB_SHUTDOWNED;
  	ControlFile->prevCheckPoint = ControlFile->checkPoint;
  	ControlFile->checkPoint = ProcLastRecPtr;
  	ControlFile->checkPointCopy = checkPoint;
--- 6204,6221 ----
  	XLByteToSeg(ControlFile->checkPointCopy.redo, _logId, _logSeg);
  
  	/*
! 	 * Update the control file. In 8.4, this routine becomes the primary
! 	 * point for recording changes of state in the control file at the 
! 	 * end of recovery. Postmaster state already shows us being in 
! 	 * normal running mode, but it is only after this point that we
! 	 * are completely free of reperforming a recovery if we crash.  Note
! 	 * that this is executed by bgwriter after the death of Startup process.
  	 */
  	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
  	if (shutdown)
  		ControlFile->state = DB_SHUTDOWNED;
+ 	else
+ 		ControlFile->state = DB_IN_PRODUCTION;
  	ControlFile->prevCheckPoint = ControlFile->checkPoint;
  	ControlFile->checkPoint = ProcLastRecPtr;
  	ControlFile->checkPointCopy = checkPoint;
***************
*** 6025,6030 **** CreateCheckPoint(int flags)
--- 6223,6243 ----
  	UpdateControlFile();
  	LWLockRelease(ControlFileLock);
  
+ 	if (leavingArchiveRecovery)
+ 	{
+ 		/*
+ 		 * Rename the config file out of the way, so that we don't accidentally
+ 		 * re-enter archive recovery mode in a subsequent crash. Prior to
+ 		 * 8.4 this step was performed at end of exitArchiveRecovery().
+ 		 */
+ 		unlink(RECOVERY_COMMAND_DONE);
+ 		if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
+ 			ereport(ERROR,
+ 				    (errcode_for_file_access(),
+ 					 errmsg("could not rename file \"%s\" to \"%s\": %m",
+ 								RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE)));
+ 	}
+ 
  	/* Update shared-memory copy of checkpoint XID/epoch */
  	{
  		/* use volatile pointer to prevent code rearrangement */
***************
*** 6068,6082 **** CreateCheckPoint(int flags)
  	 * Truncate pg_subtrans if possible.  We can throw away all data before
  	 * the oldest XMIN of any running transaction.	No future transaction will
  	 * attempt to reference any pg_subtrans entry older than that (see Asserts
! 	 * in subtrans.c).	During recovery, though, we mustn't do this because
! 	 * StartupSUBTRANS hasn't been called yet.
  	 */
! 	if (!InRecovery)
  		TruncateSUBTRANS(GetOldestXmin(true, false));
  
  	/* All real work is done, but log before releasing lock. */
  	if (log_checkpoints)
! 		LogCheckpointEnd();
  
          TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
                                  NBuffers, CheckpointStats.ckpt_segs_added,
--- 6281,6294 ----
  	 * Truncate pg_subtrans if possible.  We can throw away all data before
  	 * the oldest XMIN of any running transaction.	No future transaction will
  	 * attempt to reference any pg_subtrans entry older than that (see Asserts
! 	 * in subtrans.c).	
  	 */
! 	if (!shutdown)
  		TruncateSUBTRANS(GetOldestXmin(true, false));
  
  	/* All real work is done, but log before releasing lock. */
  	if (log_checkpoints)
! 		LogCheckpointEnd(flags);
  
          TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
                                  NBuffers, CheckpointStats.ckpt_segs_added,
***************
*** 6085,6090 **** CreateCheckPoint(int flags)
--- 6297,6347 ----
  
  	LWLockRelease(CheckpointLock);
  }
+  
+ /* 
+  * GetRedoLocationForCheckpoint()
+  *
+  * When !IsRecoveryProcessingMode() this must be called while holding 
+  * WALInsertLock().
+  */
+ static XLogRecPtr
+ GetRedoLocationForCheckpoint()
+ {
+ 	XLogCtlInsert  *Insert = &XLogCtl->Insert;
+ 	uint32                  freespace;
+ 	XLogRecPtr              redo;
+ 
+ 	freespace = INSERT_FREESPACE(Insert);
+ 	if (freespace < SizeOfXLogRecord)
+ 	{
+ 	        (void) AdvanceXLInsertBuffer(false);
+ 	        /* OK to ignore update return flag, since we will do flush anyway */
+ 	        freespace = INSERT_FREESPACE(Insert);
+ 	}
+ 	INSERT_RECPTR(redo, Insert, Insert->curridx);
+ 
+ 	/*
+ 	 * Here we update the shared RedoRecPtr for future XLogInsert calls; this
+ 	 * must be done while holding the insert lock AND the info_lck.
+ 	 *
+ 	 * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
+ 	 * pointing past where it really needs to point.  This is okay; the only
+ 	 * consequence is that XLogInsert might back up whole buffers that it
+ 	 * didn't really need to.  We can't postpone advancing RedoRecPtr because
+ 	 * XLogInserts that happen while we are dumping buffers must assume that
+ 	 * their buffer changes are not included in the checkpoint.
+ 	 */
+ 	{
+ 	        /* use volatile pointer to prevent code rearrangement */
+ 	        volatile XLogCtlData *xlogctl = XLogCtl;
+ 
+         SpinLockAcquire(&xlogctl->info_lck);
+         RedoRecPtr = xlogctl->Insert.RedoRecPtr = redo;
+         SpinLockRelease(&xlogctl->info_lck);
+ 	}
+ 
+ 	return redo;
+ }
  
  /*
   * Flush all data in shared memory to disk, and fsync
***************
*** 6150,6180 **** RecoveryRestartPoint(const CheckPoint *checkPoint)
  			}
  	}
  
  	/*
! 	 * OK, force data out to disk
  	 */
! 	CheckPointGuts(checkPoint->redo, CHECKPOINT_IMMEDIATE);
  
  	/*
! 	 * Update pg_control so that any subsequent crash will restart from this
! 	 * checkpoint.	Note: ReadRecPtr gives the XLOG address of the checkpoint
! 	 * record itself.
  	 */
- 	ControlFile->prevCheckPoint = ControlFile->checkPoint;
- 	ControlFile->checkPoint = ReadRecPtr;
- 	ControlFile->checkPointCopy = *checkPoint;
- 	ControlFile->time = (pg_time_t) time(NULL);
- 	UpdateControlFile();
  
  	ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
! 			(errmsg("recovery restart point at %X/%X",
! 					checkPoint->redo.xlogid, checkPoint->redo.xrecoff)));
! 	if (recoveryLastXTime)
! 		ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
! 				(errmsg("last completed transaction was at log time %s",
! 						timestamptz_to_str(recoveryLastXTime))));
! }
  
  /*
   * Write a NEXTOID log record
   */
--- 6407,6477 ----
  			}
  	}
  
+ 	RequestRestartPoint(ReadRecPtr, checkPoint, reachedSafeStartPoint);
+ }
+ 
+ /*
+ * As of 8.4, RestartPoints are always created by the bgwriter
+ * once we have reachedSafeStartPoint. We use bgwriter's shared memory
+ * area wherever we call it from, to keep better code structure.
+ */
+ void
+ CreateRestartPoint(const XLogRecPtr ReadPtr, const CheckPoint *restartPoint, int flags)
+ {
+ 	if (recoveryLogRestartpoints || log_checkpoints)
+ 	{
+   		/*
+ 		 * Prepare to accumulate statistics.
+   		 */
+ 
+ 		MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
+ 		CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
+ 
+ 		LogCheckpointStart(CHECKPOINT_RESTARTPOINT | flags);
+ 	}
+   
+   	/*
+ 	 * Acquire CheckpointLock to ensure only one restartpoint happens at a time.
+ 	 * We rely on this lock to ensure that the startup process doesn't exit
+ 	 * Recovery while we are half way through a restartpoint.
+   	 */
+ 	LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
+ 
+ 	CheckPointGuts(restartPoint->redo, CHECKPOINT_RESTARTPOINT | flags);
+ 
  	/*
! 	 * Update pg_control, using current time
  	 */
! 	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
!   	ControlFile->prevCheckPoint = ControlFile->checkPoint;
! 	ControlFile->checkPoint = ReadPtr;
! 	ControlFile->checkPointCopy = *restartPoint;
!   	ControlFile->time = (pg_time_t) time(NULL);
!   	UpdateControlFile();
! 	LWLockRelease(ControlFileLock);
  
  	/*
! 	 * Currently, there is no need to truncate pg_subtrans during recovery.
! 	 * If we did do that, we will need to have called StartupSUBTRANS()
! 	 * already and then TruncateSUBTRANS() would go here.
  	 */
  
+ 	/* All real work is done, but log before releasing lock. */
+ 	if (recoveryLogRestartpoints || log_checkpoints)
+ 		LogCheckpointEnd(CHECKPOINT_RESTARTPOINT);
+   
  	ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
!   			(errmsg("recovery restart point at %X/%X",
! 					restartPoint->redo.xlogid, restartPoint->redo.xrecoff)));
  
+   	if (recoveryLastXTime)
+   		ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
+ 			(errmsg("last completed transaction was at log time %s",
+ 					timestamptz_to_str(recoveryLastXTime))));
+ 
+ 	LWLockRelease(CheckpointLock);
+ }
+   
  /*
   * Write a NEXTOID log record
   */
***************
*** 6237,6243 **** RequestXLogSwitch(void)
  }
  
  /*
!  * XLOG resource manager's routines
   */
  void
  xlog_redo(XLogRecPtr lsn, XLogRecord *record)
--- 6534,6596 ----
  }
  
  /*
!  * exitRecovery()
!  *
!  * Exit recovery state and write a XLOG_RECOVERY_END record. This is the
!  * only record type that can record a change of timelineID. We assume
!  * caller has already set ThisTimeLineID, if appropriate.
!  */
! static void
! exitRecovery(void)
! {
! 	XLogRecData rdata;
! 
! 	rdata.buffer = InvalidBuffer;
! 	rdata.data = (char *) (&ThisTimeLineID);
! 	rdata.len = sizeof(TimeLineID);
! 	rdata.next = NULL;
! 
! 	/*
! 	 * If a restartpoint is in progress, we will not be able to successfully
! 	 * acquire CheckpointLock. If bgwriter is still in progress then send
! 	 * a second signal to nudge bgwriter to go faster so we can avoid delay.
! 	 * Then wait for lock, so we know the restartpoint has completed. We do
! 	 * this because we don't want to interrupt the restartpoint half way
! 	 * through, which might leave us in a mess and we want to be robust. We're
! 	 * going to checkpoint soon anyway, so not it's not wasted effort.
! 	 */
! 	if (LWLockConditionalAcquire(CheckpointLock, LW_EXCLUSIVE))
! 		LWLockRelease(CheckpointLock);
! 	else
! 	{
! 		RequestRestartPointCompletion();
! 		ereport(DEBUG1,
! 			(errmsg("startup process waiting for restartpoint to complete")));
! 		LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
! 		LWLockRelease(CheckpointLock);
! 	}	
! 
! 	/*
! 	 * This is the only type of WAL message that can be inserted during
! 	 * recovery. This ensures that we don't allow others to get access
! 	 * until after we have changed state.
! 	 */
! 	(void) XLogInsert(RM_XLOG_ID, XLOG_RECOVERY_END, &rdata);
! 
! 	/*
! 	 * We don't XLogFlush() here otherwise we'll end up zeroing the WAL
! 	 * file ourselves. So just let bgwriter's forthcoming checkpoint do
! 	 * that for us.
! 	 */
! 
! 	InRecovery = false;
! }
! 
! /*
!  * XLOG resource manager's routines.
!  *
!  * Definitions of message info are in include/catalog/pg_control.h,
!  * though not all messages relate to control file processing.
   */
  void
  xlog_redo(XLogRecPtr lsn, XLogRecord *record)
***************
*** 6271,6293 **** xlog_redo(XLogRecPtr lsn, XLogRecord *record)
  		ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
  		ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
  
! 		/*
! 		 * TLI may change in a shutdown checkpoint, but it shouldn't decrease
  		 */
- 		if (checkPoint.ThisTimeLineID != ThisTimeLineID)
- 		{
- 			if (checkPoint.ThisTimeLineID < ThisTimeLineID ||
- 				!list_member_int(expectedTLIs,
- 								 (int) checkPoint.ThisTimeLineID))
- 				ereport(PANIC,
- 						(errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
- 								checkPoint.ThisTimeLineID, ThisTimeLineID)));
- 			/* Following WAL records should be run with new TLI */
- 			ThisTimeLineID = checkPoint.ThisTimeLineID;
- 		}
  
  		RecoveryRestartPoint(&checkPoint);
  	}
  	else if (info == XLOG_CHECKPOINT_ONLINE)
  	{
  		CheckPoint	checkPoint;
--- 6624,6663 ----
  		ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
  		ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
  
!   		/*
! 		 * TLI no longer changes at shutdown checkpoint, since as of 8.4,
! 		 * shutdown checkpoints only occur at shutdown. Much less confusing.
  		 */
  
  		RecoveryRestartPoint(&checkPoint);
  	}
+ 	else if (info == XLOG_RECOVERY_END)
+ 	{
+ 		TimeLineID	tli;
+ 
+ 		memcpy(&tli, XLogRecGetData(record), sizeof(TimeLineID));
+ 
+ 		/*
+ 		 * TLI may change when recovery ends, but it shouldn't decrease.
+ 		 *
+ 		 * This is the only WAL record that can tell us to change timelineID
+ 		 * while we process WAL records. 
+ 		 *
+ 		 * We can *choose* to stop recovery at any point, generating a
+ 		 * new timelineID which is recorded using this record type.
+ 		 */
+ 		if (tli != ThisTimeLineID)
+   		{
+ 			if (tli < ThisTimeLineID ||
+   				!list_member_int(expectedTLIs,
+ 								 (int) tli))
+   				ereport(PANIC,
+ 						(errmsg("unexpected timeline ID %u (after %u) at recovery end record",
+ 								tli, ThisTimeLineID)));
+   			/* Following WAL records should be run with new TLI */
+ 			ThisTimeLineID = tli;
+   		}
+   	}
  	else if (info == XLOG_CHECKPOINT_ONLINE)
  	{
  		CheckPoint	checkPoint;
***************
*** 6309,6315 **** xlog_redo(XLogRecPtr lsn, XLogRecord *record)
  		ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
  		ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
  
! 		/* TLI should not change in an on-line checkpoint */
  		if (checkPoint.ThisTimeLineID != ThisTimeLineID)
  			ereport(PANIC,
  					(errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
--- 6679,6685 ----
  		ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
  		ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
  
! 		/* TLI must not change at a checkpoint */
  		if (checkPoint.ThisTimeLineID != ThisTimeLineID)
  			ereport(PANIC,
  					(errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
***************
*** 6545,6550 **** pg_start_backup(PG_FUNCTION_ARGS)
--- 6915,6926 ----
  				 errhint("archive_command must be defined before "
  						 "online backups can be made safely.")));
  
+ 	if (IsRecoveryProcessingMode())
+ 		ereport(ERROR,
+ 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ 				 errmsg("recovery is in progress"),
+ 				 errhint("WAL control functions cannot be executed during recovery.")));
+ 
  	backupidstr = text_to_cstring(backupid);
  
  	/*
***************
*** 6710,6715 **** pg_stop_backup(PG_FUNCTION_ARGS)
--- 7086,7097 ----
  				 errmsg("WAL archiving is not active"),
  				 errhint("archive_mode must be enabled at server start.")));
  
+ 	if (IsRecoveryProcessingMode())
+ 		ereport(ERROR,
+ 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ 				 errmsg("recovery is in progress"),
+ 				 errhint("WAL control functions cannot be executed during recovery.")));
+ 
  	/*
  	 * OK to clear forcePageWrites
  	 */
***************
*** 6865,6870 **** pg_switch_xlog(PG_FUNCTION_ARGS)
--- 7247,7258 ----
  				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
  			 (errmsg("must be superuser to switch transaction log files"))));
  
+ 	if (IsRecoveryProcessingMode())
+ 		ereport(ERROR,
+ 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ 				 errmsg("recovery is in progress"),
+ 				 errhint("WAL control functions cannot be executed during recovery.")));
+ 
  	switchpoint = RequestXLogSwitch();
  
  	/*
***************
*** 6887,6892 **** pg_current_xlog_location(PG_FUNCTION_ARGS)
--- 7275,7286 ----
  {
  	char		location[MAXFNAMELEN];
  
+ 	if (IsRecoveryProcessingMode())
+ 		ereport(ERROR,
+ 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ 				 errmsg("recovery is in progress"),
+ 				 errhint("WAL control functions cannot be executed during recovery.")));
+ 
  	/* Make sure we have an up-to-date local LogwrtResult */
  	{
  		/* use volatile pointer to prevent code rearrangement */
***************
*** 6914,6919 **** pg_current_xlog_insert_location(PG_FUNCTION_ARGS)
--- 7308,7319 ----
  	XLogRecPtr	current_recptr;
  	char		location[MAXFNAMELEN];
  
+ 	if (IsRecoveryProcessingMode())
+ 		ereport(ERROR,
+ 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ 				 errmsg("recovery is in progress"),
+ 				 errhint("WAL control functions cannot be executed during recovery.")));
+ 
  	/*
  	 * Get the current end-of-WAL position ... shared lock is sufficient
  	 */
*** src/backend/commands/dbcommands.c
--- src/backend/commands/dbcommands.c
***************
*** 1976,1981 **** dbase_redo(XLogRecPtr lsn, XLogRecord *record)
--- 1976,1986 ----
  		 * We don't need to copy subdirectories
  		 */
  		copydir(src_path, dst_path, false);
+ 
+ 		/*
+ 		 * Flat files are updated immediately following transaction commit.
+ 	 	 * Nothing to do here.
+ 		 */
  	}
  	else if (info == XLOG_DBASE_DROP)
  	{
***************
*** 1998,2003 **** dbase_redo(XLogRecPtr lsn, XLogRecord *record)
--- 2003,2012 ----
  			ereport(WARNING,
  					(errmsg("some useless files may be left behind in old database directory \"%s\"",
  							dst_path)));
+ 		/*
+ 		 * Flat files are updated immediately following transaction commit.
+ 	 	 * Nothing to do here.
+ 		 */
  	}
  	else
  		elog(PANIC, "dbase_redo: unknown op code %u", info);
*** src/backend/postmaster/bgwriter.c
--- src/backend/postmaster/bgwriter.c
***************
*** 49,54 ****
--- 49,55 ----
  #include <unistd.h>
  
  #include "access/xlog_internal.h"
+ #include "catalog/pg_control.h"
  #include "libpq/pqsignal.h"
  #include "miscadmin.h"
  #include "pgstat.h"
***************
*** 129,134 **** typedef struct
--- 130,142 ----
  
  	int			ckpt_flags;		/* checkpoint flags, as defined in xlog.h */
  
+ 	/* 
+ 	 * When the Startup process wants bgwriter to perform a restartpoint, it 
+ 	 * sets these fields so that we can update the control file afterwards.
+ 	 */
+ 	XLogRecPtr	ReadPtr;		/* Requested log pointer */
+ 	CheckPoint  restartPoint;	/* restartPoint data for ControlFile */
+ 
  	uint32		num_backend_writes;		/* counts non-bgwriter buffer writes */
  
  	int			num_requests;	/* current # of requests */
***************
*** 165,171 **** static bool ckpt_active = false;
  
  /* these values are valid when ckpt_active is true: */
  static pg_time_t ckpt_start_time;
! static XLogRecPtr ckpt_start_recptr;
  static double ckpt_cached_elapsed;
  
  static pg_time_t last_checkpoint_time;
--- 173,179 ----
  
  /* these values are valid when ckpt_active is true: */
  static pg_time_t ckpt_start_time;
! static XLogRecPtr ckpt_start_recptr;	/* not used if IsRecoveryProcessingMode */
  static double ckpt_cached_elapsed;
  
  static pg_time_t last_checkpoint_time;
***************
*** 197,202 **** BackgroundWriterMain(void)
--- 205,211 ----
  {
  	sigjmp_buf	local_sigjmp_buf;
  	MemoryContext bgwriter_context;
+ 	bool		BgWriterRecoveryMode;
  
  	BgWriterShmem->bgwriter_pid = MyProcPid;
  	am_bg_writer = true;
***************
*** 355,370 **** BackgroundWriterMain(void)
  	 */
  	PG_SETMASK(&UnBlockSig);
  
  	/*
  	 * Loop forever
  	 */
  	for (;;)
  	{
- 		bool		do_checkpoint = false;
- 		int			flags = 0;
- 		pg_time_t	now;
- 		int			elapsed_secs;
- 
  		/*
  		 * Emergency bailout if postmaster has died.  This is to avoid the
  		 * necessity for manual cleanup of all postmaster children.
--- 364,380 ----
  	 */
  	PG_SETMASK(&UnBlockSig);
  
+ 	BgWriterRecoveryMode = IsRecoveryProcessingMode();
+ 
+ 	if (BgWriterRecoveryMode)
+ 		elog(DEBUG1, "bgwriter starting during recovery, pid = %u", 
+ 			BgWriterShmem->bgwriter_pid);
+ 
  	/*
  	 * Loop forever
  	 */
  	for (;;)
  	{
  		/*
  		 * Emergency bailout if postmaster has died.  This is to avoid the
  		 * necessity for manual cleanup of all postmaster children.
***************
*** 372,499 **** BackgroundWriterMain(void)
  		if (!PostmasterIsAlive(true))
  			exit(1);
  
- 		/*
- 		 * Process any requests or signals received recently.
- 		 */
- 		AbsorbFsyncRequests();
- 
  		if (got_SIGHUP)
  		{
  			got_SIGHUP = false;
  			ProcessConfigFile(PGC_SIGHUP);
  		}
- 		if (checkpoint_requested)
- 		{
- 			checkpoint_requested = false;
- 			do_checkpoint = true;
- 			BgWriterStats.m_requested_checkpoints++;
- 		}
- 		if (shutdown_requested)
- 		{
- 			/*
- 			 * From here on, elog(ERROR) should end with exit(1), not send
- 			 * control back to the sigsetjmp block above
- 			 */
- 			ExitOnAnyError = true;
- 			/* Close down the database */
- 			ShutdownXLOG(0, 0);
- 			/* Normal exit from the bgwriter is here */
- 			proc_exit(0);		/* done */
- 		}
  
! 		/*
! 		 * Force a checkpoint if too much time has elapsed since the last one.
! 		 * Note that we count a timed checkpoint in stats only when this
! 		 * occurs without an external request, but we set the CAUSE_TIME flag
! 		 * bit even if there is also an external request.
! 		 */
! 		now = (pg_time_t) time(NULL);
! 		elapsed_secs = now - last_checkpoint_time;
! 		if (elapsed_secs >= CheckPointTimeout)
  		{
! 			if (!do_checkpoint)
! 				BgWriterStats.m_timed_checkpoints++;
! 			do_checkpoint = true;
! 			flags |= CHECKPOINT_CAUSE_TIME;
! 		}
  
! 		/*
! 		 * Do a checkpoint if requested, otherwise do one cycle of
! 		 * dirty-buffer writing.
! 		 */
! 		if (do_checkpoint)
! 		{
! 			/* use volatile pointer to prevent code rearrangement */
! 			volatile BgWriterShmemStruct *bgs = BgWriterShmem;
  
! 			/*
! 			 * Atomically fetch the request flags to figure out what kind of a
! 			 * checkpoint we should perform, and increase the started-counter
! 			 * to acknowledge that we've started a new checkpoint.
! 			 */
! 			SpinLockAcquire(&bgs->ckpt_lck);
! 			flags |= bgs->ckpt_flags;
! 			bgs->ckpt_flags = 0;
! 			bgs->ckpt_started++;
! 			SpinLockRelease(&bgs->ckpt_lck);
  
! 			/*
! 			 * We will warn if (a) too soon since last checkpoint (whatever
! 			 * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag
! 			 * since the last checkpoint start.  Note in particular that this
! 			 * implementation will not generate warnings caused by
! 			 * CheckPointTimeout < CheckPointWarning.
! 			 */
! 			if ((flags & CHECKPOINT_CAUSE_XLOG) &&
! 				elapsed_secs < CheckPointWarning)
! 				ereport(LOG,
! 						(errmsg("checkpoints are occurring too frequently (%d seconds apart)",
! 								elapsed_secs),
! 						 errhint("Consider increasing the configuration parameter \"checkpoint_segments\".")));
  
  			/*
! 			 * Initialize bgwriter-private variables used during checkpoint.
  			 */
! 			ckpt_active = true;
! 			ckpt_start_recptr = GetInsertRecPtr();
! 			ckpt_start_time = now;
! 			ckpt_cached_elapsed = 0;
  
! 			/*
! 			 * Do the checkpoint.
! 			 */
! 			CreateCheckPoint(flags);
  
  			/*
! 			 * After any checkpoint, close all smgr files.	This is so we
! 			 * won't hang onto smgr references to deleted files indefinitely.
  			 */
! 			smgrcloseall();
  
  			/*
! 			 * Indicate checkpoint completion to any waiting backends.
  			 */
! 			SpinLockAcquire(&bgs->ckpt_lck);
! 			bgs->ckpt_done = bgs->ckpt_started;
! 			SpinLockRelease(&bgs->ckpt_lck);
  
! 			ckpt_active = false;
  
! 			/*
! 			 * Note we record the checkpoint start time not end time as
! 			 * last_checkpoint_time.  This is so that time-driven checkpoints
! 			 * happen at a predictable spacing.
! 			 */
! 			last_checkpoint_time = now;
  		}
- 		else
- 			BgBufferSync();
- 
- 		/* Check for archive_timeout and switch xlog files if necessary. */
- 		CheckArchiveTimeout();
- 
- 		/* Nap for the configured time. */
- 		BgWriterNap();
  	}
  }
  
--- 382,595 ----
  		if (!PostmasterIsAlive(true))
  			exit(1);
  
  		if (got_SIGHUP)
  		{
  			got_SIGHUP = false;
  			ProcessConfigFile(PGC_SIGHUP);
  		}
  
! 		if (BgWriterRecoveryMode)
  		{
! 			if (shutdown_requested)
! 			{
! 				/*
! 				 * From here on, elog(ERROR) should end with exit(1), not send
! 				 * control back to the sigsetjmp block above
! 				 */
! 				ExitOnAnyError = true;
! 				/* Normal exit from the bgwriter is here */
! 				proc_exit(0);		/* done */
! 			}
  
! 			if (!IsRecoveryProcessingMode())
! 			{
! 				elog(DEBUG2, "bgwriter changing from recovery to normal mode");
! 	  
! 				InitXLOGAccess();
! 				BgWriterRecoveryMode = false;
! 
! 				/*
! 				 * Start time-driven events from now
! 				 */
! 				last_checkpoint_time = last_xlog_switch_time = (pg_time_t) time(NULL);
! 
! 				/* 
! 				 * Notice that we do *not* act on a checkpoint_requested
! 				 * state at this point. We have changed mode, so we wish to
! 				 * perform a checkpoint not a restartpoint.
! 				 */
! 				continue;
! 			}
  
! 			if (checkpoint_requested)
! 			{
! 				XLogRecPtr		ReadPtr;
! 				CheckPoint		restartPoint;
! 
! 				checkpoint_requested = false;
! 
! 				/*
! 				 * Initialize bgwriter-private variables used during checkpoint.
! 				 */
! 				ckpt_active = true;
! 				ckpt_start_time = (pg_time_t) time(NULL);
! 				ckpt_cached_elapsed = 0;
! 
! 				/*
! 				 * Get the requested values from shared memory that the 
! 				 * Startup process has put there for us.
! 				 */
! 				SpinLockAcquire(&BgWriterShmem->ckpt_lck);
! 				ReadPtr = BgWriterShmem->ReadPtr;
! 				memcpy(&restartPoint, &BgWriterShmem->restartPoint, sizeof(CheckPoint));
! 				SpinLockRelease(&BgWriterShmem->ckpt_lck);
! 
! 				/* Use smoothed writes, until interrupted if ever */
! 				CreateRestartPoint(ReadPtr, &restartPoint, 0);
! 
! 				/*
! 				 * After any checkpoint, close all smgr files.	This is so we
! 				 * won't hang onto smgr references to deleted files indefinitely.
! 				 */
! 				smgrcloseall();
! 
! 				ckpt_active = false;
! 				checkpoint_requested = false;
! 			}
! 			else
! 			{
! 				/* Clean buffers dirtied by recovery */
! 				BgBufferSync();
  
! 				/* Nap for the configured time. */
! 				BgWriterNap();
! 			}
! 		}
! 		else	/* Normal processing */
! 		{
! 			bool		do_checkpoint = false;
! 			int			flags = 0;
! 			pg_time_t	now;
! 			int			elapsed_secs;
  
  			/*
! 			 * Process any requests or signals received recently.
  			 */
! 			AbsorbFsyncRequests();
  
! 			if (checkpoint_requested)
! 			{
! 				checkpoint_requested = false;
! 				do_checkpoint = true;
! 				BgWriterStats.m_requested_checkpoints++;
! 			}
! 			if (shutdown_requested)
! 			{
! 				/*
! 				 * From here on, elog(ERROR) should end with exit(1), not send
! 				 * control back to the sigsetjmp block above
! 				 */
! 				ExitOnAnyError = true;
! 				/* Close down the database */
! 				ShutdownXLOG(0, 0);
! 				/* Normal exit from the bgwriter is here */
! 				proc_exit(0);		/* done */
! 			}
  
  			/*
! 			 * Force a checkpoint if too much time has elapsed since the last one.
! 			 * Note that we count a timed checkpoint in stats only when this
! 			 * occurs without an external request, but we set the CAUSE_TIME flag
! 			 * bit even if there is also an external request.
  			 */
! 			now = (pg_time_t) time(NULL);
! 			elapsed_secs = now - last_checkpoint_time;
! 			if (elapsed_secs >= CheckPointTimeout)
! 			{
! 				if (!do_checkpoint)
! 					BgWriterStats.m_timed_checkpoints++;
! 				do_checkpoint = true;
! 				flags |= CHECKPOINT_CAUSE_TIME;
! 			}
  
  			/*
! 			 * Do a checkpoint if requested, otherwise do one cycle of
! 			 * dirty-buffer writing.
  			 */
! 			if (do_checkpoint)
! 			{
! 				/* use volatile pointer to prevent code rearrangement */
! 				volatile BgWriterShmemStruct *bgs = BgWriterShmem;
! 
! 				/*
! 				 * Atomically fetch the request flags to figure out what kind of a
! 				 * checkpoint we should perform, and increase the started-counter
! 				 * to acknowledge that we've started a new checkpoint.
! 				 */
! 				SpinLockAcquire(&bgs->ckpt_lck);
! 				flags |= bgs->ckpt_flags;
! 				bgs->ckpt_flags = 0;
! 				bgs->ckpt_started++;
! 				SpinLockRelease(&bgs->ckpt_lck);
! 
! 				/*
! 				 * We will warn if (a) too soon since last checkpoint (whatever
! 				 * caused it) and (b) somebody set the CHECKPOINT_CAUSE_XLOG flag
! 				 * since the last checkpoint start.  Note in particular that this
! 				 * implementation will not generate warnings caused by
! 				 * CheckPointTimeout < CheckPointWarning.
! 				 */
! 				if ((flags & CHECKPOINT_CAUSE_XLOG) &&
! 					elapsed_secs < CheckPointWarning)
! 					ereport(LOG,
! 							(errmsg("checkpoints are occurring too frequently (%d seconds apart)",
! 									elapsed_secs),
! 							 errhint("Consider increasing the configuration parameter \"checkpoint_segments\".")));
! 
! 				/*
! 				 * Initialize bgwriter-private variables used during checkpoint.
! 				 */
! 				ckpt_active = true;
! 				ckpt_start_recptr = GetInsertRecPtr();
! 				ckpt_start_time = now;
! 				ckpt_cached_elapsed = 0;
! 
! 				/*
! 				 * Do the checkpoint.
! 				 */
! 				CreateCheckPoint(flags);
! 
! 				/*
! 				 * After any checkpoint, close all smgr files.	This is so we
! 				 * won't hang onto smgr references to deleted files indefinitely.
! 				 */
! 				smgrcloseall();
! 
! 				/*
! 				 * Indicate checkpoint completion to any waiting backends.
! 				 */
! 				SpinLockAcquire(&bgs->ckpt_lck);
! 				bgs->ckpt_done = bgs->ckpt_started;
! 				SpinLockRelease(&bgs->ckpt_lck);
! 
! 				ckpt_active = false;
! 
! 				/*
! 				 * Note we record the checkpoint start time not end time as
! 				 * last_checkpoint_time.  This is so that time-driven checkpoints
! 				 * happen at a predictable spacing.
! 				 */
! 				last_checkpoint_time = now;
! 			}
! 			else
! 				BgBufferSync();
  
! 			/* Check for archive_timeout and switch xlog files if necessary. */
! 			CheckArchiveTimeout();
  
! 			/* Nap for the configured time. */
! 			BgWriterNap();
  		}
  	}
  }
  
***************
*** 586,592 **** BgWriterNap(void)
  		(ckpt_active ? ImmediateCheckpointRequested() : checkpoint_requested))
  			break;
  		pg_usleep(1000000L);
! 		AbsorbFsyncRequests();
  		udelay -= 1000000L;
  	}
  
--- 682,689 ----
  		(ckpt_active ? ImmediateCheckpointRequested() : checkpoint_requested))
  			break;
  		pg_usleep(1000000L);
! 		if (!IsRecoveryProcessingMode())
! 			AbsorbFsyncRequests();
  		udelay -= 1000000L;
  	}
  
***************
*** 640,645 **** CheckpointWriteDelay(int flags, double progress)
--- 737,755 ----
  	if (!am_bg_writer)
  		return;
  
+ 	/* Perform minimal duties during recovery and skip wait if requested */
+ 	if (IsRecoveryProcessingMode())
+ 	{
+ 		BgBufferSync();
+ 
+ 		if (!shutdown_requested &&
+ 			!checkpoint_requested &&
+ 			IsCheckpointOnSchedule(progress))
+ 			BgWriterNap();
+ 
+ 		return;
+ 	}
+ 
  	/*
  	 * Perform the usual bgwriter duties and take a nap, unless we're behind
  	 * schedule, in which case we just try to catch up as quickly as possible.
***************
*** 714,729 **** IsCheckpointOnSchedule(double progress)
  	 * However, it's good enough for our purposes, we're only calculating an
  	 * estimate anyway.
  	 */
! 	recptr = GetInsertRecPtr();
! 	elapsed_xlogs =
! 		(((double) (int32) (recptr.xlogid - ckpt_start_recptr.xlogid)) * XLogSegsPerFile +
! 		 ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) /
! 		CheckPointSegments;
! 
! 	if (progress < elapsed_xlogs)
  	{
! 		ckpt_cached_elapsed = elapsed_xlogs;
! 		return false;
  	}
  
  	/*
--- 824,842 ----
  	 * However, it's good enough for our purposes, we're only calculating an
  	 * estimate anyway.
  	 */
! 	if (!IsRecoveryProcessingMode())
  	{
! 		recptr = GetInsertRecPtr();
! 		elapsed_xlogs =
! 			(((double) (int32) (recptr.xlogid - ckpt_start_recptr.xlogid)) * XLogSegsPerFile +
! 			 ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) /
! 			CheckPointSegments;
! 
! 		if (progress < elapsed_xlogs)
! 		{
! 			ckpt_cached_elapsed = elapsed_xlogs;
! 			return false;
! 		}
  	}
  
  	/*
***************
*** 989,994 **** RequestCheckpoint(int flags)
--- 1102,1180 ----
  }
  
  /*
+  * Always runs in Startup process (see xlog.c)
+  */
+ void
+ RequestRestartPoint(const XLogRecPtr ReadPtr, const CheckPoint *restartPoint, bool sendToBGWriter)
+ {
+ 	/*
+ 	 * Should we just do it ourselves?
+ 	 */
+ 	if (!IsPostmasterEnvironment || !sendToBGWriter)
+ 	{
+ 		CreateRestartPoint(ReadPtr, restartPoint, CHECKPOINT_IMMEDIATE);
+ 		return;
+ 	}
+ 
+ 	/*
+ 	 * Push requested values into shared memory, then signal to request restartpoint.
+ 	 */
+ 	if (BgWriterShmem->bgwriter_pid == 0)
+ 		elog(LOG, "could not request restartpoint because bgwriter not running");
+ 
+ 	SpinLockAcquire(&BgWriterShmem->ckpt_lck);
+ 	BgWriterShmem->ReadPtr = ReadPtr;
+ 	memcpy(&BgWriterShmem->restartPoint, restartPoint, sizeof(CheckPoint));
+ 	SpinLockRelease(&BgWriterShmem->ckpt_lck);
+ 
+ 	if (kill(BgWriterShmem->bgwriter_pid, SIGINT) != 0)
+ 		elog(LOG, "could not signal for restartpoint: %m");	
+ }
+ 
+ /* 
+  * Sends another checkpoint request signal to bgwriter, which causes it
+  * to avoid smoothed writes and continue processing as if it had been
+  * called with CHECKPOINT_IMMEDIATE. This is used at the end of recovery.
+  */
+ void
+ RequestRestartPointCompletion(void)
+ {
+ 	if (BgWriterShmem->bgwriter_pid != 0 &&
+ 		kill(BgWriterShmem->bgwriter_pid, SIGINT) != 0)
+ 		elog(LOG, "could not signal for restartpoint immediate: %m");
+ }
+ 
+ XLogRecPtr
+ GetRedoLocationForArchiveCheckpoint(void)
+ {
+ 	XLogRecPtr	redo;
+ 
+ 	SpinLockAcquire(&BgWriterShmem->ckpt_lck);
+ 	redo = BgWriterShmem->ReadPtr;
+ 	SpinLockRelease(&BgWriterShmem->ckpt_lck);
+ 
+ 	return redo;
+ }
+ 
+ /* 
+  * Store the information needed for a checkpoint at the end of recovery.
+  * Returns true if bgwriter can perform checkpoint, or false if bgwriter
+  * not active or otherwise unable to comply.
+  */
+ bool
+ SetRedoLocationForArchiveCheckpoint(XLogRecPtr redo)
+ {
+ 	SpinLockAcquire(&BgWriterShmem->ckpt_lck);
+ 	BgWriterShmem->ReadPtr = redo;
+ 	SpinLockRelease(&BgWriterShmem->ckpt_lck);
+ 
+ 	if (BgWriterShmem->bgwriter_pid == 0 || !IsPostmasterEnvironment)
+ 		return false;
+ 
+ 	return true;
+ }
+ 
+ /*
   * ForwardFsyncRequest
   *		Forward a file-fsync request from a backend to the bgwriter
   *
*** src/backend/postmaster/postmaster.c
--- src/backend/postmaster/postmaster.c
***************
*** 230,237 **** static bool FatalError = false; /* T if recovering from backend crash */
   * We use a simple state machine to control startup, shutdown, and
   * crash recovery (which is rather like shutdown followed by startup).
   *
!  * Normal child backends can only be launched when we are in PM_RUN state.
!  * (We also allow it in PM_WAIT_BACKUP state, but only for superusers.)
   * In other states we handle connection requests by launching "dead_end"
   * child processes, which will simply send the client an error message and
   * quit.  (We track these in the BackendList so that we can know when they
--- 230,239 ----
   * We use a simple state machine to control startup, shutdown, and
   * crash recovery (which is rather like shutdown followed by startup).
   *
!  * Normal child backends can only be launched when we are in PM_RUN or
!  * PM_RECOVERY state. Any transaction started in PM_RECOVERY state will
!  * be read-only for the whole of its life.  (We also allow launch of normal
!  * child backends in PM_WAIT_BACKUP state, but only for superusers.)
   * In other states we handle connection requests by launching "dead_end"
   * child processes, which will simply send the client an error message and
   * quit.  (We track these in the BackendList so that we can know when they
***************
*** 254,259 **** typedef enum
--- 256,266 ----
  {
  	PM_INIT,					/* postmaster starting */
  	PM_STARTUP,					/* waiting for startup subprocess */
+ 	PM_RECOVERY,				/* consistent recovery mode; state only
+ 								 * entered for archive and streaming recovery,
+ 								 * and only after the point where the 
+ 								 * all data is in consistent state.
+ 								 */
  	PM_RUN,						/* normal "database is alive" state */
  	PM_WAIT_BACKUP,				/* waiting for online backup mode to end */
  	PM_WAIT_BACKENDS,			/* waiting for live backends to exit */
***************
*** 1302,1308 **** ServerLoop(void)
  		 * state that prevents it, start one.  It doesn't matter if this
  		 * fails, we'll just try again later.
  		 */
! 		if (BgWriterPID == 0 && pmState == PM_RUN)
  			BgWriterPID = StartBackgroundWriter();
  
  		/*
--- 1309,1315 ----
  		 * state that prevents it, start one.  It doesn't matter if this
  		 * fails, we'll just try again later.
  		 */
! 		if (BgWriterPID == 0 && (pmState == PM_RUN || pmState == PM_RECOVERY))
  			BgWriterPID = StartBackgroundWriter();
  
  		/*
***************
*** 1651,1661 **** retry1:
  					(errcode(ERRCODE_CANNOT_CONNECT_NOW),
  					 errmsg("the database system is shutting down")));
  			break;
- 		case CAC_RECOVERY:
- 			ereport(FATAL,
- 					(errcode(ERRCODE_CANNOT_CONNECT_NOW),
- 					 errmsg("the database system is in recovery mode")));
- 			break;
  		case CAC_TOOMANY:
  			ereport(FATAL,
  					(errcode(ERRCODE_TOO_MANY_CONNECTIONS),
--- 1658,1663 ----
***************
*** 1664,1669 **** retry1:
--- 1666,1672 ----
  		case CAC_WAITBACKUP:
  			/* OK for now, will check in InitPostgres */
  			break;
+ 		case CAC_RECOVERY:
  		case CAC_OK:
  			break;
  	}
***************
*** 1982,1991 **** pmdie(SIGNAL_ARGS)
  			ereport(LOG,
  					(errmsg("received smart shutdown request")));
  
! 			if (pmState == PM_RUN)
  			{
  				/* autovacuum workers are told to shut down immediately */
! 				SignalAutovacWorkers(SIGTERM);
  				/* and the autovac launcher too */
  				if (AutoVacPID != 0)
  					signal_child(AutoVacPID, SIGTERM);
--- 1985,1995 ----
  			ereport(LOG,
  					(errmsg("received smart shutdown request")));
  
! 			if (pmState == PM_RUN || pmState == PM_RECOVERY)
  			{
  				/* autovacuum workers are told to shut down immediately */
! 				if (pmState == PM_RUN)
! 					SignalAutovacWorkers(SIGTERM);
  				/* and the autovac launcher too */
  				if (AutoVacPID != 0)
  					signal_child(AutoVacPID, SIGTERM);
***************
*** 2019,2025 **** pmdie(SIGNAL_ARGS)
  
  			if (StartupPID != 0)
  				signal_child(StartupPID, SIGTERM);
! 			if (pmState == PM_RUN || pmState == PM_WAIT_BACKUP)
  			{
  				ereport(LOG,
  						(errmsg("aborting any active transactions")));
--- 2023,2029 ----
  
  			if (StartupPID != 0)
  				signal_child(StartupPID, SIGTERM);
! 			if (pmState == PM_RUN || pmState == PM_RECOVERY || pmState == PM_WAIT_BACKUP)
  			{
  				ereport(LOG,
  						(errmsg("aborting any active transactions")));
***************
*** 2115,2122 **** reaper(SIGNAL_ARGS)
  		 */
  		if (pid == StartupPID)
  		{
  			StartupPID = 0;
! 			Assert(pmState == PM_STARTUP);
  
  			/* FATAL exit of startup is treated as catastrophic */
  			if (!EXIT_STATUS_0(exitstatus))
--- 2119,2129 ----
  		 */
  		if (pid == StartupPID)
  		{
+ 			bool	leavingRecovery = (pmState == PM_RECOVERY);
+ 
  			StartupPID = 0;
! 			Assert(pmState == PM_STARTUP || pmState == PM_RECOVERY ||
! 				   pmState == PM_WAIT_BACKUP || pmState == PM_WAIT_BACKENDS);
  
  			/* FATAL exit of startup is treated as catastrophic */
  			if (!EXIT_STATUS_0(exitstatus))
***************
*** 2124,2130 **** reaper(SIGNAL_ARGS)
  				LogChildExit(LOG, _("startup process"),
  							 pid, exitstatus);
  				ereport(LOG,
! 				(errmsg("aborting startup due to startup process failure")));
  				ExitPostmaster(1);
  			}
  
--- 2131,2137 ----
  				LogChildExit(LOG, _("startup process"),
  							 pid, exitstatus);
  				ereport(LOG,
! 						(errmsg("aborting startup due to startup process failure")));
  				ExitPostmaster(1);
  			}
  
***************
*** 2157,2166 **** reaper(SIGNAL_ARGS)
  			load_role();
  
  			/*
! 			 * Crank up the background writer.	It doesn't matter if this
! 			 * fails, we'll just try again later.
  			 */
! 			Assert(BgWriterPID == 0);
  			BgWriterPID = StartBackgroundWriter();
  
  			/*
--- 2164,2173 ----
  			load_role();
  
  			/*
! 			 * Check whether we need to start background writer, if not
! 			 * already running.
  			 */
! 			if (BgWriterPID == 0)
  			BgWriterPID = StartBackgroundWriter();
  
  			/*
***************
*** 2177,2184 **** reaper(SIGNAL_ARGS)
  				PgStatPID = pgstat_start();
  
  			/* at this point we are really open for business */
! 			ereport(LOG,
! 				 (errmsg("database system is ready to accept connections")));
  
  			continue;
  		}
--- 2184,2195 ----
  				PgStatPID = pgstat_start();
  
  			/* at this point we are really open for business */
! 			if (leavingRecovery)
! 				ereport(LOG,
! 					 (errmsg("database can now be accessed with read and write transactions")));
! 			else
! 				ereport(LOG,
! 					 (errmsg("database system is ready to accept connections")));
  
  			continue;
  		}
***************
*** 2898,2904 **** BackendStartup(Port *port)
  	bn->pid = pid;
  	bn->cancel_key = MyCancelKey;
  	bn->is_autovacuum = false;
! 	bn->dead_end = (port->canAcceptConnections != CAC_OK &&
  					port->canAcceptConnections != CAC_WAITBACKUP);
  	DLAddHead(BackendList, DLNewElem(bn));
  #ifdef EXEC_BACKEND
--- 2909,2916 ----
  	bn->pid = pid;
  	bn->cancel_key = MyCancelKey;
  	bn->is_autovacuum = false;
! 	bn->dead_end = (!(port->canAcceptConnections == CAC_RECOVERY || 
! 					  port->canAcceptConnections == CAC_OK) &&
  					port->canAcceptConnections != CAC_WAITBACKUP);
  	DLAddHead(BackendList, DLNewElem(bn));
  #ifdef EXEC_BACKEND
***************
*** 3845,3850 **** sigusr1_handler(SIGNAL_ARGS)
--- 3857,3910 ----
  
  	PG_SETMASK(&BlockSig);
  
+ 	if (CheckPostmasterSignal(PMSIGNAL_RECOVERY_START))
+ 	{
+ 		Assert(pmState == PM_STARTUP);
+ 
+ 		/*
+ 		 * Go to shutdown mode if a shutdown request was pending.
+ 		 */
+ 		if (Shutdown > NoShutdown)
+ 		{
+ 			pmState = PM_WAIT_BACKENDS;
+ 			/* PostmasterStateMachine logic does the rest */
+ 		}
+ 		else
+ 		{
+ 			/*
+ 			 * Startup process has entered recovery
+ 			 */
+ 			pmState = PM_RECOVERY;
+ 
+ 			/*
+ 			 * Load the flat authorization file into postmaster's cache. The
+ 			 * startup process won't have recomputed this from the database
+ 			 * yet, so it may change following recovery. We'll reload it
+ 			 * after the startup process ends.
+ 			 */
+ 			load_role();
+ 
+ 			/*
+ 			 * Crank up the background writer.	It doesn't matter if this
+ 			 * fails, we'll just try again later.
+ 			 */
+ 			Assert(BgWriterPID == 0);
+ 			BgWriterPID = StartBackgroundWriter();
+ 
+ 			/*
+ 			 * Likewise, start other special children as needed.
+ 			 */
+ 			Assert(PgStatPID == 0);
+ 			PgStatPID = pgstat_start();
+ 
+ 			/* We can now accept read-only connections */
+ 			ereport(LOG,
+ 				 (errmsg("database system is ready to accept connections")));
+ 			ereport(LOG,
+ 				 (errmsg("database can now be accessed with read only transactions")));
+ 		}
+ 	}
+ 
  	if (CheckPostmasterSignal(PMSIGNAL_PASSWORD_CHANGE))
  	{
  		/*
*** src/backend/utils/init/flatfiles.c
--- src/backend/utils/init/flatfiles.c
***************
*** 678,686 **** write_auth_file(Relation rel_authid, Relation rel_authmem)
  /*
   * This routine is called once during database startup, after completing
   * WAL replay if needed.  Its purpose is to sync the flat files with the
!  * current state of the database tables.  This is particularly important
!  * during PITR operation, since the flat files will come from the
!  * base backup which may be far out of sync with the current state.
   *
   * In theory we could skip rebuilding the flat files if no WAL replay
   * occurred, but it seems best to just do it always.  We have to
--- 678,687 ----
  /*
   * This routine is called once during database startup, after completing
   * WAL replay if needed.  Its purpose is to sync the flat files with the
!  * current state of the database tables.  
!  *
!  * In 8.4 we also run this during xact_redo_commit() if the transaction
!  * wrote a new database or auth flat file. 
   *
   * In theory we could skip rebuilding the flat files if no WAL replay
   * occurred, but it seems best to just do it always.  We have to
***************
*** 716,723 **** BuildFlatFiles(bool database_only)
  	/*
  	 * We don't have any hope of running a real relcache, but we can use the
  	 * same fake-relcache facility that WAL replay uses.
- 	 *
- 	 * No locking is needed because no one else is alive yet.
  	 */
  	rel_db = CreateFakeRelcacheEntry(rnode);
  	write_database_file(rel_db, true);
--- 717,722 ----
***************
*** 832,845 **** AtEOXact_UpdateFlatFiles(bool isCommit)
  	/* Okay to write the files */
  	if (database_file_update_subid != InvalidSubTransactionId)
  	{
! 		database_file_update_subid = InvalidSubTransactionId;
  		write_database_file(drel, false);
  		heap_close(drel, NoLock);
  	}
  
  	if (auth_file_update_subid != InvalidSubTransactionId)
  	{
! 		auth_file_update_subid = InvalidSubTransactionId;
  		write_auth_file(arel, mrel);
  		heap_close(arel, NoLock);
  		heap_close(mrel, NoLock);
--- 831,844 ----
  	/* Okay to write the files */
  	if (database_file_update_subid != InvalidSubTransactionId)
  	{
! 		/* reset database_file_update_subid later during commit */
  		write_database_file(drel, false);
  		heap_close(drel, NoLock);
  	}
  
  	if (auth_file_update_subid != InvalidSubTransactionId)
  	{
! 		/* reset auth_file_update_subid later during commit */
  		write_auth_file(arel, mrel);
  		heap_close(arel, NoLock);
  		heap_close(mrel, NoLock);
*** src/include/access/xact.h
--- src/include/access/xact.h
***************
*** 17,22 ****
--- 17,23 ----
  #include "access/xlog.h"
  #include "nodes/pg_list.h"
  #include "storage/relfilenode.h"
+ #include "utils/snapshot.h"
  #include "utils/timestamp.h"
  
  
***************
*** 84,111 **** typedef void (*SubXactCallback) (SubXactEvent event, SubTransactionId mySubid,
  #define XLOG_XACT_ABORT				0x20
  #define XLOG_XACT_COMMIT_PREPARED	0x30
  #define XLOG_XACT_ABORT_PREPARED	0x40
  
  typedef struct xl_xact_commit
  {
! 	TimestampTz xact_time;		/* time of commit */
! 	int			nrels;			/* number of RelFileNodes */
! 	int			nsubxacts;		/* number of subtransaction XIDs */
! 	/* Array of RelFileNode(s) to drop at commit */
! 	RelFileNode	xnodes[1];		/* VARIABLE LENGTH ARRAY */
! 	/* ARRAY OF COMMITTED SUBTRANSACTION XIDs FOLLOWS */
  } xl_xact_commit;
  
  #define MinSizeOfXactCommit offsetof(xl_xact_commit, xnodes)
  
  typedef struct xl_xact_abort
  {
  	TimestampTz xact_time;		/* time of abort */
  	int			nrels;			/* number of RelFileNodes */
  	int			nsubxacts;		/* number of subtransaction XIDs */
  	/* Array of RelFileNode(s) to drop at abort */
  	RelFileNode	xnodes[1];		/* VARIABLE LENGTH ARRAY */
  	/* ARRAY OF ABORTED SUBTRANSACTION XIDs FOLLOWS */
  } xl_xact_abort;
  
  #define MinSizeOfXactAbort offsetof(xl_xact_abort, xnodes)
  
--- 85,162 ----
  #define XLOG_XACT_ABORT				0x20
  #define XLOG_XACT_COMMIT_PREPARED	0x30
  #define XLOG_XACT_ABORT_PREPARED	0x40
+ #define XLOG_XACT_ASSIGNMENT		0x50
+ #define XLOG_XACT_RUNNING_XACTS		0x60
+ /* 0x70 can also be used, if required */
+ 
+ typedef struct xl_xact_assignment
+ {
+ 	TransactionId	xassign;	/* assigned xid */
+ 	TransactionId	xparent;	/* assigned xids parent, if any */
+ 	bool			isSubXact;	/* is a subtransaction */
+ 	int				slotId;		/* slotId in procarray */
+ } xl_xact_assignment;
+ 
+ /* 
+  * xl_xact_running_xacts is in utils/snapshot.h so it can be passed
+  * around to the same places as snapshots. Not snapmgr.h
+  */
  
  typedef struct xl_xact_commit
  {
!   	TimestampTz xact_time;		/* time of commit */
!  	int			slotId;			/* slotId in procarray */
!  	uint		xinfo;			/* info flags */
!   	int			nrels;			/* number of RelFileForks */
!   	int			nsubxacts;		/* number of subtransaction XIDs */
! 	int			nmsgs;			/* number of shared inval msgs */
!   	/* Array of RelFileFork(s) to drop at commit */
!   	RelFileNode	xnodes[1];		/* VARIABLE LENGTH ARRAY */
!   	/* ARRAY OF COMMITTED SUBTRANSACTION XIDs FOLLOWS */
! 	/* ARRAY OF SHARED INVALIDATION MESSAGES FOLLOWS */
  } xl_xact_commit;
  
  #define MinSizeOfXactCommit offsetof(xl_xact_commit, xnodes)
+ #define OffsetSharedInvalInXactCommit() \
+ ( \
+ 	MinSizeOfXactCommit +  \
+ 	(xlrec->nsubxacts * sizeof(TransactionId)) + \
+ 	(xlrec->nrels * sizeof(RelFileNode)) \
+ )
+ 
+ /*
+  * These flags are set in the xinfo fields of transaction
+  * completion WAL records. They indicate a number of actions
+  * that need to occur when emulating transaction completion.
+  * They are named XactCompletion... to differentiate them from
+  * EOXact... routines which run at the end of the original
+  * transaction completion.
+  */
+ #define XACT_COMPLETION_UNMARKED_SUBXIDS		0x01
+ 
+ /* These next states only occur on commit record types */
+ #define XACT_COMPLETION_UPDATE_DB_FILE			0x02
+ #define XACT_COMPLETION_UPDATE_AUTH_FILE		0x04
+ #define XACT_COMPLETION_UPDATE_RELCACHE_FILE	0x08
+ 
+ /* Access macros for above flags */
+ #define XactCompletionHasUnMarkedSubxids(xlrec)		((xlrec)->xinfo & XACT_COMPLETION_UNMARKED_SUBXIDS)
+ #define XactCompletionUpdateDBFile(xlrec) 			((xlrec)->xinfo & XACT_COMPLETION_UPDATE_DB_FILE)
+ #define XactCompletionUpdateAuthFile(xlrec) 		((xlrec)->xinfo & XACT_COMPLETION_UPDATE_AUTH_FILE)
+ #define XactCompletionRelcacheInitFileInval(xlrec)	((xlrec)->xinfo & XACT_COMPLETION_UPDATE_RELCACHE_FILE)
  
  typedef struct xl_xact_abort
  {
  	TimestampTz xact_time;		/* time of abort */
+ 	int			slotId;			/* slotId in procarray */
+ 	uint		xinfo;			/* info flags */
  	int			nrels;			/* number of RelFileNodes */
  	int			nsubxacts;		/* number of subtransaction XIDs */
  	/* Array of RelFileNode(s) to drop at abort */
  	RelFileNode	xnodes[1];		/* VARIABLE LENGTH ARRAY */
  	/* ARRAY OF ABORTED SUBTRANSACTION XIDs FOLLOWS */
  } xl_xact_abort;
+ /* Note the intentional lack of an invalidation message array c.f. commit */
  
  #define MinSizeOfXactAbort offsetof(xl_xact_abort, xnodes)
  
***************
*** 185,190 **** extern TransactionId RecordTransactionCommit(void);
--- 236,252 ----
  
  extern int	xactGetCommittedChildren(TransactionId **ptr);
  
+ extern void LogCurrentRunningXacts(void);
+ extern bool IsRunningXactDataValid(void);
+ extern void GetStandbyInfoForTransaction(RmgrId rmid, uint8 info,
+ 							XLogRecData *rdata,
+ 							TransactionId *xid2, 
+ 							uint16 *info2);
+ 
+ extern void InitRecoveryTransactionEnvironment(void);
+ extern void XactResolveRecoveryConflicts(TransactionId latestRemovedXid, Oid recDatabaseOid);
+ extern void RecordKnownAssignedTransactionIds(XLogRecPtr lsn, XLogRecord *record);
+ 
  extern void xact_redo(XLogRecPtr lsn, XLogRecord *record);
  extern void xact_desc(StringInfo buf, uint8 xl_info, char *rec);
  
*** src/include/access/xlog.h
--- src/include/access/xlog.h
***************
*** 133,139 **** typedef struct XLogRecData
  } XLogRecData;
  
  extern TimeLineID ThisTimeLineID;		/* current TLI */
! extern bool InRecovery;
  extern XLogRecPtr XactLastRecEnd;
  
  /* these variables are GUC parameters related to XLOG */
--- 133,147 ----
  } XLogRecData;
  
  extern TimeLineID ThisTimeLineID;		/* current TLI */
! /* 
!  * Prior to 8.4, all activity during recovery were carried out by Startup
!  * process. This local variable continues to be used in many parts of the
!  * code to indicate actions taken by RecoveryManagers. Other processes who
!  * potentially perform work during recovery should check
!  * IsRecoveryProcessingMode(), see XLogCtl notes in xlog.c
!  */
! extern bool InRecovery;	
! extern bool InArchiveRecovery;
  extern XLogRecPtr XactLastRecEnd;
  
  /* these variables are GUC parameters related to XLOG */
***************
*** 166,171 **** extern bool XLOG_DEBUG;
--- 174,180 ----
  /* These indicate the cause of a checkpoint request */
  #define CHECKPOINT_CAUSE_XLOG	0x0010	/* XLOG consumption */
  #define CHECKPOINT_CAUSE_TIME	0x0020	/* Elapsed time */
+ #define CHECKPOINT_RESTARTPOINT	0x0040	/* Restartpoint during recovery */
  
  /* Checkpoint statistics */
  typedef struct CheckpointStatsData
***************
*** 197,202 **** extern void XLogSetAsyncCommitLSN(XLogRecPtr record);
--- 206,214 ----
  extern void xlog_redo(XLogRecPtr lsn, XLogRecord *record);
  extern void xlog_desc(StringInfo buf, uint8 xl_info, char *rec);
  
+ extern bool IsRecoveryProcessingMode(void);
+ 
+ 
  extern void UpdateControlFile(void);
  extern Size XLOGShmemSize(void);
  extern void XLOGShmemInit(void);
*** src/include/access/xlog_internal.h
--- src/include/access/xlog_internal.h
***************
*** 17,22 ****
--- 17,23 ----
  #define XLOG_INTERNAL_H
  
  #include "access/xlog.h"
+ #include "catalog/pg_control.h"
  #include "fmgr.h"
  #include "pgtime.h"
  #include "storage/block.h"
***************
*** 245,250 **** extern const RmgrData RmgrTable[];
--- 246,254 ----
  extern pg_time_t GetLastSegSwitchTime(void);
  extern XLogRecPtr RequestXLogSwitch(void);
  
+ extern void CreateRestartPoint(const XLogRecPtr ReadPtr, 
+ 				const CheckPoint *restartPoint, int flags);
+ 
  /*
   * These aren't in xlog.h because I'd rather not include fmgr.h there.
   */
*** src/include/catalog/pg_control.h
--- src/include/catalog/pg_control.h
***************
*** 46,51 **** typedef struct CheckPoint
--- 46,52 ----
  #define XLOG_NOOP						0x20
  #define XLOG_NEXTOID					0x30
  #define XLOG_SWITCH						0x40
+ #define XLOG_RECOVERY_END				0x50
  
  
  /* System status indicator */
***************
*** 102,107 **** typedef struct ControlFileData
--- 103,109 ----
  	CheckPoint	checkPointCopy; /* copy of last check point record */
  
  	XLogRecPtr	minRecoveryPoint;		/* must replay xlog to here */
+ 	XLogRecPtr	minSafeStartPoint;		/* safe point after recovery crashes */
  
  	/*
  	 * This data is used to check for hardware-architecture compatibility of
*** src/include/postmaster/bgwriter.h
--- src/include/postmaster/bgwriter.h
***************
*** 12,17 ****
--- 12,18 ----
  #ifndef _BGWRITER_H
  #define _BGWRITER_H
  
+ #include "catalog/pg_control.h"
  #include "storage/block.h"
  #include "storage/relfilenode.h"
  
***************
*** 25,30 **** extern double CheckPointCompletionTarget;
--- 26,36 ----
  extern void BackgroundWriterMain(void);
  
  extern void RequestCheckpoint(int flags);
+ extern void RequestRestartPoint(const XLogRecPtr ReadPtr, const CheckPoint *restartPoint, bool sendToBGWriter);
+ extern void RequestRestartPointCompletion(void);
+ extern XLogRecPtr GetRedoLocationForArchiveCheckpoint(void);
+ extern bool SetRedoLocationForArchiveCheckpoint(XLogRecPtr redo);
+ 
  extern void CheckpointWriteDelay(int flags, double progress);
  
  extern bool ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum,
*** src/include/storage/pmsignal.h
--- src/include/storage/pmsignal.h
***************
*** 22,27 ****
--- 22,28 ----
   */
  typedef enum
  {
+ 	PMSIGNAL_RECOVERY_START,	/* move to PM_RECOVERY state */
  	PMSIGNAL_PASSWORD_CHANGE,	/* pg_auth file has changed */
  	PMSIGNAL_WAKEN_ARCHIVER,	/* send a NOTIFY signal to xlog archiver */
  	PMSIGNAL_ROTATE_LOGFILE,	/* send SIGUSR1 to syslogger to rotate logfile */