diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index bd6035d..30fea49 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -119,12 +119,26 @@ CheckpointStatsData CheckpointStats;
  */
 TimeLineID	ThisTimeLineID = 0;
 
-/* Are we doing recovery from XLOG? */
+/*
+ * Are we doing recovery from XLOG? 
+ *
+ * This is only ever true in the startup process, when it's replaying WAL.
+ * It's used in functions that need to act differently when called from a
+ * redo function (e.g skip WAL logging).  To check whether the system is in
+ * recovery regardless of what process you're running in, use
+ * IsRecoveryProcessingMode().
+ */
 bool		InRecovery = false;
 
 /* Are we recovering using offline XLOG archives? */
 static bool InArchiveRecovery = false;
 
+/*
+ * Local copy of shared RecoveryProcessingMode variable. True actually
+ * means "not known, need to check the shared state"
+ */
+static bool LocalRecoveryProcessingMode = true;
+
 /* Was the last xlog file restored from archive, or local? */
 static bool restoredFromArchive = false;
 
@@ -133,16 +147,22 @@ static char *recoveryRestoreCommand = NULL;
 static bool recoveryTarget = false;
 static bool recoveryTargetExact = false;
 static bool recoveryTargetInclusive = true;
-static bool recoveryLogRestartpoints = false;
 static TransactionId recoveryTargetXid;
 static TimestampTz recoveryTargetTime;
 static TimestampTz recoveryLastXTime = 0;
+/*
+ * log_restartpoints is stored in shared memory because it needs to be
+ * accessed by bgwriter when it performs restartpoints
+ */
 
 /* if recoveryStopsHere returns true, it saves actual stop xid/time here */
 static TransactionId recoveryStopXid;
 static TimestampTz recoveryStopTime;
 static bool recoveryStopAfter;
 
+/* is the database in consistent state yet? */
+static bool	reachedSafeStartPoint = false;
+
 /*
  * During normal operation, the only timeline we care about is ThisTimeLineID.
  * During recovery, however, things are more complicated.  To simplify life
@@ -313,6 +333,25 @@ typedef struct XLogCtlData
 	int			XLogCacheBlck;	/* highest allocated xlog buffer index */
 	TimeLineID	ThisTimeLineID;
 
+	/*
+	 * SharedRecoveryProcessingMode indicates if we're still in crash or
+	 * archive recovery. It's checked by IsRecoveryProcessingMode()
+	 */
+	bool		SharedRecoveryProcessingMode;
+
+	/*
+	 * During recovery, we keep a copy of the latest checkpoint record
+	 * here. It's used by the background writer when it wants to create
+	 * a restartpoint.
+	 *
+	 * is info_lck spinlock a bit too light-weight to protect this?
+	 */
+	XLogRecPtr	lastCheckPointRecPtr;
+	CheckPoint	lastCheckPoint;
+
+	/* Should restartpoints be logged? Taken from recovery.conf */
+	bool		recoveryLogRestartpoints;
+
 	slock_t		info_lck;		/* locks shared variables shown above */
 } XLogCtlData;
 
@@ -399,6 +438,7 @@ static void XLogArchiveCleanup(const char *xlog);
 static void readRecoveryCommandFile(void);
 static void exitArchiveRecovery(TimeLineID endTLI,
 					uint32 endLogId, uint32 endLogSeg);
+static void exitRecovery(void);
 static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
 
@@ -483,6 +523,11 @@ XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
 	bool		updrqst;
 	bool		doPageWrites;
 	bool		isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
+	bool		isRecoveryEnd = (rmid == RM_XLOG_ID && info == XLOG_RECOVERY_END);
+
+	/* cross-check on whether we should be here or not */
+	if (IsRecoveryProcessingMode() && !isRecoveryEnd)
+		elog(FATAL, "cannot make new WAL entries during recovery");
 
 	/* info's high bits are reserved for use by me */
 	if (info & XLR_INFO_MASK)
@@ -1730,7 +1775,7 @@ XLogFlush(XLogRecPtr record)
 	XLogwrtRqst WriteRqst;
 
 	/* Disabled during REDO */
-	if (InRedo)
+	if (IsRecoveryProcessingMode())
 		return;
 
 	/* Quick exit if already known flushed */
@@ -1818,9 +1863,9 @@ XLogFlush(XLogRecPtr record)
 	 * the bad page is encountered again during recovery then we would be
 	 * unable to restart the database at all!  (This scenario has actually
 	 * happened in the field several times with 7.1 releases. Note that we
-	 * cannot get here while InRedo is true, but if the bad page is brought in
-	 * and marked dirty during recovery then CreateCheckPoint will try to
-	 * flush it at the end of recovery.)
+	 * cannot get here while IsRecoveryProcessingMode(), but if the bad page is
+	 * brought in and marked dirty during recovery then if a checkpoint were
+	 * performed at the end of recovery it will try to flush it.
 	 *
 	 * The current approach is to ERROR under normal conditions, but only
 	 * WARNING during recovery, so that the system can be brought up even if
@@ -1830,7 +1875,7 @@ XLogFlush(XLogRecPtr record)
 	 * and so we will not force a restart for a bad LSN on a data page.
 	 */
 	if (XLByteLT(LogwrtResult.Flush, record))
-		elog(InRecovery ? WARNING : ERROR,
+		elog(ERROR,
 		"xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
 			 record.xlogid, record.xrecoff,
 			 LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
@@ -2103,7 +2148,8 @@ XLogFileInit(uint32 log, uint32 seg,
 		unlink(tmppath);
 	}
 
-	elog(DEBUG2, "done creating and filling new WAL file");
+	XLogFileName(tmppath, ThisTimeLineID, log, seg);
+	elog(DEBUG2, "done creating and filling new WAL file %s", tmppath);
 
 	/* Set flag to tell caller there was no existent file */
 	*use_existent = false;
@@ -2409,6 +2455,33 @@ XLogFileRead(uint32 log, uint32 seg, int emode)
 					 xlogfname);
 			set_ps_display(activitymsg, false);
 
+			/* 
+			 * Calculate and write out a new safeStartPoint. This defines
+			 * the latest LSN that might appear on-disk while we apply
+			 * the WAL records in this file. If we crash during recovery
+			 * we must reach this point again before we can prove
+			 * database consistency. Not a restartpoint! Restart points
+			 * define where we should start recovery from, if we crash.
+			 */
+			if (InArchiveRecovery)
+			{
+				XLogRecPtr	nextSegRecPtr;
+				uint32		nextLog = log;
+				uint32		nextSeg = seg;
+
+				NextLogSeg(nextLog, nextSeg);
+				nextSegRecPtr.xlogid = nextLog;
+				nextSegRecPtr.xrecoff = nextSeg * XLogSegSize;
+
+				LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+				if (XLByteLT(ControlFile->minSafeStartPoint, nextSegRecPtr))
+				{
+					ControlFile->minSafeStartPoint = nextSegRecPtr;
+					UpdateControlFile();
+				}
+				LWLockRelease(ControlFileLock);
+			}
+
 			return fd;
 		}
 		if (errno != ENOENT)	/* unexpected failure? */
@@ -4592,13 +4665,13 @@ readRecoveryCommandFile(void)
 			/*
 			 * does nothing if a recovery_target is not also set
 			 */
-			if (!parse_bool(tok2, &recoveryLogRestartpoints))
-				  ereport(ERROR,
-							(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-					  errmsg("parameter \"log_restartpoints\" requires a Boolean value")));
+			if (!parse_bool(tok2, &XLogCtl->recoveryLogRestartpoints))
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+							errmsg("parameter \"log_restartpoints\" requires a Boolean value")));
 			ereport(LOG,
-					(errmsg("log_restartpoints = %s", tok2)));
-		}
+				(errmsg("log_restartpoints = %s", tok2)));
+ 		}
 		else
 			ereport(FATAL,
 					(errmsg("unrecognized recovery parameter \"%s\"",
@@ -4734,7 +4807,10 @@ exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
 
 	/*
 	 * Rename the config file out of the way, so that we don't accidentally
-	 * re-enter archive recovery mode in a subsequent crash.
+	 * re-enter archive recovery mode in a subsequent crash. We have already
+	 * restored all the WAL segments we need from the archive, and we trust
+	 * that they are not going to go away even if we crash. (XXX: should
+	 * we fsync() them all to ensure that?)
 	 */
 	unlink(RECOVERY_COMMAND_DONE);
 	if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
@@ -4876,6 +4952,7 @@ StartupXLOG(void)
 	CheckPoint	checkPoint;
 	bool		wasShutdown;
 	bool		reachedStopPoint = false;
+	bool		performedRecovery = false;
 	bool		haveBackupLabel = false;
 	XLogRecPtr	RecPtr,
 				LastRec,
@@ -4888,6 +4965,8 @@ StartupXLOG(void)
 	uint32		freespace;
 	TransactionId oldestActiveXID;
 
+	XLogCtl->SharedRecoveryProcessingMode = true;
+
 	/*
 	 * Read control file and check XLOG status looks valid.
 	 *
@@ -5108,9 +5187,15 @@ StartupXLOG(void)
 		if (minRecoveryLoc.xlogid != 0 || minRecoveryLoc.xrecoff != 0)
 			ControlFile->minRecoveryPoint = minRecoveryLoc;
 		ControlFile->time = (pg_time_t) time(NULL);
+		/* No need to hold ControlFileLock yet, we aren't up far enough */
 		UpdateControlFile();
 
 		/*
+		 * Reset pgstat data, because it may be invalid after recovery.
+		 */
+		pgstat_reset_all();
+
+		/*
 		 * If there was a backup label file, it's done its job and the info
 		 * has now been propagated into pg_control.  We must get rid of the
 		 * label file so that if we crash during recovery, we'll pick up at
@@ -5155,6 +5240,7 @@ StartupXLOG(void)
 			bool		recoveryContinue = true;
 			bool		recoveryApply = true;
 			ErrorContextCallback errcontext;
+			XLogRecPtr	minSafeStartPoint;
 
 			InRedo = true;
 			ereport(LOG,
@@ -5162,6 +5248,12 @@ StartupXLOG(void)
 							ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
 
 			/*
+			 * Take a local copy of minSafeStartPoint at the beginning of
+			 * recovery, because it's updated as we go.
+			 */
+			minSafeStartPoint = ControlFile->minSafeStartPoint;
+
+			/*
 			 * main redo apply loop
 			 */
 			do
@@ -5217,6 +5309,32 @@ StartupXLOG(void)
 
 				LastRec = ReadRecPtr;
 
+				/*
+				 * Have we reached our safe starting point? If so, we can
+				 * signal postmaster to enter consistent recovery mode.
+				 *
+				 * There are two points in the log we must pass. The first is
+				 * the minRecoveryPoint, which is the LSN at the time the
+				 * base backup was taken that we are about to rollfoward from.
+				 * If recovery has ever crashed or was stopped there is 
+				 * another point also: minSafeStartPoint, which is the
+				 * latest LSN that recovery could have reached prior to crash.
+				 */
+				if (!reachedSafeStartPoint && 
+					 XLByteLE(minSafeStartPoint, EndRecPtr) && 
+					 XLByteLE(ControlFile->minRecoveryPoint, EndRecPtr))
+				{
+					reachedSafeStartPoint = true;
+					if (InArchiveRecovery)
+					{
+						ereport(LOG,
+							(errmsg("consistent recovery state reached at %X/%X",
+								EndRecPtr.xlogid, EndRecPtr.xrecoff)));
+						if (IsUnderPostmaster)
+							SendPostmasterSignal(PMSIGNAL_RECOVERY_START);
+					}
+				}
+
 				record = ReadRecord(NULL, LOG);
 			} while (record != NULL && recoveryContinue);
 
@@ -5238,6 +5356,7 @@ StartupXLOG(void)
 			/* there are no WAL records following the checkpoint */
 			ereport(LOG,
 					(errmsg("redo is not required")));
+			reachedSafeStartPoint = true;
 		}
 	}
 
@@ -5251,9 +5370,9 @@ StartupXLOG(void)
 
 	/*
 	 * Complain if we did not roll forward far enough to render the backup
-	 * dump consistent.
+	 * dump consistent and start safely.
 	 */
-	if (XLByteLT(EndOfLog, ControlFile->minRecoveryPoint))
+	if (InRecovery && !reachedSafeStartPoint)
 	{
 		if (reachedStopPoint)	/* stopped because of stop request */
 			ereport(FATAL,
@@ -5375,39 +5494,14 @@ StartupXLOG(void)
 		XLogCheckInvalidPages();
 
 		/*
-		 * Reset pgstat data, because it may be invalid after recovery.
+		 * Finally exit recovery and mark that in WAL. Pre-8.4 we wrote
+		 * a shutdown checkpoint here, but we ask bgwriter to do that now.
 		 */
-		pgstat_reset_all();
+		exitRecovery();
 
-		/*
-		 * Perform a checkpoint to update all our recovery activity to disk.
-		 *
-		 * Note that we write a shutdown checkpoint rather than an on-line
-		 * one. This is not particularly critical, but since we may be
-		 * assigning a new TLI, using a shutdown checkpoint allows us to have
-		 * the rule that TLI only changes in shutdown checkpoints, which
-		 * allows some extra error checking in xlog_redo.
-		 */
-		CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
+		performedRecovery = true;
 	}
 
-	/*
-	 * Preallocate additional log files, if wanted.
-	 */
-	PreallocXlogFiles(EndOfLog);
-
-	/*
-	 * Okay, we're officially UP.
-	 */
-	InRecovery = false;
-
-	ControlFile->state = DB_IN_PRODUCTION;
-	ControlFile->time = (pg_time_t) time(NULL);
-	UpdateControlFile();
-
-	/* start the archive_timeout timer running */
-	XLogCtl->Write.lastSegSwitchTime = ControlFile->time;
-
 	/* initialize shared-memory copy of latest checkpoint XID/epoch */
 	XLogCtl->ckptXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
 	XLogCtl->ckptXid = ControlFile->checkPointCopy.nextXid;
@@ -5441,6 +5535,67 @@ StartupXLOG(void)
 		readRecordBuf = NULL;
 		readRecordBufSize = 0;
 	}
+
+	/*
+	 * If we had to replay any WAL records, request a checkpoint. This isn't
+	 * strictly necessary: if we crash now, the recovery will simply restart
+	 * from the same point where it started this time around (or from the
+	 * last restartpoint). The control file is left in DB_IN_*_RECOVERY
+	 * state; the first checkpoint will change that to DB_IN_PRODUCTION.
+	 */
+	if (performedRecovery)
+	{
+		/*
+		 * Okay, we can come up now. Allow others to write WAL.
+		 */
+		XLogCtl->SharedRecoveryProcessingMode = false;
+
+		RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_IMMEDIATE |
+						  CHECKPOINT_STARTUP);
+	}
+	else
+	{
+		/*
+		 * No recovery, so let's just get on with it. 
+		 */
+		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+		ControlFile->state = DB_IN_PRODUCTION;
+		ControlFile->time = (pg_time_t) time(NULL);
+		UpdateControlFile();
+		LWLockRelease(ControlFileLock);
+
+		/*
+		 * Okay, we're officially UP.
+		 */
+		XLogCtl->SharedRecoveryProcessingMode = false;
+	}
+
+	/* start the archive_timeout timer running */
+	XLogCtl->Write.lastSegSwitchTime = (pg_time_t) time(NULL);
+
+}
+
+/*
+ * IsRecoveryProcessingMode()
+ *
+ * Fast test for whether we're still in recovery or not. We test the shared
+ * state each time only until we leave recovery mode. After that we never
+ * look again, relying upon the settings of our local state variables. This
+ * is designed to avoid the need for a separate initialisation step.
+ */
+bool
+IsRecoveryProcessingMode(void)
+{
+	if (!LocalRecoveryProcessingMode)
+		return false;
+	else
+	{
+		/* use volatile pointer to prevent code rearrangement */
+		volatile XLogCtlData *xlogctl = XLogCtl;
+
+		LocalRecoveryProcessingMode = xlogctl->SharedRecoveryProcessingMode;
+		return LocalRecoveryProcessingMode;
+	}
 }
 
 /*
@@ -5696,22 +5851,27 @@ ShutdownXLOG(int code, Datum arg)
  * Log start of a checkpoint.
  */
 static void
-LogCheckpointStart(int flags)
+LogCheckpointStart(int flags, bool restartpoint)
 {
-	elog(LOG, "checkpoint starting:%s%s%s%s%s%s",
-		 (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
-		 (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
-		 (flags & CHECKPOINT_FORCE) ? " force" : "",
-		 (flags & CHECKPOINT_WAIT) ? " wait" : "",
-		 (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
-		 (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "");
+	if (restartpoint)
+		elog(LOG, "restartpoint starting:%s",
+			 (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "");
+	else
+		elog(LOG, "checkpoint starting:%s%s%s%s%s%s%s",
+			 (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
+			 (flags & CHECKPOINT_STARTUP) ? " startup" : "",
+			 (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
+			 (flags & CHECKPOINT_FORCE) ? " force" : "",
+			 (flags & CHECKPOINT_WAIT) ? " wait" : "",
+			 (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
+			 (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "");
 }
 
 /*
  * Log end of a checkpoint.
  */
 static void
-LogCheckpointEnd(void)
+LogCheckpointEnd(int flags, bool restartpoint)
 {
 	long		write_secs,
 				sync_secs,
@@ -5734,17 +5894,26 @@ LogCheckpointEnd(void)
 						CheckpointStats.ckpt_sync_end_t,
 						&sync_secs, &sync_usecs);
 
-	elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
-		 "%d transaction log file(s) added, %d removed, %d recycled; "
-		 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
-		 CheckpointStats.ckpt_bufs_written,
-		 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
-		 CheckpointStats.ckpt_segs_added,
-		 CheckpointStats.ckpt_segs_removed,
-		 CheckpointStats.ckpt_segs_recycled,
-		 write_secs, write_usecs / 1000,
-		 sync_secs, sync_usecs / 1000,
-		 total_secs, total_usecs / 1000);
+	if (restartpoint)
+		elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
+			 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
+			 CheckpointStats.ckpt_bufs_written,
+			 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
+			 write_secs, write_usecs / 1000,
+			 sync_secs, sync_usecs / 1000,
+			 total_secs, total_usecs / 1000);
+	else
+		elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
+			 "%d transaction log file(s) added, %d removed, %d recycled; "
+			 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
+			 CheckpointStats.ckpt_bufs_written,
+			 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
+			 CheckpointStats.ckpt_segs_added,
+			 CheckpointStats.ckpt_segs_removed,
+			 CheckpointStats.ckpt_segs_recycled,
+			 write_secs, write_usecs / 1000,
+			 sync_secs, sync_usecs / 1000,
+			 total_secs, total_usecs / 1000);
 }
 
 /*
@@ -5800,9 +5969,11 @@ CreateCheckPoint(int flags)
 
 	if (shutdown)
 	{
+		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
 		ControlFile->state = DB_SHUTDOWNING;
 		ControlFile->time = (pg_time_t) time(NULL);
 		UpdateControlFile();
+		LWLockRelease(ControlFileLock);
 	}
 
 	/*
@@ -5906,7 +6077,7 @@ CreateCheckPoint(int flags)
 	 * to log anything if we decided to skip the checkpoint.
 	 */
 	if (log_checkpoints)
-		LogCheckpointStart(flags);
+		LogCheckpointStart(flags, false);
 
 	TRACE_POSTGRESQL_CHECKPOINT_START(flags);
 
@@ -6010,11 +6181,18 @@ CreateCheckPoint(int flags)
 	XLByteToSeg(ControlFile->checkPointCopy.redo, _logId, _logSeg);
 
 	/*
-	 * Update the control file.
+	 * Update the control file. In 8.4, this routine becomes the primary
+	 * point for recording changes of state in the control file at the 
+	 * end of recovery. Postmaster state already shows us being in 
+	 * normal running mode, but it is only after this point that we
+	 * are completely free of reperforming a recovery if we crash.  Note
+	 * that this is executed by bgwriter after the death of Startup process.
 	 */
 	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
 	if (shutdown)
 		ControlFile->state = DB_SHUTDOWNED;
+	else
+		ControlFile->state = DB_IN_PRODUCTION;
 	ControlFile->prevCheckPoint = ControlFile->checkPoint;
 	ControlFile->checkPoint = ProcLastRecPtr;
 	ControlFile->checkPointCopy = checkPoint;
@@ -6068,12 +6246,11 @@ CreateCheckPoint(int flags)
 	 * in subtrans.c).	During recovery, though, we mustn't do this because
 	 * StartupSUBTRANS hasn't been called yet.
 	 */
-	if (!InRecovery)
-		TruncateSUBTRANS(GetOldestXmin(true, false));
+	TruncateSUBTRANS(GetOldestXmin(true, false));
 
 	/* All real work is done, but log before releasing lock. */
 	if (log_checkpoints)
-		LogCheckpointEnd();
+		LogCheckpointEnd(flags, false);
 
         TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
                                 NBuffers, CheckpointStats.ckpt_segs_added,
@@ -6101,32 +6278,16 @@ CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
 }
 
 /*
- * Set a recovery restart point if appropriate
- *
- * This is similar to CreateCheckPoint, but is used during WAL recovery
- * to establish a point from which recovery can roll forward without
- * replaying the entire recovery log.  This function is called each time
- * a checkpoint record is read from XLOG; it must determine whether a
- * restartpoint is needed or not.
+ * Store checkpoint record in shared memory, so that it can be used as a 
+ * restartpoint. This function is called each time a checkpoint record is
+ * read from XLOG.
  */
 static void
 RecoveryRestartPoint(const CheckPoint *checkPoint)
 {
-	int			elapsed_secs;
 	int			rmid;
-
-	/*
-	 * Do nothing if the elapsed time since the last restartpoint is less than
-	 * half of checkpoint_timeout.	(We use a value less than
-	 * checkpoint_timeout so that variations in the timing of checkpoints on
-	 * the master, or speed of transmission of WAL segments to a slave, won't
-	 * make the slave skip a restartpoint once it's synced with the master.)
-	 * Checking true elapsed time keeps us from doing restartpoints too often
-	 * while rapidly scanning large amounts of WAL.
-	 */
-	elapsed_secs = (pg_time_t) time(NULL) - ControlFile->time;
-	if (elapsed_secs < CheckPointTimeout / 2)
-		return;
+	/* use volatile pointer to prevent code rearrangement */
+	volatile XLogCtlData *xlogctl = XLogCtl;
 
 	/*
 	 * Is it safe to checkpoint?  We must ask each of the resource managers
@@ -6148,28 +6309,111 @@ RecoveryRestartPoint(const CheckPoint *checkPoint)
 	}
 
 	/*
-	 * OK, force data out to disk
+	 * Copy the checkpoint record to shared memory, so that bgwriter can
+	 * use it the next time it wants to perform a restartpoint.
+	 */
+	SpinLockAcquire(&xlogctl->info_lck);
+	XLogCtl->lastCheckPointRecPtr = ReadRecPtr;
+	memcpy(&XLogCtl->lastCheckPoint, checkPoint, sizeof(CheckPoint));
+	SpinLockRelease(&xlogctl->info_lck);
+
+	/*
+	 * XXX: Should we try to perform restartpoints if we're not in consistent
+	 * recovery? The bgwriter isn't doing it for us in that case.
+	 */
+}
+
+/*
+ * This is similar to CreateCheckPoint, but is used during WAL recovery
+ * to establish a point from which recovery can roll forward without
+ * replaying the entire recovery log.
+ */
+void
+CreateRestartPoint(int flags)
+{
+	XLogRecPtr lastCheckPointRecPtr;
+	CheckPoint lastCheckPoint;
+	/* use volatile pointer to prevent code rearrangement */
+	volatile XLogCtlData *xlogctl = XLogCtl;
+
+	/* Get the a local copy of the last checkpoint record. */
+	SpinLockAcquire(&xlogctl->info_lck);
+	lastCheckPointRecPtr = xlogctl->lastCheckPointRecPtr;
+	memcpy(&lastCheckPoint, &XLogCtl->lastCheckPoint, sizeof(CheckPoint));
+	SpinLockRelease(&xlogctl->info_lck);
+
+	/*
+	 * If the last checkpoint record we've replayed is already our last
+	 * restartpoint, we're done.
 	 */
-	CheckPointGuts(checkPoint->redo, CHECKPOINT_IMMEDIATE);
+	if (XLByteLE(lastCheckPoint.redo, ControlFile->checkPointCopy.redo))
+	{
+		ereport(DEBUG2,
+				(errmsg("skipping restartpoint, already performed at %X/%X",
+						lastCheckPoint.redo.xlogid, lastCheckPoint.redo.xrecoff)));
+		return;
+	}
 
 	/*
-	 * Update pg_control so that any subsequent crash will restart from this
-	 * checkpoint.	Note: ReadRecPtr gives the XLOG address of the checkpoint
-	 * record itself.
+	 * Acquire CheckpointLock to ensure only one restartpoint happens at a time.
+	 * We rely on this lock to ensure that the startup process doesn't exit
+	 * Recovery while we are half way through a restartpoint. XXX ?
 	 */
+	LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
+
+	/* Check that we're still in recovery mode. */
+	if (!IsRecoveryProcessingMode())
+	{
+		ereport(DEBUG2,
+				(errmsg("skipping restartpoint, recovery has already ended")));
+		LWLockRelease(CheckpointLock);
+		return;
+	}
+
+	if (XLogCtl->recoveryLogRestartpoints)
+	{
+		/*
+		 * Prepare to accumulate statistics.
+		 */
+		MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
+		CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
+
+		LogCheckpointStart(flags, true);
+	}
+
+	CheckPointGuts(lastCheckPoint.redo, flags);
+
+	/*
+	 * Update pg_control, using current time
+	 */
+	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
 	ControlFile->prevCheckPoint = ControlFile->checkPoint;
-	ControlFile->checkPoint = ReadRecPtr;
-	ControlFile->checkPointCopy = *checkPoint;
+	ControlFile->checkPoint = lastCheckPointRecPtr;
+	ControlFile->checkPointCopy = lastCheckPoint;
 	ControlFile->time = (pg_time_t) time(NULL);
 	UpdateControlFile();
+	LWLockRelease(ControlFileLock);
 
-	ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
+	/*
+	 * Currently, there is no need to truncate pg_subtrans during recovery.
+	 * If we did do that, we will need to have called StartupSUBTRANS()
+	 * already and then TruncateSUBTRANS() would go here.
+	 */
+
+	/* All real work is done, but log before releasing lock. */
+	if (XLogCtl->recoveryLogRestartpoints)
+		LogCheckpointEnd(flags, true);
+
+	ereport((XLogCtl->recoveryLogRestartpoints ? LOG : DEBUG2),
 			(errmsg("recovery restart point at %X/%X",
-					checkPoint->redo.xlogid, checkPoint->redo.xrecoff)));
+					lastCheckPoint.redo.xlogid, lastCheckPoint.redo.xrecoff)));
+
 	if (recoveryLastXTime)
-		ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
-				(errmsg("last completed transaction was at log time %s",
-						timestamptz_to_str(recoveryLastXTime))));
+		ereport((XLogCtl->recoveryLogRestartpoints ? LOG : DEBUG2),
+			(errmsg("last completed transaction was at log time %s",
+					timestamptz_to_str(recoveryLastXTime))));
+
+	LWLockRelease(CheckpointLock);
 }
 
 /*
@@ -6234,7 +6478,43 @@ RequestXLogSwitch(void)
 }
 
 /*
+ * exitRecovery()
+ *
+ * Exit recovery state and write a XLOG_RECOVERY_END record. This is the
+ * only record type that can record a change of timelineID. We assume
+ * caller has already set ThisTimeLineID, if appropriate.
+ */
+static void
+exitRecovery(void)
+{
+	XLogRecData rdata;
+
+	rdata.buffer = InvalidBuffer;
+	rdata.data = (char *) (&ThisTimeLineID);
+	rdata.len = sizeof(TimeLineID);
+	rdata.next = NULL;
+
+	/*
+	 * This is the only type of WAL message that can be inserted during
+	 * recovery. This ensures that we don't allow others to get access
+	 * until after we have changed state.
+	 */
+	(void) XLogInsert(RM_XLOG_ID, XLOG_RECOVERY_END, &rdata);
+
+	/*
+	 * We don't XLogFlush() here otherwise we'll end up zeroing the WAL
+	 * file ourselves. So just let bgwriter's forthcoming checkpoint do
+	 * that for us.
+	 */
+
+	InRecovery = false;
+}
+
+/*
  * XLOG resource manager's routines
+ *
+ * Definitions of message info are in include/catalog/pg_control.h,
+ * though not all messages relate to control file processing.
  */
 void
 xlog_redo(XLogRecPtr lsn, XLogRecord *record)
@@ -6272,21 +6552,38 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
 		ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
 
 		/*
-		 * TLI may change in a shutdown checkpoint, but it shouldn't decrease
+		 * TLI no longer changes at shutdown checkpoint, since as of 8.4,
+		 * shutdown checkpoints only occur at shutdown. Much less confusing.
 		 */
-		if (checkPoint.ThisTimeLineID != ThisTimeLineID)
+
+		RecoveryRestartPoint(&checkPoint);
+	}
+	else if (info == XLOG_RECOVERY_END)
+	{
+		TimeLineID	tli;
+
+		memcpy(&tli, XLogRecGetData(record), sizeof(TimeLineID));
+
+		/*
+		 * TLI may change when recovery ends, but it shouldn't decrease.
+		 *
+		 * This is the only WAL record that can tell us to change timelineID
+		 * while we process WAL records. 
+		 *
+		 * We can *choose* to stop recovery at any point, generating a
+		 * new timelineID which is recorded using this record type.
+		 */
+		if (tli != ThisTimeLineID)
 		{
-			if (checkPoint.ThisTimeLineID < ThisTimeLineID ||
+			if (tli < ThisTimeLineID ||
 				!list_member_int(expectedTLIs,
-								 (int) checkPoint.ThisTimeLineID))
+								 (int) tli))
 				ereport(PANIC,
-						(errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
-								checkPoint.ThisTimeLineID, ThisTimeLineID)));
+						(errmsg("unexpected timeline ID %u (after %u) at recovery end record",
+								tli, ThisTimeLineID)));
 			/* Following WAL records should be run with new TLI */
-			ThisTimeLineID = checkPoint.ThisTimeLineID;
+			ThisTimeLineID = tli;
 		}
-
-		RecoveryRestartPoint(&checkPoint);
 	}
 	else if (info == XLOG_CHECKPOINT_ONLINE)
 	{
@@ -6309,7 +6606,7 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
 		ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
 		ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
 
-		/* TLI should not change in an on-line checkpoint */
+		/* TLI must not change at a checkpoint */
 		if (checkPoint.ThisTimeLineID != ThisTimeLineID)
 			ereport(PANIC,
 					(errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c
index 6a0cd4e..428a440 100644
--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@@ -49,6 +49,7 @@
 #include <unistd.h>
 
 #include "access/xlog_internal.h"
+#include "catalog/pg_control.h"
 #include "libpq/pqsignal.h"
 #include "miscadmin.h"
 #include "pgstat.h"
@@ -197,6 +198,9 @@ BackgroundWriterMain(void)
 {
 	sigjmp_buf	local_sigjmp_buf;
 	MemoryContext bgwriter_context;
+	bool		BgWriterRecoveryMode;
+	/* use volatile pointer to prevent code rearrangement */
+	volatile BgWriterShmemStruct *bgs = BgWriterShmem;
 
 	BgWriterShmem->bgwriter_pid = MyProcPid;
 	am_bg_writer = true;
@@ -355,6 +359,27 @@ BackgroundWriterMain(void)
 	 */
 	PG_SETMASK(&UnBlockSig);
 
+	BgWriterRecoveryMode = IsRecoveryProcessingMode();
+
+	if (BgWriterRecoveryMode)
+		elog(DEBUG1, "bgwriter starting during recovery");
+	else
+		InitXLOGAccess();
+
+	/*
+	 * If someone requested a checkpoint before we started up, process that.
+	 *
+	 * This check exists primarily for crash recovery: after the startup
+	 * process is finished with WAL replay, it will request a checkpoint, but
+	 * the background writer might not have started yet. This check will
+	 * actually not notice a checkpoint that's been requested without any
+	 * flags, but it's good enough for the startup checkpoint.
+	 */
+	SpinLockAcquire(&bgs->ckpt_lck);
+	if (bgs->ckpt_flags)
+		checkpoint_requested = true;
+	SpinLockRelease(&bgs->ckpt_lck);
+
 	/*
 	 * Loop forever
 	 */
@@ -396,7 +421,8 @@ BackgroundWriterMain(void)
 			 */
 			ExitOnAnyError = true;
 			/* Close down the database */
-			ShutdownXLOG(0, 0);
+			if (!BgWriterRecoveryMode)
+				ShutdownXLOG(0, 0);
 			/* Normal exit from the bgwriter is here */
 			proc_exit(0);		/* done */
 		}
@@ -418,14 +444,26 @@ BackgroundWriterMain(void)
 		}
 
 		/*
+		 * Check if we've exited recovery. We do this after determining
+		 * whether to perform a checkpoint or not, to be sure that we
+		 * perform a real checkpoint and not a restartpoint, if someone
+		 * (like the startup process!) requested a checkpoint immediately
+		 * after exiting recovery.
+		 */
+ 		if (BgWriterRecoveryMode && !IsRecoveryProcessingMode())
+  		{
+			elog(DEBUG1, "bgwriter changing from recovery to normal mode");
+ 
+			InitXLOGAccess();
+			BgWriterRecoveryMode = false;
+		}
+
+		/*
 		 * Do a checkpoint if requested, otherwise do one cycle of
 		 * dirty-buffer writing.
 		 */
 		if (do_checkpoint)
 		{
-			/* use volatile pointer to prevent code rearrangement */
-			volatile BgWriterShmemStruct *bgs = BgWriterShmem;
-
 			/*
 			 * Atomically fetch the request flags to figure out what kind of a
 			 * checkpoint we should perform, and increase the started-counter
@@ -444,7 +482,8 @@ BackgroundWriterMain(void)
 			 * implementation will not generate warnings caused by
 			 * CheckPointTimeout < CheckPointWarning.
 			 */
-			if ((flags & CHECKPOINT_CAUSE_XLOG) &&
+			if (!BgWriterRecoveryMode &&
+				(flags & CHECKPOINT_CAUSE_XLOG) &&
 				elapsed_secs < CheckPointWarning)
 				ereport(LOG,
 						(errmsg("checkpoints are occurring too frequently (%d seconds apart)",
@@ -455,14 +494,18 @@ BackgroundWriterMain(void)
 			 * Initialize bgwriter-private variables used during checkpoint.
 			 */
 			ckpt_active = true;
-			ckpt_start_recptr = GetInsertRecPtr();
+			if (!BgWriterRecoveryMode)
+				ckpt_start_recptr = GetInsertRecPtr();
 			ckpt_start_time = now;
 			ckpt_cached_elapsed = 0;
 
 			/*
 			 * Do the checkpoint.
 			 */
-			CreateCheckPoint(flags);
+			if (!BgWriterRecoveryMode)
+				CreateCheckPoint(flags);
+			else
+				CreateRestartPoint(flags);
 
 			/*
 			 * After any checkpoint, close all smgr files.	This is so we
@@ -507,7 +550,7 @@ CheckArchiveTimeout(void)
 	pg_time_t	now;
 	pg_time_t	last_time;
 
-	if (XLogArchiveTimeout <= 0)
+	if (XLogArchiveTimeout <= 0 || !IsRecoveryProcessingMode())
 		return;
 
 	now = (pg_time_t) time(NULL);
@@ -586,7 +629,8 @@ BgWriterNap(void)
 		(ckpt_active ? ImmediateCheckpointRequested() : checkpoint_requested))
 			break;
 		pg_usleep(1000000L);
-		AbsorbFsyncRequests();
+		if (!IsRecoveryProcessingMode())
+			AbsorbFsyncRequests();
 		udelay -= 1000000L;
 	}
 
@@ -714,16 +758,19 @@ IsCheckpointOnSchedule(double progress)
 	 * However, it's good enough for our purposes, we're only calculating an
 	 * estimate anyway.
 	 */
-	recptr = GetInsertRecPtr();
-	elapsed_xlogs =
-		(((double) (int32) (recptr.xlogid - ckpt_start_recptr.xlogid)) * XLogSegsPerFile +
-		 ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) /
-		CheckPointSegments;
-
-	if (progress < elapsed_xlogs)
+	if (!IsRecoveryProcessingMode())
 	{
-		ckpt_cached_elapsed = elapsed_xlogs;
-		return false;
+		recptr = GetInsertRecPtr();
+		elapsed_xlogs =
+			(((double) (int32) (recptr.xlogid - ckpt_start_recptr.xlogid)) * XLogSegsPerFile +
+			 ((double) recptr.xrecoff - (double) ckpt_start_recptr.xrecoff) / XLogSegSize) /
+			CheckPointSegments;
+
+		if (progress < elapsed_xlogs)
+		{
+			ckpt_cached_elapsed = elapsed_xlogs;
+			return false;
+		}
 	}
 
 	/*
@@ -850,6 +897,7 @@ BgWriterShmemInit(void)
  *
  * flags is a bitwise OR of the following:
  *	CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
+ *	CHECKPOINT_IS_STARTUP: checkpoint is for database startup.
  *	CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
  *		ignoring checkpoint_completion_target parameter.
  *	CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occured
@@ -916,6 +964,18 @@ RequestCheckpoint(int flags)
 	{
 		if (BgWriterShmem->bgwriter_pid == 0)
 		{
+			/*
+			 * The only difference between a startup checkpoint and a normal
+			 * online checkpoint is that it's quite normal for the bgwriter
+			 * to not be up yet when the startup checkpoint is requested.
+			 * (it might be, though). That's ok, background writer will
+			 * perform the checkpoint as soon as it starts up.
+			 */
+			if (flags & CHECKPOINT_STARTUP)
+			{
+				Assert(!(flags & CHECKPOINT_WAIT));
+				break;
+			}
 			if (ntries >= 20)		/* max wait 2.0 sec */
 			{
 				elog((flags & CHECKPOINT_WAIT) ? ERROR : LOG,
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 3380b80..221c9b2 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -228,7 +228,18 @@ static bool FatalError = false; /* T if recovering from backend crash */
 
 /*
  * We use a simple state machine to control startup, shutdown, and
- * crash recovery (which is rather like shutdown followed by startup).
+ * recovery.
+ *
+ * Recovery is split into two phases: crash recovery and consistent (archive)
+ * recovery.  The startup process begins with crash recovery, replaying WAL
+ * until a self-consistent database state is reached. At that point, it
+ * signals postmaster, and we switch to consistent recovery phase. The
+ * background writer is launched, while the startup process continues
+ * applying WAL.  We could start accepting connections to perform read-only
+ * queries at this point, if we had the infrastructure to do that. When the
+ * startup process exits, we switch to PM_RUN state. The startup process can
+ * also skip the consistent recovery altogether, as it will during normal
+ * startup when there's no recovery to be done, for example.
  *
  * Normal child backends can only be launched when we are in PM_RUN state.
  * (We also allow it in PM_WAIT_BACKUP state, but only for superusers.)
@@ -254,6 +265,7 @@ typedef enum
 {
 	PM_INIT,					/* postmaster starting */
 	PM_STARTUP,					/* waiting for startup subprocess */
+	PM_RECOVERY,				/* consistent recovery mode */
 	PM_RUN,						/* normal "database is alive" state */
 	PM_WAIT_BACKUP,				/* waiting for online backup mode to end */
 	PM_WAIT_BACKENDS,			/* waiting for live backends to exit */
@@ -1302,7 +1314,7 @@ ServerLoop(void)
 		 * state that prevents it, start one.  It doesn't matter if this
 		 * fails, we'll just try again later.
 		 */
-		if (BgWriterPID == 0 && pmState == PM_RUN)
+		if (BgWriterPID == 0 && (pmState == PM_RUN || pmState == PM_RECOVERY))
 			BgWriterPID = StartBackgroundWriter();
 
 		/*
@@ -2116,7 +2128,7 @@ reaper(SIGNAL_ARGS)
 		if (pid == StartupPID)
 		{
 			StartupPID = 0;
-			Assert(pmState == PM_STARTUP);
+			Assert(pmState == PM_STARTUP || pmState == PM_RECOVERY);
 
 			/* FATAL exit of startup is treated as catastrophic */
 			if (!EXIT_STATUS_0(exitstatus))
@@ -2157,11 +2169,12 @@ reaper(SIGNAL_ARGS)
 			load_role();
 
 			/*
-			 * Crank up the background writer.	It doesn't matter if this
-			 * fails, we'll just try again later.
+			 * Crank up the background writer, if we didn't do that already
+			 * when we entered consistent recovery phase.  It doesn't matter
+			 * if this fails, we'll just try again later.
 			 */
-			Assert(BgWriterPID == 0);
-			BgWriterPID = StartBackgroundWriter();
+			if (BgWriterPID == 0)
+				BgWriterPID = StartBackgroundWriter();
 
 			/*
 			 * Likewise, start other special children as needed.  In a restart
@@ -3847,6 +3860,51 @@ sigusr1_handler(SIGNAL_ARGS)
 
 	PG_SETMASK(&BlockSig);
 
+	if (CheckPostmasterSignal(PMSIGNAL_RECOVERY_START))
+	{
+		Assert(pmState == PM_STARTUP);
+
+		/*
+		 * Go to shutdown mode if a shutdown request was pending.
+		 */
+		if (Shutdown > NoShutdown)
+		{
+			pmState = PM_WAIT_BACKENDS;
+			/* PostmasterStateMachine logic does the rest */
+		}
+		else
+		{
+			/*
+			 * Startup process has entered recovery
+			 */
+			pmState = PM_RECOVERY;
+
+			/*
+			 * Load the flat authorization file into postmaster's cache. The
+			 * startup process won't have recomputed this from the database yet,
+			 * so we it may change following recovery. 
+			 */
+			load_role();
+
+			/*
+			 * Crank up the background writer.	It doesn't matter if this
+			 * fails, we'll just try again later.
+			 */
+			Assert(BgWriterPID == 0);
+			BgWriterPID = StartBackgroundWriter();
+
+			/*
+			 * Likewise, start other special children as needed.
+			 */
+			Assert(PgStatPID == 0);
+			PgStatPID = pgstat_start();
+
+			/* XXX at this point we could accept read-only connections */
+			ereport(DEBUG1,
+				 (errmsg("database system is in consistent recovery mode")));
+		}
+	}
+
 	if (CheckPostmasterSignal(PMSIGNAL_PASSWORD_CHANGE))
 	{
 		/*
diff --git a/src/backend/storage/buffer/README b/src/backend/storage/buffer/README
index 62b22bd..a7b81e3 100644
--- a/src/backend/storage/buffer/README
+++ b/src/backend/storage/buffer/README
@@ -268,3 +268,12 @@ out (and anyone else who flushes buffer contents to disk must do so too).
 This ensures that the page image transferred to disk is reasonably consistent.
 We might miss a hint-bit update or two but that isn't a problem, for the same
 reasons mentioned under buffer access rules.
+
+As of 8.4, background writer starts during recovery mode when there is
+some form of potentially extended recovery to perform. It performs an
+identical service to normal processing, except that checkpoints it
+writes are technically restartpoints. Flushing outstanding WAL for dirty
+buffers is also skipped, though there shouldn't ever be new WAL entries
+at that time in any case. We could choose to start background writer
+immediately but we hold off until we can prove the database is in a 
+consistent state so that postmaster has a single, clean state change.
diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c
index 4ea849d..3bba50a 100644
--- a/src/bin/pg_controldata/pg_controldata.c
+++ b/src/bin/pg_controldata/pg_controldata.c
@@ -197,6 +197,9 @@ main(int argc, char *argv[])
 	printf(_("Minimum recovery ending location:     %X/%X\n"),
 		   ControlFile.minRecoveryPoint.xlogid,
 		   ControlFile.minRecoveryPoint.xrecoff);
+	printf(_("Minimum safe starting location:       %X/%X\n"),
+		   ControlFile.minSafeStartPoint.xlogid,
+		   ControlFile.minSafeStartPoint.xrecoff);
 	printf(_("Maximum data alignment:               %u\n"),
 		   ControlFile.maxAlign);
 	/* we don't print floatFormat since can't say much useful about it */
diff --git a/src/bin/pg_resetxlog/pg_resetxlog.c b/src/bin/pg_resetxlog/pg_resetxlog.c
index 51cdde1..b20d4bd 100644
--- a/src/bin/pg_resetxlog/pg_resetxlog.c
+++ b/src/bin/pg_resetxlog/pg_resetxlog.c
@@ -603,6 +603,8 @@ RewriteControlFile(void)
 	ControlFile.prevCheckPoint.xrecoff = 0;
 	ControlFile.minRecoveryPoint.xlogid = 0;
 	ControlFile.minRecoveryPoint.xrecoff = 0;
+	ControlFile.minSafeStartPoint.xlogid = 0;
+	ControlFile.minSafeStartPoint.xrecoff = 0;
 
 	/* Now we can force the recorded xlog seg size to the right thing. */
 	ControlFile.xlog_seg_size = XLogSegSize;
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 6913f7c..6f58b80 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -133,7 +133,16 @@ typedef struct XLogRecData
 } XLogRecData;
 
 extern TimeLineID ThisTimeLineID;		/* current TLI */
-extern bool InRecovery;
+
+/* 
+ * Prior to 8.4, all activity during recovery were carried out by Startup
+ * process. This local variable continues to be used in many parts of the
+ * code to indicate actions taken by RecoveryManagers. Other processes who
+ * potentially perform work during recovery should check
+ * IsRecoveryProcessingMode(), see XLogCtl notes in xlog.c
+ */
+extern bool InRecovery;	
+										
 extern XLogRecPtr XactLastRecEnd;
 
 /* these variables are GUC parameters related to XLOG */
@@ -161,11 +170,12 @@ extern bool XLOG_DEBUG;
 #define CHECKPOINT_IS_SHUTDOWN	0x0001	/* Checkpoint is for shutdown */
 #define CHECKPOINT_IMMEDIATE	0x0002	/* Do it without delays */
 #define CHECKPOINT_FORCE		0x0004	/* Force even if no activity */
+#define CHECKPOINT_STARTUP		0x0008	/* Startup checkpoint */
 /* These are important to RequestCheckpoint */
-#define CHECKPOINT_WAIT			0x0008	/* Wait for completion */
+#define CHECKPOINT_WAIT			0x0010	/* Wait for completion */
 /* These indicate the cause of a checkpoint request */
-#define CHECKPOINT_CAUSE_XLOG	0x0010	/* XLOG consumption */
-#define CHECKPOINT_CAUSE_TIME	0x0020	/* Elapsed time */
+#define CHECKPOINT_CAUSE_XLOG	0x0020	/* XLOG consumption */
+#define CHECKPOINT_CAUSE_TIME	0x0040	/* Elapsed time */
 
 /* Checkpoint statistics */
 typedef struct CheckpointStatsData
@@ -199,6 +209,8 @@ extern void RestoreBkpBlocks(XLogRecPtr lsn, XLogRecord *record, bool cleanup);
 extern void xlog_redo(XLogRecPtr lsn, XLogRecord *record);
 extern void xlog_desc(StringInfo buf, uint8 xl_info, char *rec);
 
+extern bool IsRecoveryProcessingMode(void);
+
 extern void UpdateControlFile(void);
 extern Size XLOGShmemSize(void);
 extern void XLOGShmemInit(void);
@@ -207,6 +219,7 @@ extern void StartupXLOG(void);
 extern void ShutdownXLOG(int code, Datum arg);
 extern void InitXLOGAccess(void);
 extern void CreateCheckPoint(int flags);
+extern void CreateRestartPoint(int flags);
 extern void XLogPutNextOid(Oid nextOid);
 extern XLogRecPtr GetRedoRecPtr(void);
 extern XLogRecPtr GetInsertRecPtr(void);
diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h
index 400f32c..e69c8ec 100644
--- a/src/include/catalog/pg_control.h
+++ b/src/include/catalog/pg_control.h
@@ -21,7 +21,7 @@
 
 
 /* Version identifier for this pg_control format */
-#define PG_CONTROL_VERSION	843
+#define PG_CONTROL_VERSION	847
 
 /*
  * Body of CheckPoint XLOG records.  This is declared here because we keep
@@ -46,7 +46,7 @@ typedef struct CheckPoint
 #define XLOG_NOOP						0x20
 #define XLOG_NEXTOID					0x30
 #define XLOG_SWITCH						0x40
-
+#define XLOG_RECOVERY_END			0x50
 
 /* System status indicator */
 typedef enum DBState
@@ -102,6 +102,7 @@ typedef struct ControlFileData
 	CheckPoint	checkPointCopy; /* copy of last check point record */
 
 	XLogRecPtr	minRecoveryPoint;		/* must replay xlog to here */
+	XLogRecPtr	minSafeStartPoint;		/* safe point after recovery crashes */
 
 	/*
 	 * This data is used to check for hardware-architecture compatibility of
diff --git a/src/include/storage/pmsignal.h b/src/include/storage/pmsignal.h
index 3101092..1904187 100644
--- a/src/include/storage/pmsignal.h
+++ b/src/include/storage/pmsignal.h
@@ -22,6 +22,7 @@
  */
 typedef enum
 {
+	PMSIGNAL_RECOVERY_START,	/* move to PM_RECOVERY state */
 	PMSIGNAL_PASSWORD_CHANGE,	/* pg_auth file has changed */
 	PMSIGNAL_WAKEN_ARCHIVER,	/* send a NOTIFY signal to xlog archiver */
 	PMSIGNAL_ROTATE_LOGFILE,	/* send SIGUSR1 to syslogger to rotate logfile */