WAL Restore process during recovery

Started by Simon Riggsalmost 14 years ago14 messages

Simon Riggs

simon@2ndQuadrant.com

almost 14 years ago

1 attachment(s)

WALRestore process asynchronously executes restore_command while
recovery continues working.

Overlaps downloading of next WAL file to reduce time delays in file
based archive recovery.

Handles cases of file-only and streaming/file correctly.

--
Simon Riggs http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

Attachments:

walrestore_process.v1.patchtext/x-patch; charset=US-ASCII; name=walrestore_process.v1.patchDownload

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index ce659ec..e8b0b69 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -40,6 +40,7 @@
 #include "pgstat.h"
 #include "postmaster/bgwriter.h"
 #include "postmaster/startup.h"
+#include "postmaster/walrestore.h"
 #include "replication/walreceiver.h"
 #include "replication/walsender.h"
 #include "storage/bufmgr.h"
@@ -187,7 +188,6 @@ static bool InArchiveRecovery = false;
 static bool restoredFromArchive = false;
 
 /* options taken from recovery.conf for archive recovery */
-static char *recoveryRestoreCommand = NULL;
 static char *recoveryEndCommand = NULL;
 static char *archiveCleanupCommand = NULL;
 static RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
@@ -575,8 +575,8 @@ bool reachedConsistency = false;
 
 static bool InRedo = false;
 
-/* Have we launched bgwriter during recovery? */
-static bool bgwriterLaunched = false;
+/* Have we launched background procs during archive recovery yet? */
+static bool ArchRecoveryBgProcsActive = false;
 
 /*
  * Information logged when we detect a change in one of the parameters
@@ -632,8 +632,6 @@ static bool XLogPageRead(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt,
 			 bool randAccess);
 static int	emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
 static void XLogFileClose(void);
-static bool RestoreArchivedFile(char *path, const char *xlogfname,
-					const char *recovername, off_t expectedSize);
 static void ExecuteRecoveryCommand(char *command, char *commandName,
 					   bool failOnerror);
 static void PreallocXlogFiles(XLogRecPtr endptr);
@@ -2706,19 +2704,47 @@ XLogFileRead(uint32 log, uint32 seg, int emode, TimeLineID tli,
 
 	XLogFileName(xlogfname, tli, log, seg);
 
+#define TMPRECOVERYXLOG	"RECOVERYXLOG"
+
 	switch (source)
 	{
 		case XLOG_FROM_ARCHIVE:
+			/*
+			 * Check to see if the WALRestore process has already put the
+			 * next file in place while we were working. If so, use that.
+			 * If not, get it ourselves. This makes it easier to handle
+			 * initial state before the WALRestore is active, and also
+			 * handles the stop/start logic correctly when we have both
+			 * streaming and file based replication active.
+			 *
+			 * We queue up the next task for WALRestore after we've begun to
+			 * use this file later in XLogFileRead().
+			 *
+			 * If the WALRestore process is still active, the lock wait makes
+			 * us wait, which is just like we were executing the command
+			 * ourselves and so doesn't alter the logic elsewhere.
+			 */
+			if (XLogFileIsNowFullyRestored(tli, log, seg))
+			{
+				snprintf(path, MAXPGPATH, XLOGDIR "/%s", TMPRECOVERYXLOG);
+				restoredFromArchive = true;
+				break;
+			}
+
 			/* Report recovery progress in PS display */
 			snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
 					 xlogfname);
 			set_ps_display(activitymsg, false);
 
 			restoredFromArchive = RestoreArchivedFile(path, xlogfname,
-													  "RECOVERYXLOG",
+													  TMPRECOVERYXLOG,
 													  XLogSegSize);
+
 			if (!restoredFromArchive)
+			{
+				LWLockRelease(WALRestoreCommandLock);
 				return -1;
+			}
 			break;
 
 		case XLOG_FROM_PG_XLOG:
@@ -2748,18 +2774,42 @@ XLogFileRead(uint32 log, uint32 seg, int emode, TimeLineID tli,
 		if (stat(xlogfpath, &statbuf) == 0)
 		{
 			if (unlink(xlogfpath) != 0)
+			{
+				LWLockRelease(WALRestoreCommandLock);
 				ereport(FATAL,
 						(errcode_for_file_access(),
 						 errmsg("could not remove file \"%s\": %m",
 								xlogfpath)));
+			}
 			reload = true;
 		}
 
 		if (rename(path, xlogfpath) < 0)
+		{
+			LWLockRelease(WALRestoreCommandLock);
 			ereport(ERROR,
 				(errcode_for_file_access(),
 				 errmsg("could not rename file \"%s\" to \"%s\": %m",
 						path, xlogfpath)));
+		}
+
+		/*
+		 * Make sure we recover from the new filename, so we can reuse the
+		 * temporary filename for asynchronous restore actions.
+		 */
+		strcpy(path, xlogfpath);
+
+		/*
+		 * Tell the WALRestore process to get the next file now.
+		 * Hopefully it will be ready for use in time for the next call the
+		 * Startup process makes to XLogFileRead().
+		 *
+		 * It might seem like we should do that earlier but then there is a
+		 * race condition that might lead to replacing RECOVERYXLOG with
+		 * another file before we've copied it.
+		 */
+		SetNextWALRestoreLogSeg(tli, log, seg);
+		LWLockRelease(WALRestoreCommandLock);
 
 		/*
 		 * If the existing segment was replaced, since walsenders might have
@@ -2911,8 +2961,11 @@ XLogFileClose(void)
  * For fixed-size files, the caller may pass the expected size as an
  * additional crosscheck on successful recovery.  If the file size is not
  * known, set expectedSize = 0.
+ *
+ * Must be called with WALRestoreCommandLock held and must be held at exit,
+ * if the function returns.
  */
-static bool
+bool
 RestoreArchivedFile(char *path, const char *xlogfname,
 					const char *recovername, off_t expectedSize)
 {
@@ -2929,7 +2982,7 @@ RestoreArchivedFile(char *path, const char *xlogfname,
 	uint32		restartSeg;
 
 	/* In standby mode, restore_command might not be supplied */
-	if (recoveryRestoreCommand == NULL)
+	if (GetRecoveryRestoreCommand() == NULL)
 		goto not_available;
 
 	/*
@@ -2963,18 +3016,24 @@ RestoreArchivedFile(char *path, const char *xlogfname,
 	if (stat(xlogpath, &stat_buf) != 0)
 	{
 		if (errno != ENOENT)
+		{
+			LWLockRelease(WALRestoreCommandLock);
 			ereport(FATAL,
 					(errcode_for_file_access(),
 					 errmsg("could not stat file \"%s\": %m",
 							xlogpath)));
+		}
 	}
 	else
 	{
 		if (unlink(xlogpath) != 0)
+		{
+			LWLockRelease(WALRestoreCommandLock);
 			ereport(FATAL,
 					(errcode_for_file_access(),
 					 errmsg("could not remove file \"%s\": %m",
 							xlogpath)));
+		}
 	}
 
 	/*
@@ -3013,7 +3072,7 @@ RestoreArchivedFile(char *path, const char *xlogfname,
 	endp = xlogRestoreCmd + MAXPGPATH - 1;
 	*endp = '\0';
 
-	for (sp = recoveryRestoreCommand; *sp; sp++)
+	for (sp = GetRecoveryRestoreCommand(); *sp; sp++)
 	{
 		if (*sp == '%')
 		{
@@ -3059,21 +3118,29 @@ RestoreArchivedFile(char *path, const char *xlogfname,
 	}
 	*dp = '\0';
 
-	ereport(DEBUG3,
+	ereport(DEBUG2,
 			(errmsg_internal("executing restore command \"%s\"",
 							 xlogRestoreCmd)));
 
 	/*
-	 * Check signals before restore command and reset afterwards.
+	 * Set in_restore_command to tell the signal handler that we should exit
+	 * right away on SIGTERM. We know that we're at a safe point to do that.
+	 * Check if we had already received the signal, so that we don't miss a
+	 * shutdown request received just before this.
 	 */
-	PreRestoreCommand();
+	in_restore_command = true;
+	if (startup_shutdown_requested || walrestore_shutdown_requested)
+	{
+		LWLockRelease(WALRestoreCommandLock);
+		proc_exit(1);
+	}
 
 	/*
 	 * Copy xlog from archival storage to XLOGDIR
 	 */
 	rc = system(xlogRestoreCmd);
 
-	PostRestoreCommand();
+	in_restore_command = false;
 
 	if (rc == 0)
 	{
@@ -3102,7 +3169,10 @@ RestoreArchivedFile(char *path, const char *xlogfname,
 				if (StandbyMode && stat_buf.st_size < expectedSize)
 					elevel = DEBUG1;
 				else
+				{
+					LWLockRelease(WALRestoreCommandLock);
 					elevel = FATAL;
+				}
 				ereport(elevel,
 						(errmsg("archive file \"%s\" has wrong size: %lu instead of %lu",
 								xlogfname,
@@ -3123,10 +3193,13 @@ RestoreArchivedFile(char *path, const char *xlogfname,
 		{
 			/* stat failed */
 			if (errno != ENOENT)
+			{
+				LWLockRelease(WALRestoreCommandLock);
 				ereport(FATAL,
 						(errcode_for_file_access(),
 						 errmsg("could not stat file \"%s\": %m",
 								xlogpath)));
+			}
 		}
 	}
 
@@ -3158,10 +3231,18 @@ RestoreArchivedFile(char *path, const char *xlogfname,
 	 * too.
 	 */
 	if (WIFSIGNALED(rc) && WTERMSIG(rc) == SIGTERM)
+	{
+		LWLockRelease(WALRestoreCommandLock);
 		proc_exit(1);
+	}
 
 	signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;
 
+	/*
+	 * If signaled we will immediately issue a FATAL error so drop the lock
+	 */
+	if (signaled)
+		LWLockRelease(WALRestoreCommandLock);
 	ereport(signaled ? FATAL : DEBUG2,
 		(errmsg("could not restore file \"%s\" from archive: return code %d",
 				xlogfname, rc)));
@@ -4203,7 +4284,9 @@ readTimeLineHistory(TimeLineID targetTLI)
 	if (InArchiveRecovery)
 	{
 		TLHistoryFileName(histfname, targetTLI);
+		LWLockAcquire(WALRestoreCommandLock, LW_EXCLUSIVE);
 		RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
+		LWLockRelease(WALRestoreCommandLock);
 	}
 	else
 		TLHistoryFilePath(path, targetTLI);
@@ -4292,7 +4375,9 @@ existsTimeLineHistory(TimeLineID probeTLI)
 	if (InArchiveRecovery)
 	{
 		TLHistoryFileName(histfname, probeTLI);
+		LWLockAcquire(WALRestoreCommandLock, LW_EXCLUSIVE);
 		RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
+		LWLockRelease(WALRestoreCommandLock);
 	}
 	else
 		TLHistoryFilePath(path, probeTLI);
@@ -4453,7 +4538,9 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
 	if (InArchiveRecovery)
 	{
 		TLHistoryFileName(histfname, parentTLI);
+		LWLockAcquire(WALRestoreCommandLock, LW_EXCLUSIVE);
 		RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
+		LWLockRelease(WALRestoreCommandLock);
 	}
 	else
 		TLHistoryFilePath(path, parentTLI);
@@ -5299,10 +5386,10 @@ readRecoveryCommandFile(void)
 	{
 		if (strcmp(item->name, "restore_command") == 0)
 		{
-			recoveryRestoreCommand = pstrdup(item->value);
+			SetRecoveryRestoreCommand(pstrdup(item->value));
 			ereport(DEBUG2,
 					(errmsg_internal("restore_command = '%s'",
-									 recoveryRestoreCommand)));
+									 GetRecoveryRestoreCommand())));
 		}
 		else if (strcmp(item->name, "recovery_end_command") == 0)
 		{
@@ -5455,7 +5542,7 @@ readRecoveryCommandFile(void)
 	 */
 	if (StandbyMode)
 	{
-		if (PrimaryConnInfo == NULL && recoveryRestoreCommand == NULL)
+		if (PrimaryConnInfo == NULL && GetRecoveryRestoreCommand() == NULL)
 			ereport(WARNING,
 					(errmsg("recovery command file \"%s\" specified neither primary_conninfo nor restore_command",
 							RECOVERY_COMMAND_FILE),
@@ -5463,7 +5550,7 @@ readRecoveryCommandFile(void)
 	}
 	else
 	{
-		if (recoveryRestoreCommand == NULL)
+		if (GetRecoveryRestoreCommand() == NULL)
 			ereport(FATAL,
 					(errmsg("recovery command file \"%s\" must specify restore_command when standby mode is not enabled",
 							RECOVERY_COMMAND_FILE)));
@@ -6432,7 +6519,7 @@ StartupXLOG(void)
 			PublishStartupProcessInformation();
 			SetForwardFsyncRequests();
 			SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
-			bgwriterLaunched = true;
+			ArchRecoveryBgProcsActive = true;
 		}
 
 		/*
@@ -6795,7 +6882,7 @@ StartupXLOG(void)
 		 * the rule that TLI only changes in shutdown checkpoints, which
 		 * allows some extra error checking in xlog_redo.
 		 */
-		if (bgwriterLaunched)
+		if (ArchRecoveryBgProcsActive)
 			RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
 							  CHECKPOINT_IMMEDIATE |
 							  CHECKPOINT_WAIT);
@@ -9640,7 +9727,7 @@ XLogPageRead(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt,
 		 * Request a restartpoint if we've replayed too much
 		 * xlog since the last one.
 		 */
-		if (StandbyMode && bgwriterLaunched)
+		if (StandbyMode && ArchRecoveryBgProcsActive)
 		{
 			if (XLogCheckpointNeeded(readId, readSeg))
 			{
diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c
index e3ae92d..81a8cb3 100644
--- a/src/backend/bootstrap/bootstrap.c
+++ b/src/backend/bootstrap/bootstrap.c
@@ -30,6 +30,7 @@
 #include "nodes/makefuncs.h"
 #include "postmaster/bgwriter.h"
 #include "postmaster/startup.h"
+#include "postmaster/walrestore.h"
 #include "postmaster/walwriter.h"
 #include "replication/walreceiver.h"
 #include "storage/bufmgr.h"
@@ -319,6 +320,9 @@ AuxiliaryProcessMain(int argc, char *argv[])
 			case CheckpointerProcess:
 				statmsg = "checkpointer process";
 				break;
+			case WalRestoreProcess:
+				statmsg = "wal restore process";
+				break;
 			case WalWriterProcess:
 				statmsg = "wal writer process";
 				break;
@@ -424,6 +428,11 @@ AuxiliaryProcessMain(int argc, char *argv[])
 			CheckpointerMain();
 			proc_exit(1);		/* should never return */
 
+		case WalRestoreProcess:
+			/* don't set signals, wal restore has its own agenda */
+			WalRestoreMain();
+			proc_exit(1);		/* should never return */
+
 		case WalWriterProcess:
 			/* don't set signals, walwriter has its own agenda */
 			InitXLOGAccess();
diff --git a/src/backend/postmaster/Makefile b/src/backend/postmaster/Makefile
index 3056b09..349e722 100644
--- a/src/backend/postmaster/Makefile
+++ b/src/backend/postmaster/Makefile
@@ -13,6 +13,6 @@ top_builddir = ../../..
 include $(top_builddir)/src/Makefile.global
 
 OBJS = autovacuum.o bgwriter.o fork_process.o pgarch.o pgstat.o postmaster.o \
-	startup.o syslogger.o walwriter.o checkpointer.o
+	startup.o syslogger.o walrestore.o walwriter.o checkpointer.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index ad0c17a..15684c0 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -210,6 +210,7 @@ static pid_t StartupPID = 0,
 			BgWriterPID = 0,
 			CheckpointerPID = 0,
 			WalWriterPID = 0,
+			WalRestorePID = 0,
 			WalReceiverPID = 0,
 			AutoVacPID = 0,
 			PgArchPID = 0,
@@ -470,6 +471,7 @@ static void ShmemBackendArrayRemove(Backend *bn);
 #define StartCheckpointer()		StartChildProcess(CheckpointerProcess)
 #define StartWalWriter()		StartChildProcess(WalWriterProcess)
 #define StartWalReceiver()		StartChildProcess(WalReceiverProcess)
+#define StartWalRestore()		StartChildProcess(WalRestoreProcess)
 
 /* Macros to check exit status of a child process */
 #define EXIT_STATUS_0(st)  ((st) == 0)
@@ -2060,6 +2062,8 @@ SIGHUP_handler(SIGNAL_ARGS)
 			signal_child(WalWriterPID, SIGHUP);
 		if (WalReceiverPID != 0)
 			signal_child(WalReceiverPID, SIGHUP);
+		if (WalRestorePID != 0)
+			signal_child(WalRestorePID, SIGHUP);
 		if (AutoVacPID != 0)
 			signal_child(AutoVacPID, SIGHUP);
 		if (PgArchPID != 0)
@@ -2170,6 +2174,8 @@ pmdie(SIGNAL_ARGS)
 				signal_child(StartupPID, SIGTERM);
 			if (WalReceiverPID != 0)
 				signal_child(WalReceiverPID, SIGTERM);
+			if (WalRestorePID != 0)
+				signal_child(WalRestorePID, SIGTERM);
 			if (BgWriterPID != 0)
 				signal_child(BgWriterPID, SIGTERM);
 			if (pmState == PM_RECOVERY)
@@ -2225,6 +2231,8 @@ pmdie(SIGNAL_ARGS)
 				signal_child(WalWriterPID, SIGQUIT);
 			if (WalReceiverPID != 0)
 				signal_child(WalReceiverPID, SIGQUIT);
+			if (WalRestorePID != 0)
+				signal_child(WalRestorePID, SIGQUIT);
 			if (AutoVacPID != 0)
 				signal_child(AutoVacPID, SIGQUIT);
 			if (PgArchPID != 0)
@@ -2331,6 +2339,12 @@ reaper(SIGNAL_ARGS)
 			pmState = PM_RUN;
 
 			/*
+			 * Shutdown the WALRestore process
+			 */
+			if (WalRestorePID != 0)
+				signal_child(WalRestorePID, SIGTERM);
+
+			/*
 			 * Kill any walsenders to force the downstream standby(s) to
 			 * reread the timeline history file, adjust their timelines and
 			 * establish replication connections again. This is required
@@ -2477,6 +2491,30 @@ reaper(SIGNAL_ARGS)
 		}
 
 		/*
+		 * Was it the wal restore?  If exit status is zero (normal) or one
+		 * (FATAL exit), we assume everything is all right just like normal
+		 * backends.
+		 */
+		if (pid == WalRestorePID)
+		{
+			if (pmState >= PM_RUN)
+			{
+				WalRestorePID = 0;
+				continue;
+			}
+
+			/*
+			 * Any unexpected exit (including FATAL exit) of the WALRestore
+			 * process is treated as a crash, except that we don't want to
+			 * reinitialize because availability is important.
+			 */
+			RecoveryError = true;
+			HandleChildCrash(pid, exitstatus,
+							 _("walrestore process"));
+			continue;
+		}
+
+		/*
 		 * Was it the autovacuum launcher?	Normal exit can be ignored; we'll
 		 * start a new one at the next iteration of the postmaster's main
 		 * loop, if necessary.	Any other exit condition is treated as a
@@ -2756,6 +2794,18 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
 		signal_child(WalReceiverPID, (SendStop ? SIGSTOP : SIGQUIT));
 	}
 
+	/* Take care of the walrestore too */
+	if (pid == WalRestorePID)
+		WalRestorePID = 0;
+	else if (WalRestorePID != 0 && !FatalError)
+	{
+		ereport(DEBUG2,
+				(errmsg_internal("sending %s to process %d",
+								 (SendStop ? "SIGSTOP" : "SIGQUIT"),
+								 (int) WalRestorePID)));
+		signal_child(WalRestorePID, (SendStop ? SIGSTOP : SIGQUIT));
+	}
+
 	/* Take care of the autovacuum launcher too */
 	if (pid == AutoVacPID)
 		AutoVacPID = 0;
@@ -2916,6 +2966,8 @@ PostmasterStateMachine(void)
 				signal_child(StartupPID, SIGTERM);
 			if (WalReceiverPID != 0)
 				signal_child(WalReceiverPID, SIGTERM);
+			if (WalRestorePID != 0)
+				signal_child(WalRestorePID, SIGTERM);
 			pmState = PM_WAIT_BACKENDS;
 		}
 	}
@@ -2940,6 +2992,7 @@ PostmasterStateMachine(void)
 		if (CountChildren(BACKEND_TYPE_NORMAL | BACKEND_TYPE_AUTOVAC) == 0 &&
 			StartupPID == 0 &&
 			WalReceiverPID == 0 &&
+			WalRestorePID == 0 &&
 			BgWriterPID == 0 &&
 			(CheckpointerPID == 0 || !FatalError) &&
 			WalWriterPID == 0 &&
@@ -3005,11 +3058,11 @@ PostmasterStateMachine(void)
 		 * left by now anyway; what we're really waiting for is walsenders and
 		 * archiver.
 		 *
-		 * Walreceiver should normally be dead by now, but not when a fast
-		 * shutdown is performed during recovery.
+		 * Walreceiver and Walrestore should normally be dead by now, but not
+		 * when a fast shutdown is performed during recovery.
 		 */
 		if (PgArchPID == 0 && CountChildren(BACKEND_TYPE_ALL) == 0 &&
-			WalReceiverPID == 0)
+			WalReceiverPID == 0 && WalRestorePID == 0)
 		{
 			pmState = PM_WAIT_DEAD_END;
 		}
@@ -3036,6 +3089,7 @@ PostmasterStateMachine(void)
 			/* These other guys should be dead already */
 			Assert(StartupPID == 0);
 			Assert(WalReceiverPID == 0);
+			Assert(WalRestorePID == 0);
 			Assert(BgWriterPID == 0);
 			Assert(CheckpointerPID == 0);
 			Assert(WalWriterPID == 0);
@@ -4219,6 +4273,8 @@ sigusr1_handler(SIGNAL_ARGS)
 		BgWriterPID = StartBackgroundWriter();
 		Assert(CheckpointerPID == 0);
 		CheckpointerPID = StartCheckpointer();
+		Assert(WalRestorePID == 0);
+		WalRestorePID = StartWalRestore();
 
 		pmState = PM_RECOVERY;
 	}
diff --git a/src/backend/postmaster/startup.c b/src/backend/postmaster/startup.c
index ed75d09..1791feb 100644
--- a/src/backend/postmaster/startup.c
+++ b/src/backend/postmaster/startup.c
@@ -35,14 +35,14 @@
  * Flags set by interrupt handlers for later service in the redo loop.
  */
 static volatile sig_atomic_t got_SIGHUP = false;
-static volatile sig_atomic_t shutdown_requested = false;
 static volatile sig_atomic_t promote_triggered = false;
+volatile sig_atomic_t startup_shutdown_requested = false;
 
 /*
  * Flag set when executing a restore command, to tell SIGTERM signal handler
  * that it's safe to just proc_exit.
  */
-static volatile sig_atomic_t in_restore_command = false;
+volatile sig_atomic_t in_restore_command = false;
 
 /* Signal handlers */
 static void startupproc_quickdie(SIGNAL_ARGS);
@@ -131,9 +131,16 @@ StartupProcShutdownHandler(SIGNAL_ARGS)
 	int			save_errno = errno;
 
 	if (in_restore_command)
+	{
+		/*
+		 * See RestoreArchivedFile() for explanation of why this
+		 * lock is always held when in_restore_command is true.
+		 */
+		LWLockRelease(WALRestoreCommandLock);
 		proc_exit(1);
+	}
 	else
-		shutdown_requested = true;
+		startup_shutdown_requested = true;
 	WakeupRecovery();
 
 	errno = save_errno;
@@ -155,7 +162,7 @@ HandleStartupProcInterrupts(void)
 	/*
 	 * Check if we were requested to exit without finishing recovery.
 	 */
-	if (shutdown_requested)
+	if (startup_shutdown_requested)
 		proc_exit(1);
 
 	/*
@@ -226,26 +233,6 @@ StartupProcessMain(void)
 	proc_exit(0);
 }
 
-void
-PreRestoreCommand(void)
-{
-	/*
-	 * Set in_restore_command to tell the signal handler that we should exit
-	 * right away on SIGTERM. We know that we're at a safe point to do that.
-	 * Check if we had already received the signal, so that we don't miss a
-	 * shutdown request received just before this.
-	 */
-	in_restore_command = true;
-	if (shutdown_requested)
-		proc_exit(1);
-}
-
-void
-PostRestoreCommand(void)
-{
-	in_restore_command = false;
-}
-
 bool
 IsPromoteTriggered(void)
 {
diff --git a/src/backend/postmaster/walrestore.c b/src/backend/postmaster/walrestore.c
new file mode 100644
index 0000000..7634d36
--- /dev/null
+++ b/src/backend/postmaster/walrestore.c
@@ -0,0 +1,474 @@
+/*-------------------------------------------------------------------------
+ *
+ * walrestore.c
+ *
+ * The WAL restore process is new as of Postgres 9.2, though the work it performs
+ * has been handled by the startup process from Postgres 8.0 until 9.1.
+ *
+ * WALRestore process executes the restore_command. If not set, it sleeps.
+ * The startup process no longer executes the restore_command and knows
+ * little about where the WAL files have come from.
+ *
+ * The WAL restore process is started by the postmaster when we enter
+ * PM_RECOVERY state and exits immediately after startup finishes.
+ * It remains alive until the postmaster commands it to terminate.
+ * Normal termination is by SIGTERM, which instructs restore process to exit(0).
+ * Like any backend, restore process will simply abort and exit on SIGQUIT.
+ *
+ * Note that the WAL restore process only executes the restore_command.
+ * The archive_cleanup_command is exeuted by the checkpointer, while the
+ * recovery_end_command and requests for history files are executed by the
+ * startup process. That is not important to the way those commands execute.
+ * All processes that use the restore_command must hold WALRestoreCommandLock
+ * before they execute it, since we definitely wish to avoid trying to get the
+ * same file more than once concurrently, plus we can't assume that the
+ * user has specified command that would succeed if run concurrently.
+ *
+ * If the WAL restore exits unexpectedly, the postmaster treats that the same
+ * as a backend crash: shared memory may be corrupted, so remaining backends
+ * should be killed by SIGQUIT and then a recovery cycle started.
+ *
+ *
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/postmaster/walrestore.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <signal.h>
+#include <sys/time.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "access/xlog_internal.h"
+#include "libpq/pqsignal.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "postmaster/startup.h"
+#include "postmaster/walrestore.h"
+#include "storage/bufmgr.h"
+#include "storage/ipc.h"
+#include "storage/lwlock.h"
+#include "storage/pmsignal.h"
+#include "storage/shmem.h"
+#include "utils/guc.h"
+#include "utils/ps_status.h"
+#include "utils/timestamp.h"
+
+/* XXX Set to DEBUG4 prior to patch commit */
+#define WALRSTR_DEBUG_LEVEL 		LOG
+
+/*
+ * GUC parameters
+ */
+int	WalRestoreDelay = 10000;
+
+WalRestoreData *WalRstr = NULL;
+
+/*
+ * Flags set by interrupt handlers for later service in the main loop.
+ */
+static volatile sig_atomic_t got_SIGHUP = false;
+volatile sig_atomic_t walrestore_shutdown_requested = false;
+
+/* Prototypes for private functions */
+
+static bool WalRestoreNextFile(void);
+
+/* Signal handlers */
+
+static void walrestore_quickdie(SIGNAL_ARGS);
+static void WalRestoreProcSigUsr1Handler(SIGNAL_ARGS);
+static void WalRestoreSigHupHandler(SIGNAL_ARGS);
+static void WalRestoreShutdownHandler(SIGNAL_ARGS);
+
+
+/*
+ * Main entry point for walrestore process
+ *
+ * This is invoked from BootstrapMain, which has already created the basic
+ * execution environment, but not enabled signals yet.
+ */
+void
+WalRestoreMain(void)
+{
+	/* use volatile pointer to prevent code rearrangement */
+	volatile WalRestoreData *walrstr = WalRstr;
+
+	/*
+	 * WalRstr should be set up already (if we are a backend, we inherit this
+	 * by fork() or EXEC_BACKEND mechanism from the postmaster).
+	 */
+	Assert(walrstr != NULL);
+
+	InitLatch(&walrstr->WALRestoreLatch); /* initialize latch used in main loop */
+
+	/*
+	 * If possible, make this process a group leader, so that the postmaster
+	 * can signal any child processes too.
+	 */
+#ifdef HAVE_SETSID
+	if (setsid() < 0)
+		elog(FATAL, "setsid() failed: %m");
+#endif
+
+	/*
+	 * Properly accept or ignore signals the postmaster might send us
+	 *
+	 * SIGUSR1 is presently unused; keep it spare in case someday we want this
+	 * process to participate in ProcSignal signalling.
+	 */
+	pqsignal(SIGHUP, WalRestoreSigHupHandler);	/* set flag to read config file */
+	pqsignal(SIGINT, SIG_IGN);
+	pqsignal(SIGTERM, WalRestoreShutdownHandler); 	/* shutdown */
+	pqsignal(SIGQUIT, walrestore_quickdie);		/* hard crash time */
+	pqsignal(SIGALRM, SIG_IGN);
+	pqsignal(SIGPIPE, SIG_IGN);
+	pqsignal(SIGUSR1, WalRestoreProcSigUsr1Handler);	/* reserve for ProcSignal */
+	pqsignal(SIGUSR2, SIG_IGN);
+
+	/*
+	 * Reset some signals that are accepted by postmaster but not here
+	 */
+	pqsignal(SIGCHLD, SIG_DFL);
+	pqsignal(SIGTTIN, SIG_DFL);
+	pqsignal(SIGTTOU, SIG_DFL);
+	pqsignal(SIGCONT, SIG_DFL);
+	pqsignal(SIGWINCH, SIG_DFL);
+
+	/* We allow SIGQUIT (quickdie) at all times */
+	sigdelset(&BlockSig, SIGQUIT);
+
+	/*
+	 * Unblock signals (they were blocked when the postmaster forked us)
+	 */
+	PG_SETMASK(&UnBlockSig);
+
+	/*
+	 * Loop forever
+	 */
+	for (;;)
+	{
+		ResetLatch(&walrstr->WALRestoreLatch);
+
+		/*
+		 * Emergency bailout if postmaster has died.  This is to avoid the
+		 * necessity for manual cleanup of all postmaster children.
+		 */
+		if (!PostmasterIsAlive())
+			exit(1);
+
+		if (got_SIGHUP)
+		{
+			got_SIGHUP = false;
+			ProcessConfigFile(PGC_SIGHUP);
+		}
+
+		if (walrestore_shutdown_requested)
+		{
+			/*
+			 * From here on, elog(ERROR) should end with exit(1), not send
+			 * control back to the sigsetjmp block above
+			 */
+			ExitOnAnyError = true;
+			/* Normal exit from the walwriter is here */
+			proc_exit(0);		/* done */
+		}
+
+		/*
+		 * Keep restoring as long as there are files to process and we have
+		 * not exceeded wal_keep_files
+		 */
+		if (!WalRestoreNextFile())
+		{
+		(void) WaitLatch(&walrstr->WALRestoreLatch,
+							   WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
+							   WalRestoreDelay /* ms */);
+		}
+	}
+}
+
+/*
+ * SetNextWALRestoreLogSeg - set the target for next WALrestore cycle
+ *
+ * Only called by Startup process
+ *
+ * Must be called with WALRestoreCommandLock held and must be held at exit,
+ * if the function returns.
+ */
+void
+SetNextWALRestoreLogSeg(TimeLineID tli, uint32 log, uint32 seg)
+{
+	char		xlogfname[MAXFNAMELEN];
+	uint32		newlog = log;
+	uint32		newseg = seg;
+
+	NextLogSeg(newlog, newseg);
+
+	XLogFileName(xlogfname, tli, newlog, newseg);
+	elog(WALRSTR_DEBUG_LEVEL, "requesting restore of %s", xlogfname);
+
+	{
+		/* use volatile pointer to prevent code rearrangement */
+		volatile WalRestoreData *walrstr = WalRstr;
+
+		walrstr->nextFileTli = tli;
+		walrstr->nextFileLog = newlog;
+		walrstr->nextFileSeg = newseg;
+	}
+
+	SetLatch(&WalRstr->WALRestoreLatch);
+}
+
+/*
+ * Run in Startup process to see if next file has arrived. We protect
+ * WalRstr with a LWlock so that the Startup process will wait until
+ * the restore_command succeeds or is cancelled. We set interrupt flags
+ * as if we were running the restore_command ourselves; there is no
+ * difference.
+ *
+ * WALRestoreCommandLock is not held on entry, but will be held at exit.
+ */
+bool
+XLogFileIsNowFullyRestored(TimeLineID tli, uint32 log, uint32 seg)
+{
+	/* use volatile pointer to prevent code rearrangement */
+	volatile WalRestoreData *walrstr = WalRstr;
+	char		xlogfname[MAXFNAMELEN];
+
+	/*
+	 * Issue debug message before we wait for the lock, to allow
+	 * log entries to show interleaving of Startup and WALRestore actions
+	 */
+	XLogFileName(xlogfname, tli, log, seg);
+	elog(WALRSTR_DEBUG_LEVEL,
+			"startup process requests %s from archive", xlogfname);
+
+	LWLockAcquire(WALRestoreCommandLock, LW_EXCLUSIVE);
+
+	XLogFileName(xlogfname,
+					walrstr->lastFileTli,
+					walrstr->lastFileLog,
+					walrstr->lastFileSeg);
+	elog(WALRSTR_DEBUG_LEVEL,
+			"startup process sees last file was %s", xlogfname);
+
+	if (tli == walrstr->lastFileTli &&
+		log == walrstr->lastFileLog &&
+		seg == walrstr->lastFileSeg)
+		return true;
+
+	return false;
+}
+
+/*
+ * WalRestoreNextFile - returns true if next file was restored
+ *
+ * Broadly follows the logic in XLogFileRead() when called with source of
+ * XLOG_FROM_ARCHIVE, except we have to read the next file from shmem.
+ */
+static bool
+WalRestoreNextFile(void)
+{
+	/* use volatile pointer to prevent code rearrangement */
+	volatile WalRestoreData *walrstr = WalRstr;
+	char		xlogfname[MAXFNAMELEN];
+	char		activitymsg[MAXFNAMELEN + 16];
+	char		path[MAXPGPATH];
+	bool		restoredFromArchive;
+	uint32		nextFileLog;
+	uint32		nextFileSeg;
+	TimeLineID	nextFileTli;
+
+	elog(WALRSTR_DEBUG_LEVEL, "walrestore checking for next file to restore");
+
+	LWLockAcquire(WALRestoreCommandLock, LW_EXCLUSIVE);
+
+	{
+		/* use volatile pointer to prevent code rearrangement */
+		volatile WalRestoreData *walrstr = WalRstr;
+
+		nextFileTli = walrstr->nextFileTli;
+		nextFileLog = walrstr->nextFileLog;
+		nextFileSeg = walrstr->nextFileSeg;
+	}
+
+	/*
+	 * If we aren't being requested to restore a file exit quickly.
+	 */
+	if (nextFileTli == walrstr->lastFileTli &&
+			nextFileLog == walrstr->lastFileLog &&
+			nextFileSeg == walrstr->lastFileSeg)
+	{
+		LWLockRelease(WALRestoreCommandLock);
+		XLogFileName(xlogfname, nextFileTli, nextFileLog, nextFileSeg);
+		elog(WALRSTR_DEBUG_LEVEL,
+				"restore of %s is already complete, so sleep", xlogfname);
+		return false;
+	}
+
+	XLogFileName(xlogfname, nextFileTli, nextFileLog, nextFileSeg);
+
+	/* Report recovery progress in PS display */
+	snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
+			 xlogfname);
+	set_ps_display(activitymsg, false);
+
+	elog(WALRSTR_DEBUG_LEVEL, "walrestore will restore %s", xlogfname);
+
+	restoredFromArchive = RestoreArchivedFile(path, xlogfname,
+											  "RECOVERYXLOG",
+											  XLogSegSize);
+
+	if (restoredFromArchive)
+	{
+		/* use volatile pointer to prevent code rearrangement */
+		volatile WalRestoreData *walrstr = WalRstr;
+
+		walrstr->lastFileTli = nextFileTli;
+		walrstr->lastFileLog = nextFileLog;
+		walrstr->lastFileSeg = nextFileSeg;
+		walrstr->lastFileRestoreTime = GetCurrentTimestamp();
+	}
+
+	LWLockRelease(WALRestoreCommandLock);
+
+	set_ps_display("", false);
+
+	/*
+	 * Make sure Startup process is active so it can see new file, or
+	 * react to it not being there.
+	 */
+	WakeupRecovery();
+
+	return restoredFromArchive;
+}
+
+void
+SetRecoveryRestoreCommand(char *cmd)
+{
+	if (cmd == NULL)
+		return;
+
+	if (strlen(cmd) <= MAXPGPATH)
+		strcpy(WalRstr->recoveryRestoreCommand, cmd);
+	else
+		elog(FATAL, "recovery_restore_command is too long");
+}
+
+char *
+GetRecoveryRestoreCommand(void)
+{
+	return WalRstr->recoveryRestoreCommand;
+}
+
+/* Report shared memory space needed by WalRestoreShmemInit */
+Size
+WalRestoreShmemSize(void)
+{
+	Size		size = 0;
+
+	size = add_size(size, sizeof(WalRestoreData));
+
+	return size;
+}
+
+/* Allocate and initialize walrestore-related shared memory */
+void
+WalRestoreShmemInit(void)
+{
+	bool		found;
+
+	WalRstr = (WalRestoreData *)
+		ShmemInitStruct("Wal Restore Ctl", WalRestoreShmemSize(), &found);
+
+	if (found)
+		return;
+
+	/* First time through, so initialize */
+	MemSet(WalRstr, 0, WalRestoreShmemSize());
+	InitSharedLatch(&WalRstr->WALRestoreLatch);
+
+}
+
+/* --------------------------------
+ *		signal handler routines
+ * --------------------------------
+ */
+
+/*
+ * walrestore_quickdie() occurs when signalled SIGQUIT by the postmaster.
+ *
+ * Some backend has bought the farm,
+ * so we need to stop what we're doing and exit.
+ */
+static void
+walrestore_quickdie(SIGNAL_ARGS)
+{
+	PG_SETMASK(&BlockSig);
+
+	/*
+	 * We DO NOT want to run proc_exit() callbacks -- we're here because
+	 * shared memory may be corrupted, so we don't want to try to clean up our
+	 * transaction.  Just nail the windows shut and get out of town.  Now that
+	 * there's an atexit callback to prevent third-party code from breaking
+	 * things by calling exit() directly, we have to reset the callbacks
+	 * explicitly to make this work as intended.
+	 */
+	on_exit_reset();
+
+	/*
+	 * Note we do exit(2) not exit(0).	This is to force the postmaster into a
+	 * system reset cycle if some idiot DBA sends a manual SIGQUIT to a random
+	 * backend.  This is necessary precisely because we don't clean up our
+	 * shared memory state.  (The "dead man switch" mechanism in pmsignal.c
+	 * should ensure the postmaster sees this as a crash, too, but no harm in
+	 * being doubly sure.)
+	 */
+	exit(2);
+}
+
+/* SIGUSR1: let latch facility handle the signal */
+static void
+WalRestoreProcSigUsr1Handler(SIGNAL_ARGS)
+{
+	int			save_errno = errno;
+
+	latch_sigusr1_handler();
+
+	errno = save_errno;
+}
+
+/* SIGHUP: set flag to re-read config file at next convenient time */
+static void
+WalRestoreSigHupHandler(SIGNAL_ARGS)
+{
+	int			save_errno = errno;
+
+	got_SIGHUP = true;
+	SetLatch(&WalRstr->WALRestoreLatch);
+
+	errno = save_errno;
+}
+
+/* SIGTERM: set flag to shutdown and exit */
+static void
+WalRestoreShutdownHandler(SIGNAL_ARGS)
+{
+	int			save_errno = errno;
+
+	if (in_restore_command)
+	{
+		LWLockRelease(WALRestoreCommandLock);
+		proc_exit(1);
+	}
+	else
+		walrestore_shutdown_requested = true;
+	SetLatch(&WalRstr->WALRestoreLatch);
+
+	errno = save_errno;
+}
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index ef1dc91..8f4443a 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -26,6 +26,7 @@
 #include "postmaster/autovacuum.h"
 #include "postmaster/bgwriter.h"
 #include "postmaster/postmaster.h"
+#include "postmaster/walrestore.h"
 #include "replication/walreceiver.h"
 #include "replication/walsender.h"
 #include "storage/bufmgr.h"
@@ -123,6 +124,7 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port)
 		size = add_size(size, AutoVacuumShmemSize());
 		size = add_size(size, WalSndShmemSize());
 		size = add_size(size, WalRcvShmemSize());
+		size = add_size(size, WalRestoreShmemSize());
 		size = add_size(size, BTreeShmemSize());
 		size = add_size(size, SyncScanShmemSize());
 		size = add_size(size, AsyncShmemSize());
@@ -228,6 +230,7 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port)
 	AutoVacuumShmemInit();
 	WalSndShmemInit();
 	WalRcvShmemInit();
+	WalRestoreShmemInit();
 
 	/*
 	 * Set up other modules that need some shared memory space
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 1ddf4bf..e9e5325 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -270,7 +270,8 @@ extern bool XLogNeedsFlush(XLogRecPtr RecPtr);
 extern int XLogFileInit(uint32 log, uint32 seg,
 			 bool *use_existent, bool use_lock);
 extern int	XLogFileOpen(uint32 log, uint32 seg);
-
+extern bool RestoreArchivedFile(char *path, const char *xlogfname,
+					const char *recovername, off_t expectedSize);
 
 extern void XLogGetLastRemoved(uint32 *log, uint32 *seg);
 extern void XLogSetAsyncXactLSN(XLogRecPtr record);
@@ -316,6 +317,7 @@ extern TimeLineID GetRecoveryTargetTLI(void);
 extern bool CheckPromoteSignal(void);
 extern void WakeupRecovery(void);
 extern Latch *WALWriterLatch(void);
+extern Latch *WALRestoreLatch(void);
 
 /*
  * Starting/stopping a base backup
diff --git a/src/include/bootstrap/bootstrap.h b/src/include/bootstrap/bootstrap.h
index e966a73..b90ce33 100644
--- a/src/include/bootstrap/bootstrap.h
+++ b/src/include/bootstrap/bootstrap.h
@@ -23,6 +23,7 @@ typedef enum
 	StartupProcess,
 	BgWriterProcess,
 	CheckpointerProcess,
+	WalRestoreProcess,
 	WalWriterProcess,
 	WalReceiverProcess,
 
diff --git a/src/include/postmaster/startup.h b/src/include/postmaster/startup.h
index 3ec6950..35d9665 100644
--- a/src/include/postmaster/startup.h
+++ b/src/include/postmaster/startup.h
@@ -12,10 +12,11 @@
 #ifndef _STARTUP_H
 #define _STARTUP_H
 
+extern volatile sig_atomic_t startup_shutdown_requested;
+extern volatile sig_atomic_t in_restore_command;
+
 extern void HandleStartupProcInterrupts(void);
 extern void StartupProcessMain(void);
-extern void PreRestoreCommand(void);
-extern void PostRestoreCommand(void);
 extern bool IsPromoteTriggered(void);
 extern void ResetPromoteTriggered(void);
 
diff --git a/src/include/postmaster/walrestore.h b/src/include/postmaster/walrestore.h
new file mode 100644
index 0000000..98d7830
--- /dev/null
+++ b/src/include/postmaster/walrestore.h
@@ -0,0 +1,72 @@
+/*-------------------------------------------------------------------------
+ *
+ * walrestore.h
+ *	  Exports from postmaster/walrestore.c.
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ *
+ * src/include/postmaster/walrestore.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef _WALRESTORE_H
+#define _WALRESTORE_H
+
+#include "access/xlog.h"
+#include "access/xlogdefs.h"
+#include "storage/spin.h"
+#include "pgtime.h"
+
+extern volatile sig_atomic_t walrestore_shutdown_requested;
+
+/* GUC options */
+
+extern void WalRestoreMain(void);
+extern bool XLogFileIsNowFullyRestored(TimeLineID tli, uint32 log, uint32 seg);
+extern void SetNextWALRestoreLogSeg(TimeLineID tli, uint32 log, uint32 seg);
+extern void SetRecoveryRestoreCommand(char *cmd);
+extern char *GetRecoveryRestoreCommand(void);
+extern Size WalRestoreShmemSize(void);
+extern void WalRestoreShmemInit(void);
+
+/* Shared memory area for management of walrestore process */
+typedef struct
+{
+	/*
+	 * The identifiers of the last WAL file restored by WALrestore
+	 */
+	TimeLineID	lastFileTli;
+	uint32		lastFileLog;
+	uint32		lastFileSeg;
+
+	/*
+	 * Time of last restore by WALrestore
+	 */
+	TimestampTz lastFileRestoreTime;
+
+	/*
+	 * The next WAL file requested for the WALrestore process to restore
+	 */
+	TimeLineID	nextFileTli;
+	uint32		nextFileLog;
+	uint32		nextFileSeg;
+
+	/*
+	 * All of the above read and set only while holding WALRestoreCommandLock
+	 */
+
+	/*
+	 * WALRestoreLatch is used to wake up the WALRestore to restore WAL files.
+	 */
+	Latch		WALRestoreLatch;
+
+	/*
+	 * recoveryRestoreCommand for use by walrestore; can remove if becomes GUC
+	 * Set once at startup and read-only after that
+	 */
+	char		recoveryRestoreCommand[MAXPGPATH];
+} WalRestoreData;
+
+extern WalRestoreData *WalRstr;
+
+#endif   /* _WALRESTORE_H */
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index df3df29..c316dcc 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -79,6 +79,7 @@ typedef enum LWLockId
 	SerializablePredicateLockListLock,
 	OldSerXidLock,
 	SyncRepLock,
+	WALRestoreCommandLock,
 	/* Individual lock IDs end here */
 	FirstBufMappingLock,
 	FirstLockMgrLock = FirstBufMappingLock + NUM_BUFFER_PARTITIONS,

Fujii Masao

masao.fujii@gmail.com

almost 14 years ago

In reply to: Simon Riggs (#1)

Re: WAL Restore process during recovery

On Mon, Jan 16, 2012 at 2:06 AM, Simon Riggs <simon@2ndquadrant.com> wrote:

WALRestore process asynchronously executes restore_command while
recovery continues working.

Overlaps downloading of next WAL file to reduce time delays in file
based archive recovery.

Handles cases of file-only and streaming/file correctly.

Though I've not reviewed the patch deeply yet, I observed the following
two problems when I tested the patch.

When I set up streaming replication + archive (i.e., restore_command is set)
and started the standby, I got the following error:

FATAL: all AuxiliaryProcs are in use
LOG: walrestore process (PID 18839) exited with exit code 1

When I started an archive recovery without setting restore_command,
it successfully finished.

Regards,

--
Fujii Masao
NIPPON TELEGRAPH AND TELEPHONE CORPORATION
NTT Open Source Software Center

Simon Riggs

simon@2ndQuadrant.com

almost 14 years ago

In reply to: Fujii Masao (#2)

Re: WAL Restore process during recovery

On Tue, Jan 17, 2012 at 6:52 AM, Fujii Masao <masao.fujii@gmail.com> wrote:

On Mon, Jan 16, 2012 at 2:06 AM, Simon Riggs <simon@2ndquadrant.com> wrote:

WALRestore process asynchronously executes restore_command while
recovery continues working.

Overlaps downloading of next WAL file to reduce time delays in file
based archive recovery.

Handles cases of file-only and streaming/file correctly.

Though I've not reviewed the patch deeply yet, I observed the following
two problems when I tested the patch.

When I set up streaming replication + archive (i.e., restore_command is set)
and started the standby, I got the following error:

FATAL: all AuxiliaryProcs are in use
LOG: walrestore process (PID 18839) exited with exit code 1

When I started an archive recovery without setting restore_command,
it successfully finished.

Oh, I did have NUM_AUXILIARY_PROCS increased at one point, but I
"realised" it wasn't needed and removed it. Will change that. Thanks.

--
Simon Riggs http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

Simon Riggs

simon@2ndQuadrant.com

almost 14 years ago

In reply to: Fujii Masao (#2)

1 attachment(s)

Re: WAL Restore process during recovery

On Tue, Jan 17, 2012 at 6:52 AM, Fujii Masao <masao.fujii@gmail.com> wrote:

On Mon, Jan 16, 2012 at 2:06 AM, Simon Riggs <simon@2ndquadrant.com> wrote:

WALRestore process asynchronously executes restore_command while
recovery continues working.

Overlaps downloading of next WAL file to reduce time delays in file
based archive recovery.

Handles cases of file-only and streaming/file correctly.

Though I've not reviewed the patch deeply yet, I observed the following
two problems when I tested the patch.

When I set up streaming replication + archive (i.e., restore_command is set)
and started the standby, I got the following error:

FATAL: all AuxiliaryProcs are in use
LOG: walrestore process (PID 18839) exited with exit code 1

Fixed and better documented.

When I started an archive recovery without setting restore_command,
it successfully finished.

Not sure exactly what you mean, but I fixed a bug that might be
something you're seeing.

--
Simon Riggs http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

Attachments:

walrestore_process.v2.patchtext/x-patch; charset=US-ASCII; name=walrestore_process.v2.patchDownload

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index ce659ec..469e6d6 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -40,6 +40,7 @@
 #include "pgstat.h"
 #include "postmaster/bgwriter.h"
 #include "postmaster/startup.h"
+#include "postmaster/walrestore.h"
 #include "replication/walreceiver.h"
 #include "replication/walsender.h"
 #include "storage/bufmgr.h"
@@ -187,7 +188,6 @@ static bool InArchiveRecovery = false;
 static bool restoredFromArchive = false;
 
 /* options taken from recovery.conf for archive recovery */
-static char *recoveryRestoreCommand = NULL;
 static char *recoveryEndCommand = NULL;
 static char *archiveCleanupCommand = NULL;
 static RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
@@ -575,8 +575,8 @@ bool reachedConsistency = false;
 
 static bool InRedo = false;
 
-/* Have we launched bgwriter during recovery? */
-static bool bgwriterLaunched = false;
+/* Have we launched background procs during archive recovery yet? */
+static bool ArchRecoveryBgProcsActive = false;
 
 /*
  * Information logged when we detect a change in one of the parameters
@@ -632,8 +632,6 @@ static bool XLogPageRead(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt,
 			 bool randAccess);
 static int	emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
 static void XLogFileClose(void);
-static bool RestoreArchivedFile(char *path, const char *xlogfname,
-					const char *recovername, off_t expectedSize);
 static void ExecuteRecoveryCommand(char *command, char *commandName,
 					   bool failOnerror);
 static void PreallocXlogFiles(XLogRecPtr endptr);
@@ -2706,19 +2704,47 @@ XLogFileRead(uint32 log, uint32 seg, int emode, TimeLineID tli,
 
 	XLogFileName(xlogfname, tli, log, seg);
 
+#define TMPRECOVERYXLOG	"RECOVERYXLOG"
+
 	switch (source)
 	{
 		case XLOG_FROM_ARCHIVE:
+			/*
+			 * Check to see if the WALRestore process has already put the
+			 * next file in place while we were working. If so, use that.
+			 * If not, get it ourselves. This makes it easier to handle
+			 * initial state before the WALRestore is active, and also
+			 * handles the stop/start logic correctly when we have both
+			 * streaming and file based replication active.
+			 *
+			 * We queue up the next task for WALRestore after we've begun to
+			 * use this file later in XLogFileRead().
+			 *
+			 * If the WALRestore process is still active, the lock wait makes
+			 * us wait, which is just like we were executing the command
+			 * ourselves and so doesn't alter the logic elsewhere.
+			 */
+			if (XLogFileIsNowFullyRestored(tli, log, seg))
+			{
+				snprintf(path, MAXPGPATH, XLOGDIR "/%s", TMPRECOVERYXLOG);
+				restoredFromArchive = true;
+				break;
+			}
+
 			/* Report recovery progress in PS display */
 			snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
 					 xlogfname);
 			set_ps_display(activitymsg, false);
 
 			restoredFromArchive = RestoreArchivedFile(path, xlogfname,
-													  "RECOVERYXLOG",
+													  TMPRECOVERYXLOG,
 													  XLogSegSize);
+
 			if (!restoredFromArchive)
+			{
+				LWLockRelease(WALRestoreCommandLock);
 				return -1;
+			}
 			break;
 
 		case XLOG_FROM_PG_XLOG:
@@ -2748,18 +2774,42 @@ XLogFileRead(uint32 log, uint32 seg, int emode, TimeLineID tli,
 		if (stat(xlogfpath, &statbuf) == 0)
 		{
 			if (unlink(xlogfpath) != 0)
+			{
+				LWLockRelease(WALRestoreCommandLock);
 				ereport(FATAL,
 						(errcode_for_file_access(),
 						 errmsg("could not remove file \"%s\": %m",
 								xlogfpath)));
+			}
 			reload = true;
 		}
 
 		if (rename(path, xlogfpath) < 0)
+		{
+			LWLockRelease(WALRestoreCommandLock);
 			ereport(ERROR,
 				(errcode_for_file_access(),
 				 errmsg("could not rename file \"%s\" to \"%s\": %m",
 						path, xlogfpath)));
+		}
+
+		/*
+		 * Make sure we recover from the new filename, so we can reuse the
+		 * temporary filename for asynchronous restore actions.
+		 */
+		strcpy(path, xlogfpath);
+
+		/*
+		 * Tell the WALRestore process to get the next file now.
+		 * Hopefully it will be ready for use in time for the next call the
+		 * Startup process makes to XLogFileRead().
+		 *
+		 * It might seem like we should do that earlier but then there is a
+		 * race condition that might lead to replacing RECOVERYXLOG with
+		 * another file before we've copied it.
+		 */
+		SetNextWALRestoreLogSeg(tli, log, seg);
+		LWLockRelease(WALRestoreCommandLock);
 
 		/*
 		 * If the existing segment was replaced, since walsenders might have
@@ -2911,8 +2961,11 @@ XLogFileClose(void)
  * For fixed-size files, the caller may pass the expected size as an
  * additional crosscheck on successful recovery.  If the file size is not
  * known, set expectedSize = 0.
+ *
+ * Must be called with WALRestoreCommandLock held and must be held at exit,
+ * if the function returns.
  */
-static bool
+bool
 RestoreArchivedFile(char *path, const char *xlogfname,
 					const char *recovername, off_t expectedSize)
 {
@@ -2929,7 +2982,7 @@ RestoreArchivedFile(char *path, const char *xlogfname,
 	uint32		restartSeg;
 
 	/* In standby mode, restore_command might not be supplied */
-	if (recoveryRestoreCommand == NULL)
+	if (strlen(GetRecoveryRestoreCommand()) == 0)
 		goto not_available;
 
 	/*
@@ -2963,18 +3016,24 @@ RestoreArchivedFile(char *path, const char *xlogfname,
 	if (stat(xlogpath, &stat_buf) != 0)
 	{
 		if (errno != ENOENT)
+		{
+			LWLockRelease(WALRestoreCommandLock);
 			ereport(FATAL,
 					(errcode_for_file_access(),
 					 errmsg("could not stat file \"%s\": %m",
 							xlogpath)));
+		}
 	}
 	else
 	{
 		if (unlink(xlogpath) != 0)
+		{
+			LWLockRelease(WALRestoreCommandLock);
 			ereport(FATAL,
 					(errcode_for_file_access(),
 					 errmsg("could not remove file \"%s\": %m",
 							xlogpath)));
+		}
 	}
 
 	/*
@@ -3013,7 +3072,7 @@ RestoreArchivedFile(char *path, const char *xlogfname,
 	endp = xlogRestoreCmd + MAXPGPATH - 1;
 	*endp = '\0';
 
-	for (sp = recoveryRestoreCommand; *sp; sp++)
+	for (sp = GetRecoveryRestoreCommand(); *sp; sp++)
 	{
 		if (*sp == '%')
 		{
@@ -3059,21 +3118,29 @@ RestoreArchivedFile(char *path, const char *xlogfname,
 	}
 	*dp = '\0';
 
-	ereport(DEBUG3,
+	ereport(DEBUG2,
 			(errmsg_internal("executing restore command \"%s\"",
 							 xlogRestoreCmd)));
 
 	/*
-	 * Check signals before restore command and reset afterwards.
+	 * Set in_restore_command to tell the signal handler that we should exit
+	 * right away on SIGTERM. We know that we're at a safe point to do that.
+	 * Check if we had already received the signal, so that we don't miss a
+	 * shutdown request received just before this.
 	 */
-	PreRestoreCommand();
+	in_restore_command = true;
+	if (startup_shutdown_requested || walrestore_shutdown_requested)
+	{
+		LWLockRelease(WALRestoreCommandLock);
+		proc_exit(1);
+	}
 
 	/*
 	 * Copy xlog from archival storage to XLOGDIR
 	 */
 	rc = system(xlogRestoreCmd);
 
-	PostRestoreCommand();
+	in_restore_command = false;
 
 	if (rc == 0)
 	{
@@ -3102,7 +3169,10 @@ RestoreArchivedFile(char *path, const char *xlogfname,
 				if (StandbyMode && stat_buf.st_size < expectedSize)
 					elevel = DEBUG1;
 				else
+				{
+					LWLockRelease(WALRestoreCommandLock);
 					elevel = FATAL;
+				}
 				ereport(elevel,
 						(errmsg("archive file \"%s\" has wrong size: %lu instead of %lu",
 								xlogfname,
@@ -3123,10 +3193,13 @@ RestoreArchivedFile(char *path, const char *xlogfname,
 		{
 			/* stat failed */
 			if (errno != ENOENT)
+			{
+				LWLockRelease(WALRestoreCommandLock);
 				ereport(FATAL,
 						(errcode_for_file_access(),
 						 errmsg("could not stat file \"%s\": %m",
 								xlogpath)));
+			}
 		}
 	}
 
@@ -3158,10 +3231,18 @@ RestoreArchivedFile(char *path, const char *xlogfname,
 	 * too.
 	 */
 	if (WIFSIGNALED(rc) && WTERMSIG(rc) == SIGTERM)
+	{
+		LWLockRelease(WALRestoreCommandLock);
 		proc_exit(1);
+	}
 
 	signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;
 
+	/*
+	 * If signaled we will immediately issue a FATAL error so drop the lock
+	 */
+	if (signaled)
+		LWLockRelease(WALRestoreCommandLock);
 	ereport(signaled ? FATAL : DEBUG2,
 		(errmsg("could not restore file \"%s\" from archive: return code %d",
 				xlogfname, rc)));
@@ -4203,7 +4284,9 @@ readTimeLineHistory(TimeLineID targetTLI)
 	if (InArchiveRecovery)
 	{
 		TLHistoryFileName(histfname, targetTLI);
+		LWLockAcquire(WALRestoreCommandLock, LW_EXCLUSIVE);
 		RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
+		LWLockRelease(WALRestoreCommandLock);
 	}
 	else
 		TLHistoryFilePath(path, targetTLI);
@@ -4292,7 +4375,9 @@ existsTimeLineHistory(TimeLineID probeTLI)
 	if (InArchiveRecovery)
 	{
 		TLHistoryFileName(histfname, probeTLI);
+		LWLockAcquire(WALRestoreCommandLock, LW_EXCLUSIVE);
 		RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
+		LWLockRelease(WALRestoreCommandLock);
 	}
 	else
 		TLHistoryFilePath(path, probeTLI);
@@ -4453,7 +4538,9 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
 	if (InArchiveRecovery)
 	{
 		TLHistoryFileName(histfname, parentTLI);
+		LWLockAcquire(WALRestoreCommandLock, LW_EXCLUSIVE);
 		RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
+		LWLockRelease(WALRestoreCommandLock);
 	}
 	else
 		TLHistoryFilePath(path, parentTLI);
@@ -5299,10 +5386,10 @@ readRecoveryCommandFile(void)
 	{
 		if (strcmp(item->name, "restore_command") == 0)
 		{
-			recoveryRestoreCommand = pstrdup(item->value);
+			SetRecoveryRestoreCommand(pstrdup(item->value));
 			ereport(DEBUG2,
 					(errmsg_internal("restore_command = '%s'",
-									 recoveryRestoreCommand)));
+									 GetRecoveryRestoreCommand())));
 		}
 		else if (strcmp(item->name, "recovery_end_command") == 0)
 		{
@@ -5455,7 +5542,7 @@ readRecoveryCommandFile(void)
 	 */
 	if (StandbyMode)
 	{
-		if (PrimaryConnInfo == NULL && recoveryRestoreCommand == NULL)
+		if (PrimaryConnInfo == NULL && strlen(GetRecoveryRestoreCommand()) == 0)
 			ereport(WARNING,
 					(errmsg("recovery command file \"%s\" specified neither primary_conninfo nor restore_command",
 							RECOVERY_COMMAND_FILE),
@@ -5463,7 +5550,7 @@ readRecoveryCommandFile(void)
 	}
 	else
 	{
-		if (recoveryRestoreCommand == NULL)
+		if (strlen(GetRecoveryRestoreCommand()) == 0)
 			ereport(FATAL,
 					(errmsg("recovery command file \"%s\" must specify restore_command when standby mode is not enabled",
 							RECOVERY_COMMAND_FILE)));
@@ -6432,7 +6519,7 @@ StartupXLOG(void)
 			PublishStartupProcessInformation();
 			SetForwardFsyncRequests();
 			SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
-			bgwriterLaunched = true;
+			ArchRecoveryBgProcsActive = true;
 		}
 
 		/*
@@ -6795,7 +6882,7 @@ StartupXLOG(void)
 		 * the rule that TLI only changes in shutdown checkpoints, which
 		 * allows some extra error checking in xlog_redo.
 		 */
-		if (bgwriterLaunched)
+		if (ArchRecoveryBgProcsActive)
 			RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
 							  CHECKPOINT_IMMEDIATE |
 							  CHECKPOINT_WAIT);
@@ -9640,7 +9727,7 @@ XLogPageRead(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt,
 		 * Request a restartpoint if we've replayed too much
 		 * xlog since the last one.
 		 */
-		if (StandbyMode && bgwriterLaunched)
+		if (StandbyMode && ArchRecoveryBgProcsActive)
 		{
 			if (XLogCheckpointNeeded(readId, readSeg))
 			{
diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c
index e3ae92d..81a8cb3 100644
--- a/src/backend/bootstrap/bootstrap.c
+++ b/src/backend/bootstrap/bootstrap.c
@@ -30,6 +30,7 @@
 #include "nodes/makefuncs.h"
 #include "postmaster/bgwriter.h"
 #include "postmaster/startup.h"
+#include "postmaster/walrestore.h"
 #include "postmaster/walwriter.h"
 #include "replication/walreceiver.h"
 #include "storage/bufmgr.h"
@@ -319,6 +320,9 @@ AuxiliaryProcessMain(int argc, char *argv[])
 			case CheckpointerProcess:
 				statmsg = "checkpointer process";
 				break;
+			case WalRestoreProcess:
+				statmsg = "wal restore process";
+				break;
 			case WalWriterProcess:
 				statmsg = "wal writer process";
 				break;
@@ -424,6 +428,11 @@ AuxiliaryProcessMain(int argc, char *argv[])
 			CheckpointerMain();
 			proc_exit(1);		/* should never return */
 
+		case WalRestoreProcess:
+			/* don't set signals, wal restore has its own agenda */
+			WalRestoreMain();
+			proc_exit(1);		/* should never return */
+
 		case WalWriterProcess:
 			/* don't set signals, walwriter has its own agenda */
 			InitXLOGAccess();
diff --git a/src/backend/postmaster/Makefile b/src/backend/postmaster/Makefile
index 3056b09..349e722 100644
--- a/src/backend/postmaster/Makefile
+++ b/src/backend/postmaster/Makefile
@@ -13,6 +13,6 @@ top_builddir = ../../..
 include $(top_builddir)/src/Makefile.global
 
 OBJS = autovacuum.o bgwriter.o fork_process.o pgarch.o pgstat.o postmaster.o \
-	startup.o syslogger.o walwriter.o checkpointer.o
+	startup.o syslogger.o walrestore.o walwriter.o checkpointer.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index ad0c17a..15684c0 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -210,6 +210,7 @@ static pid_t StartupPID = 0,
 			BgWriterPID = 0,
 			CheckpointerPID = 0,
 			WalWriterPID = 0,
+			WalRestorePID = 0,
 			WalReceiverPID = 0,
 			AutoVacPID = 0,
 			PgArchPID = 0,
@@ -470,6 +471,7 @@ static void ShmemBackendArrayRemove(Backend *bn);
 #define StartCheckpointer()		StartChildProcess(CheckpointerProcess)
 #define StartWalWriter()		StartChildProcess(WalWriterProcess)
 #define StartWalReceiver()		StartChildProcess(WalReceiverProcess)
+#define StartWalRestore()		StartChildProcess(WalRestoreProcess)
 
 /* Macros to check exit status of a child process */
 #define EXIT_STATUS_0(st)  ((st) == 0)
@@ -2060,6 +2062,8 @@ SIGHUP_handler(SIGNAL_ARGS)
 			signal_child(WalWriterPID, SIGHUP);
 		if (WalReceiverPID != 0)
 			signal_child(WalReceiverPID, SIGHUP);
+		if (WalRestorePID != 0)
+			signal_child(WalRestorePID, SIGHUP);
 		if (AutoVacPID != 0)
 			signal_child(AutoVacPID, SIGHUP);
 		if (PgArchPID != 0)
@@ -2170,6 +2174,8 @@ pmdie(SIGNAL_ARGS)
 				signal_child(StartupPID, SIGTERM);
 			if (WalReceiverPID != 0)
 				signal_child(WalReceiverPID, SIGTERM);
+			if (WalRestorePID != 0)
+				signal_child(WalRestorePID, SIGTERM);
 			if (BgWriterPID != 0)
 				signal_child(BgWriterPID, SIGTERM);
 			if (pmState == PM_RECOVERY)
@@ -2225,6 +2231,8 @@ pmdie(SIGNAL_ARGS)
 				signal_child(WalWriterPID, SIGQUIT);
 			if (WalReceiverPID != 0)
 				signal_child(WalReceiverPID, SIGQUIT);
+			if (WalRestorePID != 0)
+				signal_child(WalRestorePID, SIGQUIT);
 			if (AutoVacPID != 0)
 				signal_child(AutoVacPID, SIGQUIT);
 			if (PgArchPID != 0)
@@ -2331,6 +2339,12 @@ reaper(SIGNAL_ARGS)
 			pmState = PM_RUN;
 
 			/*
+			 * Shutdown the WALRestore process
+			 */
+			if (WalRestorePID != 0)
+				signal_child(WalRestorePID, SIGTERM);
+
+			/*
 			 * Kill any walsenders to force the downstream standby(s) to
 			 * reread the timeline history file, adjust their timelines and
 			 * establish replication connections again. This is required
@@ -2477,6 +2491,30 @@ reaper(SIGNAL_ARGS)
 		}
 
 		/*
+		 * Was it the wal restore?  If exit status is zero (normal) or one
+		 * (FATAL exit), we assume everything is all right just like normal
+		 * backends.
+		 */
+		if (pid == WalRestorePID)
+		{
+			if (pmState >= PM_RUN)
+			{
+				WalRestorePID = 0;
+				continue;
+			}
+
+			/*
+			 * Any unexpected exit (including FATAL exit) of the WALRestore
+			 * process is treated as a crash, except that we don't want to
+			 * reinitialize because availability is important.
+			 */
+			RecoveryError = true;
+			HandleChildCrash(pid, exitstatus,
+							 _("walrestore process"));
+			continue;
+		}
+
+		/*
 		 * Was it the autovacuum launcher?	Normal exit can be ignored; we'll
 		 * start a new one at the next iteration of the postmaster's main
 		 * loop, if necessary.	Any other exit condition is treated as a
@@ -2756,6 +2794,18 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
 		signal_child(WalReceiverPID, (SendStop ? SIGSTOP : SIGQUIT));
 	}
 
+	/* Take care of the walrestore too */
+	if (pid == WalRestorePID)
+		WalRestorePID = 0;
+	else if (WalRestorePID != 0 && !FatalError)
+	{
+		ereport(DEBUG2,
+				(errmsg_internal("sending %s to process %d",
+								 (SendStop ? "SIGSTOP" : "SIGQUIT"),
+								 (int) WalRestorePID)));
+		signal_child(WalRestorePID, (SendStop ? SIGSTOP : SIGQUIT));
+	}
+
 	/* Take care of the autovacuum launcher too */
 	if (pid == AutoVacPID)
 		AutoVacPID = 0;
@@ -2916,6 +2966,8 @@ PostmasterStateMachine(void)
 				signal_child(StartupPID, SIGTERM);
 			if (WalReceiverPID != 0)
 				signal_child(WalReceiverPID, SIGTERM);
+			if (WalRestorePID != 0)
+				signal_child(WalRestorePID, SIGTERM);
 			pmState = PM_WAIT_BACKENDS;
 		}
 	}
@@ -2940,6 +2992,7 @@ PostmasterStateMachine(void)
 		if (CountChildren(BACKEND_TYPE_NORMAL | BACKEND_TYPE_AUTOVAC) == 0 &&
 			StartupPID == 0 &&
 			WalReceiverPID == 0 &&
+			WalRestorePID == 0 &&
 			BgWriterPID == 0 &&
 			(CheckpointerPID == 0 || !FatalError) &&
 			WalWriterPID == 0 &&
@@ -3005,11 +3058,11 @@ PostmasterStateMachine(void)
 		 * left by now anyway; what we're really waiting for is walsenders and
 		 * archiver.
 		 *
-		 * Walreceiver should normally be dead by now, but not when a fast
-		 * shutdown is performed during recovery.
+		 * Walreceiver and Walrestore should normally be dead by now, but not
+		 * when a fast shutdown is performed during recovery.
 		 */
 		if (PgArchPID == 0 && CountChildren(BACKEND_TYPE_ALL) == 0 &&
-			WalReceiverPID == 0)
+			WalReceiverPID == 0 && WalRestorePID == 0)
 		{
 			pmState = PM_WAIT_DEAD_END;
 		}
@@ -3036,6 +3089,7 @@ PostmasterStateMachine(void)
 			/* These other guys should be dead already */
 			Assert(StartupPID == 0);
 			Assert(WalReceiverPID == 0);
+			Assert(WalRestorePID == 0);
 			Assert(BgWriterPID == 0);
 			Assert(CheckpointerPID == 0);
 			Assert(WalWriterPID == 0);
@@ -4219,6 +4273,8 @@ sigusr1_handler(SIGNAL_ARGS)
 		BgWriterPID = StartBackgroundWriter();
 		Assert(CheckpointerPID == 0);
 		CheckpointerPID = StartCheckpointer();
+		Assert(WalRestorePID == 0);
+		WalRestorePID = StartWalRestore();
 
 		pmState = PM_RECOVERY;
 	}
diff --git a/src/backend/postmaster/startup.c b/src/backend/postmaster/startup.c
index ed75d09..1791feb 100644
--- a/src/backend/postmaster/startup.c
+++ b/src/backend/postmaster/startup.c
@@ -35,14 +35,14 @@
  * Flags set by interrupt handlers for later service in the redo loop.
  */
 static volatile sig_atomic_t got_SIGHUP = false;
-static volatile sig_atomic_t shutdown_requested = false;
 static volatile sig_atomic_t promote_triggered = false;
+volatile sig_atomic_t startup_shutdown_requested = false;
 
 /*
  * Flag set when executing a restore command, to tell SIGTERM signal handler
  * that it's safe to just proc_exit.
  */
-static volatile sig_atomic_t in_restore_command = false;
+volatile sig_atomic_t in_restore_command = false;
 
 /* Signal handlers */
 static void startupproc_quickdie(SIGNAL_ARGS);
@@ -131,9 +131,16 @@ StartupProcShutdownHandler(SIGNAL_ARGS)
 	int			save_errno = errno;
 
 	if (in_restore_command)
+	{
+		/*
+		 * See RestoreArchivedFile() for explanation of why this
+		 * lock is always held when in_restore_command is true.
+		 */
+		LWLockRelease(WALRestoreCommandLock);
 		proc_exit(1);
+	}
 	else
-		shutdown_requested = true;
+		startup_shutdown_requested = true;
 	WakeupRecovery();
 
 	errno = save_errno;
@@ -155,7 +162,7 @@ HandleStartupProcInterrupts(void)
 	/*
 	 * Check if we were requested to exit without finishing recovery.
 	 */
-	if (shutdown_requested)
+	if (startup_shutdown_requested)
 		proc_exit(1);
 
 	/*
@@ -226,26 +233,6 @@ StartupProcessMain(void)
 	proc_exit(0);
 }
 
-void
-PreRestoreCommand(void)
-{
-	/*
-	 * Set in_restore_command to tell the signal handler that we should exit
-	 * right away on SIGTERM. We know that we're at a safe point to do that.
-	 * Check if we had already received the signal, so that we don't miss a
-	 * shutdown request received just before this.
-	 */
-	in_restore_command = true;
-	if (shutdown_requested)
-		proc_exit(1);
-}
-
-void
-PostRestoreCommand(void)
-{
-	in_restore_command = false;
-}
-
 bool
 IsPromoteTriggered(void)
 {
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index ef1dc91..8f4443a 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -26,6 +26,7 @@
 #include "postmaster/autovacuum.h"
 #include "postmaster/bgwriter.h"
 #include "postmaster/postmaster.h"
+#include "postmaster/walrestore.h"
 #include "replication/walreceiver.h"
 #include "replication/walsender.h"
 #include "storage/bufmgr.h"
@@ -123,6 +124,7 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port)
 		size = add_size(size, AutoVacuumShmemSize());
 		size = add_size(size, WalSndShmemSize());
 		size = add_size(size, WalRcvShmemSize());
+		size = add_size(size, WalRestoreShmemSize());
 		size = add_size(size, BTreeShmemSize());
 		size = add_size(size, SyncScanShmemSize());
 		size = add_size(size, AsyncShmemSize());
@@ -228,6 +230,7 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port)
 	AutoVacuumShmemInit();
 	WalSndShmemInit();
 	WalRcvShmemInit();
+	WalRestoreShmemInit();
 
 	/*
 	 * Set up other modules that need some shared memory space
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 1ddf4bf..e9e5325 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -270,7 +270,8 @@ extern bool XLogNeedsFlush(XLogRecPtr RecPtr);
 extern int XLogFileInit(uint32 log, uint32 seg,
 			 bool *use_existent, bool use_lock);
 extern int	XLogFileOpen(uint32 log, uint32 seg);
-
+extern bool RestoreArchivedFile(char *path, const char *xlogfname,
+					const char *recovername, off_t expectedSize);
 
 extern void XLogGetLastRemoved(uint32 *log, uint32 *seg);
 extern void XLogSetAsyncXactLSN(XLogRecPtr record);
@@ -316,6 +317,7 @@ extern TimeLineID GetRecoveryTargetTLI(void);
 extern bool CheckPromoteSignal(void);
 extern void WakeupRecovery(void);
 extern Latch *WALWriterLatch(void);
+extern Latch *WALRestoreLatch(void);
 
 /*
  * Starting/stopping a base backup
diff --git a/src/include/bootstrap/bootstrap.h b/src/include/bootstrap/bootstrap.h
index e966a73..b90ce33 100644
--- a/src/include/bootstrap/bootstrap.h
+++ b/src/include/bootstrap/bootstrap.h
@@ -23,6 +23,7 @@ typedef enum
 	StartupProcess,
 	BgWriterProcess,
 	CheckpointerProcess,
+	WalRestoreProcess,
 	WalWriterProcess,
 	WalReceiverProcess,
 
diff --git a/src/include/postmaster/startup.h b/src/include/postmaster/startup.h
index 3ec6950..35d9665 100644
--- a/src/include/postmaster/startup.h
+++ b/src/include/postmaster/startup.h
@@ -12,10 +12,11 @@
 #ifndef _STARTUP_H
 #define _STARTUP_H
 
+extern volatile sig_atomic_t startup_shutdown_requested;
+extern volatile sig_atomic_t in_restore_command;
+
 extern void HandleStartupProcInterrupts(void);
 extern void StartupProcessMain(void);
-extern void PreRestoreCommand(void);
-extern void PostRestoreCommand(void);
 extern bool IsPromoteTriggered(void);
 extern void ResetPromoteTriggered(void);
 
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index df3df29..c316dcc 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -79,6 +79,7 @@ typedef enum LWLockId
 	SerializablePredicateLockListLock,
 	OldSerXidLock,
 	SyncRepLock,
+	WALRestoreCommandLock,
 	/* Individual lock IDs end here */
 	FirstBufMappingLock,
 	FirstLockMgrLock = FirstBufMappingLock + NUM_BUFFER_PARTITIONS,
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index 358d1a4..50d4f35 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -204,12 +204,20 @@ extern PGPROC *PreparedXactProcs;
 /*
  * We set aside some extra PGPROC structures for auxiliary processes,
  * ie things that aren't full-fledged backends but need shmem access.
+ * Logger, archiver and stats processes don't count towards this total.
  *
+ * This needs to be set to whichever of normal running or recovery has the
+ * highest number of backends that might occur together.
+ *
+ * During normal running we need slots for:
  * Background writer, checkpointer and WAL writer run during normal operation.
- * Startup process and WAL receiver also consume 2 slots, but WAL writer is
- * launched only after startup has exited, so we only need 4 slots.
+ * 3 slots
+ *
+ * During recovery we need slots for:
+ * Background writer, checkpointer, Startup process, WAL receiver, WAL restore.
+ * 5 slots
  */
-#define NUM_AUXILIARY_PROCS		4
+#define NUM_AUXILIARY_PROCS		5
 
 
 /* configurable options */

Fujii Masao

masao.fujii@gmail.com

almost 14 years ago

In reply to: Simon Riggs (#4)

Re: WAL Restore process during recovery

On Fri, Jan 20, 2012 at 4:17 AM, Simon Riggs <simon@2ndquadrant.com> wrote:

On Tue, Jan 17, 2012 at 6:52 AM, Fujii Masao <masao.fujii@gmail.com> wrote:

On Mon, Jan 16, 2012 at 2:06 AM, Simon Riggs <simon@2ndquadrant.com> wrote:

WALRestore process asynchronously executes restore_command while
recovery continues working.

Overlaps downloading of next WAL file to reduce time delays in file
based archive recovery.

Handles cases of file-only and streaming/file correctly.

Though I've not reviewed the patch deeply yet, I observed the following
two problems when I tested the patch.

When I set up streaming replication + archive (i.e., restore_command is set)
and started the standby, I got the following error:

FATAL: all AuxiliaryProcs are in use
LOG: walrestore process (PID 18839) exited with exit code 1

Fixed and better documented.

When I started an archive recovery without setting restore_command,
it successfully finished.

Not sure exactly what you mean, but I fixed a bug that might be
something you're seeing.

Thanks!

But you forgot to include walrestore.c and .h in the patch. Can you submit
the updated version of the patch?

Regards,

--
Fujii Masao
NIPPON TELEGRAPH AND TELEPHONE CORPORATION
NTT Open Source Software Center

Simon Riggs

simon@2ndQuadrant.com

almost 14 years ago

In reply to: Fujii Masao (#5)

1 attachment(s)

Re: WAL Restore process during recovery

On Fri, Jan 20, 2012 at 3:43 AM, Fujii Masao <masao.fujii@gmail.com> wrote:

Requested update

--
Simon Riggs http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

Attachments:

walrestore_process.v2.patchtext/x-patch; charset=US-ASCII; name=walrestore_process.v2.patchDownload

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index ce659ec..469e6d6 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -40,6 +40,7 @@
 #include "pgstat.h"
 #include "postmaster/bgwriter.h"
 #include "postmaster/startup.h"
+#include "postmaster/walrestore.h"
 #include "replication/walreceiver.h"
 #include "replication/walsender.h"
 #include "storage/bufmgr.h"
@@ -187,7 +188,6 @@ static bool InArchiveRecovery = false;
 static bool restoredFromArchive = false;
 
 /* options taken from recovery.conf for archive recovery */
-static char *recoveryRestoreCommand = NULL;
 static char *recoveryEndCommand = NULL;
 static char *archiveCleanupCommand = NULL;
 static RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
@@ -575,8 +575,8 @@ bool reachedConsistency = false;
 
 static bool InRedo = false;
 
-/* Have we launched bgwriter during recovery? */
-static bool bgwriterLaunched = false;
+/* Have we launched background procs during archive recovery yet? */
+static bool ArchRecoveryBgProcsActive = false;
 
 /*
  * Information logged when we detect a change in one of the parameters
@@ -632,8 +632,6 @@ static bool XLogPageRead(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt,
 			 bool randAccess);
 static int	emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
 static void XLogFileClose(void);
-static bool RestoreArchivedFile(char *path, const char *xlogfname,
-					const char *recovername, off_t expectedSize);
 static void ExecuteRecoveryCommand(char *command, char *commandName,
 					   bool failOnerror);
 static void PreallocXlogFiles(XLogRecPtr endptr);
@@ -2706,19 +2704,47 @@ XLogFileRead(uint32 log, uint32 seg, int emode, TimeLineID tli,
 
 	XLogFileName(xlogfname, tli, log, seg);
 
+#define TMPRECOVERYXLOG	"RECOVERYXLOG"
+
 	switch (source)
 	{
 		case XLOG_FROM_ARCHIVE:
+			/*
+			 * Check to see if the WALRestore process has already put the
+			 * next file in place while we were working. If so, use that.
+			 * If not, get it ourselves. This makes it easier to handle
+			 * initial state before the WALRestore is active, and also
+			 * handles the stop/start logic correctly when we have both
+			 * streaming and file based replication active.
+			 *
+			 * We queue up the next task for WALRestore after we've begun to
+			 * use this file later in XLogFileRead().
+			 *
+			 * If the WALRestore process is still active, the lock wait makes
+			 * us wait, which is just like we were executing the command
+			 * ourselves and so doesn't alter the logic elsewhere.
+			 */
+			if (XLogFileIsNowFullyRestored(tli, log, seg))
+			{
+				snprintf(path, MAXPGPATH, XLOGDIR "/%s", TMPRECOVERYXLOG);
+				restoredFromArchive = true;
+				break;
+			}
+
 			/* Report recovery progress in PS display */
 			snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
 					 xlogfname);
 			set_ps_display(activitymsg, false);
 
 			restoredFromArchive = RestoreArchivedFile(path, xlogfname,
-													  "RECOVERYXLOG",
+													  TMPRECOVERYXLOG,
 													  XLogSegSize);
+
 			if (!restoredFromArchive)
+			{
+				LWLockRelease(WALRestoreCommandLock);
 				return -1;
+			}
 			break;
 
 		case XLOG_FROM_PG_XLOG:
@@ -2748,18 +2774,42 @@ XLogFileRead(uint32 log, uint32 seg, int emode, TimeLineID tli,
 		if (stat(xlogfpath, &statbuf) == 0)
 		{
 			if (unlink(xlogfpath) != 0)
+			{
+				LWLockRelease(WALRestoreCommandLock);
 				ereport(FATAL,
 						(errcode_for_file_access(),
 						 errmsg("could not remove file \"%s\": %m",
 								xlogfpath)));
+			}
 			reload = true;
 		}
 
 		if (rename(path, xlogfpath) < 0)
+		{
+			LWLockRelease(WALRestoreCommandLock);
 			ereport(ERROR,
 				(errcode_for_file_access(),
 				 errmsg("could not rename file \"%s\" to \"%s\": %m",
 						path, xlogfpath)));
+		}
+
+		/*
+		 * Make sure we recover from the new filename, so we can reuse the
+		 * temporary filename for asynchronous restore actions.
+		 */
+		strcpy(path, xlogfpath);
+
+		/*
+		 * Tell the WALRestore process to get the next file now.
+		 * Hopefully it will be ready for use in time for the next call the
+		 * Startup process makes to XLogFileRead().
+		 *
+		 * It might seem like we should do that earlier but then there is a
+		 * race condition that might lead to replacing RECOVERYXLOG with
+		 * another file before we've copied it.
+		 */
+		SetNextWALRestoreLogSeg(tli, log, seg);
+		LWLockRelease(WALRestoreCommandLock);
 
 		/*
 		 * If the existing segment was replaced, since walsenders might have
@@ -2911,8 +2961,11 @@ XLogFileClose(void)
  * For fixed-size files, the caller may pass the expected size as an
  * additional crosscheck on successful recovery.  If the file size is not
  * known, set expectedSize = 0.
+ *
+ * Must be called with WALRestoreCommandLock held and must be held at exit,
+ * if the function returns.
  */
-static bool
+bool
 RestoreArchivedFile(char *path, const char *xlogfname,
 					const char *recovername, off_t expectedSize)
 {
@@ -2929,7 +2982,7 @@ RestoreArchivedFile(char *path, const char *xlogfname,
 	uint32		restartSeg;
 
 	/* In standby mode, restore_command might not be supplied */
-	if (recoveryRestoreCommand == NULL)
+	if (strlen(GetRecoveryRestoreCommand()) == 0)
 		goto not_available;
 
 	/*
@@ -2963,18 +3016,24 @@ RestoreArchivedFile(char *path, const char *xlogfname,
 	if (stat(xlogpath, &stat_buf) != 0)
 	{
 		if (errno != ENOENT)
+		{
+			LWLockRelease(WALRestoreCommandLock);
 			ereport(FATAL,
 					(errcode_for_file_access(),
 					 errmsg("could not stat file \"%s\": %m",
 							xlogpath)));
+		}
 	}
 	else
 	{
 		if (unlink(xlogpath) != 0)
+		{
+			LWLockRelease(WALRestoreCommandLock);
 			ereport(FATAL,
 					(errcode_for_file_access(),
 					 errmsg("could not remove file \"%s\": %m",
 							xlogpath)));
+		}
 	}
 
 	/*
@@ -3013,7 +3072,7 @@ RestoreArchivedFile(char *path, const char *xlogfname,
 	endp = xlogRestoreCmd + MAXPGPATH - 1;
 	*endp = '\0';
 
-	for (sp = recoveryRestoreCommand; *sp; sp++)
+	for (sp = GetRecoveryRestoreCommand(); *sp; sp++)
 	{
 		if (*sp == '%')
 		{
@@ -3059,21 +3118,29 @@ RestoreArchivedFile(char *path, const char *xlogfname,
 	}
 	*dp = '\0';
 
-	ereport(DEBUG3,
+	ereport(DEBUG2,
 			(errmsg_internal("executing restore command \"%s\"",
 							 xlogRestoreCmd)));
 
 	/*
-	 * Check signals before restore command and reset afterwards.
+	 * Set in_restore_command to tell the signal handler that we should exit
+	 * right away on SIGTERM. We know that we're at a safe point to do that.
+	 * Check if we had already received the signal, so that we don't miss a
+	 * shutdown request received just before this.
 	 */
-	PreRestoreCommand();
+	in_restore_command = true;
+	if (startup_shutdown_requested || walrestore_shutdown_requested)
+	{
+		LWLockRelease(WALRestoreCommandLock);
+		proc_exit(1);
+	}
 
 	/*
 	 * Copy xlog from archival storage to XLOGDIR
 	 */
 	rc = system(xlogRestoreCmd);
 
-	PostRestoreCommand();
+	in_restore_command = false;
 
 	if (rc == 0)
 	{
@@ -3102,7 +3169,10 @@ RestoreArchivedFile(char *path, const char *xlogfname,
 				if (StandbyMode && stat_buf.st_size < expectedSize)
 					elevel = DEBUG1;
 				else
+				{
+					LWLockRelease(WALRestoreCommandLock);
 					elevel = FATAL;
+				}
 				ereport(elevel,
 						(errmsg("archive file \"%s\" has wrong size: %lu instead of %lu",
 								xlogfname,
@@ -3123,10 +3193,13 @@ RestoreArchivedFile(char *path, const char *xlogfname,
 		{
 			/* stat failed */
 			if (errno != ENOENT)
+			{
+				LWLockRelease(WALRestoreCommandLock);
 				ereport(FATAL,
 						(errcode_for_file_access(),
 						 errmsg("could not stat file \"%s\": %m",
 								xlogpath)));
+			}
 		}
 	}
 
@@ -3158,10 +3231,18 @@ RestoreArchivedFile(char *path, const char *xlogfname,
 	 * too.
 	 */
 	if (WIFSIGNALED(rc) && WTERMSIG(rc) == SIGTERM)
+	{
+		LWLockRelease(WALRestoreCommandLock);
 		proc_exit(1);
+	}
 
 	signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;
 
+	/*
+	 * If signaled we will immediately issue a FATAL error so drop the lock
+	 */
+	if (signaled)
+		LWLockRelease(WALRestoreCommandLock);
 	ereport(signaled ? FATAL : DEBUG2,
 		(errmsg("could not restore file \"%s\" from archive: return code %d",
 				xlogfname, rc)));
@@ -4203,7 +4284,9 @@ readTimeLineHistory(TimeLineID targetTLI)
 	if (InArchiveRecovery)
 	{
 		TLHistoryFileName(histfname, targetTLI);
+		LWLockAcquire(WALRestoreCommandLock, LW_EXCLUSIVE);
 		RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
+		LWLockRelease(WALRestoreCommandLock);
 	}
 	else
 		TLHistoryFilePath(path, targetTLI);
@@ -4292,7 +4375,9 @@ existsTimeLineHistory(TimeLineID probeTLI)
 	if (InArchiveRecovery)
 	{
 		TLHistoryFileName(histfname, probeTLI);
+		LWLockAcquire(WALRestoreCommandLock, LW_EXCLUSIVE);
 		RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
+		LWLockRelease(WALRestoreCommandLock);
 	}
 	else
 		TLHistoryFilePath(path, probeTLI);
@@ -4453,7 +4538,9 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
 	if (InArchiveRecovery)
 	{
 		TLHistoryFileName(histfname, parentTLI);
+		LWLockAcquire(WALRestoreCommandLock, LW_EXCLUSIVE);
 		RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
+		LWLockRelease(WALRestoreCommandLock);
 	}
 	else
 		TLHistoryFilePath(path, parentTLI);
@@ -5299,10 +5386,10 @@ readRecoveryCommandFile(void)
 	{
 		if (strcmp(item->name, "restore_command") == 0)
 		{
-			recoveryRestoreCommand = pstrdup(item->value);
+			SetRecoveryRestoreCommand(pstrdup(item->value));
 			ereport(DEBUG2,
 					(errmsg_internal("restore_command = '%s'",
-									 recoveryRestoreCommand)));
+									 GetRecoveryRestoreCommand())));
 		}
 		else if (strcmp(item->name, "recovery_end_command") == 0)
 		{
@@ -5455,7 +5542,7 @@ readRecoveryCommandFile(void)
 	 */
 	if (StandbyMode)
 	{
-		if (PrimaryConnInfo == NULL && recoveryRestoreCommand == NULL)
+		if (PrimaryConnInfo == NULL && strlen(GetRecoveryRestoreCommand()) == 0)
 			ereport(WARNING,
 					(errmsg("recovery command file \"%s\" specified neither primary_conninfo nor restore_command",
 							RECOVERY_COMMAND_FILE),
@@ -5463,7 +5550,7 @@ readRecoveryCommandFile(void)
 	}
 	else
 	{
-		if (recoveryRestoreCommand == NULL)
+		if (strlen(GetRecoveryRestoreCommand()) == 0)
 			ereport(FATAL,
 					(errmsg("recovery command file \"%s\" must specify restore_command when standby mode is not enabled",
 							RECOVERY_COMMAND_FILE)));
@@ -6432,7 +6519,7 @@ StartupXLOG(void)
 			PublishStartupProcessInformation();
 			SetForwardFsyncRequests();
 			SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
-			bgwriterLaunched = true;
+			ArchRecoveryBgProcsActive = true;
 		}
 
 		/*
@@ -6795,7 +6882,7 @@ StartupXLOG(void)
 		 * the rule that TLI only changes in shutdown checkpoints, which
 		 * allows some extra error checking in xlog_redo.
 		 */
-		if (bgwriterLaunched)
+		if (ArchRecoveryBgProcsActive)
 			RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
 							  CHECKPOINT_IMMEDIATE |
 							  CHECKPOINT_WAIT);
@@ -9640,7 +9727,7 @@ XLogPageRead(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt,
 		 * Request a restartpoint if we've replayed too much
 		 * xlog since the last one.
 		 */
-		if (StandbyMode && bgwriterLaunched)
+		if (StandbyMode && ArchRecoveryBgProcsActive)
 		{
 			if (XLogCheckpointNeeded(readId, readSeg))
 			{
diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c
index e3ae92d..81a8cb3 100644
--- a/src/backend/bootstrap/bootstrap.c
+++ b/src/backend/bootstrap/bootstrap.c
@@ -30,6 +30,7 @@
 #include "nodes/makefuncs.h"
 #include "postmaster/bgwriter.h"
 #include "postmaster/startup.h"
+#include "postmaster/walrestore.h"
 #include "postmaster/walwriter.h"
 #include "replication/walreceiver.h"
 #include "storage/bufmgr.h"
@@ -319,6 +320,9 @@ AuxiliaryProcessMain(int argc, char *argv[])
 			case CheckpointerProcess:
 				statmsg = "checkpointer process";
 				break;
+			case WalRestoreProcess:
+				statmsg = "wal restore process";
+				break;
 			case WalWriterProcess:
 				statmsg = "wal writer process";
 				break;
@@ -424,6 +428,11 @@ AuxiliaryProcessMain(int argc, char *argv[])
 			CheckpointerMain();
 			proc_exit(1);		/* should never return */
 
+		case WalRestoreProcess:
+			/* don't set signals, wal restore has its own agenda */
+			WalRestoreMain();
+			proc_exit(1);		/* should never return */
+
 		case WalWriterProcess:
 			/* don't set signals, walwriter has its own agenda */
 			InitXLOGAccess();
diff --git a/src/backend/postmaster/Makefile b/src/backend/postmaster/Makefile
index 3056b09..349e722 100644
--- a/src/backend/postmaster/Makefile
+++ b/src/backend/postmaster/Makefile
@@ -13,6 +13,6 @@ top_builddir = ../../..
 include $(top_builddir)/src/Makefile.global
 
 OBJS = autovacuum.o bgwriter.o fork_process.o pgarch.o pgstat.o postmaster.o \
-	startup.o syslogger.o walwriter.o checkpointer.o
+	startup.o syslogger.o walrestore.o walwriter.o checkpointer.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index ad0c17a..15684c0 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -210,6 +210,7 @@ static pid_t StartupPID = 0,
 			BgWriterPID = 0,
 			CheckpointerPID = 0,
 			WalWriterPID = 0,
+			WalRestorePID = 0,
 			WalReceiverPID = 0,
 			AutoVacPID = 0,
 			PgArchPID = 0,
@@ -470,6 +471,7 @@ static void ShmemBackendArrayRemove(Backend *bn);
 #define StartCheckpointer()		StartChildProcess(CheckpointerProcess)
 #define StartWalWriter()		StartChildProcess(WalWriterProcess)
 #define StartWalReceiver()		StartChildProcess(WalReceiverProcess)
+#define StartWalRestore()		StartChildProcess(WalRestoreProcess)
 
 /* Macros to check exit status of a child process */
 #define EXIT_STATUS_0(st)  ((st) == 0)
@@ -2060,6 +2062,8 @@ SIGHUP_handler(SIGNAL_ARGS)
 			signal_child(WalWriterPID, SIGHUP);
 		if (WalReceiverPID != 0)
 			signal_child(WalReceiverPID, SIGHUP);
+		if (WalRestorePID != 0)
+			signal_child(WalRestorePID, SIGHUP);
 		if (AutoVacPID != 0)
 			signal_child(AutoVacPID, SIGHUP);
 		if (PgArchPID != 0)
@@ -2170,6 +2174,8 @@ pmdie(SIGNAL_ARGS)
 				signal_child(StartupPID, SIGTERM);
 			if (WalReceiverPID != 0)
 				signal_child(WalReceiverPID, SIGTERM);
+			if (WalRestorePID != 0)
+				signal_child(WalRestorePID, SIGTERM);
 			if (BgWriterPID != 0)
 				signal_child(BgWriterPID, SIGTERM);
 			if (pmState == PM_RECOVERY)
@@ -2225,6 +2231,8 @@ pmdie(SIGNAL_ARGS)
 				signal_child(WalWriterPID, SIGQUIT);
 			if (WalReceiverPID != 0)
 				signal_child(WalReceiverPID, SIGQUIT);
+			if (WalRestorePID != 0)
+				signal_child(WalRestorePID, SIGQUIT);
 			if (AutoVacPID != 0)
 				signal_child(AutoVacPID, SIGQUIT);
 			if (PgArchPID != 0)
@@ -2331,6 +2339,12 @@ reaper(SIGNAL_ARGS)
 			pmState = PM_RUN;
 
 			/*
+			 * Shutdown the WALRestore process
+			 */
+			if (WalRestorePID != 0)
+				signal_child(WalRestorePID, SIGTERM);
+
+			/*
 			 * Kill any walsenders to force the downstream standby(s) to
 			 * reread the timeline history file, adjust their timelines and
 			 * establish replication connections again. This is required
@@ -2477,6 +2491,30 @@ reaper(SIGNAL_ARGS)
 		}
 
 		/*
+		 * Was it the wal restore?  If exit status is zero (normal) or one
+		 * (FATAL exit), we assume everything is all right just like normal
+		 * backends.
+		 */
+		if (pid == WalRestorePID)
+		{
+			if (pmState >= PM_RUN)
+			{
+				WalRestorePID = 0;
+				continue;
+			}
+
+			/*
+			 * Any unexpected exit (including FATAL exit) of the WALRestore
+			 * process is treated as a crash, except that we don't want to
+			 * reinitialize because availability is important.
+			 */
+			RecoveryError = true;
+			HandleChildCrash(pid, exitstatus,
+							 _("walrestore process"));
+			continue;
+		}
+
+		/*
 		 * Was it the autovacuum launcher?	Normal exit can be ignored; we'll
 		 * start a new one at the next iteration of the postmaster's main
 		 * loop, if necessary.	Any other exit condition is treated as a
@@ -2756,6 +2794,18 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
 		signal_child(WalReceiverPID, (SendStop ? SIGSTOP : SIGQUIT));
 	}
 
+	/* Take care of the walrestore too */
+	if (pid == WalRestorePID)
+		WalRestorePID = 0;
+	else if (WalRestorePID != 0 && !FatalError)
+	{
+		ereport(DEBUG2,
+				(errmsg_internal("sending %s to process %d",
+								 (SendStop ? "SIGSTOP" : "SIGQUIT"),
+								 (int) WalRestorePID)));
+		signal_child(WalRestorePID, (SendStop ? SIGSTOP : SIGQUIT));
+	}
+
 	/* Take care of the autovacuum launcher too */
 	if (pid == AutoVacPID)
 		AutoVacPID = 0;
@@ -2916,6 +2966,8 @@ PostmasterStateMachine(void)
 				signal_child(StartupPID, SIGTERM);
 			if (WalReceiverPID != 0)
 				signal_child(WalReceiverPID, SIGTERM);
+			if (WalRestorePID != 0)
+				signal_child(WalRestorePID, SIGTERM);
 			pmState = PM_WAIT_BACKENDS;
 		}
 	}
@@ -2940,6 +2992,7 @@ PostmasterStateMachine(void)
 		if (CountChildren(BACKEND_TYPE_NORMAL | BACKEND_TYPE_AUTOVAC) == 0 &&
 			StartupPID == 0 &&
 			WalReceiverPID == 0 &&
+			WalRestorePID == 0 &&
 			BgWriterPID == 0 &&
 			(CheckpointerPID == 0 || !FatalError) &&
 			WalWriterPID == 0 &&
@@ -3005,11 +3058,11 @@ PostmasterStateMachine(void)
 		 * left by now anyway; what we're really waiting for is walsenders and
 		 * archiver.
 		 *
-		 * Walreceiver should normally be dead by now, but not when a fast
-		 * shutdown is performed during recovery.
+		 * Walreceiver and Walrestore should normally be dead by now, but not
+		 * when a fast shutdown is performed during recovery.
 		 */
 		if (PgArchPID == 0 && CountChildren(BACKEND_TYPE_ALL) == 0 &&
-			WalReceiverPID == 0)
+			WalReceiverPID == 0 && WalRestorePID == 0)
 		{
 			pmState = PM_WAIT_DEAD_END;
 		}
@@ -3036,6 +3089,7 @@ PostmasterStateMachine(void)
 			/* These other guys should be dead already */
 			Assert(StartupPID == 0);
 			Assert(WalReceiverPID == 0);
+			Assert(WalRestorePID == 0);
 			Assert(BgWriterPID == 0);
 			Assert(CheckpointerPID == 0);
 			Assert(WalWriterPID == 0);
@@ -4219,6 +4273,8 @@ sigusr1_handler(SIGNAL_ARGS)
 		BgWriterPID = StartBackgroundWriter();
 		Assert(CheckpointerPID == 0);
 		CheckpointerPID = StartCheckpointer();
+		Assert(WalRestorePID == 0);
+		WalRestorePID = StartWalRestore();
 
 		pmState = PM_RECOVERY;
 	}
diff --git a/src/backend/postmaster/startup.c b/src/backend/postmaster/startup.c
index ed75d09..1791feb 100644
--- a/src/backend/postmaster/startup.c
+++ b/src/backend/postmaster/startup.c
@@ -35,14 +35,14 @@
  * Flags set by interrupt handlers for later service in the redo loop.
  */
 static volatile sig_atomic_t got_SIGHUP = false;
-static volatile sig_atomic_t shutdown_requested = false;
 static volatile sig_atomic_t promote_triggered = false;
+volatile sig_atomic_t startup_shutdown_requested = false;
 
 /*
  * Flag set when executing a restore command, to tell SIGTERM signal handler
  * that it's safe to just proc_exit.
  */
-static volatile sig_atomic_t in_restore_command = false;
+volatile sig_atomic_t in_restore_command = false;
 
 /* Signal handlers */
 static void startupproc_quickdie(SIGNAL_ARGS);
@@ -131,9 +131,16 @@ StartupProcShutdownHandler(SIGNAL_ARGS)
 	int			save_errno = errno;
 
 	if (in_restore_command)
+	{
+		/*
+		 * See RestoreArchivedFile() for explanation of why this
+		 * lock is always held when in_restore_command is true.
+		 */
+		LWLockRelease(WALRestoreCommandLock);
 		proc_exit(1);
+	}
 	else
-		shutdown_requested = true;
+		startup_shutdown_requested = true;
 	WakeupRecovery();
 
 	errno = save_errno;
@@ -155,7 +162,7 @@ HandleStartupProcInterrupts(void)
 	/*
 	 * Check if we were requested to exit without finishing recovery.
 	 */
-	if (shutdown_requested)
+	if (startup_shutdown_requested)
 		proc_exit(1);
 
 	/*
@@ -226,26 +233,6 @@ StartupProcessMain(void)
 	proc_exit(0);
 }
 
-void
-PreRestoreCommand(void)
-{
-	/*
-	 * Set in_restore_command to tell the signal handler that we should exit
-	 * right away on SIGTERM. We know that we're at a safe point to do that.
-	 * Check if we had already received the signal, so that we don't miss a
-	 * shutdown request received just before this.
-	 */
-	in_restore_command = true;
-	if (shutdown_requested)
-		proc_exit(1);
-}
-
-void
-PostRestoreCommand(void)
-{
-	in_restore_command = false;
-}
-
 bool
 IsPromoteTriggered(void)
 {
diff --git a/src/backend/postmaster/walrestore.c b/src/backend/postmaster/walrestore.c
new file mode 100644
index 0000000..7634d36
--- /dev/null
+++ b/src/backend/postmaster/walrestore.c
@@ -0,0 +1,474 @@
+/*-------------------------------------------------------------------------
+ *
+ * walrestore.c
+ *
+ * The WAL restore process is new as of Postgres 9.2, though the work it performs
+ * has been handled by the startup process from Postgres 8.0 until 9.1.
+ *
+ * WALRestore process executes the restore_command. If not set, it sleeps.
+ * The startup process no longer executes the restore_command and knows
+ * little about where the WAL files have come from.
+ *
+ * The WAL restore process is started by the postmaster when we enter
+ * PM_RECOVERY state and exits immediately after startup finishes.
+ * It remains alive until the postmaster commands it to terminate.
+ * Normal termination is by SIGTERM, which instructs restore process to exit(0).
+ * Like any backend, restore process will simply abort and exit on SIGQUIT.
+ *
+ * Note that the WAL restore process only executes the restore_command.
+ * The archive_cleanup_command is exeuted by the checkpointer, while the
+ * recovery_end_command and requests for history files are executed by the
+ * startup process. That is not important to the way those commands execute.
+ * All processes that use the restore_command must hold WALRestoreCommandLock
+ * before they execute it, since we definitely wish to avoid trying to get the
+ * same file more than once concurrently, plus we can't assume that the
+ * user has specified command that would succeed if run concurrently.
+ *
+ * If the WAL restore exits unexpectedly, the postmaster treats that the same
+ * as a backend crash: shared memory may be corrupted, so remaining backends
+ * should be killed by SIGQUIT and then a recovery cycle started.
+ *
+ *
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/postmaster/walrestore.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <signal.h>
+#include <sys/time.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "access/xlog_internal.h"
+#include "libpq/pqsignal.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "postmaster/startup.h"
+#include "postmaster/walrestore.h"
+#include "storage/bufmgr.h"
+#include "storage/ipc.h"
+#include "storage/lwlock.h"
+#include "storage/pmsignal.h"
+#include "storage/shmem.h"
+#include "utils/guc.h"
+#include "utils/ps_status.h"
+#include "utils/timestamp.h"
+
+/* XXX Set to DEBUG4 prior to patch commit */
+#define WALRSTR_DEBUG_LEVEL 		LOG
+
+/*
+ * GUC parameters
+ */
+int	WalRestoreDelay = 10000;
+
+WalRestoreData *WalRstr = NULL;
+
+/*
+ * Flags set by interrupt handlers for later service in the main loop.
+ */
+static volatile sig_atomic_t got_SIGHUP = false;
+volatile sig_atomic_t walrestore_shutdown_requested = false;
+
+/* Prototypes for private functions */
+
+static bool WalRestoreNextFile(void);
+
+/* Signal handlers */
+
+static void walrestore_quickdie(SIGNAL_ARGS);
+static void WalRestoreProcSigUsr1Handler(SIGNAL_ARGS);
+static void WalRestoreSigHupHandler(SIGNAL_ARGS);
+static void WalRestoreShutdownHandler(SIGNAL_ARGS);
+
+
+/*
+ * Main entry point for walrestore process
+ *
+ * This is invoked from BootstrapMain, which has already created the basic
+ * execution environment, but not enabled signals yet.
+ */
+void
+WalRestoreMain(void)
+{
+	/* use volatile pointer to prevent code rearrangement */
+	volatile WalRestoreData *walrstr = WalRstr;
+
+	/*
+	 * WalRstr should be set up already (if we are a backend, we inherit this
+	 * by fork() or EXEC_BACKEND mechanism from the postmaster).
+	 */
+	Assert(walrstr != NULL);
+
+	InitLatch(&walrstr->WALRestoreLatch); /* initialize latch used in main loop */
+
+	/*
+	 * If possible, make this process a group leader, so that the postmaster
+	 * can signal any child processes too.
+	 */
+#ifdef HAVE_SETSID
+	if (setsid() < 0)
+		elog(FATAL, "setsid() failed: %m");
+#endif
+
+	/*
+	 * Properly accept or ignore signals the postmaster might send us
+	 *
+	 * SIGUSR1 is presently unused; keep it spare in case someday we want this
+	 * process to participate in ProcSignal signalling.
+	 */
+	pqsignal(SIGHUP, WalRestoreSigHupHandler);	/* set flag to read config file */
+	pqsignal(SIGINT, SIG_IGN);
+	pqsignal(SIGTERM, WalRestoreShutdownHandler); 	/* shutdown */
+	pqsignal(SIGQUIT, walrestore_quickdie);		/* hard crash time */
+	pqsignal(SIGALRM, SIG_IGN);
+	pqsignal(SIGPIPE, SIG_IGN);
+	pqsignal(SIGUSR1, WalRestoreProcSigUsr1Handler);	/* reserve for ProcSignal */
+	pqsignal(SIGUSR2, SIG_IGN);
+
+	/*
+	 * Reset some signals that are accepted by postmaster but not here
+	 */
+	pqsignal(SIGCHLD, SIG_DFL);
+	pqsignal(SIGTTIN, SIG_DFL);
+	pqsignal(SIGTTOU, SIG_DFL);
+	pqsignal(SIGCONT, SIG_DFL);
+	pqsignal(SIGWINCH, SIG_DFL);
+
+	/* We allow SIGQUIT (quickdie) at all times */
+	sigdelset(&BlockSig, SIGQUIT);
+
+	/*
+	 * Unblock signals (they were blocked when the postmaster forked us)
+	 */
+	PG_SETMASK(&UnBlockSig);
+
+	/*
+	 * Loop forever
+	 */
+	for (;;)
+	{
+		ResetLatch(&walrstr->WALRestoreLatch);
+
+		/*
+		 * Emergency bailout if postmaster has died.  This is to avoid the
+		 * necessity for manual cleanup of all postmaster children.
+		 */
+		if (!PostmasterIsAlive())
+			exit(1);
+
+		if (got_SIGHUP)
+		{
+			got_SIGHUP = false;
+			ProcessConfigFile(PGC_SIGHUP);
+		}
+
+		if (walrestore_shutdown_requested)
+		{
+			/*
+			 * From here on, elog(ERROR) should end with exit(1), not send
+			 * control back to the sigsetjmp block above
+			 */
+			ExitOnAnyError = true;
+			/* Normal exit from the walwriter is here */
+			proc_exit(0);		/* done */
+		}
+
+		/*
+		 * Keep restoring as long as there are files to process and we have
+		 * not exceeded wal_keep_files
+		 */
+		if (!WalRestoreNextFile())
+		{
+		(void) WaitLatch(&walrstr->WALRestoreLatch,
+							   WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
+							   WalRestoreDelay /* ms */);
+		}
+	}
+}
+
+/*
+ * SetNextWALRestoreLogSeg - set the target for next WALrestore cycle
+ *
+ * Only called by Startup process
+ *
+ * Must be called with WALRestoreCommandLock held and must be held at exit,
+ * if the function returns.
+ */
+void
+SetNextWALRestoreLogSeg(TimeLineID tli, uint32 log, uint32 seg)
+{
+	char		xlogfname[MAXFNAMELEN];
+	uint32		newlog = log;
+	uint32		newseg = seg;
+
+	NextLogSeg(newlog, newseg);
+
+	XLogFileName(xlogfname, tli, newlog, newseg);
+	elog(WALRSTR_DEBUG_LEVEL, "requesting restore of %s", xlogfname);
+
+	{
+		/* use volatile pointer to prevent code rearrangement */
+		volatile WalRestoreData *walrstr = WalRstr;
+
+		walrstr->nextFileTli = tli;
+		walrstr->nextFileLog = newlog;
+		walrstr->nextFileSeg = newseg;
+	}
+
+	SetLatch(&WalRstr->WALRestoreLatch);
+}
+
+/*
+ * Run in Startup process to see if next file has arrived. We protect
+ * WalRstr with a LWlock so that the Startup process will wait until
+ * the restore_command succeeds or is cancelled. We set interrupt flags
+ * as if we were running the restore_command ourselves; there is no
+ * difference.
+ *
+ * WALRestoreCommandLock is not held on entry, but will be held at exit.
+ */
+bool
+XLogFileIsNowFullyRestored(TimeLineID tli, uint32 log, uint32 seg)
+{
+	/* use volatile pointer to prevent code rearrangement */
+	volatile WalRestoreData *walrstr = WalRstr;
+	char		xlogfname[MAXFNAMELEN];
+
+	/*
+	 * Issue debug message before we wait for the lock, to allow
+	 * log entries to show interleaving of Startup and WALRestore actions
+	 */
+	XLogFileName(xlogfname, tli, log, seg);
+	elog(WALRSTR_DEBUG_LEVEL,
+			"startup process requests %s from archive", xlogfname);
+
+	LWLockAcquire(WALRestoreCommandLock, LW_EXCLUSIVE);
+
+	XLogFileName(xlogfname,
+					walrstr->lastFileTli,
+					walrstr->lastFileLog,
+					walrstr->lastFileSeg);
+	elog(WALRSTR_DEBUG_LEVEL,
+			"startup process sees last file was %s", xlogfname);
+
+	if (tli == walrstr->lastFileTli &&
+		log == walrstr->lastFileLog &&
+		seg == walrstr->lastFileSeg)
+		return true;
+
+	return false;
+}
+
+/*
+ * WalRestoreNextFile - returns true if next file was restored
+ *
+ * Broadly follows the logic in XLogFileRead() when called with source of
+ * XLOG_FROM_ARCHIVE, except we have to read the next file from shmem.
+ */
+static bool
+WalRestoreNextFile(void)
+{
+	/* use volatile pointer to prevent code rearrangement */
+	volatile WalRestoreData *walrstr = WalRstr;
+	char		xlogfname[MAXFNAMELEN];
+	char		activitymsg[MAXFNAMELEN + 16];
+	char		path[MAXPGPATH];
+	bool		restoredFromArchive;
+	uint32		nextFileLog;
+	uint32		nextFileSeg;
+	TimeLineID	nextFileTli;
+
+	elog(WALRSTR_DEBUG_LEVEL, "walrestore checking for next file to restore");
+
+	LWLockAcquire(WALRestoreCommandLock, LW_EXCLUSIVE);
+
+	{
+		/* use volatile pointer to prevent code rearrangement */
+		volatile WalRestoreData *walrstr = WalRstr;
+
+		nextFileTli = walrstr->nextFileTli;
+		nextFileLog = walrstr->nextFileLog;
+		nextFileSeg = walrstr->nextFileSeg;
+	}
+
+	/*
+	 * If we aren't being requested to restore a file exit quickly.
+	 */
+	if (nextFileTli == walrstr->lastFileTli &&
+			nextFileLog == walrstr->lastFileLog &&
+			nextFileSeg == walrstr->lastFileSeg)
+	{
+		LWLockRelease(WALRestoreCommandLock);
+		XLogFileName(xlogfname, nextFileTli, nextFileLog, nextFileSeg);
+		elog(WALRSTR_DEBUG_LEVEL,
+				"restore of %s is already complete, so sleep", xlogfname);
+		return false;
+	}
+
+	XLogFileName(xlogfname, nextFileTli, nextFileLog, nextFileSeg);
+
+	/* Report recovery progress in PS display */
+	snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
+			 xlogfname);
+	set_ps_display(activitymsg, false);
+
+	elog(WALRSTR_DEBUG_LEVEL, "walrestore will restore %s", xlogfname);
+
+	restoredFromArchive = RestoreArchivedFile(path, xlogfname,
+											  "RECOVERYXLOG",
+											  XLogSegSize);
+
+	if (restoredFromArchive)
+	{
+		/* use volatile pointer to prevent code rearrangement */
+		volatile WalRestoreData *walrstr = WalRstr;
+
+		walrstr->lastFileTli = nextFileTli;
+		walrstr->lastFileLog = nextFileLog;
+		walrstr->lastFileSeg = nextFileSeg;
+		walrstr->lastFileRestoreTime = GetCurrentTimestamp();
+	}
+
+	LWLockRelease(WALRestoreCommandLock);
+
+	set_ps_display("", false);
+
+	/*
+	 * Make sure Startup process is active so it can see new file, or
+	 * react to it not being there.
+	 */
+	WakeupRecovery();
+
+	return restoredFromArchive;
+}
+
+void
+SetRecoveryRestoreCommand(char *cmd)
+{
+	if (cmd == NULL)
+		return;
+
+	if (strlen(cmd) <= MAXPGPATH)
+		strcpy(WalRstr->recoveryRestoreCommand, cmd);
+	else
+		elog(FATAL, "recovery_restore_command is too long");
+}
+
+char *
+GetRecoveryRestoreCommand(void)
+{
+	return WalRstr->recoveryRestoreCommand;
+}
+
+/* Report shared memory space needed by WalRestoreShmemInit */
+Size
+WalRestoreShmemSize(void)
+{
+	Size		size = 0;
+
+	size = add_size(size, sizeof(WalRestoreData));
+
+	return size;
+}
+
+/* Allocate and initialize walrestore-related shared memory */
+void
+WalRestoreShmemInit(void)
+{
+	bool		found;
+
+	WalRstr = (WalRestoreData *)
+		ShmemInitStruct("Wal Restore Ctl", WalRestoreShmemSize(), &found);
+
+	if (found)
+		return;
+
+	/* First time through, so initialize */
+	MemSet(WalRstr, 0, WalRestoreShmemSize());
+	InitSharedLatch(&WalRstr->WALRestoreLatch);
+
+}
+
+/* --------------------------------
+ *		signal handler routines
+ * --------------------------------
+ */
+
+/*
+ * walrestore_quickdie() occurs when signalled SIGQUIT by the postmaster.
+ *
+ * Some backend has bought the farm,
+ * so we need to stop what we're doing and exit.
+ */
+static void
+walrestore_quickdie(SIGNAL_ARGS)
+{
+	PG_SETMASK(&BlockSig);
+
+	/*
+	 * We DO NOT want to run proc_exit() callbacks -- we're here because
+	 * shared memory may be corrupted, so we don't want to try to clean up our
+	 * transaction.  Just nail the windows shut and get out of town.  Now that
+	 * there's an atexit callback to prevent third-party code from breaking
+	 * things by calling exit() directly, we have to reset the callbacks
+	 * explicitly to make this work as intended.
+	 */
+	on_exit_reset();
+
+	/*
+	 * Note we do exit(2) not exit(0).	This is to force the postmaster into a
+	 * system reset cycle if some idiot DBA sends a manual SIGQUIT to a random
+	 * backend.  This is necessary precisely because we don't clean up our
+	 * shared memory state.  (The "dead man switch" mechanism in pmsignal.c
+	 * should ensure the postmaster sees this as a crash, too, but no harm in
+	 * being doubly sure.)
+	 */
+	exit(2);
+}
+
+/* SIGUSR1: let latch facility handle the signal */
+static void
+WalRestoreProcSigUsr1Handler(SIGNAL_ARGS)
+{
+	int			save_errno = errno;
+
+	latch_sigusr1_handler();
+
+	errno = save_errno;
+}
+
+/* SIGHUP: set flag to re-read config file at next convenient time */
+static void
+WalRestoreSigHupHandler(SIGNAL_ARGS)
+{
+	int			save_errno = errno;
+
+	got_SIGHUP = true;
+	SetLatch(&WalRstr->WALRestoreLatch);
+
+	errno = save_errno;
+}
+
+/* SIGTERM: set flag to shutdown and exit */
+static void
+WalRestoreShutdownHandler(SIGNAL_ARGS)
+{
+	int			save_errno = errno;
+
+	if (in_restore_command)
+	{
+		LWLockRelease(WALRestoreCommandLock);
+		proc_exit(1);
+	}
+	else
+		walrestore_shutdown_requested = true;
+	SetLatch(&WalRstr->WALRestoreLatch);
+
+	errno = save_errno;
+}
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index ef1dc91..8f4443a 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -26,6 +26,7 @@
 #include "postmaster/autovacuum.h"
 #include "postmaster/bgwriter.h"
 #include "postmaster/postmaster.h"
+#include "postmaster/walrestore.h"
 #include "replication/walreceiver.h"
 #include "replication/walsender.h"
 #include "storage/bufmgr.h"
@@ -123,6 +124,7 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port)
 		size = add_size(size, AutoVacuumShmemSize());
 		size = add_size(size, WalSndShmemSize());
 		size = add_size(size, WalRcvShmemSize());
+		size = add_size(size, WalRestoreShmemSize());
 		size = add_size(size, BTreeShmemSize());
 		size = add_size(size, SyncScanShmemSize());
 		size = add_size(size, AsyncShmemSize());
@@ -228,6 +230,7 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port)
 	AutoVacuumShmemInit();
 	WalSndShmemInit();
 	WalRcvShmemInit();
+	WalRestoreShmemInit();
 
 	/*
 	 * Set up other modules that need some shared memory space
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 1ddf4bf..e9e5325 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -270,7 +270,8 @@ extern bool XLogNeedsFlush(XLogRecPtr RecPtr);
 extern int XLogFileInit(uint32 log, uint32 seg,
 			 bool *use_existent, bool use_lock);
 extern int	XLogFileOpen(uint32 log, uint32 seg);
-
+extern bool RestoreArchivedFile(char *path, const char *xlogfname,
+					const char *recovername, off_t expectedSize);
 
 extern void XLogGetLastRemoved(uint32 *log, uint32 *seg);
 extern void XLogSetAsyncXactLSN(XLogRecPtr record);
@@ -316,6 +317,7 @@ extern TimeLineID GetRecoveryTargetTLI(void);
 extern bool CheckPromoteSignal(void);
 extern void WakeupRecovery(void);
 extern Latch *WALWriterLatch(void);
+extern Latch *WALRestoreLatch(void);
 
 /*
  * Starting/stopping a base backup
diff --git a/src/include/bootstrap/bootstrap.h b/src/include/bootstrap/bootstrap.h
index e966a73..b90ce33 100644
--- a/src/include/bootstrap/bootstrap.h
+++ b/src/include/bootstrap/bootstrap.h
@@ -23,6 +23,7 @@ typedef enum
 	StartupProcess,
 	BgWriterProcess,
 	CheckpointerProcess,
+	WalRestoreProcess,
 	WalWriterProcess,
 	WalReceiverProcess,
 
diff --git a/src/include/postmaster/startup.h b/src/include/postmaster/startup.h
index 3ec6950..35d9665 100644
--- a/src/include/postmaster/startup.h
+++ b/src/include/postmaster/startup.h
@@ -12,10 +12,11 @@
 #ifndef _STARTUP_H
 #define _STARTUP_H
 
+extern volatile sig_atomic_t startup_shutdown_requested;
+extern volatile sig_atomic_t in_restore_command;
+
 extern void HandleStartupProcInterrupts(void);
 extern void StartupProcessMain(void);
-extern void PreRestoreCommand(void);
-extern void PostRestoreCommand(void);
 extern bool IsPromoteTriggered(void);
 extern void ResetPromoteTriggered(void);
 
diff --git a/src/include/postmaster/walrestore.h b/src/include/postmaster/walrestore.h
new file mode 100644
index 0000000..98d7830
--- /dev/null
+++ b/src/include/postmaster/walrestore.h
@@ -0,0 +1,72 @@
+/*-------------------------------------------------------------------------
+ *
+ * walrestore.h
+ *	  Exports from postmaster/walrestore.c.
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ *
+ * src/include/postmaster/walrestore.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef _WALRESTORE_H
+#define _WALRESTORE_H
+
+#include "access/xlog.h"
+#include "access/xlogdefs.h"
+#include "storage/spin.h"
+#include "pgtime.h"
+
+extern volatile sig_atomic_t walrestore_shutdown_requested;
+
+/* GUC options */
+
+extern void WalRestoreMain(void);
+extern bool XLogFileIsNowFullyRestored(TimeLineID tli, uint32 log, uint32 seg);
+extern void SetNextWALRestoreLogSeg(TimeLineID tli, uint32 log, uint32 seg);
+extern void SetRecoveryRestoreCommand(char *cmd);
+extern char *GetRecoveryRestoreCommand(void);
+extern Size WalRestoreShmemSize(void);
+extern void WalRestoreShmemInit(void);
+
+/* Shared memory area for management of walrestore process */
+typedef struct
+{
+	/*
+	 * The identifiers of the last WAL file restored by WALrestore
+	 */
+	TimeLineID	lastFileTli;
+	uint32		lastFileLog;
+	uint32		lastFileSeg;
+
+	/*
+	 * Time of last restore by WALrestore
+	 */
+	TimestampTz lastFileRestoreTime;
+
+	/*
+	 * The next WAL file requested for the WALrestore process to restore
+	 */
+	TimeLineID	nextFileTli;
+	uint32		nextFileLog;
+	uint32		nextFileSeg;
+
+	/*
+	 * All of the above read and set only while holding WALRestoreCommandLock
+	 */
+
+	/*
+	 * WALRestoreLatch is used to wake up the WALRestore to restore WAL files.
+	 */
+	Latch		WALRestoreLatch;
+
+	/*
+	 * recoveryRestoreCommand for use by walrestore; can remove if becomes GUC
+	 * Set once at startup and read-only after that
+	 */
+	char		recoveryRestoreCommand[MAXPGPATH];
+} WalRestoreData;
+
+extern WalRestoreData *WalRstr;
+
+#endif   /* _WALRESTORE_H */
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index df3df29..c316dcc 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -79,6 +79,7 @@ typedef enum LWLockId
 	SerializablePredicateLockListLock,
 	OldSerXidLock,
 	SyncRepLock,
+	WALRestoreCommandLock,
 	/* Individual lock IDs end here */
 	FirstBufMappingLock,
 	FirstLockMgrLock = FirstBufMappingLock + NUM_BUFFER_PARTITIONS,
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index 358d1a4..f994b67 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -204,12 +204,21 @@ extern PGPROC *PreparedXactProcs;
 /*
  * We set aside some extra PGPROC structures for auxiliary processes,
  * ie things that aren't full-fledged backends but need shmem access.
+ * Logger, archiver and stats processes don't count towards this total,
+ * nor do WALSender processes.
  *
+ * NUM_AUXILIARY_PROCS must be set to the highest of the requirements for
+ * normal running and recovery.
+ *
+ * During normal running we need slots for:
  * Background writer, checkpointer and WAL writer run during normal operation.
- * Startup process and WAL receiver also consume 2 slots, but WAL writer is
- * launched only after startup has exited, so we only need 4 slots.
+ * 3 slots
+ *
+ * During recovery we need slots for:
+ * Background writer, checkpointer, Startup process, WAL receiver, WAL restore.
+ * 5 slots
  */
-#define NUM_AUXILIARY_PROCS		4
+#define NUM_AUXILIARY_PROCS		5
 
 
 /* configurable options */

Fujii Masao

masao.fujii@gmail.com

almost 14 years ago

In reply to: Simon Riggs (#6)

Re: WAL Restore process during recovery

On Fri, Jan 20, 2012 at 7:38 PM, Simon Riggs <simon@2ndquadrant.com> wrote:

On Fri, Jan 20, 2012 at 3:43 AM, Fujii Masao <masao.fujii@gmail.com> wrote:

Requested update

Thanks! Will review.

Regards,

--
Fujii Masao
NIPPON TELEGRAPH AND TELEPHONE CORPORATION
NTT Open Source Software Center

Fujii Masao

masao.fujii@gmail.com

almost 14 years ago

In reply to: Fujii Masao (#7)

Re: WAL Restore process during recovery

On Fri, Jan 20, 2012 at 7:50 PM, Fujii Masao <masao.fujii@gmail.com> wrote:

On Fri, Jan 20, 2012 at 7:38 PM, Simon Riggs <simon@2ndquadrant.com> wrote:

On Fri, Jan 20, 2012 at 3:43 AM, Fujii Masao <masao.fujii@gmail.com> wrote:

Requested update

Thanks! Will review.

In StartChildProcess(), the code which emits an error when fork of walrestore
fails is required.

In reaper(), the following comment needs to be updated because an unexpected
exit (including FATAL) is treated as a crash in the patch.

/*
* Was it the wal restore? If exit status is zero (normal) or one
* (FATAL exit), we assume everything is all right just like normal
* backends.
*/
if (pid == WalRestorePID)

Why does walrestore need to be invoked even when restore_command is
not specified? It seems to be useless. We invoke walreceiver only when
primary_conninfo is specified now. Similarly we should invoke walrestore
only when restore_command is specified?

When I set up the file-based log-shipping environment using pg_standby,
ran "pgbench -i -s2", waited for walrestore to restore at least one WAL
file, and created the trigger file, then I encounterd the following error in
the standby.

sby LOG: startup process requests 000000010000000000000003 from archive
trigger file found: smart failover
sby LOG: startup process sees last file was 000000010000000000000003
sby FATAL: could not rename file "pg_xlog/RECOVERYXLOG" to
"pg_xlog/000000010000000000000003": No such file or directory
sby LOG: startup process (PID 11079) exited with exit code 1
sby LOG: terminating any other active server processes

When I set up streaming replication with setting restore_command,
I got the following messages repeatedly. The WAL file name was always
"000000000000000000000000".

sby1 LOG: walrestore checking for next file to restore
sby1 LOG: restore of 000000000000000000000000 is already complete, so sleep

In PostmasterStateMachine(), the following comment needs to mention WALRestore.

* PM_WAIT_READONLY state ends when we have no regular backends that
* have been started during recovery. We kill the startup and
* walreceiver processes and transition to PM_WAIT_BACKENDS. Ideally,

In walrestore.c, the following comments seem to be incorrect. At least
an unexpected
exit of WALRestore doesn't start a recovery cycle in the patch.

* If the WAL restore exits unexpectedly, the postmaster treats
that the same
* as a backend crash: shared memory may be corrupted, so remaining backends
* should be killed by SIGQUIT and then a recovery cycle started.

In walrestore.c
+ * Main entry point for walrestore process
+ *
+ * This is invoked from BootstrapMain, which has already created the basic
+ * execution environment, but not enabled signals yet.

BootstrapMain() doesn't exist, and it should be changed to
AuxiliaryProcessMain().
This is not a fault of the patch. There are the same typos in
bgwriter.c, walwriter.c
and checkpointer.c

In walrestore.c
+	 * SIGUSR1 is presently unused; keep it spare in case someday we want this
+	 * process to participate in ProcSignal signalling.

The above comment is incorrect because SIGUSR1 is presently used.

+			/*
+			 * From here on, elog(ERROR) should end with exit(1), not send
+			 * control back to the sigsetjmp block above
+			 */
+			ExitOnAnyError = true;

The above is not required because sigsetjmp is not used in walrestore.c

+			/* Normal exit from the walwriter is here */
+			proc_exit(0);		/* done */

Typo: s/walwriter/walrestore

I've not reviewed the patch enough yet. Will review the patch tomorrow again.

Regards,

--
Fujii Masao
NIPPON TELEGRAPH AND TELEPHONE CORPORATION
NTT Open Source Software Center

Simon Riggs

simon@2ndQuadrant.com

almost 14 years ago

In reply to: Fujii Masao (#8)

Re: WAL Restore process during recovery

On Mon, Jan 23, 2012 at 12:23 PM, Fujii Masao <masao.fujii@gmail.com> wrote:

I've not reviewed the patch enough yet. Will review the patch tomorrow again.

Thanks very much. I'm sure that's enough to keep me busy a few days.

--
Simon Riggs http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

#10

Simon Riggs

simon@2ndQuadrant.com

almost 14 years ago

In reply to: Fujii Masao (#8)

1 attachment(s)

Re: WAL Restore process during recovery

On Mon, Jan 23, 2012 at 12:23 PM, Fujii Masao <masao.fujii@gmail.com> wrote:

In StartChildProcess(), the code which emits an error when fork of walrestore
fails is required.

In reaper(), the following comment needs to be updated because an unexpected
exit (including FATAL) is treated as a crash in the patch.

/*
* Was it the wal restore? If exit status is zero (normal) or one
* (FATAL exit), we assume everything is all right just like normal
* backends.
*/
if (pid == WalRestorePID)

Why does walrestore need to be invoked even when restore_command is
not specified? It seems to be useless. We invoke walreceiver only when
primary_conninfo is specified now. Similarly we should invoke walrestore
only when restore_command is specified?

walreceiver is shutdown and restarted in case of failed connection.
That never happens with walrestore because the command is run each
time - when we issue system(3) a new process is forked to run the
command. So there is no specific cleanup to perform and so no reason
for a managed cleanup process.

So I can't see a specific reason to change that. Do you think it makes
a difference?

When I set up the file-based log-shipping environment using pg_standby,
ran "pgbench -i -s2", waited for walrestore to restore at least one WAL
file, and created the trigger file, then I encounterd the following error in
the standby.

sby LOG: startup process requests 000000010000000000000003 from archive
trigger file found: smart failover
sby LOG: startup process sees last file was 000000010000000000000003
sby FATAL: could not rename file "pg_xlog/RECOVERYXLOG" to
"pg_xlog/000000010000000000000003": No such file or directory
sby LOG: startup process (PID 11079) exited with exit code 1
sby LOG: terminating any other active server processes

Will look further.

When I set up streaming replication with setting restore_command,
I got the following messages repeatedly. The WAL file name was always
"000000000000000000000000".

Will look further.

sby1 LOG: walrestore checking for next file to restore
sby1 LOG: restore of 000000000000000000000000 is already complete, so sleep

Will look further.

In PostmasterStateMachine(), the following comment needs to mention WALRestore.

* PM_WAIT_READONLY state ends when we have no regular backends that
* have been started during recovery. We kill the startup and
* walreceiver processes and transition to PM_WAIT_BACKENDS. Ideally,

In walrestore.c, the following comments seem to be incorrect. At least
an unexpected
exit of WALRestore doesn't start a recovery cycle in the patch.

* If the WAL restore exits unexpectedly, the postmaster treats
that the same
* as a backend crash: shared memory may be corrupted, so remaining backends
* should be killed by SIGQUIT and then a recovery cycle started.

Yes it does...

In walrestore.c
+ * Main entry point for walrestore process
+ *
+ * This is invoked from BootstrapMain, which has already created the basic
+ * execution environment, but not enabled signals yet.
BootstrapMain() doesn't exist, and it should be changed to
AuxiliaryProcessMain().
This is not a fault of the patch. There are the same typos in
bgwriter.c, walwriter.c
and checkpointer.c

OK, will fix.

In walrestore.c
+        * SIGUSR1 is presently unused; keep it spare in case someday we want this
+        * process to participate in ProcSignal signalling.

The above comment is incorrect because SIGUSR1 is presently used.

+                       /*
+                        * From here on, elog(ERROR) should end with exit(1), not send
+                        * control back to the sigsetjmp block above
+                        */
+                       ExitOnAnyError = true;

The above is not required because sigsetjmp is not used in walrestore.c

+                       /* Normal exit from the walwriter is here */
+                       proc_exit(0);           /* done */

Typo: s/walwriter/walrestore

Cleaned up the points noted, new patch attached in case you wish to
review further.

Still has bug, so still with me to fix.

--
Simon Riggs http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

Attachments:

walrestore_process.v3.patchtext/x-patch; charset=US-ASCII; name=walrestore_process.v3.patchDownload

diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml
index ea98cb7..be4bbc7 100644
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -7320,7 +7320,7 @@
 
   <para>
    The <structfield>pid</structfield> column can be joined to the
-   <structfield>pid</structfield> column of the
+   <structfield>procpid</structfield> column of the
    <structname>pg_stat_activity</structname> view to get more
    information on the session holding or waiting to hold each lock.
    Also, if you are using prepared transactions, the
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index e55b503..58e8ede 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -3857,7 +3857,7 @@ local0.*    /var/log/postgresql
          identifier from <literal>pg_stat_activity</>, use this query:
 <programlisting>
 SELECT to_hex(EXTRACT(EPOCH FROM backend_start)::integer) || '.' ||
-       to_hex(pid)
+       to_hex(procpid)
 FROM pg_stat_activity;
 </programlisting>
 
@@ -4153,7 +4153,7 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv;
        <para>
        Specifies the number of bytes reserved to track the currently
        executing command for each active session, for the
-       <structname>pg_stat_activity</>.<structfield>query</> field.
+       <structname>pg_stat_activity</>.<structfield>current_query</> field.
        The default value is 1024. This parameter can only be set at server
        start.
        </para>
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 48631cc..43b72f6 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -14322,7 +14322,7 @@ SELECT set_config('log_statement_stats', 'off', false);
     send signals (<systemitem>SIGINT</> or <systemitem>SIGTERM</>
     respectively) to backend processes identified by process ID.
     The process ID of an active backend can be found from
-    the <structfield>pid</structfield> column of the
+    the <structfield>procpid</structfield> column of the
     <structname>pg_stat_activity</structname> view, or by listing the
     <command>postgres</command> processes on the server (using
     <application>ps</> on Unix or the <application>Task
diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index fef2a35..2259180 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -577,9 +577,17 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
      <entry>client_addr</entry>
      <entry><type>inet</></entry>
      <entry>The remote IP of the client connected to the backend.
-      If this field is not set, it indicates that the client is either connected
-      via a Unix socket on the server machine or is an internal process such
-      as autovacuum.
+     If this field is not set, it indicates that the client is either:
+      <itemizedlist spacing="compact" mark="bullet">
+       <listitem>
+        <para>
+         Connected via unix sockets on the server machine
+        </para>
+       </listitem>
+       <listitem>
+        <para>An internal process such as autovacuum</para>
+       </listitem>
+     </itemizedlist>
      </entry>
     </row>
     <row>
@@ -1454,8 +1462,8 @@ postgres: <replaceable>user</> <replaceable>database</> <replaceable>host</> <re
    example, to show the <acronym>PID</>s and current queries of all server processes:
 
 <programlisting>
-SELECT pg_stat_get_backend_pid(s.backendid) AS pid,
-       pg_stat_get_backend_activity(s.backendid) AS query
+SELECT pg_stat_get_backend_pid(s.backendid) AS procpid,
+       pg_stat_get_backend_activity(s.backendid) AS current_query
     FROM (SELECT pg_stat_get_backend_idset() AS backendid) AS s;
 </programlisting>
   </para>
@@ -1662,7 +1670,7 @@ SELECT pg_stat_get_backend_pid(s.backendid) AS pid,
      <entry>statement-status</entry>
      <entry>(const char *)</entry>
      <entry>Probe that fires anytime the server process updates its
-      <structname>pg_stat_activity</>.<structfield>status</>.
+      <structname>pg_stat_activity</>.<structfield>current_query</> status.
       arg0 is the new status string.</entry>
     </row>
     <row>
diff --git a/doc/src/sgml/tcn.sgml b/doc/src/sgml/tcn.sgml
index 53c4637..af830df 100644
--- a/doc/src/sgml/tcn.sgml
+++ b/doc/src/sgml/tcn.sgml
@@ -18,7 +18,7 @@
  </para>
 
  <para>
-  Only one parameter may be supplied to the function in a
+  Only one parameter may be suupplied to the function in a
   <literal>CREATE TRIGGER</> statement, and that is optional.  If supplied
   it will be used for the channel name for the notifications.  If omitted
   <literal>tcn</> will be used for the channel name.
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index ce659ec..469e6d6 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -40,6 +40,7 @@
 #include "pgstat.h"
 #include "postmaster/bgwriter.h"
 #include "postmaster/startup.h"
+#include "postmaster/walrestore.h"
 #include "replication/walreceiver.h"
 #include "replication/walsender.h"
 #include "storage/bufmgr.h"
@@ -187,7 +188,6 @@ static bool InArchiveRecovery = false;
 static bool restoredFromArchive = false;
 
 /* options taken from recovery.conf for archive recovery */
-static char *recoveryRestoreCommand = NULL;
 static char *recoveryEndCommand = NULL;
 static char *archiveCleanupCommand = NULL;
 static RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
@@ -575,8 +575,8 @@ bool reachedConsistency = false;
 
 static bool InRedo = false;
 
-/* Have we launched bgwriter during recovery? */
-static bool bgwriterLaunched = false;
+/* Have we launched background procs during archive recovery yet? */
+static bool ArchRecoveryBgProcsActive = false;
 
 /*
  * Information logged when we detect a change in one of the parameters
@@ -632,8 +632,6 @@ static bool XLogPageRead(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt,
 			 bool randAccess);
 static int	emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
 static void XLogFileClose(void);
-static bool RestoreArchivedFile(char *path, const char *xlogfname,
-					const char *recovername, off_t expectedSize);
 static void ExecuteRecoveryCommand(char *command, char *commandName,
 					   bool failOnerror);
 static void PreallocXlogFiles(XLogRecPtr endptr);
@@ -2706,19 +2704,47 @@ XLogFileRead(uint32 log, uint32 seg, int emode, TimeLineID tli,
 
 	XLogFileName(xlogfname, tli, log, seg);
 
+#define TMPRECOVERYXLOG	"RECOVERYXLOG"
+
 	switch (source)
 	{
 		case XLOG_FROM_ARCHIVE:
+			/*
+			 * Check to see if the WALRestore process has already put the
+			 * next file in place while we were working. If so, use that.
+			 * If not, get it ourselves. This makes it easier to handle
+			 * initial state before the WALRestore is active, and also
+			 * handles the stop/start logic correctly when we have both
+			 * streaming and file based replication active.
+			 *
+			 * We queue up the next task for WALRestore after we've begun to
+			 * use this file later in XLogFileRead().
+			 *
+			 * If the WALRestore process is still active, the lock wait makes
+			 * us wait, which is just like we were executing the command
+			 * ourselves and so doesn't alter the logic elsewhere.
+			 */
+			if (XLogFileIsNowFullyRestored(tli, log, seg))
+			{
+				snprintf(path, MAXPGPATH, XLOGDIR "/%s", TMPRECOVERYXLOG);
+				restoredFromArchive = true;
+				break;
+			}
+
 			/* Report recovery progress in PS display */
 			snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
 					 xlogfname);
 			set_ps_display(activitymsg, false);
 
 			restoredFromArchive = RestoreArchivedFile(path, xlogfname,
-													  "RECOVERYXLOG",
+													  TMPRECOVERYXLOG,
 													  XLogSegSize);
+
 			if (!restoredFromArchive)
+			{
+				LWLockRelease(WALRestoreCommandLock);
 				return -1;
+			}
 			break;
 
 		case XLOG_FROM_PG_XLOG:
@@ -2748,18 +2774,42 @@ XLogFileRead(uint32 log, uint32 seg, int emode, TimeLineID tli,
 		if (stat(xlogfpath, &statbuf) == 0)
 		{
 			if (unlink(xlogfpath) != 0)
+			{
+				LWLockRelease(WALRestoreCommandLock);
 				ereport(FATAL,
 						(errcode_for_file_access(),
 						 errmsg("could not remove file \"%s\": %m",
 								xlogfpath)));
+			}
 			reload = true;
 		}
 
 		if (rename(path, xlogfpath) < 0)
+		{
+			LWLockRelease(WALRestoreCommandLock);
 			ereport(ERROR,
 				(errcode_for_file_access(),
 				 errmsg("could not rename file \"%s\" to \"%s\": %m",
 						path, xlogfpath)));
+		}
+
+		/*
+		 * Make sure we recover from the new filename, so we can reuse the
+		 * temporary filename for asynchronous restore actions.
+		 */
+		strcpy(path, xlogfpath);
+
+		/*
+		 * Tell the WALRestore process to get the next file now.
+		 * Hopefully it will be ready for use in time for the next call the
+		 * Startup process makes to XLogFileRead().
+		 *
+		 * It might seem like we should do that earlier but then there is a
+		 * race condition that might lead to replacing RECOVERYXLOG with
+		 * another file before we've copied it.
+		 */
+		SetNextWALRestoreLogSeg(tli, log, seg);
+		LWLockRelease(WALRestoreCommandLock);
 
 		/*
 		 * If the existing segment was replaced, since walsenders might have
@@ -2911,8 +2961,11 @@ XLogFileClose(void)
  * For fixed-size files, the caller may pass the expected size as an
  * additional crosscheck on successful recovery.  If the file size is not
  * known, set expectedSize = 0.
+ *
+ * Must be called with WALRestoreCommandLock held and must be held at exit,
+ * if the function returns.
  */
-static bool
+bool
 RestoreArchivedFile(char *path, const char *xlogfname,
 					const char *recovername, off_t expectedSize)
 {
@@ -2929,7 +2982,7 @@ RestoreArchivedFile(char *path, const char *xlogfname,
 	uint32		restartSeg;
 
 	/* In standby mode, restore_command might not be supplied */
-	if (recoveryRestoreCommand == NULL)
+	if (strlen(GetRecoveryRestoreCommand()) == 0)
 		goto not_available;
 
 	/*
@@ -2963,18 +3016,24 @@ RestoreArchivedFile(char *path, const char *xlogfname,
 	if (stat(xlogpath, &stat_buf) != 0)
 	{
 		if (errno != ENOENT)
+		{
+			LWLockRelease(WALRestoreCommandLock);
 			ereport(FATAL,
 					(errcode_for_file_access(),
 					 errmsg("could not stat file \"%s\": %m",
 							xlogpath)));
+		}
 	}
 	else
 	{
 		if (unlink(xlogpath) != 0)
+		{
+			LWLockRelease(WALRestoreCommandLock);
 			ereport(FATAL,
 					(errcode_for_file_access(),
 					 errmsg("could not remove file \"%s\": %m",
 							xlogpath)));
+		}
 	}
 
 	/*
@@ -3013,7 +3072,7 @@ RestoreArchivedFile(char *path, const char *xlogfname,
 	endp = xlogRestoreCmd + MAXPGPATH - 1;
 	*endp = '\0';
 
-	for (sp = recoveryRestoreCommand; *sp; sp++)
+	for (sp = GetRecoveryRestoreCommand(); *sp; sp++)
 	{
 		if (*sp == '%')
 		{
@@ -3059,21 +3118,29 @@ RestoreArchivedFile(char *path, const char *xlogfname,
 	}
 	*dp = '\0';
 
-	ereport(DEBUG3,
+	ereport(DEBUG2,
 			(errmsg_internal("executing restore command \"%s\"",
 							 xlogRestoreCmd)));
 
 	/*
-	 * Check signals before restore command and reset afterwards.
+	 * Set in_restore_command to tell the signal handler that we should exit
+	 * right away on SIGTERM. We know that we're at a safe point to do that.
+	 * Check if we had already received the signal, so that we don't miss a
+	 * shutdown request received just before this.
 	 */
-	PreRestoreCommand();
+	in_restore_command = true;
+	if (startup_shutdown_requested || walrestore_shutdown_requested)
+	{
+		LWLockRelease(WALRestoreCommandLock);
+		proc_exit(1);
+	}
 
 	/*
 	 * Copy xlog from archival storage to XLOGDIR
 	 */
 	rc = system(xlogRestoreCmd);
 
-	PostRestoreCommand();
+	in_restore_command = false;
 
 	if (rc == 0)
 	{
@@ -3102,7 +3169,10 @@ RestoreArchivedFile(char *path, const char *xlogfname,
 				if (StandbyMode && stat_buf.st_size < expectedSize)
 					elevel = DEBUG1;
 				else
+				{
+					LWLockRelease(WALRestoreCommandLock);
 					elevel = FATAL;
+				}
 				ereport(elevel,
 						(errmsg("archive file \"%s\" has wrong size: %lu instead of %lu",
 								xlogfname,
@@ -3123,10 +3193,13 @@ RestoreArchivedFile(char *path, const char *xlogfname,
 		{
 			/* stat failed */
 			if (errno != ENOENT)
+			{
+				LWLockRelease(WALRestoreCommandLock);
 				ereport(FATAL,
 						(errcode_for_file_access(),
 						 errmsg("could not stat file \"%s\": %m",
 								xlogpath)));
+			}
 		}
 	}
 
@@ -3158,10 +3231,18 @@ RestoreArchivedFile(char *path, const char *xlogfname,
 	 * too.
 	 */
 	if (WIFSIGNALED(rc) && WTERMSIG(rc) == SIGTERM)
+	{
+		LWLockRelease(WALRestoreCommandLock);
 		proc_exit(1);
+	}
 
 	signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;
 
+	/*
+	 * If signaled we will immediately issue a FATAL error so drop the lock
+	 */
+	if (signaled)
+		LWLockRelease(WALRestoreCommandLock);
 	ereport(signaled ? FATAL : DEBUG2,
 		(errmsg("could not restore file \"%s\" from archive: return code %d",
 				xlogfname, rc)));
@@ -4203,7 +4284,9 @@ readTimeLineHistory(TimeLineID targetTLI)
 	if (InArchiveRecovery)
 	{
 		TLHistoryFileName(histfname, targetTLI);
+		LWLockAcquire(WALRestoreCommandLock, LW_EXCLUSIVE);
 		RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
+		LWLockRelease(WALRestoreCommandLock);
 	}
 	else
 		TLHistoryFilePath(path, targetTLI);
@@ -4292,7 +4375,9 @@ existsTimeLineHistory(TimeLineID probeTLI)
 	if (InArchiveRecovery)
 	{
 		TLHistoryFileName(histfname, probeTLI);
+		LWLockAcquire(WALRestoreCommandLock, LW_EXCLUSIVE);
 		RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
+		LWLockRelease(WALRestoreCommandLock);
 	}
 	else
 		TLHistoryFilePath(path, probeTLI);
@@ -4453,7 +4538,9 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
 	if (InArchiveRecovery)
 	{
 		TLHistoryFileName(histfname, parentTLI);
+		LWLockAcquire(WALRestoreCommandLock, LW_EXCLUSIVE);
 		RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
+		LWLockRelease(WALRestoreCommandLock);
 	}
 	else
 		TLHistoryFilePath(path, parentTLI);
@@ -5299,10 +5386,10 @@ readRecoveryCommandFile(void)
 	{
 		if (strcmp(item->name, "restore_command") == 0)
 		{
-			recoveryRestoreCommand = pstrdup(item->value);
+			SetRecoveryRestoreCommand(pstrdup(item->value));
 			ereport(DEBUG2,
 					(errmsg_internal("restore_command = '%s'",
-									 recoveryRestoreCommand)));
+									 GetRecoveryRestoreCommand())));
 		}
 		else if (strcmp(item->name, "recovery_end_command") == 0)
 		{
@@ -5455,7 +5542,7 @@ readRecoveryCommandFile(void)
 	 */
 	if (StandbyMode)
 	{
-		if (PrimaryConnInfo == NULL && recoveryRestoreCommand == NULL)
+		if (PrimaryConnInfo == NULL && strlen(GetRecoveryRestoreCommand()) == 0)
 			ereport(WARNING,
 					(errmsg("recovery command file \"%s\" specified neither primary_conninfo nor restore_command",
 							RECOVERY_COMMAND_FILE),
@@ -5463,7 +5550,7 @@ readRecoveryCommandFile(void)
 	}
 	else
 	{
-		if (recoveryRestoreCommand == NULL)
+		if (strlen(GetRecoveryRestoreCommand()) == 0)
 			ereport(FATAL,
 					(errmsg("recovery command file \"%s\" must specify restore_command when standby mode is not enabled",
 							RECOVERY_COMMAND_FILE)));
@@ -6432,7 +6519,7 @@ StartupXLOG(void)
 			PublishStartupProcessInformation();
 			SetForwardFsyncRequests();
 			SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
-			bgwriterLaunched = true;
+			ArchRecoveryBgProcsActive = true;
 		}
 
 		/*
@@ -6795,7 +6882,7 @@ StartupXLOG(void)
 		 * the rule that TLI only changes in shutdown checkpoints, which
 		 * allows some extra error checking in xlog_redo.
 		 */
-		if (bgwriterLaunched)
+		if (ArchRecoveryBgProcsActive)
 			RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
 							  CHECKPOINT_IMMEDIATE |
 							  CHECKPOINT_WAIT);
@@ -9640,7 +9727,7 @@ XLogPageRead(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt,
 		 * Request a restartpoint if we've replayed too much
 		 * xlog since the last one.
 		 */
-		if (StandbyMode && bgwriterLaunched)
+		if (StandbyMode && ArchRecoveryBgProcsActive)
 		{
 			if (XLogCheckpointNeeded(readId, readSeg))
 			{
diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c
index e3ae92d..81a8cb3 100644
--- a/src/backend/bootstrap/bootstrap.c
+++ b/src/backend/bootstrap/bootstrap.c
@@ -30,6 +30,7 @@
 #include "nodes/makefuncs.h"
 #include "postmaster/bgwriter.h"
 #include "postmaster/startup.h"
+#include "postmaster/walrestore.h"
 #include "postmaster/walwriter.h"
 #include "replication/walreceiver.h"
 #include "storage/bufmgr.h"
@@ -319,6 +320,9 @@ AuxiliaryProcessMain(int argc, char *argv[])
 			case CheckpointerProcess:
 				statmsg = "checkpointer process";
 				break;
+			case WalRestoreProcess:
+				statmsg = "wal restore process";
+				break;
 			case WalWriterProcess:
 				statmsg = "wal writer process";
 				break;
@@ -424,6 +428,11 @@ AuxiliaryProcessMain(int argc, char *argv[])
 			CheckpointerMain();
 			proc_exit(1);		/* should never return */
 
+		case WalRestoreProcess:
+			/* don't set signals, wal restore has its own agenda */
+			WalRestoreMain();
+			proc_exit(1);		/* should never return */
+
 		case WalWriterProcess:
 			/* don't set signals, walwriter has its own agenda */
 			InitXLOGAccess();
diff --git a/src/backend/postmaster/Makefile b/src/backend/postmaster/Makefile
index 3056b09..349e722 100644
--- a/src/backend/postmaster/Makefile
+++ b/src/backend/postmaster/Makefile
@@ -13,6 +13,6 @@ top_builddir = ../../..
 include $(top_builddir)/src/Makefile.global
 
 OBJS = autovacuum.o bgwriter.o fork_process.o pgarch.o pgstat.o postmaster.o \
-	startup.o syslogger.o walwriter.o checkpointer.o
+	startup.o syslogger.o walrestore.o walwriter.o checkpointer.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index ad0c17a..ce0c80a 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -210,6 +210,7 @@ static pid_t StartupPID = 0,
 			BgWriterPID = 0,
 			CheckpointerPID = 0,
 			WalWriterPID = 0,
+			WalRestorePID = 0,
 			WalReceiverPID = 0,
 			AutoVacPID = 0,
 			PgArchPID = 0,
@@ -470,6 +471,7 @@ static void ShmemBackendArrayRemove(Backend *bn);
 #define StartCheckpointer()		StartChildProcess(CheckpointerProcess)
 #define StartWalWriter()		StartChildProcess(WalWriterProcess)
 #define StartWalReceiver()		StartChildProcess(WalReceiverProcess)
+#define StartWalRestore()		StartChildProcess(WalRestoreProcess)
 
 /* Macros to check exit status of a child process */
 #define EXIT_STATUS_0(st)  ((st) == 0)
@@ -2060,6 +2062,8 @@ SIGHUP_handler(SIGNAL_ARGS)
 			signal_child(WalWriterPID, SIGHUP);
 		if (WalReceiverPID != 0)
 			signal_child(WalReceiverPID, SIGHUP);
+		if (WalRestorePID != 0)
+			signal_child(WalRestorePID, SIGHUP);
 		if (AutoVacPID != 0)
 			signal_child(AutoVacPID, SIGHUP);
 		if (PgArchPID != 0)
@@ -2170,6 +2174,8 @@ pmdie(SIGNAL_ARGS)
 				signal_child(StartupPID, SIGTERM);
 			if (WalReceiverPID != 0)
 				signal_child(WalReceiverPID, SIGTERM);
+			if (WalRestorePID != 0)
+				signal_child(WalRestorePID, SIGTERM);
 			if (BgWriterPID != 0)
 				signal_child(BgWriterPID, SIGTERM);
 			if (pmState == PM_RECOVERY)
@@ -2225,6 +2231,8 @@ pmdie(SIGNAL_ARGS)
 				signal_child(WalWriterPID, SIGQUIT);
 			if (WalReceiverPID != 0)
 				signal_child(WalReceiverPID, SIGQUIT);
+			if (WalRestorePID != 0)
+				signal_child(WalRestorePID, SIGQUIT);
 			if (AutoVacPID != 0)
 				signal_child(AutoVacPID, SIGQUIT);
 			if (PgArchPID != 0)
@@ -2331,6 +2339,12 @@ reaper(SIGNAL_ARGS)
 			pmState = PM_RUN;
 
 			/*
+			 * Shutdown the WALRestore process
+			 */
+			if (WalRestorePID != 0)
+				signal_child(WalRestorePID, SIGTERM);
+
+			/*
 			 * Kill any walsenders to force the downstream standby(s) to
 			 * reread the timeline history file, adjust their timelines and
 			 * establish replication connections again. This is required
@@ -2477,6 +2491,28 @@ reaper(SIGNAL_ARGS)
 		}
 
 		/*
+		 * Was it the wal restore?
+		 */
+		if (pid == WalRestorePID)
+		{
+			if (pmState >= PM_RUN)
+			{
+				WalRestorePID = 0;
+				continue;
+			}
+
+			/*
+			 * Any unexpected exit (including FATAL exit) of the WALRestore
+			 * process is treated as a crash, except that we don't want to
+			 * reinitialize because availability is important.
+			 */
+			RecoveryError = true;
+			HandleChildCrash(pid, exitstatus,
+							 _("walrestore process"));
+			continue;
+		}
+
+		/*
 		 * Was it the autovacuum launcher?	Normal exit can be ignored; we'll
 		 * start a new one at the next iteration of the postmaster's main
 		 * loop, if necessary.	Any other exit condition is treated as a
@@ -2756,6 +2792,18 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
 		signal_child(WalReceiverPID, (SendStop ? SIGSTOP : SIGQUIT));
 	}
 
+	/* Take care of the walrestore too */
+	if (pid == WalRestorePID)
+		WalRestorePID = 0;
+	else if (WalRestorePID != 0 && !FatalError)
+	{
+		ereport(DEBUG2,
+				(errmsg_internal("sending %s to process %d",
+								 (SendStop ? "SIGSTOP" : "SIGQUIT"),
+								 (int) WalRestorePID)));
+		signal_child(WalRestorePID, (SendStop ? SIGSTOP : SIGQUIT));
+	}
+
 	/* Take care of the autovacuum launcher too */
 	if (pid == AutoVacPID)
 		AutoVacPID = 0;
@@ -2904,7 +2952,7 @@ PostmasterStateMachine(void)
 	{
 		/*
 		 * PM_WAIT_READONLY state ends when we have no regular backends that
-		 * have been started during recovery.  We kill the startup and
+		 * have been started during recovery.  We kill the startup, walrestore and
 		 * walreceiver processes and transition to PM_WAIT_BACKENDS.  Ideally,
 		 * we might like to kill these processes first and then wait for
 		 * backends to die off, but that doesn't work at present because
@@ -2916,6 +2964,8 @@ PostmasterStateMachine(void)
 				signal_child(StartupPID, SIGTERM);
 			if (WalReceiverPID != 0)
 				signal_child(WalReceiverPID, SIGTERM);
+			if (WalRestorePID != 0)
+				signal_child(WalRestorePID, SIGTERM);
 			pmState = PM_WAIT_BACKENDS;
 		}
 	}
@@ -2940,6 +2990,7 @@ PostmasterStateMachine(void)
 		if (CountChildren(BACKEND_TYPE_NORMAL | BACKEND_TYPE_AUTOVAC) == 0 &&
 			StartupPID == 0 &&
 			WalReceiverPID == 0 &&
+			WalRestorePID == 0 &&
 			BgWriterPID == 0 &&
 			(CheckpointerPID == 0 || !FatalError) &&
 			WalWriterPID == 0 &&
@@ -3005,11 +3056,11 @@ PostmasterStateMachine(void)
 		 * left by now anyway; what we're really waiting for is walsenders and
 		 * archiver.
 		 *
-		 * Walreceiver should normally be dead by now, but not when a fast
-		 * shutdown is performed during recovery.
+		 * Walreceiver and Walrestore should normally be dead by now, but not
+		 * when a fast shutdown is performed during recovery.
 		 */
 		if (PgArchPID == 0 && CountChildren(BACKEND_TYPE_ALL) == 0 &&
-			WalReceiverPID == 0)
+			WalReceiverPID == 0 && WalRestorePID == 0)
 		{
 			pmState = PM_WAIT_DEAD_END;
 		}
@@ -3036,6 +3087,7 @@ PostmasterStateMachine(void)
 			/* These other guys should be dead already */
 			Assert(StartupPID == 0);
 			Assert(WalReceiverPID == 0);
+			Assert(WalRestorePID == 0);
 			Assert(BgWriterPID == 0);
 			Assert(CheckpointerPID == 0);
 			Assert(WalWriterPID == 0);
@@ -4219,6 +4271,8 @@ sigusr1_handler(SIGNAL_ARGS)
 		BgWriterPID = StartBackgroundWriter();
 		Assert(CheckpointerPID == 0);
 		CheckpointerPID = StartCheckpointer();
+		Assert(WalRestorePID == 0);
+		WalRestorePID = StartWalRestore();
 
 		pmState = PM_RECOVERY;
 	}
@@ -4513,6 +4567,10 @@ StartChildProcess(AuxProcType type)
 				ereport(LOG,
 						(errmsg("could not fork WAL writer process: %m")));
 				break;
+			case WalRestoreProcess:
+				ereport(LOG,
+						(errmsg("could not fork WAL restore process: %m")));
+				break;
 			case WalReceiverProcess:
 				ereport(LOG,
 						(errmsg("could not fork WAL receiver process: %m")));
diff --git a/src/backend/postmaster/startup.c b/src/backend/postmaster/startup.c
index ed75d09..1791feb 100644
--- a/src/backend/postmaster/startup.c
+++ b/src/backend/postmaster/startup.c
@@ -35,14 +35,14 @@
  * Flags set by interrupt handlers for later service in the redo loop.
  */
 static volatile sig_atomic_t got_SIGHUP = false;
-static volatile sig_atomic_t shutdown_requested = false;
 static volatile sig_atomic_t promote_triggered = false;
+volatile sig_atomic_t startup_shutdown_requested = false;
 
 /*
  * Flag set when executing a restore command, to tell SIGTERM signal handler
  * that it's safe to just proc_exit.
  */
-static volatile sig_atomic_t in_restore_command = false;
+volatile sig_atomic_t in_restore_command = false;
 
 /* Signal handlers */
 static void startupproc_quickdie(SIGNAL_ARGS);
@@ -131,9 +131,16 @@ StartupProcShutdownHandler(SIGNAL_ARGS)
 	int			save_errno = errno;
 
 	if (in_restore_command)
+	{
+		/*
+		 * See RestoreArchivedFile() for explanation of why this
+		 * lock is always held when in_restore_command is true.
+		 */
+		LWLockRelease(WALRestoreCommandLock);
 		proc_exit(1);
+	}
 	else
-		shutdown_requested = true;
+		startup_shutdown_requested = true;
 	WakeupRecovery();
 
 	errno = save_errno;
@@ -155,7 +162,7 @@ HandleStartupProcInterrupts(void)
 	/*
 	 * Check if we were requested to exit without finishing recovery.
 	 */
-	if (shutdown_requested)
+	if (startup_shutdown_requested)
 		proc_exit(1);
 
 	/*
@@ -226,26 +233,6 @@ StartupProcessMain(void)
 	proc_exit(0);
 }
 
-void
-PreRestoreCommand(void)
-{
-	/*
-	 * Set in_restore_command to tell the signal handler that we should exit
-	 * right away on SIGTERM. We know that we're at a safe point to do that.
-	 * Check if we had already received the signal, so that we don't miss a
-	 * shutdown request received just before this.
-	 */
-	in_restore_command = true;
-	if (shutdown_requested)
-		proc_exit(1);
-}
-
-void
-PostRestoreCommand(void)
-{
-	in_restore_command = false;
-}
-
 bool
 IsPromoteTriggered(void)
 {
diff --git a/src/backend/postmaster/walrestore.c b/src/backend/postmaster/walrestore.c
new file mode 100644
index 0000000..8d01232
--- /dev/null
+++ b/src/backend/postmaster/walrestore.c
@@ -0,0 +1,469 @@
+/*-------------------------------------------------------------------------
+ *
+ * walrestore.c
+ *
+ * The WAL restore process is new as of Postgres 9.2, though the work it performs
+ * has been handled by the startup process from Postgres 8.0 until 9.1.
+ *
+ * WALRestore process executes the restore_command. If not set, it sleeps.
+ * The startup process no longer executes the restore_command and knows
+ * little about where the WAL files have come from.
+ *
+ * The WAL restore process is started by the postmaster when we enter
+ * PM_RECOVERY state and exits immediately after startup finishes.
+ * It remains alive until the postmaster commands it to terminate.
+ * Normal termination is by SIGTERM, which instructs restore process to exit(0).
+ * Like any backend, restore process will simply abort and exit on SIGQUIT.
+ *
+ * Note that the WAL restore process only executes the restore_command.
+ * The archive_cleanup_command is exeuted by the checkpointer, while the
+ * recovery_end_command and requests for history files are executed by the
+ * startup process. That is not important to the way those commands execute.
+ * All processes that use the restore_command must hold WALRestoreCommandLock
+ * before they execute it, since we definitely wish to avoid trying to get the
+ * same file more than once concurrently, plus we can't assume that the
+ * user has specified command that would succeed if run concurrently.
+ *
+ * If the WAL restore exits unexpectedly, the postmaster treats that the same
+ * as a backend crash: shared memory may be corrupted, so remaining backends
+ * should be killed by SIGQUIT and then a recovery cycle started.
+ *
+ *
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/postmaster/walrestore.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <signal.h>
+#include <sys/time.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "access/xlog_internal.h"
+#include "libpq/pqsignal.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "postmaster/startup.h"
+#include "postmaster/walrestore.h"
+#include "storage/bufmgr.h"
+#include "storage/ipc.h"
+#include "storage/lwlock.h"
+#include "storage/pmsignal.h"
+#include "storage/shmem.h"
+#include "utils/guc.h"
+#include "utils/ps_status.h"
+#include "utils/timestamp.h"
+
+/* XXX Set to DEBUG4 prior to patch commit */
+#define WALRSTR_DEBUG_LEVEL 		LOG
+
+/*
+ * GUC parameters
+ */
+int	WalRestoreDelay = 10000;
+
+WalRestoreData *WalRstr = NULL;
+
+/*
+ * Flags set by interrupt handlers for later service in the main loop.
+ */
+static volatile sig_atomic_t got_SIGHUP = false;
+volatile sig_atomic_t walrestore_shutdown_requested = false;
+
+/* Prototypes for private functions */
+
+static bool WalRestoreNextFile(void);
+
+/* Signal handlers */
+
+static void walrestore_quickdie(SIGNAL_ARGS);
+static void WalRestoreProcSigUsr1Handler(SIGNAL_ARGS);
+static void WalRestoreSigHupHandler(SIGNAL_ARGS);
+static void WalRestoreShutdownHandler(SIGNAL_ARGS);
+
+
+/*
+ * Main entry point for walrestore process
+ *
+ * This is invoked from BootstrapMain, which has already created the basic
+ * execution environment, but not enabled signals yet.
+ */
+void
+WalRestoreMain(void)
+{
+	/* use volatile pointer to prevent code rearrangement */
+	volatile WalRestoreData *walrstr = WalRstr;
+
+	/*
+	 * WalRstr should be set up already (if we are a backend, we inherit this
+	 * by fork() or EXEC_BACKEND mechanism from the postmaster).
+	 */
+	Assert(walrstr != NULL);
+
+	InitLatch(&walrstr->WALRestoreLatch); /* initialize latch used in main loop */
+
+	/*
+	 * If possible, make this process a group leader, so that the postmaster
+	 * can signal any child processes too.
+	 */
+#ifdef HAVE_SETSID
+	if (setsid() < 0)
+		elog(FATAL, "setsid() failed: %m");
+#endif
+
+	/*
+	 * Properly accept or ignore signals the postmaster might send us
+	 *
+	 * SIGUSR1 is presently unused; keep it spare in case someday we want this
+	 * process to participate in ProcSignal signalling.
+	 */
+	pqsignal(SIGHUP, WalRestoreSigHupHandler);	/* set flag to read config file */
+	pqsignal(SIGINT, SIG_IGN);
+	pqsignal(SIGTERM, WalRestoreShutdownHandler); 	/* shutdown */
+	pqsignal(SIGQUIT, walrestore_quickdie);		/* hard crash time */
+	pqsignal(SIGALRM, SIG_IGN);
+	pqsignal(SIGPIPE, SIG_IGN);
+	pqsignal(SIGUSR1, WalRestoreProcSigUsr1Handler);	/* reserve for ProcSignal */
+	pqsignal(SIGUSR2, SIG_IGN);
+
+	/*
+	 * Reset some signals that are accepted by postmaster but not here
+	 */
+	pqsignal(SIGCHLD, SIG_DFL);
+	pqsignal(SIGTTIN, SIG_DFL);
+	pqsignal(SIGTTOU, SIG_DFL);
+	pqsignal(SIGCONT, SIG_DFL);
+	pqsignal(SIGWINCH, SIG_DFL);
+
+	/* We allow SIGQUIT (quickdie) at all times */
+	sigdelset(&BlockSig, SIGQUIT);
+
+	/*
+	 * Unblock signals (they were blocked when the postmaster forked us)
+	 */
+	PG_SETMASK(&UnBlockSig);
+
+	/*
+	 * Loop forever
+	 */
+	for (;;)
+	{
+		ResetLatch(&walrstr->WALRestoreLatch);
+
+		/*
+		 * Emergency bailout if postmaster has died.  This is to avoid the
+		 * necessity for manual cleanup of all postmaster children.
+		 */
+		if (!PostmasterIsAlive())
+			exit(1);
+
+		if (got_SIGHUP)
+		{
+			got_SIGHUP = false;
+			ProcessConfigFile(PGC_SIGHUP);
+		}
+
+		if (walrestore_shutdown_requested)
+		{
+			/* Normal exit from the walrestore is here */
+			proc_exit(0);		/* done */
+		}
+
+		/*
+		 * Keep restoring as long as there are files to process and we have
+		 * not exceeded wal_keep_files
+		 */
+		if (!WalRestoreNextFile())
+		{
+		(void) WaitLatch(&walrstr->WALRestoreLatch,
+							   WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
+							   WalRestoreDelay /* ms */);
+		}
+	}
+}
+
+/*
+ * SetNextWALRestoreLogSeg - set the target for next WALrestore cycle
+ *
+ * Only called by Startup process
+ *
+ * Must be called with WALRestoreCommandLock held and must be held at exit,
+ * if the function returns.
+ */
+void
+SetNextWALRestoreLogSeg(TimeLineID tli, uint32 log, uint32 seg)
+{
+	char		xlogfname[MAXFNAMELEN];
+	uint32		newlog = log;
+	uint32		newseg = seg;
+
+	NextLogSeg(newlog, newseg);
+
+	XLogFileName(xlogfname, tli, newlog, newseg);
+	elog(WALRSTR_DEBUG_LEVEL, "requesting restore of %s", xlogfname);
+
+	{
+		/* use volatile pointer to prevent code rearrangement */
+		volatile WalRestoreData *walrstr = WalRstr;
+
+		walrstr->nextFileTli = tli;
+		walrstr->nextFileLog = newlog;
+		walrstr->nextFileSeg = newseg;
+	}
+
+	SetLatch(&WalRstr->WALRestoreLatch);
+}
+
+/*
+ * Run in Startup process to see if next file has arrived. We protect
+ * WalRstr with a LWlock so that the Startup process will wait until
+ * the restore_command succeeds or is cancelled. We set interrupt flags
+ * as if we were running the restore_command ourselves; there is no
+ * difference.
+ *
+ * WALRestoreCommandLock is not held on entry, but will be held at exit.
+ */
+bool
+XLogFileIsNowFullyRestored(TimeLineID tli, uint32 log, uint32 seg)
+{
+	/* use volatile pointer to prevent code rearrangement */
+	volatile WalRestoreData *walrstr = WalRstr;
+	char		xlogfname[MAXFNAMELEN];
+
+	/*
+	 * Issue debug message before we wait for the lock, to allow
+	 * log entries to show interleaving of Startup and WALRestore actions
+	 */
+	XLogFileName(xlogfname, tli, log, seg);
+	elog(WALRSTR_DEBUG_LEVEL,
+			"startup process requests %s from archive", xlogfname);
+
+	LWLockAcquire(WALRestoreCommandLock, LW_EXCLUSIVE);
+
+	XLogFileName(xlogfname,
+					walrstr->lastFileTli,
+					walrstr->lastFileLog,
+					walrstr->lastFileSeg);
+	elog(WALRSTR_DEBUG_LEVEL,
+			"startup process sees last file was %s", xlogfname);
+
+	if (tli == walrstr->lastFileTli &&
+		log == walrstr->lastFileLog &&
+		seg == walrstr->lastFileSeg)
+		return true;
+
+	return false;
+}
+
+/*
+ * WalRestoreNextFile - returns true if next file was restored
+ *
+ * Broadly follows the logic in XLogFileRead() when called with source of
+ * XLOG_FROM_ARCHIVE, except we have to read the next file from shmem.
+ */
+static bool
+WalRestoreNextFile(void)
+{
+	/* use volatile pointer to prevent code rearrangement */
+	volatile WalRestoreData *walrstr = WalRstr;
+	char		xlogfname[MAXFNAMELEN];
+	char		activitymsg[MAXFNAMELEN + 16];
+	char		path[MAXPGPATH];
+	bool		restoredFromArchive;
+	uint32		nextFileLog;
+	uint32		nextFileSeg;
+	TimeLineID	nextFileTli;
+
+	elog(WALRSTR_DEBUG_LEVEL, "walrestore checking for next file to restore");
+
+	LWLockAcquire(WALRestoreCommandLock, LW_EXCLUSIVE);
+
+	{
+		/* use volatile pointer to prevent code rearrangement */
+		volatile WalRestoreData *walrstr = WalRstr;
+
+		nextFileTli = walrstr->nextFileTli;
+		nextFileLog = walrstr->nextFileLog;
+		nextFileSeg = walrstr->nextFileSeg;
+	}
+
+	/*
+	 * If we aren't being requested to restore a file exit quickly.
+	 */
+	if (nextFileTli == walrstr->lastFileTli &&
+			nextFileLog == walrstr->lastFileLog &&
+			nextFileSeg == walrstr->lastFileSeg)
+	{
+		LWLockRelease(WALRestoreCommandLock);
+		XLogFileName(xlogfname, nextFileTli, nextFileLog, nextFileSeg);
+		elog(WALRSTR_DEBUG_LEVEL,
+				"restore of %s is already complete, so sleep", xlogfname);
+		return false;
+	}
+
+	XLogFileName(xlogfname, nextFileTli, nextFileLog, nextFileSeg);
+
+	/* Report recovery progress in PS display */
+	snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
+			 xlogfname);
+	set_ps_display(activitymsg, false);
+
+	elog(WALRSTR_DEBUG_LEVEL, "walrestore will restore %s", xlogfname);
+
+	restoredFromArchive = RestoreArchivedFile(path, xlogfname,
+											  "RECOVERYXLOG",
+											  XLogSegSize);
+
+	if (restoredFromArchive)
+	{
+		/* use volatile pointer to prevent code rearrangement */
+		volatile WalRestoreData *walrstr = WalRstr;
+
+		walrstr->lastFileTli = nextFileTli;
+		walrstr->lastFileLog = nextFileLog;
+		walrstr->lastFileSeg = nextFileSeg;
+		walrstr->lastFileRestoreTime = GetCurrentTimestamp();
+	}
+
+	LWLockRelease(WALRestoreCommandLock);
+
+	set_ps_display("", false);
+
+	/*
+	 * Make sure Startup process is active so it can see new file, or
+	 * react to it not being there.
+	 */
+	WakeupRecovery();
+
+	return restoredFromArchive;
+}
+
+void
+SetRecoveryRestoreCommand(char *cmd)
+{
+	if (cmd == NULL)
+		return;
+
+	if (strlen(cmd) <= MAXPGPATH)
+		strcpy(WalRstr->recoveryRestoreCommand, cmd);
+	else
+		elog(FATAL, "recovery_restore_command is too long");
+}
+
+char *
+GetRecoveryRestoreCommand(void)
+{
+	return WalRstr->recoveryRestoreCommand;
+}
+
+/* Report shared memory space needed by WalRestoreShmemInit */
+Size
+WalRestoreShmemSize(void)
+{
+	Size		size = 0;
+
+	size = add_size(size, sizeof(WalRestoreData));
+
+	return size;
+}
+
+/* Allocate and initialize walrestore-related shared memory */
+void
+WalRestoreShmemInit(void)
+{
+	bool		found;
+
+	WalRstr = (WalRestoreData *)
+		ShmemInitStruct("Wal Restore Ctl", WalRestoreShmemSize(), &found);
+
+	if (found)
+		return;
+
+	/* First time through, so initialize */
+	MemSet(WalRstr, 0, WalRestoreShmemSize());
+	InitSharedLatch(&WalRstr->WALRestoreLatch);
+
+}
+
+/* --------------------------------
+ *		signal handler routines
+ * --------------------------------
+ */
+
+/*
+ * walrestore_quickdie() occurs when signalled SIGQUIT by the postmaster.
+ *
+ * Some backend has bought the farm,
+ * so we need to stop what we're doing and exit.
+ */
+static void
+walrestore_quickdie(SIGNAL_ARGS)
+{
+	PG_SETMASK(&BlockSig);
+
+	/*
+	 * We DO NOT want to run proc_exit() callbacks -- we're here because
+	 * shared memory may be corrupted, so we don't want to try to clean up our
+	 * transaction.  Just nail the windows shut and get out of town.  Now that
+	 * there's an atexit callback to prevent third-party code from breaking
+	 * things by calling exit() directly, we have to reset the callbacks
+	 * explicitly to make this work as intended.
+	 */
+	on_exit_reset();
+
+	/*
+	 * Note we do exit(2) not exit(0).	This is to force the postmaster into a
+	 * system reset cycle if some idiot DBA sends a manual SIGQUIT to a random
+	 * backend.  This is necessary precisely because we don't clean up our
+	 * shared memory state.  (The "dead man switch" mechanism in pmsignal.c
+	 * should ensure the postmaster sees this as a crash, too, but no harm in
+	 * being doubly sure.)
+	 */
+	exit(2);
+}
+
+/* SIGUSR1: let latch facility handle the signal */
+static void
+WalRestoreProcSigUsr1Handler(SIGNAL_ARGS)
+{
+	int			save_errno = errno;
+
+	latch_sigusr1_handler();
+
+	errno = save_errno;
+}
+
+/* SIGHUP: set flag to re-read config file at next convenient time */
+static void
+WalRestoreSigHupHandler(SIGNAL_ARGS)
+{
+	int			save_errno = errno;
+
+	got_SIGHUP = true;
+	SetLatch(&WalRstr->WALRestoreLatch);
+
+	errno = save_errno;
+}
+
+/* SIGTERM: set flag to shutdown and exit */
+static void
+WalRestoreShutdownHandler(SIGNAL_ARGS)
+{
+	int			save_errno = errno;
+
+	if (in_restore_command)
+	{
+		LWLockRelease(WALRestoreCommandLock);
+		proc_exit(1);
+	}
+	else
+		walrestore_shutdown_requested = true;
+	SetLatch(&WalRstr->WALRestoreLatch);
+
+	errno = save_errno;
+}
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index ef1dc91..8f4443a 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -26,6 +26,7 @@
 #include "postmaster/autovacuum.h"
 #include "postmaster/bgwriter.h"
 #include "postmaster/postmaster.h"
+#include "postmaster/walrestore.h"
 #include "replication/walreceiver.h"
 #include "replication/walsender.h"
 #include "storage/bufmgr.h"
@@ -123,6 +124,7 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port)
 		size = add_size(size, AutoVacuumShmemSize());
 		size = add_size(size, WalSndShmemSize());
 		size = add_size(size, WalRcvShmemSize());
+		size = add_size(size, WalRestoreShmemSize());
 		size = add_size(size, BTreeShmemSize());
 		size = add_size(size, SyncScanShmemSize());
 		size = add_size(size, AsyncShmemSize());
@@ -228,6 +230,7 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port)
 	AutoVacuumShmemInit();
 	WalSndShmemInit();
 	WalRcvShmemInit();
+	WalRestoreShmemInit();
 
 	/*
 	 * Set up other modules that need some shared memory space
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 9fc96b2..5c910dd 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2362,7 +2362,7 @@ static struct config_int ConfigureNamesInt[] =
 
 	{
 		{"track_activity_query_size", PGC_POSTMASTER, RESOURCES_MEM,
-			gettext_noop("Sets the size reserved for pg_stat_activity.query, in bytes."),
+			gettext_noop("Sets the size reserved for pg_stat_activity.current_query, in bytes."),
 			NULL,
 		},
 		&pgstat_track_activity_query_size,
diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c
index aabbdac..4007680 100644
--- a/src/bin/pg_basebackup/pg_basebackup.c
+++ b/src/bin/pg_basebackup/pg_basebackup.c
@@ -918,10 +918,10 @@ BaseBackup(void)
 				progname, PQerrorMessage(conn));
 		disconnect_and_exit(1);
 	}
-	if (PQntuples(res) != 1 || PQnfields(res) != 3)
+	if (PQntuples(res) != 1)
 	{
-		fprintf(stderr, _("%s: could not identify system, got %i rows and %i fields\n"),
-				progname, PQntuples(res), PQnfields(res));
+		fprintf(stderr, _("%s: could not identify system, got %i rows\n"),
+				progname, PQntuples(res));
 		disconnect_and_exit(1);
 	}
 	sysidentifier = strdup(PQgetvalue(res, 0, 0));
@@ -1130,7 +1130,7 @@ BaseBackup(void)
 		{
 			fprintf(stderr, _("%s: could not parse xlog end position \"%s\"\n"),
 					progname, xlogend);
-			disconnect_and_exit(1);
+			exit(1);
 		}
 		InterlockedIncrement(&has_xlogendptr);
 
@@ -1162,7 +1162,6 @@ BaseBackup(void)
 	/*
 	 * End of copy data. Final result is already checked inside the loop.
 	 */
-	PQclear(res);
 	PQfinish(conn);
 
 	if (verbose)
diff --git a/src/bin/pg_basebackup/pg_receivexlog.c b/src/bin/pg_basebackup/pg_receivexlog.c
index fe9e39b..e698b06 100644
--- a/src/bin/pg_basebackup/pg_receivexlog.c
+++ b/src/bin/pg_basebackup/pg_receivexlog.c
@@ -235,10 +235,10 @@ StreamLog(void)
 				progname, PQerrorMessage(conn));
 		disconnect_and_exit(1);
 	}
-	if (PQntuples(res) != 1 || PQnfields(res) != 3)
+	if (PQntuples(res) != 1)
 	{
-		fprintf(stderr, _("%s: could not identify system, got %i rows and %i fields\n"),
-				progname, PQntuples(res), PQnfields(res));
+		fprintf(stderr, _("%s: could not identify system, got %i rows\n"),
+				progname, PQntuples(res));
 		disconnect_and_exit(1);
 	}
 	timeline = atoi(PQgetvalue(res, 0, 1));
diff --git a/src/bin/pg_basebackup/receivelog.c b/src/bin/pg_basebackup/receivelog.c
index 8ca3882..c390cbf 100644
--- a/src/bin/pg_basebackup/receivelog.c
+++ b/src/bin/pg_basebackup/receivelog.c
@@ -235,13 +235,6 @@ ReceiveXlogStream(PGconn *conn, XLogRecPtr startpos, uint32 timeline, char *sysi
 			PQclear(res);
 			return false;
 		}
-		if (PQnfields(res) != 3 || PQntuples(res) != 1)
-		{
-			fprintf(stderr, _("%s: could not identify system, got %i rows and %i fields\n"),
-					progname, PQntuples(res), PQnfields(res));
-			PQclear(res);
-			return false;
-		}
 		if (strcmp(sysidentifier, PQgetvalue(res, 0, 0)) != 0)
 		{
 			fprintf(stderr, _("%s: system identifier does not match between base backup and streaming connection\n"), progname);
diff --git a/src/bin/psql/tab-complete.c b/src/bin/psql/tab-complete.c
index 6efc0ce..a27ef69 100644
--- a/src/bin/psql/tab-complete.c
+++ b/src/bin/psql/tab-complete.c
@@ -2209,52 +2209,21 @@ psql_completion(char *text, int start, int end)
 		COMPLETE_WITH_SCHEMA_QUERY(Query_for_list_of_foreign_tables, NULL);
 
 /* GRANT && REVOKE */
-	/* Complete GRANT/REVOKE with a list of roles and privileges */
+	/* Complete GRANT/REVOKE with a list of privileges */
 	else if (pg_strcasecmp(prev_wd, "GRANT") == 0 ||
 			 pg_strcasecmp(prev_wd, "REVOKE") == 0)
 	{
-		COMPLETE_WITH_QUERY(Query_for_list_of_roles
-							" UNION SELECT 'SELECT'"
-							" UNION SELECT 'INSERT'"
-							" UNION SELECT 'UPDATE'"
-							" UNION SELECT 'DELETE'"
-							" UNION SELECT 'TRUNCATE'"
-							" UNION SELECT 'REFERENCES'"
-							" UNION SELECT 'TRIGGER'"
-							" UNION SELECT 'CREATE'"
-							" UNION SELECT 'CONNECT'"
-							" UNION SELECT 'TEMPORARY'"
-							" UNION SELECT 'EXECUTE'"
-							" UNION SELECT 'USAGE'"
-							" UNION SELECT 'ALL'");
-	}
-	/* Complete GRANT/REVOKE <privilege> with "ON", GRANT/REVOKE <role> with TO/FROM */
+		static const char *const list_privilege[] =
+		{"SELECT", "INSERT", "UPDATE", "DELETE", "TRUNCATE", "REFERENCES",
+			"TRIGGER", "CREATE", "CONNECT", "TEMPORARY", "EXECUTE", "USAGE",
+		"ALL", NULL};
+
+		COMPLETE_WITH_LIST(list_privilege);
+	}
+	/* Complete GRANT/REVOKE <sth> with "ON" */
 	else if (pg_strcasecmp(prev2_wd, "GRANT") == 0 ||
 			 pg_strcasecmp(prev2_wd, "REVOKE") == 0)
-	{
-		if (pg_strcasecmp(prev_wd, "SELECT") == 0
-			|| pg_strcasecmp(prev_wd, "INSERT") == 0
-			|| pg_strcasecmp(prev_wd, "UPDATE") == 0
-			|| pg_strcasecmp(prev_wd, "DELETE") == 0
-			|| pg_strcasecmp(prev_wd, "TRUNCATE") == 0
-			|| pg_strcasecmp(prev_wd, "REFERENCES") == 0
-			|| pg_strcasecmp(prev_wd, "TRIGGER") == 0
-			|| pg_strcasecmp(prev_wd, "CREATE") == 0
-			|| pg_strcasecmp(prev_wd, "CONNECT") == 0
-			|| pg_strcasecmp(prev_wd, "TEMPORARY") == 0
-			|| pg_strcasecmp(prev_wd, "TEMP") == 0
-			|| pg_strcasecmp(prev_wd, "EXECUTE") == 0
-			|| pg_strcasecmp(prev_wd, "USAGE") == 0
-			|| pg_strcasecmp(prev_wd, "ALL") == 0)
-			COMPLETE_WITH_CONST("ON");
-		else
-		{
-			if (pg_strcasecmp(prev2_wd, "GRANT") == 0)
-				COMPLETE_WITH_CONST("TO");
-			else
-				COMPLETE_WITH_CONST("FROM");
-		}
-	}
+		COMPLETE_WITH_CONST("ON");
 
 	/*
 	 * Complete GRANT/REVOKE <sth> ON with a list of tables, views, sequences,
@@ -2335,18 +2304,6 @@ psql_completion(char *text, int start, int end)
 			COMPLETE_WITH_CONST("FROM");
 	}
 
-	/* Complete "GRANT/REVOKE * TO/FROM" with username, GROUP, or PUBLIC */
-	else if (pg_strcasecmp(prev3_wd, "GRANT") == 0 &&
-			 pg_strcasecmp(prev_wd, "TO") == 0)
-	{
-		COMPLETE_WITH_QUERY(Query_for_list_of_grant_roles);
-	}
-	else if (pg_strcasecmp(prev3_wd, "REVOKE") == 0 &&
-			 pg_strcasecmp(prev_wd, "FROM") == 0)
-	{
-		COMPLETE_WITH_QUERY(Query_for_list_of_grant_roles);
-	}
-
 /* GROUP BY */
 	else if (pg_strcasecmp(prev3_wd, "FROM") == 0 &&
 			 pg_strcasecmp(prev_wd, "GROUP") == 0)
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 1ddf4bf..e9e5325 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -270,7 +270,8 @@ extern bool XLogNeedsFlush(XLogRecPtr RecPtr);
 extern int XLogFileInit(uint32 log, uint32 seg,
 			 bool *use_existent, bool use_lock);
 extern int	XLogFileOpen(uint32 log, uint32 seg);
-
+extern bool RestoreArchivedFile(char *path, const char *xlogfname,
+					const char *recovername, off_t expectedSize);
 
 extern void XLogGetLastRemoved(uint32 *log, uint32 *seg);
 extern void XLogSetAsyncXactLSN(XLogRecPtr record);
@@ -316,6 +317,7 @@ extern TimeLineID GetRecoveryTargetTLI(void);
 extern bool CheckPromoteSignal(void);
 extern void WakeupRecovery(void);
 extern Latch *WALWriterLatch(void);
+extern Latch *WALRestoreLatch(void);
 
 /*
  * Starting/stopping a base backup
diff --git a/src/include/bootstrap/bootstrap.h b/src/include/bootstrap/bootstrap.h
index e966a73..b90ce33 100644
--- a/src/include/bootstrap/bootstrap.h
+++ b/src/include/bootstrap/bootstrap.h
@@ -23,6 +23,7 @@ typedef enum
 	StartupProcess,
 	BgWriterProcess,
 	CheckpointerProcess,
+	WalRestoreProcess,
 	WalWriterProcess,
 	WalReceiverProcess,
 
diff --git a/src/include/postmaster/startup.h b/src/include/postmaster/startup.h
index 3ec6950..35d9665 100644
--- a/src/include/postmaster/startup.h
+++ b/src/include/postmaster/startup.h
@@ -12,10 +12,11 @@
 #ifndef _STARTUP_H
 #define _STARTUP_H
 
+extern volatile sig_atomic_t startup_shutdown_requested;
+extern volatile sig_atomic_t in_restore_command;
+
 extern void HandleStartupProcInterrupts(void);
 extern void StartupProcessMain(void);
-extern void PreRestoreCommand(void);
-extern void PostRestoreCommand(void);
 extern bool IsPromoteTriggered(void);
 extern void ResetPromoteTriggered(void);
 
diff --git a/src/include/postmaster/walrestore.h b/src/include/postmaster/walrestore.h
new file mode 100644
index 0000000..98d7830
--- /dev/null
+++ b/src/include/postmaster/walrestore.h
@@ -0,0 +1,72 @@
+/*-------------------------------------------------------------------------
+ *
+ * walrestore.h
+ *	  Exports from postmaster/walrestore.c.
+ *
+ * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
+ *
+ * src/include/postmaster/walrestore.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef _WALRESTORE_H
+#define _WALRESTORE_H
+
+#include "access/xlog.h"
+#include "access/xlogdefs.h"
+#include "storage/spin.h"
+#include "pgtime.h"
+
+extern volatile sig_atomic_t walrestore_shutdown_requested;
+
+/* GUC options */
+
+extern void WalRestoreMain(void);
+extern bool XLogFileIsNowFullyRestored(TimeLineID tli, uint32 log, uint32 seg);
+extern void SetNextWALRestoreLogSeg(TimeLineID tli, uint32 log, uint32 seg);
+extern void SetRecoveryRestoreCommand(char *cmd);
+extern char *GetRecoveryRestoreCommand(void);
+extern Size WalRestoreShmemSize(void);
+extern void WalRestoreShmemInit(void);
+
+/* Shared memory area for management of walrestore process */
+typedef struct
+{
+	/*
+	 * The identifiers of the last WAL file restored by WALrestore
+	 */
+	TimeLineID	lastFileTli;
+	uint32		lastFileLog;
+	uint32		lastFileSeg;
+
+	/*
+	 * Time of last restore by WALrestore
+	 */
+	TimestampTz lastFileRestoreTime;
+
+	/*
+	 * The next WAL file requested for the WALrestore process to restore
+	 */
+	TimeLineID	nextFileTli;
+	uint32		nextFileLog;
+	uint32		nextFileSeg;
+
+	/*
+	 * All of the above read and set only while holding WALRestoreCommandLock
+	 */
+
+	/*
+	 * WALRestoreLatch is used to wake up the WALRestore to restore WAL files.
+	 */
+	Latch		WALRestoreLatch;
+
+	/*
+	 * recoveryRestoreCommand for use by walrestore; can remove if becomes GUC
+	 * Set once at startup and read-only after that
+	 */
+	char		recoveryRestoreCommand[MAXPGPATH];
+} WalRestoreData;
+
+extern WalRestoreData *WalRstr;
+
+#endif   /* _WALRESTORE_H */
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index df3df29..c316dcc 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -79,6 +79,7 @@ typedef enum LWLockId
 	SerializablePredicateLockListLock,
 	OldSerXidLock,
 	SyncRepLock,
+	WALRestoreCommandLock,
 	/* Individual lock IDs end here */
 	FirstBufMappingLock,
 	FirstLockMgrLock = FirstBufMappingLock + NUM_BUFFER_PARTITIONS,
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index 358d1a4..f994b67 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -204,12 +204,21 @@ extern PGPROC *PreparedXactProcs;
 /*
  * We set aside some extra PGPROC structures for auxiliary processes,
  * ie things that aren't full-fledged backends but need shmem access.
+ * Logger, archiver and stats processes don't count towards this total,
+ * nor do WALSender processes.
  *
+ * NUM_AUXILIARY_PROCS must be set to the highest of the requirements for
+ * normal running and recovery.
+ *
+ * During normal running we need slots for:
  * Background writer, checkpointer and WAL writer run during normal operation.
- * Startup process and WAL receiver also consume 2 slots, but WAL writer is
- * launched only after startup has exited, so we only need 4 slots.
+ * 3 slots
+ *
+ * During recovery we need slots for:
+ * Background writer, checkpointer, Startup process, WAL receiver, WAL restore.
+ * 5 slots
  */
-#define NUM_AUXILIARY_PROCS		4
+#define NUM_AUXILIARY_PROCS		5
 
 
 /* configurable options */

#11

Fujii Masao

masao.fujii@gmail.com

almost 14 years ago

In reply to: Simon Riggs (#10)

Re: WAL Restore process during recovery

On Tue, Jan 24, 2012 at 12:23 AM, Simon Riggs <simon@2ndquadrant.com> wrote:

On Mon, Jan 23, 2012 at 12:23 PM, Fujii Masao <masao.fujii@gmail.com> wrote:

Why does walrestore need to be invoked even when restore_command is
not specified? It seems to be useless. We invoke walreceiver only when
primary_conninfo is specified now. Similarly we should invoke walrestore
only when restore_command is specified?

walreceiver is shutdown and restarted in case of failed connection.
That never happens with walrestore because the command is run each
time - when we issue system(3) a new process is forked to run the
command. So there is no specific cleanup to perform and so no reason
for a managed cleanup process.

So I can't see a specific reason to change that. Do you think it makes
a difference?

Yes. When restore_command is not specified in recovery.conf, walrestore
process doesn't do any useful activity and just wastes CPU cycle. Which
might be harmless for a functionality of recovery, but ISTM it's better not
to start up walrestore in that case to avoid the waste of cycle.

Cleaned up the points noted, new patch attached in case you wish to
review further.

Still has bug, so still with me to fix.

Thanks! Will review further.

Regards,

--
Fujii Masao
NIPPON TELEGRAPH AND TELEPHONE CORPORATION
NTT Open Source Software Center

#12

Simon Riggs

simon@2ndQuadrant.com

almost 14 years ago

In reply to: Fujii Masao (#11)

Re: WAL Restore process during recovery

On Tue, Jan 24, 2012 at 9:43 AM, Fujii Masao <masao.fujii@gmail.com> wrote:

On Tue, Jan 24, 2012 at 12:23 AM, Simon Riggs <simon@2ndquadrant.com> wrote:

On Mon, Jan 23, 2012 at 12:23 PM, Fujii Masao <masao.fujii@gmail.com> wrote:

Why does walrestore need to be invoked even when restore_command is
not specified? It seems to be useless. We invoke walreceiver only when
primary_conninfo is specified now. Similarly we should invoke walrestore
only when restore_command is specified?

walreceiver is shutdown and restarted in case of failed connection.
That never happens with walrestore because the command is run each
time - when we issue system(3) a new process is forked to run the
command. So there is no specific cleanup to perform and so no reason
for a managed cleanup process.

So I can't see a specific reason to change that. Do you think it makes
a difference?

Yes. When restore_command is not specified in recovery.conf, walrestore
process doesn't do any useful activity and just wastes CPU cycle. Which
might be harmless for a functionality of recovery, but ISTM it's better not
to start up walrestore in that case to avoid the waste of cycle.

It just sleeps on a latch when it has nothing to do, so no wasted cycles.

Asking the postmaster seemed the easier option, I guess I could have
chosen the other way also.

I'll look at this when this is the last thing left to resolve to see
if that improves things.

Cleaned up the points noted, new patch attached in case you wish to
review further.

Still has bug, so still with me to fix.

Thanks! Will review further.

Much appreciated.

--
Simon Riggs http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services

#13

Fujii Masao

masao.fujii@gmail.com

almost 14 years ago

In reply to: Simon Riggs (#12)

Re: WAL Restore process during recovery

On Tue, Jan 24, 2012 at 6:49 PM, Simon Riggs <simon@2ndquadrant.com> wrote:

On Tue, Jan 24, 2012 at 9:43 AM, Fujii Masao <masao.fujii@gmail.com> wrote:

On Tue, Jan 24, 2012 at 12:23 AM, Simon Riggs <simon@2ndquadrant.com> wrote:

On Mon, Jan 23, 2012 at 12:23 PM, Fujii Masao <masao.fujii@gmail.com> wrote:

Why does walrestore need to be invoked even when restore_command is
not specified? It seems to be useless. We invoke walreceiver only when
primary_conninfo is specified now. Similarly we should invoke walrestore
only when restore_command is specified?

walreceiver is shutdown and restarted in case of failed connection.
That never happens with walrestore because the command is run each
time - when we issue system(3) a new process is forked to run the
command. So there is no specific cleanup to perform and so no reason
for a managed cleanup process.

So I can't see a specific reason to change that. Do you think it makes
a difference?

Yes. When restore_command is not specified in recovery.conf, walrestore
process doesn't do any useful activity and just wastes CPU cycle. Which
might be harmless for a functionality of recovery, but ISTM it's better not
to start up walrestore in that case to avoid the waste of cycle.

It just sleeps on a latch when it has nothing to do, so no wasted cycles.

Right, since walrestore process wakes up just every 10 seconds, a waste of
cycle is low. But what I feel uncomfortable is that walrestore process has
nothing to do *from start to end*, when restore_command is not specified,
but it's started up. I guess that many people would get surprised at that.
Of course, if restore_command can be changed without restarting the server,
I agree with you because walrestore process might do an useful activity
later. But currently not.

Asking the postmaster seemed the easier option, I guess I could have
chosen the other way also.

I'll look at this when this is the last thing left to resolve to see
if that improves things.

Okay.

Regards,

--
Fujii Masao
NIPPON TELEGRAPH AND TELEPHONE CORPORATION
NTT Open Source Software Center

#14

Fujii Masao

masao.fujii@gmail.com

almost 14 years ago

In reply to: Fujii Masao (#11)

Re: WAL Restore process during recovery

On Tue, Jan 24, 2012 at 6:43 PM, Fujii Masao <masao.fujii@gmail.com> wrote:

Cleaned up the points noted, new patch attached in case you wish to
review further.

Still has bug, so still with me to fix.

Thanks! Will review further.

v3 patch contains lots of unrelated code changes like the following.

-   <structfield>pid</structfield> column of the
+   <structfield>procpid</structfield> column of the

You seem to have failed to extract the patch from your repository correctly.
So I used v2 patch for the review. Sorry if I comment the things which you've
already fixed in v3 patch.

Here are the comments. They are almost not serious problems.

+/*
+ * GUC parameters
+ */
+int	WalRestoreDelay = 10000;

You forget to change guc.c to define wal_restore_delay as a GUC parameter?
Or just that source code comment is incorrect?

+ elog(FATAL, "recovery_restore_command is too long");

Typo: s/recovery_restore_command/restore_command

+ InitLatch(&walrstr->WALRestoreLatch); /* initialize latch used in main loop */

That latch is shared one. OwnLatch() should be called instead of InitLatch()?
If yes, it's better to call DisownLatch() when walrestore exits. Though skipping
DisownLatch() would be harmless because the latch is never owned by new
process after walrestore exits.

+	{
+		/* use volatile pointer to prevent code rearrangement */
+		volatile WalRestoreData *walrstr = WalRstr;
+
+		nextFileTli = walrstr->nextFileTli;

The declaration of "walrstr" is not required here because it's already done
at the head of WalRestoreNextFile().

+	if (restoredFromArchive)
+	{
+		/* use volatile pointer to prevent code rearrangement */
+		volatile WalRestoreData *walrstr = WalRstr;

Same as above.

+#define TMPRECOVERYXLOG "RECOVERYXLOG"

ISTM that it's better to move this definition to an include file and we should
use it in all the places where the fixed value "RECOVERYXLOG" is still used.

Regards,

--
Fujii Masao
NIPPON TELEGRAPH AND TELEPHONE CORPORATION
NTT Open Source Software Center