Include WAL in base backup

Started by Magnus Haganderalmost 15 years ago29 messages
#1Magnus Hagander
magnus@hagander.net
1 attachment(s)

Here's a cutdown version of the idea about including WAL in the base
backup. What I initially wanted was to introduce a way to guarantee
that the required WAL (with some sort of cutoff of course) would be
available for the backup, but I ran out of time for that. We can
always add that later.

For now, you need to set wal_keep_segments to make it work properly,
but if you do the idea is that the tar file/stream generated in the
base backup will include all the required WAL files. That means that
you can start a postmaster directly against the directory extracted
from the tarball, and you no longer need to set up archive logging to
get a backup.

I've got some refactoring I want to do around the
SendBackupDirectory() function after this, but a review of the
functionality first would be good. And obviously, documentation is
still necessary.

The patch to pg_basebackup applies on top of the previously posted version.

--
 Magnus Hagander
 Me: http://www.hagander.net/
 Work: http://www.redpill-linpro.com/

Attachments:

basebackup_wal.patchtext/x-patch; charset=US-ASCII; name=basebackup_wal.patchDownload
diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c
index b4d5bbe..d8f0031 100644
--- a/src/backend/replication/basebackup.c
+++ b/src/backend/replication/basebackup.c
@@ -38,9 +38,9 @@ static void _tarWriteHeader(char *filename, char *linktarget,
 				struct stat * statbuf);
 static void send_int8_string(StringInfoData *buf, int64 intval);
 static void SendBackupHeader(List *tablespaces);
-static void SendBackupDirectory(char *location, char *spcoid);
+static void SendBackupDirectory(char *location, char *spcoid, bool closetar);
 static void base_backup_cleanup(int code, Datum arg);
-static void perform_base_backup(const char *backup_label, bool progress, DIR *tblspcdir);
+static void perform_base_backup(const char *backup_label, bool progress, DIR *tblspcdir, bool includewal);
 
 typedef struct
 {
@@ -67,9 +67,12 @@ base_backup_cleanup(int code, Datum arg)
  * clobbered by longjmp" from stupider versions of gcc.
  */
 static void
-perform_base_backup(const char *backup_label, bool progress, DIR *tblspcdir)
+perform_base_backup(const char *backup_label, bool progress, DIR *tblspcdir, bool includewal)
 {
-	do_pg_start_backup(backup_label, true);
+	XLogRecPtr	startptr;
+	XLogRecPtr	endptr;
+
+	startptr = do_pg_start_backup(backup_label, true);
 
 	PG_ENSURE_ERROR_CLEANUP(base_backup_cleanup, (Datum) 0);
 	{
@@ -79,11 +82,6 @@ perform_base_backup(const char *backup_label, bool progress, DIR *tblspcdir)
 		tablespaceinfo *ti;
 
 
-		/* Add a node for the base directory */
-		ti = palloc0(sizeof(tablespaceinfo));
-		ti->size = progress ? sendDir(".", 1, true) : -1;
-		tablespaces = lappend(tablespaces, ti);
-
 		/* Collect information about all tablespaces */
 		while ((de = ReadDir(tblspcdir, "pg_tblspc")) != NULL)
 		{
@@ -111,6 +109,10 @@ perform_base_backup(const char *backup_label, bool progress, DIR *tblspcdir)
 			tablespaces = lappend(tablespaces, ti);
 		}
 
+		/* Add a node for the base directory at the end */
+		ti = palloc0(sizeof(tablespaceinfo));
+		ti->size = progress ? sendDir(".", 1, true) : -1;
+		tablespaces = lappend(tablespaces, ti);
 
 		/* Send tablespace header */
 		SendBackupHeader(tablespaces);
@@ -120,12 +122,62 @@ perform_base_backup(const char *backup_label, bool progress, DIR *tblspcdir)
 		{
 			tablespaceinfo *ti = (tablespaceinfo *) lfirst(lc);
 
-			SendBackupDirectory(ti->path, ti->oid);
+			SendBackupDirectory(ti->path, ti->oid,
+								!includewal || ti->path != NULL);
 		}
 	}
 	PG_END_ENSURE_ERROR_CLEANUP(base_backup_cleanup, (Datum) 0);
 
-	do_pg_stop_backup();
+	endptr = do_pg_stop_backup();
+
+	if (includewal)
+	{
+		/*
+		 * We've left the last tar file "open", so we can now append the
+		 * required WAL files to it.
+		 */
+		int 	logid;
+		int		logseg;
+
+		elog(LOG, "Going to write wal from %i.%i to %i.%i",
+			 startptr.xlogid, startptr.xrecoff / XLogSegSize,
+			 endptr.xlogid, endptr.xrecoff / XLogSegSize);
+		logid = startptr.xlogid;
+		logseg = startptr.xrecoff / XLogSegSize;
+
+		while (true)
+		{
+			char	xlogname[64];
+			char	fn[MAXPGPATH];
+			struct stat statbuf;
+
+			/* Send the current WAL file */
+#define TIMELINE 1
+			XLogFileName(xlogname, TIMELINE, logid, logseg);
+			sprintf(fn, "./pg_xlog/%s", xlogname);
+			if (lstat(fn, &statbuf) != 0)
+				ereport(ERROR,
+						(errcode(errcode_for_file_access()),
+						 errmsg("could not stat file \"%s\": %m",
+								fn)));
+
+			if (!S_ISREG(statbuf.st_mode))
+				ereport(ERROR,
+						(errmsg("\"%s\" is not a file",
+								fn)));
+			sendFile(fn, 1, &statbuf);
+
+			/* Advance to the next WAL file */
+			NextLogSeg(logid, logseg);
+
+			/* Have we reached our stop position yet? */
+			if (logid > endptr.xlogid ||
+				(logid == endptr.xlogid && logseg >= endptr.xrecoff / XLogSegSize))
+				break;
+		}
+		/* Send CopyDone message for the last tar file*/
+		pq_putemptymessage('c');
+	}
 }
 
 /*
@@ -135,7 +187,7 @@ perform_base_backup(const char *backup_label, bool progress, DIR *tblspcdir)
  * pg_stop_backup() for the user.
  */
 void
-SendBaseBackup(const char *backup_label, bool progress)
+SendBaseBackup(const char *backup_label, bool progress, bool includewal)
 {
 	DIR		   *dir;
 	MemoryContext backup_context;
@@ -168,7 +220,7 @@ SendBaseBackup(const char *backup_label, bool progress)
 		ereport(ERROR,
 				(errmsg("unable to open directory pg_tblspc: %m")));
 
-	perform_base_backup(backup_label, progress, dir);
+	perform_base_backup(backup_label, progress, dir, includewal);
 
 	FreeDir(dir);
 
@@ -256,7 +308,7 @@ SendBackupHeader(List *tablespaces)
 }
 
 static void
-SendBackupDirectory(char *location, char *spcoid)
+SendBackupDirectory(char *location, char *spcoid, bool closetar)
 {
 	StringInfoData buf;
 
@@ -272,7 +324,8 @@ SendBackupDirectory(char *location, char *spcoid)
 			false);
 
 	/* Send CopyDone message */
-	pq_putemptymessage('c');
+	if (closetar)
+		pq_putemptymessage('c');
 }
 
 
diff --git a/src/backend/replication/repl_gram.y b/src/backend/replication/repl_gram.y
index 0ef33dd..d4bbcbe 100644
--- a/src/backend/replication/repl_gram.y
+++ b/src/backend/replication/repl_gram.y
@@ -66,11 +66,12 @@ Node *replication_parse_result;
 %token K_IDENTIFY_SYSTEM
 %token K_LABEL
 %token K_PROGRESS
+%token K_WAL
 %token K_START_REPLICATION
 
 %type <node>	command
 %type <node>	base_backup start_replication identify_system
-%type <boolval>	opt_progress
+%type <boolval>	opt_progress opt_wal
 %type <str>     opt_label
 
 %%
@@ -105,12 +106,13 @@ identify_system:
  * BASE_BACKUP [LABEL <label>] [PROGRESS]
  */
 base_backup:
-			K_BASE_BACKUP opt_label opt_progress
+			K_BASE_BACKUP opt_label opt_progress opt_wal
 				{
 					BaseBackupCmd *cmd = (BaseBackupCmd *) makeNode(BaseBackupCmd);
 
 					cmd->label = $2;
 					cmd->progress = $3;
+					cmd->includewal = $4;
 
 					$$ = (Node *) cmd;
 				}
@@ -124,6 +126,10 @@ opt_progress: K_PROGRESS		{ $$ = true; }
 			| /* EMPTY */		{ $$ = false; }
 			;
 
+opt_wal: K_WAL				{ $$ = true; }
+			| /* EMPTY */		{ $$ = false; }
+			;
+
 /*
  * START_REPLICATION %X/%X
  */
diff --git a/src/backend/replication/repl_scanner.l b/src/backend/replication/repl_scanner.l
index 014a720..9f75346 100644
--- a/src/backend/replication/repl_scanner.l
+++ b/src/backend/replication/repl_scanner.l
@@ -60,6 +60,7 @@ BASE_BACKUP			{ return K_BASE_BACKUP; }
 IDENTIFY_SYSTEM		{ return K_IDENTIFY_SYSTEM; }
 LABEL			{ return K_LABEL; }
 PROGRESS			{ return K_PROGRESS; }
+WAL			{ return K_WAL; }
 START_REPLICATION	{ return K_START_REPLICATION; }
 ","				{ return ','; }
 ";"				{ return ';'; }
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index 0ad6804..be3f8ec 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -402,7 +402,7 @@ HandleReplicationCommand(const char *cmd_string)
 			{
 				BaseBackupCmd *cmd = (BaseBackupCmd *) cmd_node;
 
-				SendBaseBackup(cmd->label, cmd->progress);
+				SendBaseBackup(cmd->label, cmd->progress, cmd->includewal);
 
 				/* Send CommandComplete and ReadyForQuery messages */
 				EndCommand("SELECT", DestRemote);
diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c
index 7fcb20a..eb315b9 100644
--- a/src/bin/pg_basebackup/pg_basebackup.c
+++ b/src/bin/pg_basebackup/pg_basebackup.c
@@ -33,6 +33,7 @@ char	   *label = "pg_basebackup base backup";
 bool		showprogress = false;
 int			verbose = 0;
 int			compresslevel = 0;
+bool		includewal = false;
 char	   *conninfo = NULL;
 
 /* Progress counters */
@@ -97,6 +98,7 @@ usage(void)
 	printf(_("  -d, --basedir=directory   receive base backup into directory\n"));
 	printf(_("  -t, --tardir=directory    receive base backup into tar files\n"
 			 "                            stored in specified directory\n"));
+	printf(_("  -w, --wal                 include required WAL files in backup\n"));
 	printf(_("  -Z, --compress=0-9        compress tar output\n"));
 	printf(_("  -l, --label=label         set backup label\n"));
 	printf(_("  -p, --progress            show progress information\n"));
@@ -642,9 +644,10 @@ BaseBackup()
 	conn = GetConnection();
 
 	PQescapeStringConn(conn, escaped_label, label, sizeof(escaped_label), &i);
-	snprintf(current_path, sizeof(current_path), "BASE_BACKUP LABEL '%s' %s",
+	snprintf(current_path, sizeof(current_path), "BASE_BACKUP LABEL '%s' %s %s",
 			 escaped_label,
-			 showprogress ? "PROGRESS" : "");
+			 showprogress ? "PROGRESS" : "",
+			 includewal ? "WAL" : "");
 
 	if (PQsendQuery(conn, current_path) == 0)
 	{
@@ -687,7 +690,7 @@ BaseBackup()
 		 * once since it can be relocated, and it will be checked before we do
 		 * anything anyway.
 		 */
-		if (basedir != NULL && i > 0)
+		if (basedir != NULL && !PQgetisnull(res, i, 1))
 			verify_dir_is_empty_or_create(PQgetvalue(res, i, 1));
 	}
 
@@ -747,6 +750,7 @@ main(int argc, char **argv)
 		{"conninfo", required_argument, NULL, 'c'},
 		{"basedir", required_argument, NULL, 'd'},
 		{"tardir", required_argument, NULL, 't'},
+		{"wal", no_argument, NULL, 'w'},
 		{"compress", required_argument, NULL, 'Z'},
 		{"label", required_argument, NULL, 'l'},
 		{"verbose", no_argument, NULL, 'v'},
@@ -776,7 +780,7 @@ main(int argc, char **argv)
 		}
 	}
 
-	while ((c = getopt_long(argc, argv, "c:d:t:l:Z:vp",
+	while ((c = getopt_long(argc, argv, "c:d:t:l:Z:wvp",
 							long_options, &option_index)) != -1)
 	{
 		switch (c)
@@ -790,6 +794,9 @@ main(int argc, char **argv)
 			case 't':
 				tardir = xstrdup(optarg);
 				break;
+			case 'w':
+				includewal = true;
+				break;
 			case 'l':
 				label = xstrdup(optarg);
 				break;
diff --git a/src/include/replication/basebackup.h b/src/include/replication/basebackup.h
index eb2e160..3172d4b 100644
--- a/src/include/replication/basebackup.h
+++ b/src/include/replication/basebackup.h
@@ -12,6 +12,6 @@
 #ifndef _BASEBACKUP_H
 #define _BASEBACKUP_H
 
-extern void SendBaseBackup(const char *backup_label, bool progress);
+extern void SendBaseBackup(const char *backup_label, bool progress, bool includewal);
 
 #endif   /* _BASEBACKUP_H */
diff --git a/src/include/replication/replnodes.h b/src/include/replication/replnodes.h
index 4f4a1a3..4ecd18a 100644
--- a/src/include/replication/replnodes.h
+++ b/src/include/replication/replnodes.h
@@ -47,6 +47,7 @@ typedef struct BaseBackupCmd
 	NodeTag		type;
 	char	   *label;
 	bool		progress;
+	bool		includewal;
 }	BaseBackupCmd;
 
 
#2Dimitri Fontaine
dimitri@2ndQuadrant.fr
In reply to: Magnus Hagander (#1)
Re: Include WAL in base backup

Magnus Hagander <magnus@hagander.net> writes:

Here's a cutdown version of the idea about including WAL in the base
backup. What I initially wanted was to introduce a way to guarantee
that the required WAL (with some sort of cutoff of course) would be
available for the backup, but I ran out of time for that. We can
always add that later.

What if you start a concurrent process that begins streaming the WAL
segments just before you start the backup, and you stop it after having
stopped the backup. I would think that then, the local received files
would be complete. We would only need a program able to stream the WAL
segments and build WAL files from that… do you know about one? :)

For now, you need to set wal_keep_segments to make it work properly,

That's quite a big foot gun, isn't it? You would have to at least offer
an option to check for your backup or to call it broken when you miss
some WAL files on the server.

The only other safe option I know about that's not a pg_streamrecv
subprocess would be to require archiving for the duration of the base
backup, but I think we agreed that it would be nice being able to bypass
that.

Regards,
--
Dimitri Fontaine
http://2ndQuadrant.fr PostgreSQL : Expertise, Formation et Support

#3Magnus Hagander
magnus@hagander.net
In reply to: Dimitri Fontaine (#2)
Re: Include WAL in base backup

On Sat, Jan 15, 2011 at 23:32, Dimitri Fontaine <dimitri@2ndquadrant.fr> wrote:

Magnus Hagander <magnus@hagander.net> writes:

Here's a cutdown version of the idea about including WAL in the base
backup. What I initially wanted was to introduce a way to guarantee
that the required WAL (with some sort of cutoff of course) would be
available for the backup, but I ran out of time for that. We can
always add that later.

What if you start a concurrent process that begins streaming the WAL
segments just before you start the backup, and you stop it after having
stopped the backup.  I would think that then, the local received files
would be complete.  We would only need a program able to stream the WAL
segments and build WAL files from that… do you know about one? :)

Sure, if you stream the backups "on the side", then you don't need
this feature. This is for "very simple filesystem backups", without
the need to set up streaming of archiving.

If you're streaming the WAL separately, you'd use pg_basebackup in
non-wal mode. That's why it's optional.

For now, you need to set wal_keep_segments to make it work properly,

That's quite a big foot gun, isn't it?  You would have to at least offer
an option to check for your backup or to call it broken when you miss
some WAL files on the server.

Oh, it will give you an ERROR if any of the required WAL files aren't
around anymore. So you will know that your backup is incomplete.

The only other safe option I know about that's not a pg_streamrecv
subprocess would be to require archiving for the duration of the base
backup, but I think we agreed that it would be nice being able to bypass
that.

No, the safe option I'm envisioning for this one is to just prevent
the server from recycling the logfiles until they have been sent off.
Witha cap so we don't prevent recycling forever if the client hangs.
This is the part I started working on, but didn't hav etime to finish
for the CF. I've not given up on it, it's just waiting for later.

--
 Magnus Hagander
 Me: http://www.hagander.net/
 Work: http://www.redpill-linpro.com/

#4Dimitri Fontaine
dimitri@2ndQuadrant.fr
In reply to: Magnus Hagander (#3)
Re: Include WAL in base backup

Magnus Hagander <magnus@hagander.net> writes:

What if you start a concurrent process that begins streaming the WAL
segments just before you start the backup, and you stop it after having
stopped the backup.  I would think that then, the local received files
would be complete.  We would only need a program able to stream the WAL
segments and build WAL files from that… do you know about one? :)

Sure, if you stream the backups "on the side", then you don't need
this feature. This is for "very simple filesystem backups", without
the need to set up streaming of archiving.

What I meant is: why don't we provide an option to automate just that
behavior in pg_basebackup? It looks like a fork() then calling code you
already wrote.

Regards,
--
Dimitri Fontaine
http://2ndQuadrant.fr PostgreSQL : Expertise, Formation et Support

#5Magnus Hagander
magnus@hagander.net
In reply to: Dimitri Fontaine (#4)
Re: Include WAL in base backup

On Sun, Jan 16, 2011 at 18:45, Dimitri Fontaine <dimitri@2ndquadrant.fr> wrote:

Magnus Hagander <magnus@hagander.net> writes:

What if you start a concurrent process that begins streaming the WAL
segments just before you start the backup, and you stop it after having
stopped the backup.  I would think that then, the local received files
would be complete.  We would only need a program able to stream the WAL
segments and build WAL files from that… do you know about one? :)

Sure, if you stream the backups "on the side", then you don't need
this feature. This is for "very simple filesystem backups", without
the need to set up streaming of archiving.

What I meant is: why don't we provide an option to automate just that
behavior in pg_basebackup?  It looks like a fork() then calling code you
already wrote.

Ah, I see. That's a good idea.

However, it's not quite that simple. "just adding a fork()" doesn't
work on all our platforms, and you need to deal with feedback and such
between them as well.

I think it's definitely something worth doing in the long run, but I
think we should start with the simpler way.

Oh, and this might be the use-case for integrating the streaming log
code as well :-) But if we plan to do that, perhaps we should pick a
different name for the binary? Or maybe just share code with another
one later..

--
 Magnus Hagander
 Me: http://www.hagander.net/
 Work: http://www.redpill-linpro.com/

#6Dimitri Fontaine
dimitri@2ndQuadrant.fr
In reply to: Magnus Hagander (#5)
Re: Include WAL in base backup

Magnus Hagander <magnus@hagander.net> writes:

However, it's not quite that simple. "just adding a fork()" doesn't
work on all our platforms, and you need to deal with feedback and such
between them as well.

I'd think client-side, we have an existing implementation with the
parallel pg_restore option. Don't know (yet) how easy it is to reuse
that code…

Oh, and this might be the use-case for integrating the streaming log
code as well :-) But if we plan to do that, perhaps we should pick a
different name for the binary? Or maybe just share code with another
one later..

You're talking about the pg_streamrecv binary? Then yes, my idea about
it is that it should become the default archive client we ship with
PostgreSQL. And grow into offering a way to be the default restore
command too. I'd see the current way that the standby can switch
between restoring from archive and from a live stream as a first step
into that direction.

Regards,
--
Dimitri Fontaine
http://2ndQuadrant.fr PostgreSQL : Expertise, Formation et Support

#7Magnus Hagander
magnus@hagander.net
In reply to: Dimitri Fontaine (#6)
Re: Include WAL in base backup

On Sun, Jan 16, 2011 at 20:13, Dimitri Fontaine <dimitri@2ndquadrant.fr> wrote:

Magnus Hagander <magnus@hagander.net> writes:

However, it's not quite that simple. "just adding a fork()" doesn't
work on all our platforms, and you need to deal with feedback and such
between them as well.

I'd think client-side, we have an existing implementation with the
parallel pg_restore option.  Don't know (yet) how easy it is to reuse
that code…

True.

But however we do it, it will be significantly more complex than just
including the WAL. And I want to make sure we get *something* done in
time for 9.1, and then improve upon it. If we can get the improvement
into 9.1 that's great, but if not it will have to wait until 9.2 - and
I don't want to leave us without anything for 9.1.

Oh, and this might be the use-case for integrating the streaming log
code as well :-) But if we plan to do that, perhaps we should pick a
different name for the binary? Or maybe just share code with another
one later..

You're talking about the pg_streamrecv binary?  Then yes, my idea about
it is that it should become the default archive client we ship with
PostgreSQL.  And grow into offering a way to be the default restore
command too.  I'd see the current way that the standby can switch
between restoring from archive and from a live stream as a first step
into that direction.

Right. I did put both the base backup and the wal streaming in the
same binary earlier, and the only shared code was the one to connect
to the db. So I split them apart again. This is the reason to put them
back together perhaps - or just have the ability for pg_basebackup to
fork()+exec() the wal streamer.

--
 Magnus Hagander
 Me: http://www.hagander.net/
 Work: http://www.redpill-linpro.com/

#8Dimitri Fontaine
dimitri@2ndQuadrant.fr
In reply to: Magnus Hagander (#7)
Re: Include WAL in base backup

Magnus Hagander <magnus@hagander.net> writes:

But however we do it, it will be significantly more complex than just
including the WAL. And I want to make sure we get *something* done in
time for 9.1, and then improve upon it. If we can get the improvement
into 9.1 that's great, but if not it will have to wait until 9.2 - and
I don't want to leave us without anything for 9.1.

+1. The only point I'm not clear on is the complexity, and I trust you
to cut off at the right point here… meanwhile, I'm still asking for this
little more until you say your plate's full :)

Right. I did put both the base backup and the wal streaming in the
same binary earlier, and the only shared code was the one to connect
to the db. So I split them apart again. This is the reason to put them
back together perhaps - or just have the ability for pg_basebackup to
fork()+exec() the wal streamer.

That would be awesome.

Then pg_streamrecv could somehow accept options that make it suitable
for use as an archive command, connecting to your (still?) third-party
daemon? At this point it'd be pg_walsender :)

Regards,
--
Dimitri Fontaine
http://2ndQuadrant.fr PostgreSQL : Expertise, Formation et Support

#9Stephen Frost
sfrost@snowman.net
In reply to: Magnus Hagander (#1)
Re: Include WAL in base backup

Greetings,

* Magnus Hagander (magnus@hagander.net) wrote:

For now, you need to set wal_keep_segments to make it work properly,
but if you do the idea is that the tar file/stream generated in the
base backup will include all the required WAL files.

Is there some reason to not ERROR outright if we're asked to provide WAL
and wal_keep_segments isn't set..? I'd rather do that than only ERROR
when a particular WAL is missing.. That could lead to transient backup
errors that an inexperienced sysadmin or DBA might miss or ignore.
They'll notice if it doesn't work the first time they try it and spits
out a hint about wal_keep_segments.

That means that
you can start a postmaster directly against the directory extracted
from the tarball, and you no longer need to set up archive logging to
get a backup.

Like that part.

I've got some refactoring I want to do around the
SendBackupDirectory() function after this, but a review of the
functionality first would be good. And obviously, documentation is
still necessary.

mkay, I'm not going to try to make this ready for committer, but will
provide my comments on it overall. Bit difficult to review when someone
else is reviewing the base patch too. :/

Here goes:

- I'm not a huge fan of the whole 'closetar' option, that feels really
rather wrong to me. Why not just open it and close it in
perform_base_backup(), unconditionally?

- I wonder if you're not getting to a level where you shold be using a
struct to pass the relevant information to perform_base_backup()
instead of adding more arguments on.. That's going to get unwieldy at
some point.

- Why not initialize logid and logseg like so?:

int logid = startptr.xlogid;
int logseg = startptr.xrecoff / XLogSegSize;

Then use those in your elog? Seems cleaner to me.

- A #define right in the middle of a while loop...? Really?

- The grammar changes strike me as.. odd. Typically, you would have an
'option' production that you can then have a list of and then let each
option be whatever the OR'd set of options is. Wouldn't the current
grammar require that you put the options in a specific order? That'd
blow.

@@ -687,7 +690,7 @@ BaseBackup()
 		 * once since it can be relocated, and it will be checked before we do
 		 * anything anyway.
 		 */
-		if (basedir != NULL && i > 0)
+		if (basedir != NULL && !PQgetisnull(res, i, 1))
 			verify_dir_is_empty_or_create(PQgetvalue(res, i, 1));
 	}

- Should the 'i > 0' conditional above still be there..?

So, that's my review from just reading the source code and the thread..
Hope it's useful, sorry it's not more. :/

Thanks,

Stephen

#10Stephen Frost
sfrost@snowman.net
In reply to: Stephen Frost (#9)
Re: Include WAL in base backup

Magnus,

* Stephen Frost (sfrost@snowman.net) wrote:

mkay, I'm not going to try to make this ready for committer, but will
provide my comments on it overall. Bit difficult to review when someone
else is reviewing the base patch too. :/

I went ahead and marked it as 'waiting on author', since I'd like
feedback on my comments, but I'll try to make time in the next few days
to apply the two patches and test it out, unless I hear otherwise.

Thanks,

Stephen

#11Magnus Hagander
magnus@hagander.net
In reply to: Stephen Frost (#9)
Re: Include WAL in base backup

On Thu, Jan 20, 2011 at 05:03, Stephen Frost <sfrost@snowman.net> wrote:

Greetings,

* Magnus Hagander (magnus@hagander.net) wrote:

For now, you need to set wal_keep_segments to make it work properly,
but if you do the idea is that the tar file/stream generated in the
base backup will include all the required WAL files.

Is there some reason to not ERROR outright if we're asked to provide WAL
and wal_keep_segments isn't set..?  I'd rather do that than only ERROR
when a particular WAL is missing..  That could lead to transient backup
errors that an inexperienced sysadmin or DBA might miss or ignore.
They'll notice if it doesn't work the first time they try it and spits
out a hint about wal_keep_segments.

Well, in a "smaller:ish" database you can easily do the full backup
before you run out of segments in the data directory even when you
haven't set wal_keep_segments. If we error out, we force extra work on
the user in the trivial case.

I'd rather not change that, but instead (as Fujii-san has also
mentioned is needed anyway) put some more effort into documenting in
which cases you need to set it.

I've got some refactoring I want to do around the
SendBackupDirectory() function after this, but a review of the
functionality first would be good. And obviously, documentation is
still necessary.

mkay, I'm not going to try to make this ready for committer, but will
provide my comments on it overall.  Bit difficult to review when someone
else is reviewing the base patch too. :/

Heh, yeah.

Here goes:

- I'm not a huge fan of the whole 'closetar' option, that feels really
 rather wrong to me.  Why not just open it and close it in
 perform_base_backup(), unconditionally?

Yeah, we could move the whole thing up there. Or, as I mentioned in an
IM conversation with Heikki, just get rid of SendBackupDirectory()
completely and inline it inside the loop in perform_base_backup().
Given that it's basically just 5 lines + a call to sendDir()..

- I wonder if you're not getting to a level where you shold be using a
 struct to pass the relevant information to perform_base_backup()
 instead of adding more arguments on..  That's going to get unwieldy at
 some point.

Yeah, probably.

We *could* pass the BaseBackupCmd struct from the parser all the way
in - or is that cheating too much on abstractions? It seems if we
don't, we're just going to hav ea copy of that struct without the
NodeTag member..

- Why not initialize logid and logseg like so?:

       int logid = startptr.xlogid;
       int logseg = startptr.xrecoff / XLogSegSize;

 Then use those in your elog?  Seems cleaner to me.

Hmm. Yes. Agreed.

- A #define right in the middle of a while loop...?  Really?

Haha, yeah, that was a typo. I didn't remember the name of the
variable so I stuck it there for testing and forgot it. It should be
ThisTimeLineID, and no #define at all.

- The grammar changes strike me as..  odd.  Typically, you would have an
 'option' production that you can then have a list of and then let each
 option be whatever the OR'd set of options is.  Wouldn't the current
 grammar require that you put the options in a specific order?  That'd
 blow.

It does require them in a specific order. The advantage is that it
makes the code easier. and it's not like end users are expected to run
them anyway...

Now, I'm no bison export, so it might be an easy fix. But the way I
could figure, to make them order indepdent I have to basically collect
them up together as a List instead of just as a struct, and then loop
through that list to build a struct later.

If someone who knows Bison better can tell me a neater way to do that,
I'll be happy to change :-)

@@ -687,7 +690,7 @@ BaseBackup()
                * once since it can be relocated, and it will be checked before we do
                * anything anyway.
                */
-               if (basedir != NULL && i > 0)
+               if (basedir != NULL && !PQgetisnull(res, i, 1))
                       verify_dir_is_empty_or_create(PQgetvalue(res, i, 1));
       }

- Should the 'i > 0' conditional above still be there..?

No. That's a cheat-check that assumes the base directory is always
sent first. Which is not true anymore - with this patch we always send
it *last* so we can include the WAL in it.

So, that's my review from just reading the source code and the thread..
Hope it's useful, sorry it's not more. :/

Thanks - it certainly is!

--
 Magnus Hagander
 Me: http://www.hagander.net/
 Work: http://www.redpill-linpro.com/

#12Tom Lane
tgl@sss.pgh.pa.us
In reply to: Magnus Hagander (#11)
Re: Include WAL in base backup

Magnus Hagander <magnus@hagander.net> writes:

- Why not initialize logid and logseg like so?:

� � � �int logid = startptr.xlogid;
� � � �int logseg = startptr.xrecoff / XLogSegSize;

�Then use those in your elog? �Seems cleaner to me.

Hmm. Yes. Agreed.

Marginal complaint here: int is the wrong type, I'm pretty sure.

regards, tom lane

#13Fujii Masao
masao.fujii@gmail.com
In reply to: Tom Lane (#12)
Re: Include WAL in base backup

On Fri, Jan 21, 2011 at 12:28 AM, Tom Lane <tgl@sss.pgh.pa.us> wrote:

Magnus Hagander <magnus@hagander.net> writes:

- Why not initialize logid and logseg like so?:

       int logid = startptr.xlogid;
       int logseg = startptr.xrecoff / XLogSegSize;

 Then use those in your elog?  Seems cleaner to me.

Hmm. Yes. Agreed.

Marginal complaint here: int is the wrong type, I'm pretty sure.

And, we should use XLByteToPrevSeg here instead of just =, I think.

Regards,

--
Fujii Masao
NIPPON TELEGRAPH AND TELEPHONE CORPORATION
NTT Open Source Software Center

#14Magnus Hagander
magnus@hagander.net
In reply to: Fujii Masao (#13)
Re: Include WAL in base backup

On Mon, Jan 24, 2011 at 08:45, Fujii Masao <masao.fujii@gmail.com> wrote:

On Fri, Jan 21, 2011 at 12:28 AM, Tom Lane <tgl@sss.pgh.pa.us> wrote:

Magnus Hagander <magnus@hagander.net> writes:

- Why not initialize logid and logseg like so?:

       int logid = startptr.xlogid;
       int logseg = startptr.xrecoff / XLogSegSize;

 Then use those in your elog?  Seems cleaner to me.

Hmm. Yes. Agreed.

Marginal complaint here: int is the wrong type, I'm pretty sure.

And, we should use XLByteToPrevSeg here instead of just =, I think.

Not XLByteToSeg?

(I admit I missed the existance of both those macros, though)

I plan to post a rebased version of this patch today or tomorrow, btw,
that should work against all the changes that have been applied to
git.

--
 Magnus Hagander
 Me: http://www.hagander.net/
 Work: http://www.redpill-linpro.com/

#15Fujii Masao
masao.fujii@gmail.com
In reply to: Magnus Hagander (#14)
Re: Include WAL in base backup

On Mon, Jan 24, 2011 at 4:47 PM, Magnus Hagander <magnus@hagander.net> wrote:

On Mon, Jan 24, 2011 at 08:45, Fujii Masao <masao.fujii@gmail.com> wrote:

On Fri, Jan 21, 2011 at 12:28 AM, Tom Lane <tgl@sss.pgh.pa.us> wrote:

Magnus Hagander <magnus@hagander.net> writes:

- Why not initialize logid and logseg like so?:

       int logid = startptr.xlogid;
       int logseg = startptr.xrecoff / XLogSegSize;

 Then use those in your elog?  Seems cleaner to me.

Hmm. Yes. Agreed.

Marginal complaint here: int is the wrong type, I'm pretty sure.

And, we should use XLByteToPrevSeg here instead of just =, I think.

Not XLByteToSeg?

Checking... yeah, you are right. We should use XLByteToSeg since
the REDO starting WAL record doesn't exist in the previous WAL
segment when the REDO starting location is a boundary byte.

Regards,

--
Fujii Masao
NIPPON TELEGRAPH AND TELEPHONE CORPORATION
NTT Open Source Software Center

#16Magnus Hagander
magnus@hagander.net
In reply to: Fujii Masao (#15)
1 attachment(s)
Re: Include WAL in base backup

On Mon, Jan 24, 2011 at 09:03, Fujii Masao <masao.fujii@gmail.com> wrote:

On Mon, Jan 24, 2011 at 4:47 PM, Magnus Hagander <magnus@hagander.net> wrote:

On Mon, Jan 24, 2011 at 08:45, Fujii Masao <masao.fujii@gmail.com> wrote:

On Fri, Jan 21, 2011 at 12:28 AM, Tom Lane <tgl@sss.pgh.pa.us> wrote:

Magnus Hagander <magnus@hagander.net> writes:

- Why not initialize logid and logseg like so?:

       int logid = startptr.xlogid;
       int logseg = startptr.xrecoff / XLogSegSize;

 Then use those in your elog?  Seems cleaner to me.

Hmm. Yes. Agreed.

Marginal complaint here: int is the wrong type, I'm pretty sure.

And, we should use XLByteToPrevSeg here instead of just =, I think.

Not XLByteToSeg?

Checking... yeah, you are right. We should use XLByteToSeg since
the REDO starting WAL record doesn't exist in the previous WAL
segment when the REDO starting location is a boundary byte.

Here's an updated version of the patch:

* rebased on the current git master (including changing the switch
from -w to -x since -w was used now)
* includes some documentation
* use XLogByteToSeg and uint32 as mentioned here
* refactored to remove SendBackupDirectory(), removing the ugly closetar boolean

--
 Magnus Hagander
 Me: http://www.hagander.net/
 Work: http://www.redpill-linpro.com/

Attachments:

basebackup_wal.patchtext/x-patch; charset=US-ASCII; name=basebackup_wal.patchDownload
diff --git a/doc/src/sgml/protocol.sgml b/doc/src/sgml/protocol.sgml
index 73f26b4..c257413 100644
--- a/doc/src/sgml/protocol.sgml
+++ b/doc/src/sgml/protocol.sgml
@@ -1460,7 +1460,7 @@ The commands accepted in walsender mode are:
   </varlistentry>
 
   <varlistentry>
-    <term>BASE_BACKUP [<literal>LABEL</literal> <replaceable>'label'</replaceable>] [<literal>PROGRESS</literal>] [<literal>FAST</literal>]</term>
+    <term>BASE_BACKUP [<literal>LABEL</literal> <replaceable>'label'</replaceable>] [<literal>PROGRESS</literal>] [<literal>FAST</literal>] [<literal>WAL</literal>]</term>
     <listitem>
      <para>
       Instructs the server to start streaming a base backup.
@@ -1505,6 +1505,18 @@ The commands accepted in walsender mode are:
          </para>
         </listitem>
        </varlistentry>
+
+       <varlistentry>
+        <term><literal>WAL</literal></term>
+        <listitem>
+         <para>
+          Include the necessary WAL segments in the backup. This will include
+          all the files between start and stop backup in the
+          <filename>pg_xlog</filename> directory of the base directory tar
+          file.
+         </para>
+        </listitem>
+       </varlistentry>
       </variablelist>
      </para>
      <para>
diff --git a/doc/src/sgml/ref/pg_basebackup.sgml b/doc/src/sgml/ref/pg_basebackup.sgml
index 321c8ca..60ffa3c 100644
--- a/doc/src/sgml/ref/pg_basebackup.sgml
+++ b/doc/src/sgml/ref/pg_basebackup.sgml
@@ -145,6 +145,31 @@ PostgreSQL documentation
      </varlistentry>
 
      <varlistentry>
+      <term><option>-x</option></term>
+      <term><option>--xlog</option></term>
+      <listitem>
+       <para>
+        Includes the required transaction log files (WAL files) in the
+        backup. This will include all transaction logs generated during
+        the backup. If this option is specified, it is possible to start
+        a postmaster directly in the extracted directory without the need
+        to consult the log archive, thus making this a completely standalone
+        backup.
+       </para>
+       <note>
+        <para>
+         The transaction log files are collected at the end of the backup.
+         Therefore, it is necessary for the
+         <xref linkend="guc-wal-keep-segments"> parameter to be set high
+         enough that the log is not removed before the end of the backup.
+         If the log has been rotated when it's time to transfer it, the
+         backup will fail and be unusable.
+        </para>
+       </note>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry>
       <term><option>-Z <replaceable class="parameter">level</replaceable></option></term>
       <term><option>--compress=<replaceable class="parameter">level</replaceable></option></term>
       <listitem>
diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c
index 73edcf2..9bffe17 100644
--- a/src/backend/replication/basebackup.c
+++ b/src/backend/replication/basebackup.c
@@ -37,6 +37,7 @@ typedef struct
 	const char *label;
 	bool		progress;
 	bool		fastcheckpoint;
+	bool		includewal;
 }	basebackup_options;
 
 
@@ -46,7 +47,6 @@ static void _tarWriteHeader(char *filename, char *linktarget,
 				struct stat * statbuf);
 static void send_int8_string(StringInfoData *buf, int64 intval);
 static void SendBackupHeader(List *tablespaces);
-static void SendBackupDirectory(char *location, char *spcoid);
 static void base_backup_cleanup(int code, Datum arg);
 static void perform_base_backup(basebackup_options *opt, DIR *tblspcdir);
 static void parse_basebackup_options(List *options, basebackup_options *opt);
@@ -78,7 +78,10 @@ base_backup_cleanup(int code, Datum arg)
 static void
 perform_base_backup(basebackup_options *opt, DIR *tblspcdir)
 {
-	do_pg_start_backup(opt->label, opt->fastcheckpoint);
+	XLogRecPtr	startptr;
+	XLogRecPtr	endptr;
+
+	startptr = do_pg_start_backup(opt->label, opt->fastcheckpoint);
 
 	PG_ENSURE_ERROR_CLEANUP(base_backup_cleanup, (Datum) 0);
 	{
@@ -87,12 +90,6 @@ perform_base_backup(basebackup_options *opt, DIR *tblspcdir)
 		struct dirent *de;
 		tablespaceinfo *ti;
 
-
-		/* Add a node for the base directory */
-		ti = palloc0(sizeof(tablespaceinfo));
-		ti->size = opt->progress ? sendDir(".", 1, true) : -1;
-		tablespaces = lappend(tablespaces, ti);
-
 		/* Collect information about all tablespaces */
 		while ((de = ReadDir(tblspcdir, "pg_tblspc")) != NULL)
 		{
@@ -120,6 +117,10 @@ perform_base_backup(basebackup_options *opt, DIR *tblspcdir)
 			tablespaces = lappend(tablespaces, ti);
 		}
 
+		/* Add a node for the base directory at the end */
+		ti = palloc0(sizeof(tablespaceinfo));
+		ti->size = opt->progress ? sendDir(".", 1, true) : -1;
+		tablespaces = lappend(tablespaces, ti);
 
 		/* Send tablespace header */
 		SendBackupHeader(tablespaces);
@@ -128,13 +129,84 @@ perform_base_backup(basebackup_options *opt, DIR *tblspcdir)
 		foreach(lc, tablespaces)
 		{
 			tablespaceinfo *ti = (tablespaceinfo *) lfirst(lc);
+			StringInfoData	buf;
 
-			SendBackupDirectory(ti->path, ti->oid);
+			/* Send CopyOutResponse message */
+			pq_beginmessage(&buf, 'H');
+			pq_sendbyte(&buf, 0);		/* overall format */
+			pq_sendint(&buf, 0, 2);		/* natts */
+			pq_endmessage(&buf);
+
+			sendDir(ti->path == NULL ? "." : ti->path,
+					ti->path == NULL ? 1 : strlen(ti->path),
+					false);
+
+			/*
+			 * If we're including WAL, and this is the main data directory
+			 * we don't terminate the tar stream here. Instead, we will append
+			 * the xlog files below and terminate it then. This is safe since
+			 * the main data directory is always sent *last*.
+			 */
+			if (opt->includewal && ti->path == NULL)
+			{
+				Assert(lnext(lc) == NULL);
+			}
+			else
+				pq_putemptymessage('c'); /* CopyDone */
 		}
 	}
 	PG_END_ENSURE_ERROR_CLEANUP(base_backup_cleanup, (Datum) 0);
 
-	do_pg_stop_backup();
+	endptr = do_pg_stop_backup();
+
+	if (opt->includewal)
+	{
+		/*
+		 * We've left the last tar file "open", so we can now append the
+		 * required WAL files to it.
+		 */
+		uint32 	logid, logseg;
+		uint32	endlogid, endlogseg;
+
+		XLByteToSeg(startptr, logid, logseg);
+		XLByteToSeg(endptr, endlogid, endlogseg);
+		elog(DEBUG1, "Going to send wal from %i.%i to %i.%i",
+			 logid, logseg,
+			 endlogid, endlogseg);
+
+		while (true)
+		{
+			char	xlogname[64];
+			char	fn[MAXPGPATH];
+			struct stat statbuf;
+
+			/* Send the current WAL file */
+			XLogFileName(xlogname, ThisTimeLineID, logid, logseg);
+			sprintf(fn, "./pg_xlog/%s", xlogname);
+			if (lstat(fn, &statbuf) != 0)
+				ereport(ERROR,
+						(errcode(errcode_for_file_access()),
+						 errmsg("could not stat file \"%s\": %m",
+								fn)));
+
+			if (!S_ISREG(statbuf.st_mode))
+				ereport(ERROR,
+						(errmsg("\"%s\" is not a file",
+								fn)));
+			sendFile(fn, 1, &statbuf);
+
+			/* Advance to the next WAL file */
+			NextLogSeg(logid, logseg);
+
+			/* Have we reached our stop position yet? */
+			if (logid > endptr.xlogid ||
+				(logid == endptr.xlogid && logseg >= endptr.xrecoff / XLogSegSize))
+				break;
+		}
+
+		/* Send CopyDone message for the last tar file*/
+		pq_putemptymessage('c');
+	}
 }
 
 /*
@@ -147,6 +219,7 @@ parse_basebackup_options(List *options, basebackup_options *opt)
 	bool		o_label = false;
 	bool		o_progress = false;
 	bool		o_fast = false;
+	bool		o_wal = false;
 
 	MemSet(opt, 0, sizeof(opt));
 	foreach(lopt, options)
@@ -180,6 +253,15 @@ parse_basebackup_options(List *options, basebackup_options *opt)
 			opt->fastcheckpoint = true;
 			o_fast = true;
 		}
+		else if (strcmp(defel->defname, "wal") == 0)
+		{
+			if (o_wal)
+				ereport(ERROR,
+						(errcode(ERRCODE_SYNTAX_ERROR),
+						 errmsg("duplicate option \"%s\"", defel->defname)));
+			opt->includewal = true;
+			o_wal = true;
+		}
 		else
 			elog(ERROR, "option \"%s\" not recognized",
 				 defel->defname);
@@ -316,26 +398,6 @@ SendBackupHeader(List *tablespaces)
 	pq_puttextmessage('C', "SELECT");
 }
 
-static void
-SendBackupDirectory(char *location, char *spcoid)
-{
-	StringInfoData buf;
-
-	/* Send CopyOutResponse message */
-	pq_beginmessage(&buf, 'H');
-	pq_sendbyte(&buf, 0);		/* overall format */
-	pq_sendint(&buf, 0, 2);		/* natts */
-	pq_endmessage(&buf);
-
-	/* tar up the data directory if NULL, otherwise the tablespace */
-	sendDir(location == NULL ? "." : location,
-			location == NULL ? 1 : strlen(location),
-			false);
-
-	/* Send CopyDone message */
-	pq_putemptymessage('c');
-}
-
 
 static int64
 sendDir(char *path, int basepathlen, bool sizeonly)
diff --git a/src/backend/replication/repl_gram.y b/src/backend/replication/repl_gram.y
index 879a0bd..1adcbdc 100644
--- a/src/backend/replication/repl_gram.y
+++ b/src/backend/replication/repl_gram.y
@@ -71,6 +71,7 @@ Node *replication_parse_result;
 %token K_LABEL
 %token K_PROGRESS
 %token K_FAST
+%token K_WAL
 %token K_START_REPLICATION
 
 %type <node>	command
@@ -136,7 +137,12 @@ base_backup_opt:
 				  $$ = makeDefElem("fast",
 						   (Node *)makeInteger(TRUE));
 				}
-
+			| K_WAL
+				{
+				  $$ = makeDefElem("wal",
+						   (Node *)makeInteger(TRUE));
+				}
+			;
 
 /*
  * START_REPLICATION %X/%X
diff --git a/src/backend/replication/repl_scanner.l b/src/backend/replication/repl_scanner.l
index e6dfb04..87e77d9 100644
--- a/src/backend/replication/repl_scanner.l
+++ b/src/backend/replication/repl_scanner.l
@@ -61,6 +61,7 @@ FAST			{ return K_FAST; }
 IDENTIFY_SYSTEM		{ return K_IDENTIFY_SYSTEM; }
 LABEL			{ return K_LABEL; }
 PROGRESS			{ return K_PROGRESS; }
+WAL			{ return K_WAL; }
 START_REPLICATION	{ return K_START_REPLICATION; }
 ","				{ return ','; }
 ";"				{ return ';'; }
diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c
index 5363034..baea186 100644
--- a/src/bin/pg_basebackup/pg_basebackup.c
+++ b/src/bin/pg_basebackup/pg_basebackup.c
@@ -33,6 +33,7 @@ char	   *label = "pg_basebackup base backup";
 bool		showprogress = false;
 int			verbose = 0;
 int			compresslevel = 0;
+bool		includewal = false;
 bool		fastcheckpoint = false;
 char	   *dbhost = NULL;
 char	   *dbuser = NULL;
@@ -124,6 +125,7 @@ usage(void)
 	printf(_("\nOptions controlling the output:\n"));
 	printf(_("  -D, --pgdata=directory    receive base backup into directory\n"));
 	printf(_("  -F, --format=p|t          output format (plain, tar)\n"));
+	printf(_("  -x, --xlog                include required WAL files in backup\n"));
 	printf(_("  -Z, --compress=0-9        compress tar output\n"));
 	printf(_("\nGeneral options:\n"));
 	printf(_("  -c, --checkpoint=fast|spread\n"
@@ -746,9 +748,10 @@ BaseBackup()
 	conn = GetConnection();
 
 	PQescapeStringConn(conn, escaped_label, label, sizeof(escaped_label), &i);
-	snprintf(current_path, sizeof(current_path), "BASE_BACKUP LABEL '%s' %s %s",
+	snprintf(current_path, sizeof(current_path), "BASE_BACKUP LABEL '%s' %s %s %s",
 			 escaped_label,
 			 showprogress ? "PROGRESS" : "",
+			 includewal ? "WAL" : "",
 			 fastcheckpoint ? "FAST" : "");
 
 	if (PQsendQuery(conn, current_path) == 0)
@@ -789,7 +792,7 @@ BaseBackup()
 		 * first once since it can be relocated, and it will be checked before
 		 * we do anything anyway.
 		 */
-		if (format == 'p' && i > 0)
+		if (format == 'p' && !PQgetisnull(res, i, 1))
 			verify_dir_is_empty_or_create(PQgetvalue(res, i, 1));
 	}
 
@@ -848,6 +851,7 @@ main(int argc, char **argv)
 		{"pgdata", required_argument, NULL, 'D'},
 		{"format", required_argument, NULL, 'F'},
 		{"checkpoint", required_argument, NULL, 'c'},
+		{"xlog", no_argument, NULL, 'w'},
 		{"compress", required_argument, NULL, 'Z'},
 		{"label", required_argument, NULL, 'l'},
 		{"host", required_argument, NULL, 'h'},
@@ -881,7 +885,7 @@ main(int argc, char **argv)
 		}
 	}
 
-	while ((c = getopt_long(argc, argv, "D:F:l:Z:c:h:p:U:wWvP",
+	while ((c = getopt_long(argc, argv, "D:F:l:Z:c:h:p:U:xwWvP",
 							long_options, &option_index)) != -1)
 	{
 		switch (c)
@@ -901,6 +905,9 @@ main(int argc, char **argv)
 					exit(1);
 				}
 				break;
+			case 'x':
+				includewal = true;
+				break;
 			case 'l':
 				label = xstrdup(optarg);
 				break;
#17Fujii Masao
masao.fujii@gmail.com
In reply to: Magnus Hagander (#16)
Re: Include WAL in base backup

On Tue, Jan 25, 2011 at 12:33 AM, Magnus Hagander <magnus@hagander.net> wrote:

Here's an updated version of the patch:

* rebased on the current git master (including changing the switch
from -w to -x since -w was used now)
* includes some documentation
* use XLogByteToSeg and uint32 as mentioned here
* refactored to remove SendBackupDirectory(), removing the ugly closetar boolean

I reviewed the patch. Here are comments.

+ {"xlog", no_argument, NULL, 'w'},

Typo: s/w/x

/*
* BASE_BACKUP [LABEL <label>] [PROGRESS] [FAST]
*/

In repl_gram.y, the above comment needs to be updated.

Both this patch and the multiple-backup one removes SendBackupDirectory
in the almost same way. So, how about committing that common part first?

+ XLByteToSeg(endptr, endlogid, endlogseg);

XLByteToPrevSeg should be used for the backup end location. Because
when it's a boundary byte, all the required WAL records exist in the
previous WAL segment. This is why pg_stop_backup uses XLByteToPrevSeg
for the backup end location.

+		elog(DEBUG1, "Going to send wal from %i.%i to %i.%i",
+			 logid, logseg,
+			 endlogid, endlogseg);

%u should be used instead of %i. Or how about logging the filename?

+ char xlogname[64];

How about using MAXFNAMELEN instead of 64?

+			XLogFileName(xlogname, ThisTimeLineID, logid, logseg);
+			sprintf(fn, "./pg_xlog/%s", xlogname);

How about using XLogFilePath? instead?

+			if (logid > endptr.xlogid ||
+				(logid == endptr.xlogid && logseg >= endptr.xrecoff / XLogSegSize))

Why don't you use endlogseg?

The estimated size of $PGDATA doesn't include WAL files, but the
actual one does.
This inconsistency messes up the progress report as follows:

33708/17323 kB (194%) 1/1 tablespaces

So, what about sparating the progress report into two parts: one for $PGDATA and
tablespaces, another for WAL files?

Regards,

--
Fujii Masao
NIPPON TELEGRAPH AND TELEPHONE CORPORATION
NTT Open Source Software Center

#18Magnus Hagander
magnus@hagander.net
In reply to: Fujii Masao (#17)
Re: Include WAL in base backup

On Tue, Jan 25, 2011 at 10:56, Fujii Masao <masao.fujii@gmail.com> wrote:

On Tue, Jan 25, 2011 at 12:33 AM, Magnus Hagander <magnus@hagander.net> wrote:

Here's an updated version of the patch:

* rebased on the current git master (including changing the switch
from -w to -x since -w was used now)
* includes some documentation
* use XLogByteToSeg and uint32 as mentioned here
* refactored to remove SendBackupDirectory(), removing the ugly closetar boolean

I reviewed the patch. Here are comments.

+               {"xlog", no_argument, NULL, 'w'},

Typo: s/w/x

Ah, oops. thanks.

/*
 * BASE_BACKUP [LABEL <label>] [PROGRESS] [FAST]
 */

In repl_gram.y, the above comment needs to be updated.

Same here - oops, thanks. It was also missing the quotes around <label>.

Both this patch and the multiple-backup one removes SendBackupDirectory
in the almost same way. So, how about committing that common part first?

I think they're small enough that we'll just commit it along with
whichever comes first and then have the other one merge with it.

+               XLByteToSeg(endptr, endlogid, endlogseg);

XLByteToPrevSeg should be used for the backup end location. Because
when it's a boundary byte, all the required WAL records exist in the
previous WAL segment. This is why pg_stop_backup uses XLByteToPrevSeg
for the backup end location.

Well, it's just for debugging output, really, but see below.

+               elog(DEBUG1, "Going to send wal from %i.%i to %i.%i",
+                        logid, logseg,
+                        endlogid, endlogseg);

%u should be used instead of %i. Or how about logging the filename?

I actually plan to remove that DEBUG output completely (sorry, forgot
to mention that), I'm not sure it's useful to have it around once we
apply. Or do you think it would be useful to keep?

+                       char    xlogname[64];

How about using MAXFNAMELEN instead of 64?

Agreed.

+                       XLogFileName(xlogname, ThisTimeLineID, logid, logseg);
+                       sprintf(fn, "./pg_xlog/%s", xlogname);

How about using XLogFilePath? instead?

Can't do that, because we need the "./" part when we call sendFile() -
it's structured around having that one, since we get it from the file
name looping.

+                       if (logid > endptr.xlogid ||
+                               (logid == endptr.xlogid && logseg >= endptr.xrecoff / XLogSegSize))

Why don't you use endlogseg?

Honestly? Because I thought I added endlogseg just for the debugging
output, didn't realize I had it down here.

But if I do, then I should not use the XLByteToPrevSeg() per your
suggestion above, right? Keep it the way it is.

The estimated size of $PGDATA doesn't include WAL files, but the
actual one does.
This inconsistency messes up the progress report as follows:

   33708/17323 kB (194%) 1/1 tablespaces

Yeah, that is definitely a potential problem. I think we're just going
to have to document it - and in a big database, it's not going to be
quite as bad...

So, what about sparating the progress report into two parts: one for $PGDATA and
tablespaces, another for WAL files?

We can't really do that. We need to send the progress report before we
begin the tar file, and we don't know how many xlog segments we will
have at that time. And we don't want to send pg_xlog as a separate tar
stream, because if we do we won't be able to get the single-tar-output
in the simple case - thus no piping etc. I actually had the xlog data
as it's own tar stream in the beginning, but Heikki convinced me of
the advantage of keeping it in the main one...

What we could do, I guess, is to code pg_basebackup to detect when it
starts receiving xlog files and then stop increasing the percentage at
that point, instead just doing a counter?

(the discussed changse above have been applied and pushed to github)

--
 Magnus Hagander
 Me: http://www.hagander.net/
 Work: http://www.redpill-linpro.com/

#19Fujii Masao
masao.fujii@gmail.com
In reply to: Magnus Hagander (#18)
Re: Include WAL in base backup

On Tue, Jan 25, 2011 at 8:21 PM, Magnus Hagander <magnus@hagander.net> wrote:

+               elog(DEBUG1, "Going to send wal from %i.%i to %i.%i",
+                        logid, logseg,
+                        endlogid, endlogseg);

%u should be used instead of %i. Or how about logging the filename?

I actually plan to remove that DEBUG output completely (sorry, forgot
to mention that), I'm not sure it's useful to have it around once we
apply. Or do you think it would be useful to keep?

Nope. I'm OK to remove that.

+                       XLogFileName(xlogname, ThisTimeLineID, logid, logseg);
+                       sprintf(fn, "./pg_xlog/%s", xlogname);

How about using XLogFilePath? instead?

Can't do that, because we need the "./" part when we call sendFile() -
it's structured around having that one, since we get it from the file
name looping.

Understood.

+                       if (logid > endptr.xlogid ||
+                               (logid == endptr.xlogid && logseg >= endptr.xrecoff / XLogSegSize))

Why don't you use endlogseg?

Honestly? Because I thought I added endlogseg just for the debugging
output, didn't realize I had it down here.

But if I do, then I should not use the XLByteToPrevSeg() per your
suggestion above, right? Keep it the way it is.

Yes. If we check "logseg >= endlogseg", we should use XLByteToSeg.

So, what about sparating the progress report into two parts: one for $PGDATA and
tablespaces, another for WAL files?

We can't really do that. We need to send the progress report before we
begin the tar file, and we don't know how many xlog segments we will
have at that time. And we don't want to send pg_xlog as a separate tar
stream, because if we do we won't be able to get the single-tar-output
in the simple case - thus no piping etc. I actually had the xlog data
as it's own tar stream in the beginning, but Heikki convinced me of
the advantage of keeping it in the main one...

What we could do, I guess, is to code pg_basebackup to detect when it
starts receiving xlog files and then stop increasing the percentage at
that point, instead just doing a counter?

Umm.. not progressing the report during receiving WAL files seems
also odd. But I don't have another good idea... For now, I'm OK if the
behavior of the progress report is well documented.

(the discussed changse above have been applied and pushed to github)

Thanks! I'll test and review that.

Regards,

--
Fujii Masao
NIPPON TELEGRAPH AND TELEPHONE CORPORATION
NTT Open Source Software Center

#20Fujii Masao
masao.fujii@gmail.com
In reply to: Fujii Masao (#19)
Re: Include WAL in base backup

On Tue, Jan 25, 2011 at 10:28 PM, Fujii Masao <masao.fujii@gmail.com> wrote:

(the discussed changse above have been applied and pushed to github)

Thanks! I'll test and review that.

WAL file might get recycled or removed while walsender is reading it.
So the WAL file which pg_basebackup seemingly received successfully
might be actually invalid. Shouldn't we need to check that what we read
is valid as XLogRead does?

Regards,

--
Fujii Masao
NIPPON TELEGRAPH AND TELEPHONE CORPORATION
NTT Open Source Software Center

#21Magnus Hagander
magnus@hagander.net
In reply to: Fujii Masao (#20)
Re: Include WAL in base backup

On Tue, Jan 25, 2011 at 15:04, Fujii Masao <masao.fujii@gmail.com> wrote:

On Tue, Jan 25, 2011 at 10:28 PM, Fujii Masao <masao.fujii@gmail.com> wrote:

(the discussed changse above have been applied and pushed to github)

Thanks! I'll test and review that.

WAL file might get recycled or removed while walsender is reading it.
So the WAL file which pg_basebackup seemingly received successfully
might be actually invalid. Shouldn't we need to check that what we read
is valid as XLogRead does?

Yikes. Yes, you are correct. We do catch the case when it is removed,
but not when it's recycled.

I'll need to look at that. Not sure if I'll have time today, but I'll
do it as soon as I can :-)

--
 Magnus Hagander
 Me: http://www.hagander.net/
 Work: http://www.redpill-linpro.com/

#22Magnus Hagander
magnus@hagander.net
In reply to: Magnus Hagander (#21)
1 attachment(s)
Re: Include WAL in base backup

On Tue, Jan 25, 2011 at 16:34, Magnus Hagander <magnus@hagander.net> wrote:

On Tue, Jan 25, 2011 at 15:04, Fujii Masao <masao.fujii@gmail.com> wrote:

On Tue, Jan 25, 2011 at 10:28 PM, Fujii Masao <masao.fujii@gmail.com> wrote:

(the discussed changse above have been applied and pushed to github)

Thanks! I'll test and review that.

WAL file might get recycled or removed while walsender is reading it.
So the WAL file which pg_basebackup seemingly received successfully
might be actually invalid. Shouldn't we need to check that what we read
is valid as XLogRead does?

We should, and the easiest way is to actually use XLogRead() since the
code is already there. How about the way in this patch?

--
 Magnus Hagander
 Me: http://www.hagander.net/
 Work: http://www.redpill-linpro.com/

Attachments:

basebackup_wal.patchtext/x-patch; charset=US-ASCII; name=basebackup_wal.patchDownload
diff --git a/doc/src/sgml/protocol.sgml b/doc/src/sgml/protocol.sgml
index 73f26b4..0775b7a 100644
--- a/doc/src/sgml/protocol.sgml
+++ b/doc/src/sgml/protocol.sgml
@@ -1460,7 +1460,7 @@ The commands accepted in walsender mode are:
   </varlistentry>
 
   <varlistentry>
-    <term>BASE_BACKUP [<literal>LABEL</literal> <replaceable>'label'</replaceable>] [<literal>PROGRESS</literal>] [<literal>FAST</literal>]</term>
+    <term>BASE_BACKUP [<literal>LABEL</literal> <replaceable>'label'</replaceable>] [<literal>PROGRESS</literal>] [<literal>FAST</literal>] [<literal>WAL</literal>]</term>
     <listitem>
      <para>
       Instructs the server to start streaming a base backup.
@@ -1505,6 +1505,18 @@ The commands accepted in walsender mode are:
          </para>
         </listitem>
        </varlistentry>
+
+       <varlistentry>
+        <term><literal>WAL</literal></term>
+        <listitem>
+         <para>
+          Include the necessary WAL segments in the backup. This will include
+          all the files between start and stop backup in the
+          <filename>pg_xlog</filename> directory of the base directory tar
+          file.
+         </para>
+        </listitem>
+       </varlistentry>
       </variablelist>
      </para>
      <para>
@@ -1561,7 +1573,10 @@ The commands accepted in walsender mode are:
        </listitem>
        <listitem>
         <para>
-         <filename>pg_xlog</> (including subdirectories)
+         <filename>pg_xlog</>, including subdirectories. If the backup is run
+         with wal files included, a synthesized version of pg_xlog will be
+         included, but it will only contain the files necessary for the
+         backup to work, not the rest of the contents.
         </para>
        </listitem>
       </itemizedlist>
diff --git a/doc/src/sgml/ref/pg_basebackup.sgml b/doc/src/sgml/ref/pg_basebackup.sgml
index 321c8ca..60ffa3c 100644
--- a/doc/src/sgml/ref/pg_basebackup.sgml
+++ b/doc/src/sgml/ref/pg_basebackup.sgml
@@ -145,6 +145,31 @@ PostgreSQL documentation
      </varlistentry>
 
      <varlistentry>
+      <term><option>-x</option></term>
+      <term><option>--xlog</option></term>
+      <listitem>
+       <para>
+        Includes the required transaction log files (WAL files) in the
+        backup. This will include all transaction logs generated during
+        the backup. If this option is specified, it is possible to start
+        a postmaster directly in the extracted directory without the need
+        to consult the log archive, thus making this a completely standalone
+        backup.
+       </para>
+       <note>
+        <para>
+         The transaction log files are collected at the end of the backup.
+         Therefore, it is necessary for the
+         <xref linkend="guc-wal-keep-segments"> parameter to be set high
+         enough that the log is not removed before the end of the backup.
+         If the log has been rotated when it's time to transfer it, the
+         backup will fail and be unusable.
+        </para>
+       </note>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry>
       <term><option>-Z <replaceable class="parameter">level</replaceable></option></term>
       <term><option>--compress=<replaceable class="parameter">level</replaceable></option></term>
       <listitem>
diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c
index 251ed8e..819419e 100644
--- a/src/backend/replication/basebackup.c
+++ b/src/backend/replication/basebackup.c
@@ -37,6 +37,7 @@ typedef struct
 	const char *label;
 	bool		progress;
 	bool		fastcheckpoint;
+	bool		includewal;
 }	basebackup_options;
 
 
@@ -46,11 +47,17 @@ static void _tarWriteHeader(char *filename, char *linktarget,
 				struct stat * statbuf);
 static void send_int8_string(StringInfoData *buf, int64 intval);
 static void SendBackupHeader(List *tablespaces);
-static void SendBackupDirectory(char *location, char *spcoid);
 static void base_backup_cleanup(int code, Datum arg);
 static void perform_base_backup(basebackup_options *opt, DIR *tblspcdir);
 static void parse_basebackup_options(List *options, basebackup_options *opt);
 
+/*
+ * Size of each block sent into the tar stream for larger files.
+ *
+ * XLogSegSize *MUST* be evenly dividable by this
+ */
+#define TAR_SEND_SIZE 32768
+
 typedef struct
 {
 	char	   *oid;
@@ -78,7 +85,10 @@ base_backup_cleanup(int code, Datum arg)
 static void
 perform_base_backup(basebackup_options *opt, DIR *tblspcdir)
 {
-	do_pg_start_backup(opt->label, opt->fastcheckpoint);
+	XLogRecPtr	startptr;
+	XLogRecPtr	endptr;
+
+	startptr = do_pg_start_backup(opt->label, opt->fastcheckpoint);
 
 	PG_ENSURE_ERROR_CLEANUP(base_backup_cleanup, (Datum) 0);
 	{
@@ -87,12 +97,6 @@ perform_base_backup(basebackup_options *opt, DIR *tblspcdir)
 		struct dirent *de;
 		tablespaceinfo *ti;
 
-
-		/* Add a node for the base directory */
-		ti = palloc0(sizeof(tablespaceinfo));
-		ti->size = opt->progress ? sendDir(".", 1, true) : -1;
-		tablespaces = lappend(tablespaces, ti);
-
 		/* Collect information about all tablespaces */
 		while ((de = ReadDir(tblspcdir, "pg_tblspc")) != NULL)
 		{
@@ -120,6 +124,10 @@ perform_base_backup(basebackup_options *opt, DIR *tblspcdir)
 			tablespaces = lappend(tablespaces, ti);
 		}
 
+		/* Add a node for the base directory at the end */
+		ti = palloc0(sizeof(tablespaceinfo));
+		ti->size = opt->progress ? sendDir(".", 1, true) : -1;
+		tablespaces = lappend(tablespaces, ti);
 
 		/* Send tablespace header */
 		SendBackupHeader(tablespaces);
@@ -128,13 +136,106 @@ perform_base_backup(basebackup_options *opt, DIR *tblspcdir)
 		foreach(lc, tablespaces)
 		{
 			tablespaceinfo *ti = (tablespaceinfo *) lfirst(lc);
+			StringInfoData	buf;
+
+			/* Send CopyOutResponse message */
+			pq_beginmessage(&buf, 'H');
+			pq_sendbyte(&buf, 0);		/* overall format */
+			pq_sendint(&buf, 0, 2);		/* natts */
+			pq_endmessage(&buf);
+
+			sendDir(ti->path == NULL ? "." : ti->path,
+					ti->path == NULL ? 1 : strlen(ti->path),
+					false);
 
-			SendBackupDirectory(ti->path, ti->oid);
+			/*
+			 * If we're including WAL, and this is the main data directory
+			 * we don't terminate the tar stream here. Instead, we will append
+			 * the xlog files below and terminate it then. This is safe since
+			 * the main data directory is always sent *last*.
+			 */
+			if (opt->includewal && ti->path == NULL)
+			{
+				Assert(lnext(lc) == NULL);
+			}
+			else
+				pq_putemptymessage('c'); /* CopyDone */
 		}
 	}
 	PG_END_ENSURE_ERROR_CLEANUP(base_backup_cleanup, (Datum) 0);
 
-	do_pg_stop_backup();
+	endptr = do_pg_stop_backup();
+
+	if (opt->includewal)
+	{
+		/*
+		 * We've left the last tar file "open", so we can now append the
+		 * required WAL files to it.
+		 */
+		uint32 	logid, logseg;
+		uint32	endlogid, endlogseg;
+		struct stat statbuf;
+
+		MemSet(&statbuf, 0, sizeof(statbuf));
+		statbuf.st_mode=S_IRUSR | S_IWUSR;
+#ifndef WIN32
+		statbuf.st_uid=getuid();
+		statbuf.st_gid=getgid();
+#endif
+		statbuf.st_size=XLogSegSize;
+		statbuf.st_mtime=time(NULL);
+
+		XLByteToSeg(startptr, logid, logseg);
+		XLByteToSeg(endptr, endlogid, endlogseg);
+		elog(LOG, "Going to send wal from %u.%u to %u.%u",
+			 logid, logseg,
+			 endlogid, endlogseg);
+
+		while (true)
+		{
+			/* Send another xlog segment */
+			char	xlogname[MAXFNAMELEN];
+			char	fn[MAXPGPATH];
+			int		i;
+
+			/* Send the current WAL file */
+			XLogFileName(xlogname, ThisTimeLineID, logid, logseg);
+
+			sprintf(fn, "./pg_xlog/%s", xlogname);
+			_tarWriteHeader(fn, NULL, &statbuf);
+
+			/* Send the actual WAL file contents, by reading block-by-block */
+			for (i = 0; i < XLogSegSize / TAR_SEND_SIZE; i++)
+			{
+				char		buf[TAR_SEND_SIZE];
+				XLogRecPtr	ptr;
+
+				ptr.xlogid = logid;
+				ptr.xrecoff = logseg * XLogSegSize + TAR_SEND_SIZE * i;
+
+				XLogRead(buf, ptr, TAR_SEND_SIZE);
+				if (pq_putmessage('d', buf, TAR_SEND_SIZE))
+					ereport(ERROR,
+							(errmsg("base backup could not send data, aborting backup")));
+			}
+			/*
+			 * Files are always fixed size, and always end on a 512 byte
+			 * boundary, so padding is never necessary.
+			 */
+
+
+			/* Advance to the next WAL file */
+			NextLogSeg(logid, logseg);
+
+			/* Have we reached our stop position yet? */
+			if (logid > endlogid ||
+				(logid == endlogid && logseg >= endlogseg))
+				break;
+		}
+
+		/* Send CopyDone message for the last tar file*/
+		pq_putemptymessage('c');
+	}
 }
 
 /*
@@ -147,6 +248,7 @@ parse_basebackup_options(List *options, basebackup_options *opt)
 	bool		o_label = false;
 	bool		o_progress = false;
 	bool		o_fast = false;
+	bool		o_wal = false;
 
 	MemSet(opt, 0, sizeof(*opt));
 	foreach(lopt, options)
@@ -180,6 +282,15 @@ parse_basebackup_options(List *options, basebackup_options *opt)
 			opt->fastcheckpoint = true;
 			o_fast = true;
 		}
+		else if (strcmp(defel->defname, "wal") == 0)
+		{
+			if (o_wal)
+				ereport(ERROR,
+						(errcode(ERRCODE_SYNTAX_ERROR),
+						 errmsg("duplicate option \"%s\"", defel->defname)));
+			opt->includewal = true;
+			o_wal = true;
+		}
 		else
 			elog(ERROR, "option \"%s\" not recognized",
 				 defel->defname);
@@ -316,26 +427,6 @@ SendBackupHeader(List *tablespaces)
 	pq_puttextmessage('C', "SELECT");
 }
 
-static void
-SendBackupDirectory(char *location, char *spcoid)
-{
-	StringInfoData buf;
-
-	/* Send CopyOutResponse message */
-	pq_beginmessage(&buf, 'H');
-	pq_sendbyte(&buf, 0);		/* overall format */
-	pq_sendint(&buf, 0, 2);		/* natts */
-	pq_endmessage(&buf);
-
-	/* tar up the data directory if NULL, otherwise the tablespace */
-	sendDir(location == NULL ? "." : location,
-			location == NULL ? 1 : strlen(location),
-			false);
-
-	/* Send CopyDone message */
-	pq_putemptymessage('c');
-}
-
 
 static int64
 sendDir(char *path, int basepathlen, bool sizeonly)
@@ -506,7 +597,7 @@ static void
 sendFile(char *filename, int basepathlen, struct stat * statbuf)
 {
 	FILE	   *fp;
-	char		buf[32768];
+	char		buf[TAR_SEND_SIZE];
 	size_t		cnt;
 	pgoff_t		len = 0;
 	size_t		pad;
diff --git a/src/backend/replication/repl_gram.y b/src/backend/replication/repl_gram.y
index 879a0bd..e1a4a51 100644
--- a/src/backend/replication/repl_gram.y
+++ b/src/backend/replication/repl_gram.y
@@ -71,6 +71,7 @@ Node *replication_parse_result;
 %token K_LABEL
 %token K_PROGRESS
 %token K_FAST
+%token K_WAL
 %token K_START_REPLICATION
 
 %type <node>	command
@@ -106,7 +107,7 @@ identify_system:
 			;
 
 /*
- * BASE_BACKUP [LABEL <label>] [PROGRESS] [FAST]
+ * BASE_BACKUP [LABEL '<label>'] [PROGRESS] [FAST] [WAL]
  */
 base_backup:
 			K_BASE_BACKUP base_backup_opt_list
@@ -136,7 +137,12 @@ base_backup_opt:
 				  $$ = makeDefElem("fast",
 						   (Node *)makeInteger(TRUE));
 				}
-
+			| K_WAL
+				{
+				  $$ = makeDefElem("wal",
+						   (Node *)makeInteger(TRUE));
+				}
+			;
 
 /*
  * START_REPLICATION %X/%X
diff --git a/src/backend/replication/repl_scanner.l b/src/backend/replication/repl_scanner.l
index e6dfb04..87e77d9 100644
--- a/src/backend/replication/repl_scanner.l
+++ b/src/backend/replication/repl_scanner.l
@@ -61,6 +61,7 @@ FAST			{ return K_FAST; }
 IDENTIFY_SYSTEM		{ return K_IDENTIFY_SYSTEM; }
 LABEL			{ return K_LABEL; }
 PROGRESS			{ return K_PROGRESS; }
+WAL			{ return K_WAL; }
 START_REPLICATION	{ return K_START_REPLICATION; }
 ","				{ return ','; }
 ";"				{ return ';'; }
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index 44efa9f..f70458e 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -105,7 +105,6 @@ static int	WalSndLoop(void);
 static void InitWalSnd(void);
 static void WalSndHandshake(void);
 static void WalSndKill(int code, Datum arg);
-static void XLogRead(char *buf, XLogRecPtr recptr, Size nbytes);
 static bool XLogSend(char *msgbuf, bool *caughtup);
 static void CheckClosedConnection(void);
 static void IdentifySystem(void);
@@ -649,8 +648,13 @@ WalSndKill(int code, Datum arg)
  *
  * XXX probably this should be improved to suck data directly from the
  * WAL buffers when possible.
+ *
+ * Will open, and keep open, one WAL segment stored in the global file
+ * descriptor sendFile. This means if XLogRead is used once, there will
+ * always be one descriptor left open until the process ends, but never
+ * more than one.
  */
-static void
+void
 XLogRead(char *buf, XLogRecPtr recptr, Size nbytes)
 {
 	XLogRecPtr	startRecPtr = recptr;
diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c
index 5363034..ef2718a 100644
--- a/src/bin/pg_basebackup/pg_basebackup.c
+++ b/src/bin/pg_basebackup/pg_basebackup.c
@@ -33,6 +33,7 @@ char	   *label = "pg_basebackup base backup";
 bool		showprogress = false;
 int			verbose = 0;
 int			compresslevel = 0;
+bool		includewal = false;
 bool		fastcheckpoint = false;
 char	   *dbhost = NULL;
 char	   *dbuser = NULL;
@@ -124,6 +125,7 @@ usage(void)
 	printf(_("\nOptions controlling the output:\n"));
 	printf(_("  -D, --pgdata=directory    receive base backup into directory\n"));
 	printf(_("  -F, --format=p|t          output format (plain, tar)\n"));
+	printf(_("  -x, --xlog                include required WAL files in backup\n"));
 	printf(_("  -Z, --compress=0-9        compress tar output\n"));
 	printf(_("\nGeneral options:\n"));
 	printf(_("  -c, --checkpoint=fast|spread\n"
@@ -200,16 +202,20 @@ verify_dir_is_empty_or_create(char *dirname)
 static void
 progress_report(int tablespacenum, char *fn)
 {
+	int percent = (int) ((totaldone / 1024) * 100 / totalsize);
+	if (percent > 100)
+		percent = 100;
+
 	if (verbose)
 		fprintf(stderr,
 				INT64_FORMAT "/" INT64_FORMAT " kB (%i%%) %i/%i tablespaces (%-30s)\r",
 				totaldone / 1024, totalsize,
-				(int) ((totaldone / 1024) * 100 / totalsize),
+				percent,
 				tablespacenum, tablespacecount, fn);
 	else
 		fprintf(stderr, INT64_FORMAT "/" INT64_FORMAT " kB (%i%%) %i/%i tablespaces\r",
 				totaldone / 1024, totalsize,
-				(int) ((totaldone / 1024) * 100 / totalsize),
+				percent,
 				tablespacenum, tablespacecount);
 }
 
@@ -746,9 +752,10 @@ BaseBackup()
 	conn = GetConnection();
 
 	PQescapeStringConn(conn, escaped_label, label, sizeof(escaped_label), &i);
-	snprintf(current_path, sizeof(current_path), "BASE_BACKUP LABEL '%s' %s %s",
+	snprintf(current_path, sizeof(current_path), "BASE_BACKUP LABEL '%s' %s %s %s",
 			 escaped_label,
 			 showprogress ? "PROGRESS" : "",
+			 includewal ? "WAL" : "",
 			 fastcheckpoint ? "FAST" : "");
 
 	if (PQsendQuery(conn, current_path) == 0)
@@ -789,7 +796,7 @@ BaseBackup()
 		 * first once since it can be relocated, and it will be checked before
 		 * we do anything anyway.
 		 */
-		if (format == 'p' && i > 0)
+		if (format == 'p' && !PQgetisnull(res, i, 1))
 			verify_dir_is_empty_or_create(PQgetvalue(res, i, 1));
 	}
 
@@ -848,6 +855,7 @@ main(int argc, char **argv)
 		{"pgdata", required_argument, NULL, 'D'},
 		{"format", required_argument, NULL, 'F'},
 		{"checkpoint", required_argument, NULL, 'c'},
+		{"xlog", no_argument, NULL, 'x'},
 		{"compress", required_argument, NULL, 'Z'},
 		{"label", required_argument, NULL, 'l'},
 		{"host", required_argument, NULL, 'h'},
@@ -881,7 +889,7 @@ main(int argc, char **argv)
 		}
 	}
 
-	while ((c = getopt_long(argc, argv, "D:F:l:Z:c:h:p:U:wWvP",
+	while ((c = getopt_long(argc, argv, "D:F:l:Z:c:h:p:U:xwWvP",
 							long_options, &option_index)) != -1)
 	{
 		switch (c)
@@ -901,6 +909,9 @@ main(int argc, char **argv)
 					exit(1);
 				}
 				break;
+			case 'x':
+				includewal = true;
+				break;
 			case 'l':
 				label = xstrdup(optarg);
 				break;
diff --git a/src/include/replication/walsender.h b/src/include/replication/walsender.h
index bd9e193..8a87311 100644
--- a/src/include/replication/walsender.h
+++ b/src/include/replication/walsender.h
@@ -67,6 +67,8 @@ extern Size WalSndShmemSize(void);
 extern void WalSndShmemInit(void);
 extern void WalSndWakeup(void);
 extern void WalSndSetState(WalSndState state);
+extern void XLogRead(char *buf, XLogRecPtr recptr, Size nbytes);
+
 
 extern Datum pg_stat_get_wal_senders(PG_FUNCTION_ARGS);
 
#23Fujii Masao
masao.fujii@gmail.com
In reply to: Magnus Hagander (#22)
Re: Include WAL in base backup

On Wed, Jan 26, 2011 at 5:17 AM, Magnus Hagander <magnus@hagander.net> wrote:

We should, and the easiest way is to actually use XLogRead() since the
code is already there. How about the way in this patch?

Thanks for the update. I reread the patch.

+		MemSet(&statbuf, 0, sizeof(statbuf));
+		statbuf.st_mode=S_IRUSR | S_IWUSR;
+#ifndef WIN32
+		statbuf.st_uid=getuid();
+		statbuf.st_gid=getgid();
+#endif
+		statbuf.st_size=XLogSegSize;
+		statbuf.st_mtime=time(NULL);

Can you put a space around "="?

In the multiple-backups patch, Heikki uses geteuid and getegid for
the same purpose instead of getuid and getgid, as follows.

! /* Windows doesn't have the concept of uid and gid */
! #ifdef WIN32
! statbuf.st_uid = 0;
! statbuf.st_gid = 0;
! #else
! statbuf.st_uid = geteuid();
! statbuf.st_gid = getegid();
! #endif
! statbuf.st_mtime = time(NULL);
! statbuf.st_mode = S_IRUSR | S_IWUSR;
! statbuf.st_size = len;

Which is correct? Since we cannot start the PostgreSQL when
getuid != geteuid, I don't think that there is really difference between
getuid and geteuid. But other code always uses geteuid, so I think
that geteuid should be used here instead of getuid for the sake of
consistency.

OTOH, I'm not sure if there is really difference between getgid and
getegid in the backend (though I guess getgid == getegid because
getuid == geteuid), and which should be used here.
What is your opinion?

+			XLogFileName(xlogname, ThisTimeLineID, logid, logseg);
+
+			sprintf(fn, "./pg_xlog/%s", xlogname);
+			_tarWriteHeader(fn, NULL, &statbuf);

Can we use XLogFilePath? instead? Because sendFile has not been
used.

+		XLByteToSeg(endptr, endlogid, endlogseg);
<snip>
+			/* Have we reached our stop position yet? */
+			if (logid > endlogid ||
+				(logid == endlogid && logseg >= endlogseg))
+				break;

What I said in upthread is wrong. We should use XLByteToPrevSeg
for endptr and check "logseg > endlogseg". Otherwise, if endptr is
not a boundary byte, endlogid/endlogseg indicates the last
necessary WAL file, but it's not sent.

+ if (percent > 100)
+ percent = 100;

I'm not sure if this is good idea because the total received can go
further than the estimated size though the percentage stops at 100.
This looks more confusing than the previous behavior. Anyway,
I think that we need to document about the combination of -P and
-x in pg_basebackup.

In pg_basebackup.sgml
<term><option>--checkpoint <replaceable
class="parameter">fast|spread</replaceable></option></term>

Though this is not the problem of the patch, I found the inconsistency
of the descriptions about options of pg_basebackup. We should
s/--checkpoint/--checkpoint=

Regards,

--
Fujii Masao
NIPPON TELEGRAPH AND TELEPHONE CORPORATION
NTT Open Source Software Center

#24Heikki Linnakangas
heikki.linnakangas@enterprisedb.com
In reply to: Fujii Masao (#23)
Re: Include WAL in base backup

On 27.01.2011 06:44, Fujii Masao wrote:

+		XLByteToSeg(endptr, endlogid, endlogseg);
<snip>
+			/* Have we reached our stop position yet? */
+			if (logid>  endlogid ||
+				(logid == endlogid&&  logseg>= endlogseg))
+				break;

What I said in upthread is wrong. We should use XLByteToPrevSeg
for endptr and check "logseg> endlogseg". Otherwise, if endptr is
not a boundary byte, endlogid/endlogseg indicates the last
necessary WAL file, but it's not sent.

We should use XLByteToPrevSeg, but I believe >= is still correct.
logid/logseg is the last WAL segment we've successfully sent, and
endlogif/endlogid is the last WAL segment we need to send. When they are
the same, we're done.

--
Heikki Linnakangas
EnterpriseDB http://www.enterprisedb.com

#25Fujii Masao
masao.fujii@gmail.com
In reply to: Heikki Linnakangas (#24)
Re: Include WAL in base backup

On Sat, Jan 29, 2011 at 6:02 AM, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:

On 27.01.2011 06:44, Fujii Masao wrote:

+               XLByteToSeg(endptr, endlogid, endlogseg);
<snip>
+                       /* Have we reached our stop position yet? */
+                       if (logid>  endlogid ||
+                               (logid == endlogid&&  logseg>= endlogseg))
+                               break;

What I said in upthread is wrong. We should use XLByteToPrevSeg
for endptr and check "logseg>  endlogseg". Otherwise, if endptr is
not a boundary byte, endlogid/endlogseg indicates the last
necessary WAL file, but it's not sent.

We should use XLByteToPrevSeg, but I believe >= is still correct.
logid/logseg is the last WAL segment we've successfully sent, and
endlogif/endlogid is the last WAL segment we need to send. When they are the
same, we're done.

Really? logid/logseg is incremented just before the check as follows.
So, when they are the same, the WAL file which logid/logseg indicates
has not been sent yet. Am I missing something?

+			/* Advance to the next WAL file */
+			NextLogSeg(logid, logseg);
+
+			/* Have we reached our stop position yet? */
+			if (logid > endlogid ||
+				(logid == endlogid && logseg >= endlogseg))
+				break;

Regards,

--
Fujii Masao
NIPPON TELEGRAPH AND TELEPHONE CORPORATION
NTT Open Source Software Center

#26Heikki Linnakangas
heikki.linnakangas@enterprisedb.com
In reply to: Fujii Masao (#25)
Re: Include WAL in base backup

On 29.01.2011 09:10, Fujii Masao wrote:

On Sat, Jan 29, 2011 at 6:02 AM, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:

On 27.01.2011 06:44, Fujii Masao wrote:

+               XLByteToSeg(endptr, endlogid, endlogseg);
<snip>
+                       /* Have we reached our stop position yet? */
+                       if (logid>    endlogid ||
+                               (logid == endlogid&&    logseg>= endlogseg))
+                               break;

What I said in upthread is wrong. We should use XLByteToPrevSeg
for endptr and check "logseg> endlogseg". Otherwise, if endptr is
not a boundary byte, endlogid/endlogseg indicates the last
necessary WAL file, but it's not sent.

We should use XLByteToPrevSeg, but I believe>= is still correct.
logid/logseg is the last WAL segment we've successfully sent, and
endlogif/endlogid is the last WAL segment we need to send. When they are the
same, we're done.

Really? logid/logseg is incremented just before the check as follows.
So, when they are the same, the WAL file which logid/logseg indicates
has not been sent yet. Am I missing something?

+			/* Advance to the next WAL file */
+			NextLogSeg(logid, logseg);
+
+			/* Have we reached our stop position yet? */
+			if (logid>  endlogid ||
+				(logid == endlogid&&  logseg>= endlogseg))
+				break;

Ah, you're right, I misread it. Never mind..

--
Heikki Linnakangas
EnterpriseDB http://www.enterprisedb.com

#27Magnus Hagander
magnus@hagander.net
In reply to: Fujii Masao (#23)
Re: Include WAL in base backup

On Thu, Jan 27, 2011 at 05:44, Fujii Masao <masao.fujii@gmail.com> wrote:

On Wed, Jan 26, 2011 at 5:17 AM, Magnus Hagander <magnus@hagander.net> wrote:

We should, and the easiest way is to actually use XLogRead() since the
code is already there. How about the way in this patch?

Thanks for the update. I reread the patch.

+               MemSet(&statbuf, 0, sizeof(statbuf));
+               statbuf.st_mode=S_IRUSR | S_IWUSR;
+#ifndef WIN32
+               statbuf.st_uid=getuid();
+               statbuf.st_gid=getgid();
+#endif
+               statbuf.st_size=XLogSegSize;
+               statbuf.st_mtime=time(NULL);

Can you put a space around "="?

I'll run pgindent, it'll take care of that.

Which is correct? Since we cannot start the PostgreSQL when
getuid != geteuid, I don't think that there is really difference between
getuid and geteuid. But other code always uses geteuid, so I think
that geteuid should be used here instead of getuid for the sake of
consistency.

Yeah, changed for consistency.

+                       XLogFileName(xlogname, ThisTimeLineID, logid, logseg);
+
+                       sprintf(fn, "./pg_xlog/%s", xlogname);
+                       _tarWriteHeader(fn, NULL, &statbuf);

Can we use XLogFilePath? instead? Because sendFile has not been
used.

We can now, changed.

What I said in upthread is wrong. We should use XLByteToPrevSeg
for endptr and check "logseg > endlogseg". Otherwise, if endptr is
not a boundary byte, endlogid/endlogseg indicates the last
necessary WAL file, but it's not sent.

Yeah, thanks for this - and thanks to Heikki for straightening it out for me.

+       if (percent > 100)
+               percent = 100;

I'm not sure if this is good idea because the total received can go
further than the estimated size though the percentage stops at 100.
This looks more confusing than the previous behavior. Anyway,
I think that we need to document about the combination of -P and
-x in pg_basebackup.

I found it less confusing - but still somewhat confusing. I'll add some docs.

In pg_basebackup.sgml
    <term><option>--checkpoint <replaceable
class="parameter">fast|spread</replaceable></option></term>

Though this is not the problem of the patch, I found the inconsistency
of the descriptions about options of pg_basebackup. We should
s/--checkpoint/--checkpoint=

Agreed, changed.

--
 Magnus Hagander
 Me: http://www.hagander.net/
 Work: http://www.redpill-linpro.com/

#28Fujii Masao
masao.fujii@gmail.com
In reply to: Magnus Hagander (#27)
Re: Include WAL in base backup

On Mon, Jan 31, 2011 at 4:22 AM, Magnus Hagander <magnus@hagander.net> wrote:

In pg_basebackup.sgml
    <term><option>--checkpoint <replaceable
class="parameter">fast|spread</replaceable></option></term>

Though this is not the problem of the patch, I found the inconsistency
of the descriptions about options of pg_basebackup. We should
s/--checkpoint/--checkpoint=

Agreed, changed.

Thanks for all your efforts! pg_basebackup and this new option are VERY useful.

Regards,

--
Fujii Masao
NIPPON TELEGRAPH AND TELEPHONE CORPORATION
NTT Open Source Software Center

#29Robert Haas
robertmhaas@gmail.com
In reply to: Fujii Masao (#28)
Re: Include WAL in base backup

On Sun, Jan 30, 2011 at 9:34 PM, Fujii Masao <masao.fujii@gmail.com> wrote:

On Mon, Jan 31, 2011 at 4:22 AM, Magnus Hagander <magnus@hagander.net> wrote:

In pg_basebackup.sgml
    <term><option>--checkpoint <replaceable
class="parameter">fast|spread</replaceable></option></term>

Though this is not the problem of the patch, I found the inconsistency
of the descriptions about options of pg_basebackup. We should
s/--checkpoint/--checkpoint=

Agreed, changed.

Thanks for all your efforts! pg_basebackup and this new option are VERY useful.

+1!

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company