Streaming a base backup from master
It's been discussed before that it would be cool if you could stream a
new base backup from the master server, via libpq. That way you would
not need low-level filesystem access to initialize a new standby.
Magnus mentioned today that he started hacking on that, and
coincidentally I just started experimenting with it yesterday as well
:-). So let's get this out on the mailing list.
Here's a WIP patch. It adds a new "TAKE_BACKUP" command to the
replication command set. Upon receiving that command, the master starts
a COPY, and streams a tarred copy of the data directory to the client.
The patch includes a simple command-line tool, pg_streambackup, to
connect to a server and request a backup that you can then redirect to a
.tar file or pipe to "tar x".
TODO:
* We need a smarter way to do pg_start/stop_backup() with this. At the
moment, you can only have one backup running at a time, but we shouldn't
have that limitation with this built-in mechanism.
* The streamed backup archive should contain all the necessary WAL files
too, so that you don't need to set up archiving to use this. You could
just point the tiny client tool to the server, and get a backup archive
containing everything that's necessary to restore correctly.
--
Heikki Linnakangas
EnterpriseDB http://www.enterprisedb.com
Attachments:
basebackup-1.patchtext/x-diff; name=basebackup-1.patchDownload
diff --git a/src/backend/replication/Makefile b/src/backend/replication/Makefile
index 113dc3f..5bb4159 100644
--- a/src/backend/replication/Makefile
+++ b/src/backend/replication/Makefile
@@ -12,6 +12,6 @@ subdir = src/backend/replication
top_builddir = ../../..
include $(top_builddir)/src/Makefile.global
-OBJS = walsender.o walreceiverfuncs.o walreceiver.o
+OBJS = walsender.o walreceiverfuncs.o walreceiver.o basebackup.o
include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c
new file mode 100644
index 0000000..9be9bdf
--- /dev/null
+++ b/src/backend/replication/basebackup.c
@@ -0,0 +1,276 @@
+/*-------------------------------------------------------------------------
+ *
+ * basebackup.c
+ * code for taking a base backup and streaming it a standby
+ *
+ * Portions Copyright (c) 2010-2010, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <dirent.h>
+#include <time.h>
+
+#include "access/xlog_internal.h" /* for pg_start/stop_backup */
+#include "utils/builtins.h"
+#include "lib/stringinfo.h"
+#include "libpq/libpq.h"
+#include "libpq/pqformat.h"
+#include "replication/basebackup.h"
+#include "storage/fd.h"
+
+static void sendDir(char *path);
+static void sendFile(char *path);
+static void _tarWriteHeader(char *filename, uint64 fileLen);
+
+void
+SendBaseBackup(void)
+{
+ StringInfoData buf;
+
+ DirectFunctionCall2(&pg_start_backup, CStringGetTextDatum("basebackup"),
+ BoolGetDatum(true));
+
+ /* Send CopyOutResponse message */
+ pq_beginmessage(&buf, 'H');
+ pq_sendbyte(&buf, 0); /* overall format */
+ pq_sendint(&buf, 0, 2); /* natts */
+ pq_endmessage(&buf);
+
+ /* tar up the data directory */
+ sendDir(".");
+
+ /* Send CopyDone message */
+ pq_putemptymessage('c');
+
+ /* XXX: Is there no DirectFunctionCall0? */
+ DirectFunctionCall1(&pg_stop_backup, (Datum) 0);
+}
+
+static void
+sendDir(char *path)
+{
+ DIR *dir;
+ struct dirent *de;
+ char pathbuf[MAXPGPATH];
+ struct stat statbuf;
+
+ dir = AllocateDir(path);
+ while ((de = ReadDir(dir, path)) != NULL)
+ {
+ /* Skip special stuff */
+ if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+ continue;
+
+ snprintf(pathbuf, MAXPGPATH, "%s/%s", path, de->d_name);
+
+ /* Skip pg_xlog and postmaster.pid */
+ if (strcmp(pathbuf, "./pg_xlog") == 0)
+ continue;
+ if (strcmp(pathbuf, "./postmaster.pid") == 0)
+ continue;
+
+ if (lstat(pathbuf, &statbuf) != 0)
+ {
+ if (errno != ENOENT)
+ {
+ elog(WARNING, "could not stat file or directory \"%s\": %m",
+ pathbuf);
+ }
+ continue;
+ }
+
+ if (S_ISDIR(statbuf.st_mode))
+ {
+ /* call ourselves recursively for a directory */
+ sendDir(pathbuf);
+ }
+ else if (S_ISREG(statbuf.st_mode))
+ {
+ sendFile(pathbuf);
+ }
+ else
+ elog(WARNING, "skipping special file \"%s\"", pathbuf);
+ }
+ FreeDir(dir);
+}
+
+
+/***** Functions for handling tar file format, copied from pg_dump ******/
+
+/*
+ * Utility routine to print possibly larger than 32 bit integers in a
+ * portable fashion. Filled with zeros.
+ */
+static void
+print_val(char *s, uint64 val, unsigned int base, size_t len)
+{
+ int i;
+
+ for (i = len; i > 0; i--)
+ {
+ int digit = val % base;
+
+ s[i - 1] = '0' + digit;
+ val = val / base;
+ }
+}
+
+/*
+ * Maximum file size for a tar member: The limit inherent in the
+ * format is 2^33-1 bytes (nearly 8 GB). But we don't want to exceed
+ * what we can represent in pgoff_t.
+ */
+#define MAX_TAR_MEMBER_FILELEN (((int64) 1 << Min(33, sizeof(pgoff_t)*8 - 1)) - 1)
+
+static int
+_tarChecksum(char *header)
+{
+ int i,
+ sum;
+
+ sum = 0;
+ for (i = 0; i < 512; i++)
+ if (i < 148 || i >= 156)
+ sum += 0xFF & header[i];
+ return sum + 256; /* Assume 8 blanks in checksum field */
+}
+
+/* Given the member, write the TAR header & copy the file */
+static void
+sendFile(char *filename)
+{
+ FILE *fp;
+ char buf[32768];
+ size_t cnt;
+ pgoff_t len = 0;
+ size_t pad;
+ pgoff_t fileLen;
+
+ fp = AllocateFile(filename, "rb");
+ if (fp == NULL)
+ elog(ERROR, "could not open file \"%s\": %m", filename);
+
+ /*
+ * Find file len & go back to start.
+ */
+ fseeko(fp, 0, SEEK_END);
+ fileLen = ftello(fp);
+ fseeko(fp, 0, SEEK_SET);
+
+ /*
+ * Some compilers will throw a warning knowing this test can never be true
+ * because pgoff_t can't exceed the compared maximum on their platform.
+ */
+ if (fileLen > MAX_TAR_MEMBER_FILELEN)
+ elog(ERROR, "archive member too large for tar format");
+
+ _tarWriteHeader(filename, fileLen);
+
+ while ((cnt = fread(buf, 1, Min(sizeof(buf), fileLen - len), fp)) > 0)
+ {
+ /* Send the chunk as a CopyData message */
+ pq_putmessage('d', buf, cnt);
+ len += cnt;
+
+ if (len >= fileLen)
+ {
+ /*
+ * Reached end of file. The file could be longer, if it was
+ * extended while we were sending it, but for a base backup we
+ * can ignore such extended data. It will be restored from WAL.
+ */
+ break;
+ }
+ }
+ /* If the file was truncated while we were sending it, pad it with zeros */
+ if (len < fileLen)
+ {
+ MemSet(buf, 0, sizeof(buf));
+ while(len < fileLen)
+ {
+ cnt = Min(sizeof(buf), fileLen - len);
+ pq_putmessage('d', buf, cnt);
+ len += cnt;
+ }
+ }
+
+ /* Pad to 512 byte boundary */
+ pad = ((len + 511) & ~511) - len;
+ MemSet(buf, 0, pad);
+ pq_putmessage('d', buf, pad);
+
+ FreeFile(fp);
+}
+
+
+static void
+_tarWriteHeader(char *filename, uint64 fileLen)
+{
+ char h[512];
+ int lastSum = 0;
+ int sum;
+
+ memset(h, 0, sizeof(h));
+
+ /* Name 100 */
+ sprintf(&h[0], "%.99s", filename);
+
+ /* Mode 8 */
+ sprintf(&h[100], "100600 ");
+
+ /* User ID 8 */
+ sprintf(&h[108], "004000 ");
+
+ /* Group 8 */
+ sprintf(&h[116], "002000 ");
+
+ /* File size 12 - 11 digits, 1 space, no NUL */
+ print_val(&h[124], fileLen, 8, 11);
+ sprintf(&h[135], " ");
+
+ /* Mod Time 12 */
+ sprintf(&h[136], "%011o ", (int) time(NULL));
+
+ /* Checksum 8 */
+ sprintf(&h[148], "%06o ", lastSum);
+
+ /* Type - regular file */
+ sprintf(&h[156], "0");
+
+ /* Link tag 100 (NULL) */
+
+ /* Magic 6 + Version 2 */
+ sprintf(&h[257], "ustar00");
+
+#if 0
+ /* User 32 */
+ sprintf(&h[265], "%.31s", ""); /* How do I get username reliably? Do
+ * I need to? */
+
+ /* Group 32 */
+ sprintf(&h[297], "%.31s", ""); /* How do I get group reliably? Do I
+ * need to? */
+
+ /* Maj Dev 8 */
+ sprintf(&h[329], "%6o ", 0);
+
+ /* Min Dev 8 */
+ sprintf(&h[337], "%6o ", 0);
+#endif
+
+ while ((sum = _tarChecksum(h)) != lastSum)
+ {
+ sprintf(&h[148], "%06o ", sum);
+ lastSum = sum;
+ }
+
+ pq_putmessage('d', h, 512);
+}
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index 53c2581..291ec0b 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -42,6 +42,7 @@
#include "libpq/pqformat.h"
#include "libpq/pqsignal.h"
#include "miscadmin.h"
+#include "replication/basebackup.h"
#include "replication/walprotocol.h"
#include "replication/walsender.h"
#include "storage/fd.h"
@@ -294,6 +295,14 @@ WalSndHandshake(void)
/* break out of the loop */
replication_started = true;
}
+ else if (strcmp(query_string, "TAKE_BACKUP") == 0)
+ {
+ SendBaseBackup();
+ /* Send CommandComplete and ReadyForQuery messages */
+ EndCommand("SELECT", DestRemote);
+ ReadyForQuery(DestRemote);
+ /* ReadyForQuery did pq_flush for us */
+ }
else
{
ereport(FATAL,
diff --git a/src/bin/scripts/Makefile b/src/bin/scripts/Makefile
index d82c067..99f62a2 100644
--- a/src/bin/scripts/Makefile
+++ b/src/bin/scripts/Makefile
@@ -16,7 +16,7 @@ subdir = src/bin/scripts
top_builddir = ../../..
include $(top_builddir)/src/Makefile.global
-PROGRAMS = createdb createlang createuser dropdb droplang dropuser clusterdb vacuumdb reindexdb
+PROGRAMS = createdb createlang createuser dropdb droplang dropuser clusterdb vacuumdb reindexdb streambackup
override CPPFLAGS := -I$(top_srcdir)/src/bin/pg_dump -I$(top_srcdir)/src/bin/psql -I$(libpq_srcdir) $(CPPFLAGS)
@@ -34,6 +34,7 @@ dropuser: dropuser.o common.o dumputils.o kwlookup.o keywords.o
clusterdb: clusterdb.o common.o dumputils.o kwlookup.o keywords.o
vacuumdb: vacuumdb.o common.o
reindexdb: reindexdb.o common.o dumputils.o kwlookup.o keywords.o
+streambackup: streambackup.o common.o streambackup.o
dumputils.c keywords.c: % : $(top_srcdir)/src/bin/pg_dump/%
rm -f $@ && $(LN_S) $< .
@@ -54,6 +55,7 @@ install: all installdirs
$(INSTALL_PROGRAM) clusterdb$(X) '$(DESTDIR)$(bindir)'/clusterdb$(X)
$(INSTALL_PROGRAM) vacuumdb$(X) '$(DESTDIR)$(bindir)'/vacuumdb$(X)
$(INSTALL_PROGRAM) reindexdb$(X) '$(DESTDIR)$(bindir)'/reindexdb$(X)
+ $(INSTALL_PROGRAM) streambackup$(X) '$(DESTDIR)$(bindir)'/streambackup$(X)
installdirs:
$(MKDIR_P) '$(DESTDIR)$(bindir)'
diff --git a/src/bin/scripts/streambackup.c b/src/bin/scripts/streambackup.c
new file mode 100644
index 0000000..28255fd
--- /dev/null
+++ b/src/bin/scripts/streambackup.c
@@ -0,0 +1,182 @@
+/*-------------------------------------------------------------------------
+ *
+ * streambackup
+ *
+ * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ *
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres_fe.h"
+#include "common.h"
+#include "dumputils.h"
+
+
+static void help(const char *progname);
+
+int
+main(int argc, char *argv[])
+{
+ static struct option long_options[] = {
+ {"host", required_argument, NULL, 'h'},
+ {"port", required_argument, NULL, 'p'},
+ {"username", required_argument, NULL, 'U'},
+ {"no-password", no_argument, NULL, 'w'},
+ {"password", no_argument, NULL, 'W'},
+ {NULL, 0, NULL, 0}
+ };
+
+ const char *progname;
+ int optindex;
+ int c;
+
+ const char *host = NULL;
+ const char *port = NULL;
+ const char *username = NULL;
+ enum trivalue prompt_password = TRI_DEFAULT;
+ PGconn *conn;
+ PGresult *res;
+
+ progname = get_progname(argv[0]);
+ set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pgscripts"));
+
+ handle_help_version_opts(argc, argv, "streambackup", help);
+
+ /* process command-line options */
+ while ((c = getopt_long(argc, argv, "h:p:U:wW", long_options, &optindex)) != -1)
+ {
+ switch (c)
+ {
+ case 'h':
+ host = optarg;
+ break;
+ case 'p':
+ port = optarg;
+ break;
+ case 'U':
+ username = optarg;
+ break;
+ case 'w':
+ prompt_password = TRI_NO;
+ break;
+ case 'W':
+ prompt_password = TRI_YES;
+ break;
+ default:
+ fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
+ exit(1);
+ }
+ }
+
+ switch (argc - optind)
+ {
+ case 0:
+ break;
+ default:
+ fprintf(stderr, _("%s: too many command-line arguments (first is \"%s\")\n"), progname, argv[optind + 1]);
+ fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
+ exit(1);
+ }
+
+ {
+#define PARAMS_ARRAY_SIZE 7
+ const char **keywords = malloc(PARAMS_ARRAY_SIZE * sizeof(*keywords));
+ const char **values = malloc(PARAMS_ARRAY_SIZE * sizeof(*values));
+
+ if (!keywords || !values)
+ {
+ fprintf(stderr, _("%s: out of memory\n"), progname);
+ exit(1);
+ }
+ keywords[0] = "host";
+ values[0] = host;
+ keywords[1] = "port";
+ values[1] = port;
+ keywords[2] = "user";
+ values[2] = username;
+ keywords[3] = "password";
+ values[3] = NULL;
+ keywords[4] = "replication";
+ values[4] = "true";
+ keywords[5] = "fallback_application_name";
+ values[5] = progname;
+ keywords[6] = NULL;
+ values[6] = NULL;
+
+ conn = PQconnectdbParams(keywords, values, true);
+ }
+
+ /* check to see that the backend connection was successfully made */
+ if (PQstatus(conn) == CONNECTION_BAD)
+ {
+ fprintf(stderr, _("%s: could not open a replication connection: %s"),
+ progname, PQerrorMessage(conn));
+ exit(1);
+ }
+
+ /* Main workhorse */
+ res = PQexec(conn, "TAKE_BACKUP");
+ if (PQresultStatus(res) != PGRES_COPY_OUT)
+ {
+ PQclear(res);
+ fprintf(stderr, _("%s: could not start streaming backup: %s"),
+ progname, PQerrorMessage(conn));
+ }
+
+ for (;;)
+ {
+ static char *recvBuf = NULL;
+ int rawlen = PQgetCopyData(conn, &recvBuf, 0);
+ if (rawlen == -1) /* end-of-streaming or error */
+ {
+ PGresult *res;
+
+ res = PQgetResult(conn);
+ if (PQresultStatus(res) != PGRES_COMMAND_OK)
+ {
+ PQclear(res);
+ fprintf(stderr, _("%s: backup terminated by server: %s"),
+ progname, PQerrorMessage(conn));
+ exit(1);
+ }
+
+ /* Success! */
+ break;
+ }
+ if (rawlen < -1)
+ {
+ fprintf(stderr, _("%s: backup terminated by server: %s"),
+ progname, PQerrorMessage(conn));
+ exit(1);
+ }
+ fwrite(recvBuf, rawlen, 1, stdout);
+ PQfreemem(recvBuf);
+ }
+
+ PQfinish(conn);
+
+ exit(0);
+}
+
+static void
+help(const char *progname)
+{
+ printf(_("%s streams a physical backup of a PostgreSQL cluster.\n\n"), progname);
+ printf(_("Usage:\n"));
+ printf(_(" %s [OPTION]... [DBNAME]\n"), progname);
+ printf(_("\nOptions:\n"));
+ printf(_(" -i, --index=INDEX recreate specific index only\n"));
+ printf(_(" -q, --quiet don't write any messages\n"));
+ printf(_(" --help show this help, then exit\n"));
+ printf(_(" --version output version information, then exit\n"));
+ printf(_("\nConnection options:\n"));
+ printf(_(" -h, --host=HOSTNAME database server host or socket directory\n"));
+ printf(_(" -p, --port=PORT database server port\n"));
+ printf(_(" -U, --username=USERNAME user name to connect as\n"));
+ printf(_(" -w, --no-password never prompt for password\n"));
+ printf(_(" -W, --password force password prompt\n"));
+ printf(_("\n"));
+ printf(_("\nReport bugs to <pgsql-bugs@postgresql.org>.\n"));
+}
diff --git a/src/include/replication/basebackup.h b/src/include/replication/basebackup.h
new file mode 100644
index 0000000..2714663
--- /dev/null
+++ b/src/include/replication/basebackup.h
@@ -0,0 +1,17 @@
+/*-------------------------------------------------------------------------
+ *
+ * basebackup.h
+ * Exports from replication/basebackup.c.
+ *
+ * Portions Copyright (c) 2010-2010, PostgreSQL Global Development Group
+ *
+ * $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef _BASEBACKUP_H
+#define _BASEBACKUP_H
+
+extern void SendBaseBackup(void);
+
+#endif /* _BASEBACKUP_H */
On 3 September 2010 12:19, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:
TODO:
* We need a smarter way to do pg_start/stop_backup() with this. At the
moment, you can only have one backup running at a time, but we shouldn't
have that limitation with this built-in mechanism.
Would it be possible to not require pg_start/stop_backup() for this
new feature? (yes, I'm probably missing something obvious here)
--
Thom Brown
Twitter: @darkixion
IRC (freenode): dark_ixion
Registered Linux user: #516935
On Fri, Sep 3, 2010 at 12:19 PM, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:
Here's a WIP patch. It adds a new "TAKE_BACKUP" command to the replication
command set. Upon receiving that command, the master starts a COPY, and
streams a tarred copy of the data directory to the client. The patch
includes a simple command-line tool, pg_streambackup, to connect to a server
and request a backup that you can then redirect to a .tar file or pipe to
"tar x".
Cool. Can you add a TODO to build in code to un-tar the archive? tar
is not usually found on Windows systems, and as we already have tar
extraction code in pg_restore it could presumably be added relatively
painlessly.
--
Dave Page
Blog: http://pgsnake.blogspot.com
Twitter: @pgsnake
EnterpriseDB UK: http://www.enterprisedb.com
The Enterprise Postgres Company
On Fri, Sep 3, 2010 at 13:19, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:
It's been discussed before that it would be cool if you could stream a new
base backup from the master server, via libpq. That way you would not need
low-level filesystem access to initialize a new standby.Magnus mentioned today that he started hacking on that, and coincidentally I
just started experimenting with it yesterday as well :-). So let's get this
out on the mailing list.Here's a WIP patch. It adds a new "TAKE_BACKUP" command to the replication
command set. Upon receiving that command, the master starts a COPY, and
streams a tarred copy of the data directory to the client. The patch
includes a simple command-line tool, pg_streambackup, to connect to a server
and request a backup that you can then redirect to a .tar file or pipe to
"tar x".TODO:
* We need a smarter way to do pg_start/stop_backup() with this. At the
moment, you can only have one backup running at a time, but we shouldn't
have that limitation with this built-in mechanism.* The streamed backup archive should contain all the necessary WAL files
too, so that you don't need to set up archiving to use this. You could just
point the tiny client tool to the server, and get a backup archive
containing everything that's necessary to restore correctly.
For this last point, this should of course be *optional*, but it would
be very good to have that option (and probably on by default).
Couple of quick comments that I saw directly differentiated from the
code I have :-) We chatted some about it already, but it should be
included for others...
* It should be possible to pass the backup label through, not just
hardcode it to basebackup
* Needs support for tablespaces. We should either follow the symlinks
and pick up the files, or throw an error if it's there. Silently
delivering an incomplete backup is not a good thing :-)
* Is there a point in adapting the chunk size to the size of the libpq buffers?
FWIW, my implementation was as a user-defined function, which has the
advantage it can run on 9.0. But most likely this code can be ripped
out and provided as a separate backport project for 9.0 if necessary -
no need to have separate codebases.
Other than that, our code is remarkably similar.
--
Magnus Hagander
Me: http://www.hagander.net/
Work: http://www.redpill-linpro.com/
On Fri, Sep 3, 2010 at 13:25, Thom Brown <thom@linux.com> wrote:
On 3 September 2010 12:19, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:TODO:
* We need a smarter way to do pg_start/stop_backup() with this. At the
moment, you can only have one backup running at a time, but we shouldn't
have that limitation with this built-in mechanism.Would it be possible to not require pg_start/stop_backup() for this
new feature? (yes, I'm probably missing something obvious here)
You don't need to run it *manually*, but the process needs to run it
automatically in the background for you. Which it does already in the
suggested patch.
--
Magnus Hagander
Me: http://www.hagander.net/
Work: http://www.redpill-linpro.com/
On 3 September 2010 12:30, Magnus Hagander <magnus@hagander.net> wrote:
On Fri, Sep 3, 2010 at 13:25, Thom Brown <thom@linux.com> wrote:
On 3 September 2010 12:19, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:TODO:
* We need a smarter way to do pg_start/stop_backup() with this. At the
moment, you can only have one backup running at a time, but we shouldn't
have that limitation with this built-in mechanism.Would it be possible to not require pg_start/stop_backup() for this
new feature? (yes, I'm probably missing something obvious here)You don't need to run it *manually*, but the process needs to run it
automatically in the background for you. Which it does already in the
suggested patch.
Ah, clearly I didn't read the patch in any detail. Thanks :)
--
Thom Brown
Twitter: @darkixion
IRC (freenode): dark_ixion
Registered Linux user: #516935
On 03/09/10 14:25, Thom Brown wrote:
On 3 September 2010 12:19, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:TODO:
* We need a smarter way to do pg_start/stop_backup() with this. At the
moment, you can only have one backup running at a time, but we shouldn't
have that limitation with this built-in mechanism.Would it be possible to not require pg_start/stop_backup() for this
new feature? (yes, I'm probably missing something obvious here)
Well, pg_start_backup() does several things:
1. It sets the forceFullPageWrites flag, so that we don't get partial
pages in the restored database.
2. It performs a checkpoint
3. It creates a backup label file
We certainly need 1 and 2. We don't necessary need to write the backup
label file to the data directory when we're streaming the backup
directly to the client, we can just include it in the streamed archive.
pg_stop_backup() also does several things:
1. It clears the forceFullPageWrites flag.
2. It writes an end-of-backup WAL record
3. It switches to new WAL segment, to get the final WAL segment archived.
4. It writes a backup history file
5. It removes the backup label file.
6. It waits for all the required WAL files to be archived.
We need 1, but the rest we could do in a smarter way. When we have more
control of the backup process, I don't think we need the end-of-backup
WAL record or the backup label anymore. We can add the pg_control file
as the last file in the archive, and set minRecoveryPoint in it to the
last WAL record needed to recover.
So no, we don't really need pg_start/stop_backup() per se, but we'll
need something similar...
--
Heikki Linnakangas
EnterpriseDB http://www.enterprisedb.com
On 03/09/10 14:28, Dave Page wrote:
On Fri, Sep 3, 2010 at 12:19 PM, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:Here's a WIP patch. It adds a new "TAKE_BACKUP" command to the replication
command set. Upon receiving that command, the master starts a COPY, and
streams a tarred copy of the data directory to the client. The patch
includes a simple command-line tool, pg_streambackup, to connect to a server
and request a backup that you can then redirect to a .tar file or pipe to
"tar x".Cool. Can you add a TODO to build in code to un-tar the archive? tar
is not usually found on Windows systems, and as we already have tar
extraction code in pg_restore it could presumably be added relatively
painlessly.
Ok. Another obvious thing that people will want is to gzip the tar file
while sending it, to reduce network traffic.
--
Heikki Linnakangas
EnterpriseDB http://www.enterprisedb.com
On Fri, Sep 3, 2010 at 13:48, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:
On 03/09/10 14:28, Dave Page wrote:
On Fri, Sep 3, 2010 at 12:19 PM, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:Here's a WIP patch. It adds a new "TAKE_BACKUP" command to the
replication
command set. Upon receiving that command, the master starts a COPY, and
streams a tarred copy of the data directory to the client. The patch
includes a simple command-line tool, pg_streambackup, to connect to a
server
and request a backup that you can then redirect to a .tar file or pipe to
"tar x".Cool. Can you add a TODO to build in code to un-tar the archive? tar
is not usually found on Windows systems, and as we already have tar
extraction code in pg_restore it could presumably be added relatively
painlessly.Ok. Another obvious thing that people will want is to gzip the tar file
while sending it, to reduce network traffic.
Not necessarily obvious, needs to be configurable. There are a lot of
cases where you might not want it.
--
Magnus Hagander
Me: http://www.hagander.net/
Work: http://www.redpill-linpro.com/
On Fri, Sep 3, 2010 at 12:19 PM, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:
* We need a smarter way to do pg_start/stop_backup() with this. At the
moment, you can only have one backup running at a time, but we shouldn't
have that limitation with this built-in mechanism.
Well there's no particular reason we couldn't support having multiple
pg_start_backup() pending either. It's just not usually something
people have need so far.
--
greg
On 03/09/10 15:16, Greg Stark wrote:
On Fri, Sep 3, 2010 at 12:19 PM, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:* We need a smarter way to do pg_start/stop_backup() with this. At the
moment, you can only have one backup running at a time, but we shouldn't
have that limitation with this built-in mechanism.Well there's no particular reason we couldn't support having multiple
pg_start_backup() pending either. It's just not usually something
people have need so far.
The backup label file makes that hard. There can be only one at a time.
--
Heikki Linnakangas
EnterpriseDB http://www.enterprisedb.com
On Fri, Sep 3, 2010 at 7:28 AM, Dave Page <dpage@pgadmin.org> wrote:
On Fri, Sep 3, 2010 at 12:19 PM, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:Here's a WIP patch. It adds a new "TAKE_BACKUP" command to the replication
command set. Upon receiving that command, the master starts a COPY, and
streams a tarred copy of the data directory to the client. The patch
includes a simple command-line tool, pg_streambackup, to connect to a server
and request a backup that you can then redirect to a .tar file or pipe to
"tar x".Cool. Can you add a TODO to build in code to un-tar the archive? tar
is not usually found on Windows systems, and as we already have tar
extraction code in pg_restore it could presumably be added relatively
painlessly.
It seems like the elephant in the room here is updating an existing
backup without recopying the entire data directory. Perhaps that's
phase two, but worth keeping in mind...
--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise Postgres Company
On Fri, Sep 3, 2010 at 15:24, Robert Haas <robertmhaas@gmail.com> wrote:
On Fri, Sep 3, 2010 at 7:28 AM, Dave Page <dpage@pgadmin.org> wrote:
On Fri, Sep 3, 2010 at 12:19 PM, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:Here's a WIP patch. It adds a new "TAKE_BACKUP" command to the replication
command set. Upon receiving that command, the master starts a COPY, and
streams a tarred copy of the data directory to the client. The patch
includes a simple command-line tool, pg_streambackup, to connect to a server
and request a backup that you can then redirect to a .tar file or pipe to
"tar x".Cool. Can you add a TODO to build in code to un-tar the archive? tar
is not usually found on Windows systems, and as we already have tar
extraction code in pg_restore it could presumably be added relatively
painlessly.It seems like the elephant in the room here is updating an existing
backup without recopying the entire data directory. Perhaps that's
phase two, but worth keeping in mind...
I'd say that's a very different use-case, but still a very useful one
of course. It's probably going to be a lot more complex (it would
require bi-directional traffic, I think)...
--
Magnus Hagander
Me: http://www.hagander.net/
Work: http://www.redpill-linpro.com/
On Fri, Sep 3, 2010 at 2:24 PM, Robert Haas <robertmhaas@gmail.com> wrote:
On Fri, Sep 3, 2010 at 7:28 AM, Dave Page <dpage@pgadmin.org> wrote:
On Fri, Sep 3, 2010 at 12:19 PM, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:Here's a WIP patch. It adds a new "TAKE_BACKUP" command to the replication
command set. Upon receiving that command, the master starts a COPY, and
streams a tarred copy of the data directory to the client. The patch
includes a simple command-line tool, pg_streambackup, to connect to a server
and request a backup that you can then redirect to a .tar file or pipe to
"tar x".Cool. Can you add a TODO to build in code to un-tar the archive? tar
is not usually found on Windows systems, and as we already have tar
extraction code in pg_restore it could presumably be added relatively
painlessly.It seems like the elephant in the room here is updating an existing
backup without recopying the entire data directory. Perhaps that's
phase two, but worth keeping in mind...
rsync? Might be easier to use that from day 1 (well, day 2) than to
retrofit later.
--
Dave Page
Blog: http://pgsnake.blogspot.com
Twitter: @pgsnake
EnterpriseDB UK: http://www.enterprisedb.com
The Enterprise Postgres Company
On Fri, Sep 3, 2010 at 9:26 AM, Dave Page <dpage@pgadmin.org> wrote:
rsync? Might be easier to use that from day 1 (well, day 2) than to
retrofit later.
I'm not sure we want to depend on an external utility like that,
particularly one that users may not have installed. And I'm not sure
if that can be made to work over a libpq channel, either. But
certainly something with that functionality would be nice to have,
whether it ends up sharing code or not.
--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise Postgres Company
On Fri, Sep 3, 2010 at 2:29 PM, Robert Haas <robertmhaas@gmail.com> wrote:
On Fri, Sep 3, 2010 at 9:26 AM, Dave Page <dpage@pgadmin.org> wrote:
rsync? Might be easier to use that from day 1 (well, day 2) than to
retrofit later.I'm not sure we want to depend on an external utility like that,
particularly one that users may not have installed. And I'm not sure
if that can be made to work over a libpq channel, either. But
certainly something with that functionality would be nice to have,
whether it ends up sharing code or not.
No, I agree we don't want an external dependency (I was just bleating
about needing tar on Windows). I was assuming/hoping there's a
librsync somewhere...
--
Dave Page
Blog: http://pgsnake.blogspot.com
Twitter: @pgsnake
EnterpriseDB UK: http://www.enterprisedb.com
The Enterprise Postgres Company
On Fri, Sep 3, 2010 at 9:32 AM, Dave Page <dpage@pgadmin.org> wrote:
No, I agree we don't want an external dependency (I was just bleating
about needing tar on Windows). I was assuming/hoping there's a
librsync somewhere...
The rsync code itself is not modular, I believe. I think the author
thereof kind of took the approach of placing efficiency before all.
See:
http://www.samba.org/rsync/how-rsync-works.html ... especially the
section on "The Rsync Protocol"
I Googled librsync and got a hit, but that code is a rewrite of the
source base and seems to have little or no activity since 2004.
http://librsync.sourceforge.net/
That page writes: "librsync is not wire-compatible with rsync 2.x, and
is not likely to be in the future." The current version of rsync is
3.0.7.
--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise Postgres Company
* Robert Haas (robertmhaas@gmail.com) wrote:
The rsync code itself is not modular, I believe. I think the author
thereof kind of took the approach of placing efficiency before all.
Yeah, I looked into this when discussing this same concept at PGCon with
folks. There doesn't appear to be a good librsync and, even if there
was, there's a heck of alot of complexity there that we *don't* need.
rsync is a great tool, don't get me wrong, but let's not try to go over
our heads here.
We don't need permissions handling, as an example. I also don't think
we need the binary diff/partial file transfer capability- we already
break relations into 1G chunks (when/if they reach that size), so you
won't necessairly be copying the entire relation if you're just doing
mtime based or per-file-checksum based detection. We don't need device
node handling, we don't need auto-ignoring files, or pattern
exclusion/inclusion, we don't really need a progress bar (though it'd be
nice.. :), etc, etc, etc.
Thanks,
Stephen
Heikki Linnakangas <heikki.linnakangas@enterprisedb.com> writes:
On 03/09/10 15:16, Greg Stark wrote:
On Fri, Sep 3, 2010 at 12:19 PM, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:* We need a smarter way to do pg_start/stop_backup() with this. At the
moment, you can only have one backup running at a time, but we shouldn't
have that limitation with this built-in mechanism.Well there's no particular reason we couldn't support having multiple
pg_start_backup() pending either. It's just not usually something
people have need so far.
The backup label file makes that hard. There can be only one at a time.
I don't actually see a use-case for streaming multiple concurrent
backups. How many people are going to be able to afford that kind of
load on the master's I/O bandwidth?
Certainly for version 1, it would be sufficient to throw an error if
someone tries to start a backup while another one is in progress.
*Maybe*, down the road, we'd want to relax it.
regards, tom lane
Stephen Frost <sfrost@snowman.net> wrote:
there's a heck of alot of complexity there that we *don't* need.
rsync is a great tool, don't get me wrong, but let's not try to go
over our heads here.
Right -- among other things, it checks for portions of a new file
which match the old file at a different location. For example, if
you have a very large text file, and insert a line or two at the
start, it will wind up only sending the new lines. (Well, that and
all the checksums which help it determine that the rest of the file
matches at a shifted location.) I would think that PostgreSQL could
just check whether *corresponding* portions of a file matched, which
is much simpler.
we already break relations into 1G chunks (when/if they reach that
size), so you won't necessairly be copying the entire relation if
you're just doing mtime based or per-file-checksum based
detection.
While 1GB granularity would be OK, I doubt it's optimal; I think CRC
checks for smaller chunks might be worthwhile. My gut feel is that
somewhere in the 64kB to 1MB range would probably be optimal for us,
although the "sweet spot" will depend on how the database is used.
A configurable or self-adjusting size would be cool.
-Kevin
Kevin,
* Kevin Grittner (Kevin.Grittner@wicourts.gov) wrote:
While 1GB granularity would be OK, I doubt it's optimal; I think CRC
checks for smaller chunks might be worthwhile. My gut feel is that
somewhere in the 64kB to 1MB range would probably be optimal for us,
although the "sweet spot" will depend on how the database is used.
A configurable or self-adjusting size would be cool.
We have something much better, called WAL. If people want to keep their
backup current, they should use that after getting the base backup up
and working. We don't need to support this for the base backup, imv.
In any case, it's certainly not something required for an initial
implementation..
Thanks,
Stephen
On 3 September 2010 16:01, Tom Lane <tgl@sss.pgh.pa.us> wrote:
Heikki Linnakangas <heikki.linnakangas@enterprisedb.com> writes:
On 03/09/10 15:16, Greg Stark wrote:
On Fri, Sep 3, 2010 at 12:19 PM, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:* We need a smarter way to do pg_start/stop_backup() with this. At the
moment, you can only have one backup running at a time, but we shouldn't
have that limitation with this built-in mechanism.Well there's no particular reason we couldn't support having multiple
pg_start_backup() pending either. It's just not usually something
people have need so far.The backup label file makes that hard. There can be only one at a time.
I don't actually see a use-case for streaming multiple concurrent
backups. How many people are going to be able to afford that kind of
load on the master's I/O bandwidth?
To make it affordable, could functionality be added to allow slaves to
become chainable? (i.e. master streams to standby 1, which streams to
standby 2 etc) This would help reduce bandwidth for normal streaming
replication too, which would be useful on particularly busy databases.
Obviously in synchronous replication this would be horribly slow so
not feasible for that.
--
Thom Brown
Twitter: @darkixion
IRC (freenode): dark_ixion
Registered Linux user: #516935
Stephen Frost <sfrost@snowman.net> wrote:
We have something much better, called WAL. If people want to keep
their backup current, they should use that after getting the base
backup up and working.
Unless you want to provide support for Point In Time Recovery
without excessive recovery times.
We don't need to support this for the base backup, imv.
We found that making a hard-link copy of the previous base backup
and using rsync to bring it up to date used 1% the WAN bandwidth as
sending a complete, compressed base backup. Just sending modified
files in their entirety would have bought the first order of
magnitude; recognizing the unchanged portions buys the second order
of magnitude.
In any case, it's certainly not something required for an initial
implementation..
No disagreement there; but sometimes it pays to know where you might
want to go, so you don't do something to make further development in
that direction unnecessarily difficult.
-Kevin
On 03/09/10 18:01, Tom Lane wrote:
Heikki Linnakangas<heikki.linnakangas@enterprisedb.com> writes:
On 03/09/10 15:16, Greg Stark wrote:
On Fri, Sep 3, 2010 at 12:19 PM, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:* We need a smarter way to do pg_start/stop_backup() with this. At the
moment, you can only have one backup running at a time, but we shouldn't
have that limitation with this built-in mechanism.Well there's no particular reason we couldn't support having multiple
pg_start_backup() pending either. It's just not usually something
people have need so far.The backup label file makes that hard. There can be only one at a time.
I don't actually see a use-case for streaming multiple concurrent
backups. How many people are going to be able to afford that kind of
load on the master's I/O bandwidth?
It's more a matter of convenience when you're setting up test
environments with small databases or something like that. I don't see
many people regularly using the streaming backup for anything larger
than a few hundred gigabytes anyway. At that point you'll most likely
want to use something more efficient.
Certainly for version 1, it would be sufficient to throw an error if
someone tries to start a backup while another one is in progress.
*Maybe*, down the road, we'd want to relax it.
Yeah, it's OK for 1st version.
--
Heikki Linnakangas
EnterpriseDB http://www.enterprisedb.com
On Fri, Sep 3, 2010 at 11:20 AM, Stephen Frost <sfrost@snowman.net> wrote:
Kevin,
* Kevin Grittner (Kevin.Grittner@wicourts.gov) wrote:
While 1GB granularity would be OK, I doubt it's optimal; I think CRC
checks for smaller chunks might be worthwhile. My gut feel is that
somewhere in the 64kB to 1MB range would probably be optimal for us,
although the "sweet spot" will depend on how the database is used.
A configurable or self-adjusting size would be cool.We have something much better, called WAL. If people want to keep their
backup current, they should use that after getting the base backup up
and working. We don't need to support this for the base backup, imv.In any case, it's certainly not something required for an initial
implementation..
While I'm certainly not knocking WAL, it's not difficult to think of
cases where being able to incrementally update a backup saves you an
awful lot of bandwidth.
--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise Postgres Company
"Kevin Grittner" <Kevin.Grittner@wicourts.gov> writes:
Stephen Frost <sfrost@snowman.net> wrote:
In any case, it's certainly not something required for an initial
implementation..
No disagreement there; but sometimes it pays to know where you might
want to go, so you don't do something to make further development in
that direction unnecessarily difficult.
I think that setting out to reimplement rsync, or to go down a design
path where we're likely to do a lot of that eventually, is the height
of folly. We should be standing on the shoulders of other projects,
not rolling our own because of misguided ideas about people not having
those projects installed.
IOW, what I'd like to see is protocol extensions that allow an external
copy of rsync to be invoked; not build in rsync, or tar, or anything
else that we could get off-the-shelf.
regards, tom lane
On Fri, Sep 3, 2010 at 11:47 AM, Tom Lane <tgl@sss.pgh.pa.us> wrote:
"Kevin Grittner" <Kevin.Grittner@wicourts.gov> writes:
Stephen Frost <sfrost@snowman.net> wrote:
In any case, it's certainly not something required for an initial
implementation..No disagreement there; but sometimes it pays to know where you might
want to go, so you don't do something to make further development in
that direction unnecessarily difficult.I think that setting out to reimplement rsync, or to go down a design
path where we're likely to do a lot of that eventually, is the height
of folly. We should be standing on the shoulders of other projects,
not rolling our own because of misguided ideas about people not having
those projects installed.IOW, what I'd like to see is protocol extensions that allow an external
copy of rsync to be invoked; not build in rsync, or tar, or anything
else that we could get off-the-shelf.
We used to use "cp" to create databases. Should we go back to that system?
--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise Postgres Company
On Fri, Sep 3, 2010 at 11:47 AM, Tom Lane <tgl@sss.pgh.pa.us> wrote:
IOW, what I'd like to see is protocol extensions that allow an external
copy of rsync to be invoked; not build in rsync, or tar, or anything
else that we could get off-the-shelf.
Personally, I would love to see protocol-level compression added.
(Yes, going over a compressed SSH tunnel works well, but in general
isn't user-friendly.)
Josh: we talked on IRC awhile back and you mentioned that CMD had
added this in Mammoth? Would you be interested in having someone get
that integrated back into the community?
David Blewett
* Tom Lane (tgl@sss.pgh.pa.us) wrote:
IOW, what I'd like to see is protocol extensions that allow an external
copy of rsync to be invoked; not build in rsync, or tar, or anything
else that we could get off-the-shelf.
I'd much rather use an existing library to implement it than call out to
some external utility. That said, I'm about as thrilled with libtar as
librsync after a bit of googling around. :/
Thanks,
Stephen
Tom Lane <tgl@sss.pgh.pa.us> wrote:
what I'd like to see is protocol extensions that allow an external
copy of rsync to be invoked; not build in rsync, or tar, or
anything else that we could get off-the-shelf.
The complexities of dealing with properly invoking rsync externally
could well require more code and be considerably more fragile than
passing the data through the existing SR connection; particularly
since to get the full benefits of rsync you need to be dealing with
a daemon which has the appropriate modules configured -- the
location of which you wouldn't easily know.
If we were talking about re-implementing rsync, or doing more than a
rough approximation, kinda, of 5% of what rsync does, I'd be with
you.
-Kevin
On 03/09/10 19:09, Stephen Frost wrote:
* Tom Lane (tgl@sss.pgh.pa.us) wrote:
IOW, what I'd like to see is protocol extensions that allow an external
copy of rsync to be invoked; not build in rsync, or tar, or anything
else that we could get off-the-shelf.I'd much rather use an existing library to implement it than call out to
some external utility. That said, I'm about as thrilled with libtar as
librsync after a bit of googling around. :/
The code to build a tar archive is about 200 lines of code. The amount
of code for untar is about the same. That's about the amount of effort
We could add zlib compression since we already link with that, but
that's about it. I'm not interested in adding more infrastructure for
more tools. For more complicated scenarios, you can still use
pg_start/stop_backup() as usual, there's nothing wrong with that.
--
Heikki Linnakangas
EnterpriseDB http://www.enterprisedb.com
On 03/09/10 18:53, David Blewett wrote:
On Fri, Sep 3, 2010 at 11:47 AM, Tom Lane<tgl@sss.pgh.pa.us> wrote:
IOW, what I'd like to see is protocol extensions that allow an external
copy of rsync to be invoked; not build in rsync, or tar, or anything
else that we could get off-the-shelf.Personally, I would love to see protocol-level compression added.
(Yes, going over a compressed SSH tunnel works well, but in general
isn't user-friendly.)Josh: we talked on IRC awhile back and you mentioned that CMD had
added this in Mammoth? Would you be interested in having someone get
that integrated back into the community?
There's a recent thread on pgsql-general about just that:
http://archives.postgresql.org/pgsql-general/2010-08/msg00003.php
I agree with Tom's comments there, I'd like to have something to
enable/disable SSL compression rather than implement our own. There was
some discussion that it might not be available on JDBC SSL
implementations, but if it's done in our protocol, you'll need changes
to the client to make it work anyway.
--
Heikki Linnakangas
EnterpriseDB http://www.enterprisedb.com
On Fri, Sep 03, 2010 at 09:56:12AM -0400, Stephen Frost wrote:
* Robert Haas (robertmhaas@gmail.com) wrote:
The rsync code itself is not modular, I believe. I think the author
thereof kind of took the approach of placing efficiency before all.Yeah, I looked into this when discussing this same concept at PGCon with
folks. There doesn't appear to be a good librsync and, even if there
was, there's a heck of alot of complexity there that we *don't* need.
rsync is a great tool, don't get me wrong, but let's not try to go over
our heads here.
rsync is not rocket science. All you need is for the receiving end to
send a checksum for each block it has. The server side does the same
checksum and for each block sends back "same" or "new data".
The client and the server don't need to synchronise at all. If the
client sends nothing, the server sends everything.
The tricky part of rsync (finding block that have moved) is not needed
here.
Have a nice day,
--
Martijn van Oosterhout <kleptog@svana.org> http://svana.org/kleptog/
Show quoted text
Patriotism is when love of your own people comes first; nationalism,
when hate for people other than your own comes first.
- Charles de Gaulle
On Fri, Sep 3, 2010 at 12:23 PM, Heikki Linnakangas
<heikki.linnakangas@enterprisedb.com> wrote:
On 03/09/10 18:53, David Blewett wrote:
On Fri, Sep 3, 2010 at 11:47 AM, Tom Lane<tgl@sss.pgh.pa.us> wrote:
IOW, what I'd like to see is protocol extensions that allow an external
copy of rsync to be invoked; not build in rsync, or tar, or anything
else that we could get off-the-shelf.Personally, I would love to see protocol-level compression added.
(Yes, going over a compressed SSH tunnel works well, but in general
isn't user-friendly.)Josh: we talked on IRC awhile back and you mentioned that CMD had
added this in Mammoth? Would you be interested in having someone get
that integrated back into the community?There's a recent thread on pgsql-general about just that:
http://archives.postgresql.org/pgsql-general/2010-08/msg00003.phpI agree with Tom's comments there, I'd like to have something to
enable/disable SSL compression rather than implement our own. There was some
discussion that it might not be available on JDBC SSL implementations, but
if it's done in our protocol, you'll need changes to the client to make it
work anyway.
While I agree that combining SSL with compression is a great win, I'm
not sold on Tom's argument that compression is only needed in WAN
situations. I've seen great benefit to using an SSH tunnel with
compression over LAN connections (100 and 1000 mbps). At work, we do
have a private WAN that it would be nice to be able to use compression
with no encryption on. I think it's a general-use thing. While I know
it's not the best argument, MySQL does provide compression at the
connection level.
David Blewett
On Fri, Sep 3, 2010 at 8:30 PM, Martijn van Oosterhout
<kleptog@svana.org> wrote:
rsync is not rocket science. All you need is for the receiving end to
send a checksum for each block it has. The server side does the same
checksum and for each block sends back "same" or "new data".
Well rsync is closer to rocket science than that. It does rolling
checksums and can handle data being moved around, which vacuum does do
so it's probably worthwhile.
*However* I tihnk you're all headed in the wrong direction here. I
don't think rsync is what anyone should be doing with their backups at
all. It still requires scanning through *all* your data even if you've
only changed a small percentage (which it seems is the use case you're
concerned about) and it results in corrupting your backup while the
rsync is in progress and having a window with no usable backup. You
could address that with rsync --compare-dest but then you're back to
needing space and i/o for whole backups every time even if you're only
changing small parts of the database.
The industry standard solution that we're missing that we *should* be
figuring out how to implement is incremental backups.
I've actually been thinking about this recently and I think we could
do it fairly easily with our existing infrastructure. I was planning
on doing it as an external utility but it would be tempting to be able
to request an external backup via the streaming protocol so maybe it
would be better a bit more integrated.
The way I see it there are two alternatives. You need to start by
figuring out which blocks have been modified since the last backup (or
selected reference point). You can do this either by scanning every
data file and picking every block with an LSN > the reference LSN. Or
you can do it by scanning the WAL since that point and accumulating a
list of block numbers.
Either way you then need to archive all those blocks into a special
file format which includes meta-information to dictate which file and
what block number each block represents. Also it would be useful to
include the reference LSN and the beginning and ending LSN of the
backup so that we can verify when restoring it that we're starting
with a recent enough database and that we've replayed the right range
of WAL to bring it to a consistent state.
It's tempting to make the incremental backup file format just a
regular WAL file with a series of special WAL records which just
contain a backup block. That might be a bit confusing since it would
be a second unrelated LSN series but I like the idea of being able to
use the same bits of code to handle the "holes" and maybe other code.
On the whole I think it would be just a little too weird though.
--
greg
On 4 September 2010 14:42, Greg Stark <gsstark@mit.edu> wrote:
The industry standard solution that we're missing that we *should* be
figuring out how to implement is incremental backups.
I'll buy you a crate of beer if this gets implemented... although
you're in Dublin so would be like buying Willy Wonka a Mars bar.
--
Thom Brown
Twitter: @darkixion
IRC (freenode): dark_ixion
Registered Linux user: #516935
On Sat, Sep 4, 2010 at 9:42 AM, Greg Stark <gsstark@mit.edu> wrote:
*However* I tihnk you're all headed in the wrong direction here. I
don't think rsync is what anyone should be doing with their backups at
all. It still requires scanning through *all* your data even if you've
only changed a small percentage (which it seems is the use case you're
concerned about) and it results in corrupting your backup while the
rsync is in progress and having a window with no usable backup. You
could address that with rsync --compare-dest but then you're back to
needing space and i/o for whole backups every time even if you're only
changing small parts of the database.
It depends. If the use case is "I accidentally (or purposefully but
temporarily) started up my slave as a master, and now I want it to go
back to having it be the master" or "I lost the WAL files I need to
roll this base backup forward (perhaps because wal_keep_segments
wasn't set high enough)", rsync is what you need.
--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise Postgres Company
On Sat, Sep 04, 2010 at 02:42:40PM +0100, Greg Stark wrote:
On Fri, Sep 3, 2010 at 8:30 PM, Martijn van Oosterhout
<kleptog@svana.org> wrote:rsync is not rocket science. All you need is for the receiving end to
send a checksum for each block it has. The server side does the same
checksum and for each block sends back "same" or "new data".Well rsync is closer to rocket science than that. It does rolling
checksums and can handle data being moved around, which vacuum does do
so it's probably worthwhile.
Not sure. When vacuum moves rows around the chance that it will move
rows as a block and that the line pointers will be the same is
practically nil. I don't think rsync will pick up on blocks the size of
a typical row. Vacuum changes the headers so you never have a copied
block.
*However* I tihnk you're all headed in the wrong direction here. I
don't think rsync is what anyone should be doing with their backups at
all. It still requires scanning through *all* your data even if you've
only changed a small percentage (which it seems is the use case you're
concerned about) and it results in corrupting your backup while the
rsync is in progress and having a window with no usable backup. You
could address that with rsync --compare-dest but then you're back to
needing space and i/o for whole backups every time even if you're only
changing small parts of the database.
If you're working from a known good version of the database at some
point, yes you are right you have more interesting options. If you
don't you want something that will fix it.
Have a nice day,
--
Martijn van Oosterhout <kleptog@svana.org> http://svana.org/kleptog/
Show quoted text
Patriotism is when love of your own people comes first; nationalism,
when hate for people other than your own comes first.
- Charles de Gaulle
On Sun, Sep 5, 2010 at 4:51 PM, Martijn van Oosterhout
<kleptog@svana.org> wrote:
If you're working from a known good version of the database at some
point, yes you are right you have more interesting options. If you
don't you want something that will fix it.
Sure, in that case you want to restore from backup. Whatever you use
to do that is the same net result. I'm not sure rsync is actually
going to be much faster though since it still has to read all of the
existing database which a normal restore doesn't have to. If the
database has changed significantly that's a lot of extra I/O and
you're probably on a local network with a lot of bandwidth available.
What I'm talking about is how you *take* backups. Currently you have
to take a full backup which if you have a large data warehouse could
be a big job. If only a small percentage of the database is changing
then you could use rsync to reduce the network bandwidth to transfer
your backup but you still have to read the entire database and write
out the entire backup.
Incremental backups mean being able to read just the data blocks that
have been modified and write out a backup file with just those blocks.
When it comes time to restore then you restore the last full backup,
then any incremental backups since then, then replay any logs needed
to bring it to a consistent state.
I think that description pretty much settles the question in my mind.
The implementation choice of scanning the WAL to find all the changed
blocks is more relevant to the use cases where incremental backups are
useful. If you still have to read the entire database then there's not
all that much to be gained except storage space. If you scan the WAL
then you can avoid reading most of your large data warehouse to
generate the incremental and only read the busy portion.
In the use case where the database is extremely busy but writing and
rewriting the same small number of blocks over and over even scanning
the WAL might not be ideal. For that use case it might be more useful
to generate a kind of wal-summary which lists all the blocks touched
since the last checkpoint every checkpoint. But that could be a later
optimization.
--
greg
On Mon, Sep 6, 2010 at 10:07 AM, Greg Stark <gsstark@mit.edu> wrote:
I think that description pretty much settles the question in my mind.
The implementation choice of scanning the WAL to find all the changed
blocks is more relevant to the use cases where incremental backups are
useful. If you still have to read the entire database then there's not
all that much to be gained except storage space. If you scan the WAL
then you can avoid reading most of your large data warehouse to
generate the incremental and only read the busy portion.
If you can scan the WAL, why wouldn't you just replay it?
--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise Postgres Company
Greg Stark wrote:
The industry standard solution that we're missing that we *should* be
figuring out how to implement is incremental backups.I've actually been thinking about this recently and I think we could
do it fairly easily with our existing infrastructure. I was planning
on doing it as an external utility but it would be tempting to be able
to request an external backup via the streaming protocol so maybe it
would be better a bit more integrated.The way I see it there are two alternatives. You need to start by
figuring out which blocks have been modified since the last backup (or
selected reference point). You can do this either by scanning every
data file and picking every block with an LSN > the reference LSN. Or
you can do it by scanning the WAL since that point and accumulating a
list of block numbers.
That's what pgrman does already:
http://code.google.com/p/pg-rman/
Are you saying you want to do that over the libpq connection?
--
Bruce Momjian <bruce@momjian.us> http://momjian.us
EnterpriseDB http://enterprisedb.com
+ It's impossible for everything to be true. +