[PATCH] pg_upgrade: support for btrfs copy-on-write clones

Started by Oskari Saarenmaaover 12 years ago12 messages

os@ohmu.fi

over 12 years ago

Add file cloning as an alternative data transfer method to pg_upgrade.
Currently only btrfs is supported, but copy-on-write cloning is also
available on at least ZFS. Cloning must be requested explicitly and if
it isn't supported by the operating system or filesystem a fatal error
is thrown.

This provides upgrade performance similar to link mode while allowing
the old cluster to be used even after the new one has been started.

Signed-off-by: Oskari Saarenmaa <os@ohmu.fi>
---
configure | 5 +-
configure.in | 7 ++-
contrib/pg_upgrade/check.c | 3 +
contrib/pg_upgrade/file.c | 125 +++++++++++++++++++++++++++++----------
contrib/pg_upgrade/option.c | 7 +++
contrib/pg_upgrade/pg_upgrade.h | 13 ++--
contrib/pg_upgrade/relfilenode.c | 31 ++++------
doc/src/sgml/pgupgrade.sgml | 7 +++
src/include/pg_config.h.in | 3 +
9 files changed, 141 insertions(+), 60 deletions(-)

diff --git a/configure b/configure
index c685ca3..5087463 100755
--- a/configure
+++ b/configure
@@ -10351,7 +10351,10 @@ done

-for ac_header in crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h
+for ac_header in crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h \
+    linux/btrfs.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h \
+    sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h \
+    sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h
 do
 as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh`
 if { as_var=$as_ac_Header; eval "test \"\${$as_var+set}\" = set"; }; then
diff --git a/configure.in b/configure.in
index 82771bd..609aa73 100644
--- a/configure.in
+++ b/configure.in
@@ -982,7 +982,12 @@ AC_SUBST(OSSP_UUID_LIBS)
 ##

 dnl sys/socket.h is required by AC_FUNC_ACCEPT_ARGTYPES
-AC_CHECK_HEADERS([crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h])
+AC_CHECK_HEADERS([crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h \
+                  langinfo.h linux/btrfs.h poll.h pwd.h sys/ioctl.h \
+                  sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h \
+                  sys/select.h sys/sem.h sys/shm.h sys/socket.h \
+                  sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h \
+                  ucred.h utime.h wchar.h wctype.h])

 # On BSD, test for net/if.h will fail unless sys/socket.h
 # is included first.
diff --git a/contrib/pg_upgrade/check.c b/contrib/pg_upgrade/check.c
index 0376fcb..2a52dd8 100644
--- a/contrib/pg_upgrade/check.c
+++ b/contrib/pg_upgrade/check.c
@@ -151,6 +151,9 @@ check_new_cluster(void)
 	if (user_opts.transfer_mode == TRANSFER_MODE_LINK)
 		check_hard_link();

+	if (user_opts.transfer_mode == TRANSFER_MODE_CLONE)
+		check_clone_file();
+
 	check_is_super_user(&new_cluster);

 	/*
diff --git a/contrib/pg_upgrade/file.c b/contrib/pg_upgrade/file.c
index dfeb79f..fc935b7 100644
--- a/contrib/pg_upgrade/file.c
+++ b/contrib/pg_upgrade/file.c
@@ -8,11 +8,16 @@
  */

#include "postgres_fe.h"
+#include "pg_config.h"

#include "pg_upgrade.h"

#include <fcntl.h>

+#ifdef HAVE_LINUX_BTRFS_H
+# include <sys/ioctl.h>
+# include <linux/btrfs.h>
+#endif

#ifndef WIN32
@@ -23,21 +28,42 @@ static int win32_pghardlink(const char *src, const char *dst);

 /*
- * copyAndUpdateFile()
+ * upgradeFile()
  *
- *	Copies a relation file from src to dst.  If pageConverter is non-NULL, this function
- *	uses that pageConverter to do a page-by-page conversion.
+ * Transfer a relation file from src to dst using one of the supported
+ * methods.  If the on-disk format of the new cluster is bit-for-bit
+ * compatible with the on-disk format of the old cluster we can simply link
+ * each relation to perform a true in-place upgrade.  Otherwise we must copy
+ * (either block-by-block or using a copy-on-write clone) the data from old
+ * cluster to new cluster and then perform the conversion.
  */
 const char *
-copyAndUpdateFile(pageCnvCtx *pageConverter,
-				  const char *src, const char *dst, bool force)
+upgradeFile(transferMode transfer_mode, const char *src,
+		const char *dst, pageCnvCtx *pageConverter)
 {
 	if (pageConverter == NULL)
 	{
-		if (pg_copy_file(src, dst, force) == -1)
-			return getErrorText(errno);
-		else
-			return NULL;
+		int rc = -1;
+
+		switch (transfer_mode)
+		{
+			case TRANSFER_MODE_COPY:
+				rc = pg_copy_file(src, dst, true);
+				break;
+			case TRANSFER_MODE_CLONE:
+				rc = upg_clone_file(src, dst);
+				break;
+			case TRANSFER_MODE_LINK:
+				rc = pg_link_file(src, dst);
+				break;
+		}
+
+		return (rc < 0) ? getErrorText(errno) : NULL;
+	}
+	else if (transfer_mode != TRANSFER_MODE_COPY)
+	{
+		return "Cannot in-place update this cluster, "
+			"page-by-page (copy-mode) conversion is required";
 	}
 	else
 	{
@@ -100,29 +126,6 @@ copyAndUpdateFile(pageCnvCtx *pageConverter,
 }

-/*
- * linkAndUpdateFile()
- *
- * Creates a hard link between the given relation files. We use
- * this function to perform a true in-place update. If the on-disk
- * format of the new cluster is bit-for-bit compatible with the on-disk
- * format of the old cluster, we can simply link each relation
- * instead of copying the data from the old cluster to the new cluster.
- */
-const char *
-linkAndUpdateFile(pageCnvCtx *pageConverter,
-				  const char *src, const char *dst)
-{
-	if (pageConverter != NULL)
-		return "Cannot in-place update this cluster, page-by-page conversion is required";
-
-	if (pg_link_file(src, dst) == -1)
-		return getErrorText(errno);
-	else
-		return NULL;
-}
-
-
 #ifndef WIN32
 static int
 copy_file(const char *srcfile, const char *dstfile, bool force)
@@ -228,6 +231,64 @@ win32_pghardlink(const char *src, const char *dst)
 #endif

+int
+upg_clone_file(const char *existing_file, const char *new_file)
+{
+#ifdef BTRFS_IOC_CLONE
+	int rc, res_errno = 0, src_fd = -1, dest_fd = -1;
+
+	src_fd = open(existing_file, O_RDONLY);
+	if (src_fd < 0)
+		return -1;
+
+	dest_fd = open(new_file, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR);
+	if (dest_fd < 0)
+	{
+		close(src_fd);
+		return -1;
+	}
+
+	rc = ioctl(dest_fd, BTRFS_IOC_CLONE, src_fd);
+	if (rc < 0)
+	{
+		pg_log(PG_REPORT, "btrfs clone: %s\n", strerror(errno));
+		res_errno = errno;  /* save errno for caller */
+		unlink(new_file);
+	}
+
+	close(dest_fd);
+	close(src_fd);
+
+	errno = res_errno;  /* restore errno after close() calls */
+	return rc;
+#else
+	/* TODO: add support for zfs clones */
+	pg_log(PG_REPORT, "system does not support file cloning\n");
+	errno = ENOSYS;
+	return -1;
+#endif
+}
+
+void
+check_clone_file(void)
+{
+	char		existing_file[MAXPGPATH];
+	char		cloned_file[MAXPGPATH];
+
+	snprintf(existing_file, sizeof(existing_file), "%s/PG_VERSION", old_cluster.pgdata);
+	snprintf(cloned_file, sizeof(cloned_file), "%s/PG_VERSION.linktest", new_cluster.pgdata);
+	unlink(cloned_file);		/* might fail */
+
+	if (upg_clone_file(existing_file, cloned_file) == -1)
+	{
+		pg_log(PG_FATAL,
+			   "Could not clone a file between old and new data directories: %s\n"
+			   "File cloning is currently only supported on btrfs.\n",
+			   getErrorText(errno));
+	}
+	unlink(cloned_file);
+}
+
 /* fopen() file with no group/other permissions */
 FILE *
 fopen_priv(const char *path, const char *mode)
diff --git a/contrib/pg_upgrade/option.c b/contrib/pg_upgrade/option.c
index 2774b1e..fdf9f5c 100644
--- a/contrib/pg_upgrade/option.c
+++ b/contrib/pg_upgrade/option.c
@@ -54,6 +54,7 @@ parseCommandLine(int argc, char *argv[])
 		{"retain", no_argument, NULL, 'r'},
 		{"jobs", required_argument, NULL, 'j'},
 		{"verbose", no_argument, NULL, 'v'},
+		{"clone", no_argument, NULL, 1},
 		{NULL, 0, NULL, 0}
 	};
 	int			option;			/* Command line option */
@@ -186,6 +187,10 @@ parseCommandLine(int argc, char *argv[])
 				log_opts.verbose = true;
 				break;

+			case 1:
+				user_opts.transfer_mode = TRANSFER_MODE_CLONE;
+				break;
+
 			default:
 				pg_log(PG_FATAL,
 					   "Try \"%s --help\" for more information.\n",
@@ -236,6 +241,8 @@ Options:\n\
   -D, --new-datadir=DATADIR     new cluster data directory\n\
   -j, --jobs                    number of simultaneous processes or threads to use\n\
   -k, --link                    link instead of copying files to new cluster\n\
+      --clone                   use copy-on-write cloning instead of copying\n\
+                                files to new cluster (only supported on btrfs)\n\
   -o, --old-options=OPTIONS     old cluster options to pass to the server\n\
   -O, --new-options=OPTIONS     new cluster options to pass to the server\n\
   -p, --old-port=PORT           old cluster port number (default %d)\n\
diff --git a/contrib/pg_upgrade/pg_upgrade.h b/contrib/pg_upgrade/pg_upgrade.h
index 0b3ad20..6932bd6 100644
--- a/contrib/pg_upgrade/pg_upgrade.h
+++ b/contrib/pg_upgrade/pg_upgrade.h
@@ -209,12 +209,13 @@ typedef struct
 } ControlData;

/*
- * Enumeration to denote link modes
+ * Enumeration to denote transfer mode
*/
typedef enum
{
TRANSFER_MODE_COPY,
- TRANSFER_MODE_LINK
+ TRANSFER_MODE_LINK,
+ TRANSFER_MODE_CLONE
} transferMode;

/*
@@ -381,12 +382,12 @@ const pageCnvCtx *setupPageConverter(void);
typedef void *pageCnvCtx;
#endif

-const char *copyAndUpdateFile(pageCnvCtx *pageConverter, const char *src,
-				  const char *dst, bool force);
-const char *linkAndUpdateFile(pageCnvCtx *pageConverter, const char *src,
-				  const char *dst);
+const char *upgradeFile(transferMode transfer_mode, const char *src,
+				const char *dst, pageCnvCtx *pageConverter);

 void		check_hard_link(void);
+void		check_clone_file(void);
+int		upg_clone_file(const char *old_file, const char *new_file);
 FILE	   *fopen_priv(const char *path, const char *mode);

 /* function.c */
diff --git a/contrib/pg_upgrade/relfilenode.c b/contrib/pg_upgrade/relfilenode.c
index a951fc9..c808313 100644
--- a/contrib/pg_upgrade/relfilenode.c
+++ b/contrib/pg_upgrade/relfilenode.c
@@ -32,7 +32,10 @@ transfer_all_new_tablespaces(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr,
 							 char *old_pgdata, char *new_pgdata)
 {
 	pg_log(PG_REPORT, "%s user relation files\n",
-	  user_opts.transfer_mode == TRANSFER_MODE_LINK ? "Linking" : "Copying");
+		user_opts.transfer_mode == TRANSFER_MODE_COPY ? "Copying" :
+		user_opts.transfer_mode == TRANSFER_MODE_LINK ? "Linking" :
+		user_opts.transfer_mode == TRANSFER_MODE_CLONE ? "Cloning" :
+		"FAIL");

/*
* Transfering files by tablespace is tricky because a single database can
@@ -270,26 +273,14 @@ transfer_relfile(pageCnvCtx *pageConverter, FileNameMap *map,
/* Copying files might take some time, so give feedback. */
pg_log(PG_STATUS, "%s", old_file);

-		if ((user_opts.transfer_mode == TRANSFER_MODE_LINK) && (pageConverter != NULL))
-			pg_log(PG_FATAL, "This upgrade requires page-by-page conversion, "
-				   "you must use copy mode instead of link mode.\n");
-
-		if (user_opts.transfer_mode == TRANSFER_MODE_COPY)
-		{
-			pg_log(PG_VERBOSE, "copying \"%s\" to \"%s\"\n", old_file, new_file);
-
-			if ((msg = copyAndUpdateFile(pageConverter, old_file, new_file, true)) != NULL)
-				pg_log(PG_FATAL, "error while copying relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n",
-					   map->nspname, map->relname, old_file, new_file, msg);
-		}
-		else
+		msg = upgradeFile(user_opts.transfer_mode, old_file, new_file, pageConverter);
+		if (msg != NULL)
 		{
-			pg_log(PG_VERBOSE, "linking \"%s\" to \"%s\"\n", old_file, new_file);
-
-			if ((msg = linkAndUpdateFile(pageConverter, old_file, new_file)) != NULL)
-				pg_log(PG_FATAL,
-					   "error while creating link for relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n",
-					   map->nspname, map->relname, old_file, new_file, msg);
+			pg_log(PG_FATAL, "error while %s relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n",
+				(user_opts.transfer_mode == TRANSFER_MODE_COPY ? "copying" :
+				 user_opts.transfer_mode == TRANSFER_MODE_LINK ? "linking" :
+				 user_opts.transfer_mode == TRANSFER_MODE_CLONE ? "cloning" :
+				 "FAIL"), map->nspname, map->relname, old_file, new_file, msg);
 		}
 	}

diff --git a/doc/src/sgml/pgupgrade.sgml b/doc/src/sgml/pgupgrade.sgml
index f6cd9f0..fc9b89d 100644
--- a/doc/src/sgml/pgupgrade.sgml
+++ b/doc/src/sgml/pgupgrade.sgml
@@ -126,6 +126,13 @@
      </varlistentry>

      <varlistentry>
+      <term><option>--clone</option></term>
+      <listitem><para>use copy-on-write clones instead of copying or hard linking files to
+      the new cluster; this option can only be used when the old and new clusters reside
+      on the same btrfs filesystem</para></listitem>
+     </varlistentry>
+
+     <varlistentry>
       <term><option>-o</option> <replaceable class="parameter">options</replaceable></term>
       <term><option>--old-options</option> <replaceable class="parameter">options</replaceable></term>
       <listitem><para>options to be passed directly to the
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 8aabf3c..7cd5a8d 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -317,6 +317,9 @@
 /* Define to 1 if you have the `z' library (-lz). */
 #undef HAVE_LIBZ

+/* Define to 1 if you have the <linux/btrfs.h> header file. */
+#undef HAVE_LINUX_BTRFS_H
+
 /* Define to 1 if constants of type 'long long int' should have the suffix LL.
    */
 #undef HAVE_LL_CONSTANTS
-- 
1.8.3.1

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Kevin Grittner

kgrittn@ymail.com

over 12 years ago

In reply to: Oskari Saarenmaa (#1)

Re: [PATCH] pg_upgrade: support for btrfs copy-on-write clones

Oskari Saarenmaa <os@ohmu.fi> wrote:

Add file cloning as an alternative data transfer method to pg_upgrade.

Currently only btrfs is supported, but copy-on-write cloning is also
available on at least ZFS. Cloning must be requested explicitly and if
it isn't supported by the operating system or filesystem a fatal error
is thrown.

This provides upgrade performance similar to link mode while allowing
the old cluster to be used even after the new one has been started.

Please add the patch here to make sure it gets reviewed:

https://commitfest.postgresql.org/action/commitfest_view/open

For more information on the process, see:

http://wiki.postgresql.org/wiki/CommitFest

--
Kevin Grittner
EDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Andrew Dunstan

andrew@dunslane.net

over 12 years ago

In reply to: Oskari Saarenmaa (#1)

Re: [PATCH] pg_upgrade: support for btrfs copy-on-write clones

On 10/01/2013 06:31 PM, Oskari Saarenmaa wrote:

Add file cloning as an alternative data transfer method to pg_upgrade.
Currently only btrfs is supported, but copy-on-write cloning is also
available on at least ZFS. Cloning must be requested explicitly and if
it isn't supported by the operating system or filesystem a fatal error
is thrown.

So, just curious, why isn't ZFS supported? It's what I am more
interested in, at least.

cheers

andrew

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Oskari Saarenmaa

os@ohmu.fi

over 12 years ago

In reply to: Andrew Dunstan (#3)

Re: [PATCH] pg_upgrade: support for btrfs copy-on-write clones

On 02/10/13 17:18, Andrew Dunstan wrote:

On 10/01/2013 06:31 PM, Oskari Saarenmaa wrote:

Add file cloning as an alternative data transfer method to pg_upgrade.
Currently only btrfs is supported, but copy-on-write cloning is also
available on at least ZFS. Cloning must be requested explicitly and if
it isn't supported by the operating system or filesystem a fatal error
is thrown.

So, just curious, why isn't ZFS supported? It's what I am more
interested in, at least.

No fundamental reason; I'm hoping ZFS will be supported in addition to
btrfs, but I don't have any systems with ZFS filesystems at the moment
so I haven't been able to test it or find out the mechanisms ZFS uses
for cloning. On btrfs cloning is implemented with a custom
btrfs-specific ioctl, ZFS probably has something similar which would be
pretty easy to add on top of this patch.

Added this patch to commitfest as suggested,
https://commitfest.postgresql.org/action/patch_view?id=1251

/ Oskari

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Bruce Momjian

bruce@momjian.us

over 12 years ago

In reply to: Oskari Saarenmaa (#4)

Re: [PATCH] pg_upgrade: support for btrfs copy-on-write clones

On Wed, Oct 2, 2013 at 05:23:31PM +0300, Oskari Saarenmaa wrote:

On 02/10/13 17:18, Andrew Dunstan wrote:

On 10/01/2013 06:31 PM, Oskari Saarenmaa wrote:

Add file cloning as an alternative data transfer method to pg_upgrade.
Currently only btrfs is supported, but copy-on-write cloning is also
available on at least ZFS. Cloning must be requested explicitly and if
it isn't supported by the operating system or filesystem a fatal error
is thrown.

So, just curious, why isn't ZFS supported? It's what I am more
interested in, at least.

No fundamental reason; I'm hoping ZFS will be supported in addition
to btrfs, but I don't have any systems with ZFS filesystems at the
moment so I haven't been able to test it or find out the mechanisms
ZFS uses for cloning. On btrfs cloning is implemented with a custom
btrfs-specific ioctl, ZFS probably has something similar which would
be pretty easy to add on top of this patch.

Added this patch to commitfest as suggested,
https://commitfest.postgresql.org/action/patch_view?id=1251

What is the performance overhead of using a cloned data directory for a
cluster?

--
Bruce Momjian <bruce@momjian.us> http://momjian.us
EnterpriseDB http://enterprisedb.com

+ It's impossible for everything to be true. +

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Josh Berkus

josh@agliodbs.com

over 12 years ago

In reply to: Oskari Saarenmaa (#1)

Re: [PATCH] pg_upgrade: support for btrfs copy-on-write clones

No fundamental reason; I'm hoping ZFS will be supported in addition to
btrfs, but I don't have any systems with ZFS filesystems at the moment
so I haven't been able to test it or find out the mechanisms ZFS uses
for cloning. On btrfs cloning is implemented with a custom
btrfs-specific ioctl, ZFS probably has something similar which would be
pretty easy to add on top of this patch.

Would you like a VM with ZFS on it? I'm pretty sure I can supply one.

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Import Notes

Reply to msg id not found: WM8bad8bb46737415319d061b876f02425e5beeac4a8dab0cfff2c53eaa08c20deaaca7bc498c3c9bb81a097243ae51fd6@asav-2.01.com

Larry Rosenman

ler@lerctr.org

over 12 years ago

In reply to: Josh Berkus (#6)

Re: [PATCH] pg_upgrade: support for btrfs copy-on-write clones

On 2013-10-02 17:32, Josh Berkus wrote:

No fundamental reason; I'm hoping ZFS will be supported in addition to
btrfs, but I don't have any systems with ZFS filesystems at the moment
so I haven't been able to test it or find out the mechanisms ZFS uses
for cloning. On btrfs cloning is implemented with a custom
btrfs-specific ioctl, ZFS probably has something similar which would
be
pretty easy to add on top of this patch.

Would you like a VM with ZFS on it? I'm pretty sure I can supply one.

--
Josh Berkus
PostgreSQL Experts Inc.
http://pgexperts.com

I can also supply SSH access to a FreeBSD 10 system that is totally ZFS.

--
Larry Rosenman http://www.lerctr.org/~ler
Phone: +1 214-642-9640 (c) E-Mail: ler@lerctr.org
US Mail: 108 Turvey Cove, Hutto, TX 78634-5688

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Oskari Saarenmaa

os@ohmu.fi

over 12 years ago

In reply to: Larry Rosenman (#7)

Re: [PATCH] pg_upgrade: support for btrfs copy-on-write clones

03.10.2013 01:35, Larry Rosenman kirjoitti:

On 2013-10-02 17:32, Josh Berkus wrote:

No fundamental reason; I'm hoping ZFS will be supported in addition to
btrfs, but I don't have any systems with ZFS filesystems at the moment
so I haven't been able to test it or find out the mechanisms ZFS uses
for cloning. On btrfs cloning is implemented with a custom
btrfs-specific ioctl, ZFS probably has something similar which would be
pretty easy to add on top of this patch.

Would you like a VM with ZFS on it? I'm pretty sure I can supply one.

I can also supply SSH access to a FreeBSD 10 system that is totally ZFS.

Thanks for the offers, but it looks like ZFS doesn't actually implement
a similar file level clone operation. See
https://github.com/zfsonlinux/zfs/issues/405 for discussion on a feature
request for it.

ZFS does support cloning entire datasets which seem to be similar to
btrfs subvolume snapshots and could be used to set up a new data
directory for a new $PGDATA. This would require the original $PGDATA
to be a dataset/subvolume of its own and quite a bit different logic
(than just another file copy method in pg_upgrade) to initialize the new
version's $PGDATA as a snapshot/clone of the original. The way this
would work is that the original $PGDATA dataset/subvolume gets cloned to
a new location after which we move the files out of the way of the new
PG installation and run pg_upgrade in link mode. I'm not sure if
there's a good way to integrate this into pg_upgrade or if it's just
something that could be documented as a fast way to run pg_upgrade
without touching original files.

With btrfs tooling the sequence would be something like this:

btrfs subvolume snapshot /srv/pg92 /srv/pg93
mv /srv/pg93/data /srv/pg93/data92
initdb /data/pg93/data
pg_upgrade --link \
--old-datadir=/data/pg93/data92 \
--new-datadir=/data/pg93/data

/ Oskari

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Bruce Momjian

bruce@momjian.us

over 12 years ago

In reply to: Oskari Saarenmaa (#8)

Re: [PATCH] pg_upgrade: support for btrfs copy-on-write clones

On Fri, Oct 4, 2013 at 10:42:46PM +0300, Oskari Saarenmaa wrote:

Thanks for the offers, but it looks like ZFS doesn't actually implement
a similar file level clone operation. See
https://github.com/zfsonlinux/zfs/issues/405 for discussion on a feature
request for it.

ZFS does support cloning entire datasets which seem to be similar to
btrfs subvolume snapshots and could be used to set up a new data
directory for a new $PGDATA. This would require the original $PGDATA
to be a dataset/subvolume of its own and quite a bit different logic
(than just another file copy method in pg_upgrade) to initialize the new
version's $PGDATA as a snapshot/clone of the original. The way this
would work is that the original $PGDATA dataset/subvolume gets cloned to
a new location after which we move the files out of the way of the new
PG installation and run pg_upgrade in link mode. I'm not sure if
there's a good way to integrate this into pg_upgrade or if it's just
something that could be documented as a fast way to run pg_upgrade
without touching original files.

With btrfs tooling the sequence would be something like this:

btrfs subvolume snapshot /srv/pg92 /srv/pg93
mv /srv/pg93/data /srv/pg93/data92
initdb /data/pg93/data
pg_upgrade --link \
--old-datadir=/data/pg93/data92 \
--new-datadir=/data/pg93/data

Does btrfs support file system snapshots? If so, shouldn't people just
take a snapshot of the old data directory before the ugprade, rather
than using cloning?

--
Bruce Momjian <bruce@momjian.us> http://momjian.us
EnterpriseDB http://enterprisedb.com

+ It's impossible for everything to be true. +

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#10

Oskari Saarenmaa

os@ohmu.fi

over 12 years ago

In reply to: Bruce Momjian (#9)

Re: [PATCH] pg_upgrade: support for btrfs copy-on-write clones

05.10.2013 16:38, Bruce Momjian kirjoitti:

On Fri, Oct 4, 2013 at 10:42:46PM +0300, Oskari Saarenmaa wrote:

Thanks for the offers, but it looks like ZFS doesn't actually implement
a similar file level clone operation. See
https://github.com/zfsonlinux/zfs/issues/405 for discussion on a feature
request for it.

ZFS does support cloning entire datasets which seem to be similar to
btrfs subvolume snapshots and could be used to set up a new data
directory for a new $PGDATA. This would require the original $PGDATA
to be a dataset/subvolume of its own and quite a bit different logic
(than just another file copy method in pg_upgrade) to initialize the new
version's $PGDATA as a snapshot/clone of the original. The way this
would work is that the original $PGDATA dataset/subvolume gets cloned to
a new location after which we move the files out of the way of the new
PG installation and run pg_upgrade in link mode. I'm not sure if
there's a good way to integrate this into pg_upgrade or if it's just
something that could be documented as a fast way to run pg_upgrade
without touching original files.

With btrfs tooling the sequence would be something like this:

btrfs subvolume snapshot /srv/pg92 /srv/pg93
mv /srv/pg93/data /srv/pg93/data92
initdb /data/pg93/data
pg_upgrade --link \
--old-datadir=/data/pg93/data92 \
--new-datadir=/data/pg93/data

Does btrfs support file system snapshots? If so, shouldn't people just
take a snapshot of the old data directory before the ugprade, rather
than using cloning?

Yeah, it's possible to clone an existing subvolume, but this requires
that $PGDATA is a subvolume of its own and would be a bit difficult to
integrate into existing pg_upgrade scripts.

The BTRFS_IOC_CLONE ioctl operates on file level and can be used to
clone files anywhere in a btrfs filesystem.

/ Oskari

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#11

Heikki Linnakangas

hlinnakangas@vmware.com

about 12 years ago

In reply to: Oskari Saarenmaa (#10)

Re: [PATCH] pg_upgrade: support for btrfs copy-on-write clones

On 05.10.2013 16:57, Oskari Saarenmaa wrote:

05.10.2013 16:38, Bruce Momjian kirjoitti:

On Fri, Oct 4, 2013 at 10:42:46PM +0300, Oskari Saarenmaa wrote:

Thanks for the offers, but it looks like ZFS doesn't actually implement
a similar file level clone operation. See
https://github.com/zfsonlinux/zfs/issues/405 for discussion on a feature
request for it.

ZFS does support cloning entire datasets which seem to be similar to
btrfs subvolume snapshots and could be used to set up a new data
directory for a new $PGDATA. This would require the original $PGDATA
to be a dataset/subvolume of its own and quite a bit different logic
(than just another file copy method in pg_upgrade) to initialize the new
version's $PGDATA as a snapshot/clone of the original. The way this
would work is that the original $PGDATA dataset/subvolume gets cloned to
a new location after which we move the files out of the way of the new
PG installation and run pg_upgrade in link mode. I'm not sure if
there's a good way to integrate this into pg_upgrade or if it's just
something that could be documented as a fast way to run pg_upgrade
without touching original files.

With btrfs tooling the sequence would be something like this:

btrfs subvolume snapshot /srv/pg92 /srv/pg93
mv /srv/pg93/data /srv/pg93/data92
initdb /data/pg93/data
pg_upgrade --link \
--old-datadir=/data/pg93/data92 \
--new-datadir=/data/pg93/data

Does btrfs support file system snapshots? If so, shouldn't people just
take a snapshot of the old data directory before the ugprade, rather
than using cloning?

Yeah, it's possible to clone an existing subvolume, but this requires
that $PGDATA is a subvolume of its own and would be a bit difficult to
integrate into existing pg_upgrade scripts.

The BTRFS_IOC_CLONE ioctl operates on file level and can be used to
clone files anywhere in a btrfs filesystem.

Hmm, you can also do

cp --reflog -r data92 data-tmp
pg_upgrade --link --old-datadir=data92-copy --new-datadir=data-tmp
rm -rf data-tmp

That BTRFS_IOC_CLONE ioctl seems so hacky that I'd rather not get that
in our source tree. cp --reflog is much more likely to get that magic
incantation right, since it gets a lot more attention and testing than
pg_upgrade.

I'm not in favor of adding filesystem-specific tricks into pg_upgrade.
It would be nice to list these tricks in the docs, though.

- Heikki

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#12

Bruce Momjian

bruce@momjian.us

almost 12 years ago

In reply to: Heikki Linnakangas (#11)

1 attachment(s)

Re: [PATCH] pg_upgrade: support for btrfs copy-on-write clones

On Fri, Nov 15, 2013 at 10:40:20AM +0200, Heikki Linnakangas wrote:

The BTRFS_IOC_CLONE ioctl operates on file level and can be used to
clone files anywhere in a btrfs filesystem.

Hmm, you can also do

cp --reflog -r data92 data-tmp

I think you mean --reflink here.

pg_upgrade --link --old-datadir=data92-copy --new-datadir=data-tmp
rm -rf data-tmp

That BTRFS_IOC_CLONE ioctl seems so hacky that I'd rather not get
that in our source tree. cp --reflog is much more likely to get that
magic incantation right, since it gets a lot more attention and
testing than pg_upgrade.

I'm not in favor of adding filesystem-specific tricks into
pg_upgrade. It would be nice to list these tricks in the docs,
though.

I have applied the attached patch which suggests the use of file system
snapshots and copy-on-write file copies.

--
Bruce Momjian <bruce@momjian.us> http://momjian.us
EnterpriseDB http://enterprisedb.com

+ Everyone has their own god. +

Attachments:

pg_upgrade.difftext/x-diff; charset=us-asciiDownload

diff --git a/doc/src/sgml/pgupgrade.sgml b/doc/src/sgml/pgupgrade.sgml
new file mode 100644
index 3d529b2..bb3b6a0
*** a/doc/src/sgml/pgupgrade.sgml
--- b/doc/src/sgml/pgupgrade.sgml
*************** psql --username postgres --file script.s
*** 569,575 ****
     the old server and run <command>rsync</> again to update the copy with any
     changes to make it consistent.  You might want to exclude some
     files, e.g. <filename>postmaster.pid</>, as documented in <xref
!    linkend="backup-lowlevel-base-backup">.
    </para>
  
   <refsect2>
--- 569,578 ----
     the old server and run <command>rsync</> again to update the copy with any
     changes to make it consistent.  You might want to exclude some
     files, e.g. <filename>postmaster.pid</>, as documented in <xref
!    linkend="backup-lowlevel-base-backup">.  If your file system supports
!    file system snapshots or copy-on-write file copying, you can use that
!    to make a backup of the old cluster, though the snapshot and copies
!    must be created simultaneously or while the database server is down.
    </para>
  
   <refsect2>