pg_upgrade --copy-file-range
Hello,
I was just in a pg_upgrade unconference session at PGCon where the
lack of $SUBJECT came up. This system call gives the kernel the
option to use fast block cloning on XFS, ZFS (as of very recently),
etc, and works on Linux and FreeBSD. It's probably much the same as
--clone mode on COW file systems, except that is Linux-only. On
overwrite file systems (ie not copy-on-write, like ext4), it may also
be able to push copies down to storage hardware/network file systems.
There was something like this in the nearby large files patch set, but
in that version it just magically did it when available in --copy
mode. Now I think the user should have to have to opt in with
--copy-file-range, and simply to error out if it fails. It may not
work in some cases -- for example, the man page says that older Linux
systems can fail with EXDEV when you try to copy across file systems,
while newer systems will do something less efficient but still
sensible internally; also I saw a claim that some older versions had
weird bugs. Better to just expose the raw functionality and let users
say when they want it and read the error if it fail, I think.
Attachments:
0001-Add-copy-file-range-option-to-pg_upgrade.patchtext/x-patch; charset=US-ASCII; name=0001-Add-copy-file-range-option-to-pg_upgrade.patchDownload
From 571e68a2948c5bff9fa1d66f382c859fc6606829 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Fri, 2 Jun 2023 13:35:54 -0400
Subject: [PATCH] Add --copy-file-range option to pg_upgrade.
The copy_file_range() system call is available on at least Linux and
FreeBSD, and asks the kernel to use efficient ways to copy ranges of a
file. Options available to the kernel include sharing block ranges
(similar to --clone mode), and pushing down block copies to the storage
layer.
---
configure | 2 +-
configure.ac | 1 +
doc/src/sgml/ref/pgupgrade.sgml | 13 +++++++++
meson.build | 1 +
src/bin/pg_upgrade/check.c | 1 +
src/bin/pg_upgrade/file.c | 43 ++++++++++++++++++++++++++++++
src/bin/pg_upgrade/option.c | 10 +++++++
src/bin/pg_upgrade/pg_upgrade.h | 3 +++
src/bin/pg_upgrade/relfilenumber.c | 8 ++++++
src/include/pg_config.h.in | 3 +++
src/tools/msvc/Solution.pm | 1 +
11 files changed, 85 insertions(+), 1 deletion(-)
diff --git a/configure b/configure
index 1b415142d1..a620e049fa 100755
--- a/configure
+++ b/configure
@@ -15700,7 +15700,7 @@ fi
LIBS_including_readline="$LIBS"
LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'`
-for ac_func in backtrace_symbols copyfile getifaddrs getpeerucred inet_pton kqueue mbstowcs_l memset_s posix_fallocate ppoll pthread_is_threaded_np setproctitle setproctitle_fast strchrnul strsignal syncfs sync_file_range uselocale wcstombs_l
+for ac_func in backtrace_symbols copyfile copy_file_range getifaddrs getpeerucred inet_pton kqueue mbstowcs_l memset_s posix_fallocate ppoll pthread_is_threaded_np setproctitle setproctitle_fast strchrnul strsignal syncfs sync_file_range uselocale wcstombs_l
do :
as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var"
diff --git a/configure.ac b/configure.ac
index 09558ada0f..69b9256037 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1794,6 +1794,7 @@ LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'`
AC_CHECK_FUNCS(m4_normalize([
backtrace_symbols
copyfile
+ copy_file_range
getifaddrs
getpeerucred
inet_pton
diff --git a/doc/src/sgml/ref/pgupgrade.sgml b/doc/src/sgml/ref/pgupgrade.sgml
index 7816b4c685..9180513307 100644
--- a/doc/src/sgml/ref/pgupgrade.sgml
+++ b/doc/src/sgml/ref/pgupgrade.sgml
@@ -240,6 +240,19 @@ PostgreSQL documentation
</listitem>
</varlistentry>
+ <varlistentry>
+ <term><option>--copy-file-range</option></term>
+ <listitem>
+ <para>
+ Use the <function>copy_file_range</function> system call for efficient
+ copying. On some file systems this gives results similar to
+ <option>--clone</option>, sharing physical disk blocks, while on others
+ it may still copy blocks, but do so via an optimized path. At present,
+ it is supported on Linux and FreeBSD.
+ </para>
+ </listitem>
+ </varlistentry>
+
<varlistentry>
<term><option>-?</option></term>
<term><option>--help</option></term>
diff --git a/meson.build b/meson.build
index 16b2e86646..322d8f822d 100644
--- a/meson.build
+++ b/meson.build
@@ -2404,6 +2404,7 @@ func_checks = [
['backtrace_symbols', {'dependencies': [execinfo_dep]}],
['clock_gettime', {'dependencies': [rt_dep, posix4_dep], 'define': false}],
['copyfile'],
+ ['copy_file_range'],
# gcc/clang's sanitizer helper library provides dlopen but not dlsym, thus
# when enabling asan the dlopen check doesn't notice that -ldl is actually
# required. Just checking for dlsym() ought to suffice.
diff --git a/src/bin/pg_upgrade/check.c b/src/bin/pg_upgrade/check.c
index 64024e3b9e..8c4e56a568 100644
--- a/src/bin/pg_upgrade/check.c
+++ b/src/bin/pg_upgrade/check.c
@@ -199,6 +199,7 @@ check_new_cluster(void)
check_file_clone();
break;
case TRANSFER_MODE_COPY:
+ case TRANSFER_MODE_COPY_FILE_RANGE:
break;
case TRANSFER_MODE_LINK:
check_hard_link();
diff --git a/src/bin/pg_upgrade/file.c b/src/bin/pg_upgrade/file.c
index d173602882..d8f123bba6 100644
--- a/src/bin/pg_upgrade/file.c
+++ b/src/bin/pg_upgrade/file.c
@@ -10,6 +10,7 @@
#include "postgres_fe.h"
#include <sys/stat.h>
+#include <limits.h>
#include <fcntl.h>
#ifdef HAVE_COPYFILE_H
#include <copyfile.h>
@@ -140,6 +141,48 @@ copyFile(const char *src, const char *dst,
}
+/*
+ * copyFileByRange()
+ *
+ * Copies a relation file from src to dst.
+ * schemaName/relName are relation's SQL name (used for error messages only).
+ */
+void
+copyFileByRange(const char *src, const char *dst,
+ const char *schemaName, const char *relName)
+{
+#ifdef HAVE_COPY_FILE_RANGE
+ int src_fd;
+ int dest_fd;
+ ssize_t nbytes;
+
+ if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0)
+ pg_fatal("error while copying relation \"%s.%s\": could not open file \"%s\": %s",
+ schemaName, relName, src, strerror(errno));
+
+ if ((dest_fd = open(dst, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
+ pg_file_create_mode)) < 0)
+ pg_fatal("error while copying relation \"%s.%s\": could not create file \"%s\": %s",
+ schemaName, relName, dst, strerror(errno));
+
+ for (;;)
+ {
+ nbytes = copy_file_range(src_fd, NULL, dest_fd, NULL, SSIZE_MAX, 0);
+ if (nbytes < 0 && errno != EINTR)
+ pg_fatal("error while copying relation \"%s.%s\": could not copy file range from \"%s\" to \"%s\": %s",
+ schemaName, relName, src, dst, strerror(errno));
+ if (nbytes == 0)
+ break;
+ }
+
+ close(src_fd);
+ close(dest_fd);
+#else
+ pg_fatal("copy_file_range not supported on this platform");
+#endif
+}
+
+
/*
* linkFile()
*
diff --git a/src/bin/pg_upgrade/option.c b/src/bin/pg_upgrade/option.c
index 640361009e..0734508a2b 100644
--- a/src/bin/pg_upgrade/option.c
+++ b/src/bin/pg_upgrade/option.c
@@ -57,6 +57,7 @@ parseCommandLine(int argc, char *argv[])
{"verbose", no_argument, NULL, 'v'},
{"clone", no_argument, NULL, 1},
{"copy", no_argument, NULL, 2},
+ {"copy-file-range", no_argument, NULL, 3},
{NULL, 0, NULL, 0}
};
@@ -199,6 +200,14 @@ parseCommandLine(int argc, char *argv[])
user_opts.transfer_mode = TRANSFER_MODE_COPY;
break;
+ case 3:
+#ifdef HAVE_COPY_FILE_RANGE
+ user_opts.transfer_mode = TRANSFER_MODE_COPY_FILE_RANGE;
+#else
+ pg_fatal("copy_file_range not available on this platform");
+#endif
+ break;
+
default:
fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
os_info.progname);
@@ -289,6 +298,7 @@ usage(void)
printf(_(" -V, --version display version information, then exit\n"));
printf(_(" --clone clone instead of copying files to new cluster\n"));
printf(_(" --copy copy files to new cluster (default)\n"));
+ printf(_(" --copy-file-range copy files to new cluster with copy_file_range\n"));
printf(_(" -?, --help show this help, then exit\n"));
printf(_("\n"
"Before running pg_upgrade you must:\n"
diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h
index 3eea0139c7..a4cb14a49f 100644
--- a/src/bin/pg_upgrade/pg_upgrade.h
+++ b/src/bin/pg_upgrade/pg_upgrade.h
@@ -234,6 +234,7 @@ typedef enum
{
TRANSFER_MODE_CLONE,
TRANSFER_MODE_COPY,
+ TRANSFER_MODE_COPY_FILE_RANGE,
TRANSFER_MODE_LINK
} transferMode;
@@ -379,6 +380,8 @@ void cloneFile(const char *src, const char *dst,
const char *schemaName, const char *relName);
void copyFile(const char *src, const char *dst,
const char *schemaName, const char *relName);
+void copyFileByRange(const char *src, const char *dst,
+ const char *schemaName, const char *relName);
void linkFile(const char *src, const char *dst,
const char *schemaName, const char *relName);
void rewriteVisibilityMap(const char *fromfile, const char *tofile,
diff --git a/src/bin/pg_upgrade/relfilenumber.c b/src/bin/pg_upgrade/relfilenumber.c
index 34bc9c1504..094a4db936 100644
--- a/src/bin/pg_upgrade/relfilenumber.c
+++ b/src/bin/pg_upgrade/relfilenumber.c
@@ -37,6 +37,9 @@ transfer_all_new_tablespaces(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr,
case TRANSFER_MODE_COPY:
prep_status_progress("Copying user relation files");
break;
+ case TRANSFER_MODE_COPY_FILE_RANGE:
+ prep_status_progress("Copying user relation files with copy_file_range");
+ break;
case TRANSFER_MODE_LINK:
prep_status_progress("Linking user relation files");
break;
@@ -250,6 +253,11 @@ transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_fro
old_file, new_file);
copyFile(old_file, new_file, map->nspname, map->relname);
break;
+ case TRANSFER_MODE_COPY_FILE_RANGE:
+ pg_log(PG_VERBOSE, "copying \"%s\" to \"%s\" with copy_file_range",
+ old_file, new_file);
+ copyFileByRange(old_file, new_file, map->nspname, map->relname);
+ break;
case TRANSFER_MODE_LINK:
pg_log(PG_VERBOSE, "linking \"%s\" to \"%s\"",
old_file, new_file);
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 6d572c3820..0b26836f68 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -85,6 +85,9 @@
/* Define to 1 if you have the <copyfile.h> header file. */
#undef HAVE_COPYFILE_H
+/* Define to 1 if you have the `copy_file_range' function. */
+#undef HAVE_COPY_FILE_RANGE
+
/* Define to 1 if you have the <crtdefs.h> header file. */
#undef HAVE_CRTDEFS_H
diff --git a/src/tools/msvc/Solution.pm b/src/tools/msvc/Solution.pm
index b6d31c3583..733376a87e 100644
--- a/src/tools/msvc/Solution.pm
+++ b/src/tools/msvc/Solution.pm
@@ -230,6 +230,7 @@ sub GenerateFiles
HAVE_COMPUTED_GOTO => undef,
HAVE_COPYFILE => undef,
HAVE_COPYFILE_H => undef,
+ HAVE_COPY_FILE_RANGE => undef,
HAVE_CRTDEFS_H => undef,
HAVE_CRYPTO_LOCK => undef,
HAVE_DECL_FDATASYNC => 0,
--
2.39.2
On 02.06.23 21:30, Thomas Munro wrote:
I was just in a pg_upgrade unconference session at PGCon where the
lack of $SUBJECT came up. This system call gives the kernel the
option to use fast block cloning on XFS, ZFS (as of very recently),
etc, and works on Linux and FreeBSD. It's probably much the same as
--clone mode on COW file systems, except that is Linux-only. On
overwrite file systems (ie not copy-on-write, like ext4), it may also
be able to push copies down to storage hardware/network file systems.There was something like this in the nearby large files patch set, but
in that version it just magically did it when available in --copy
mode. Now I think the user should have to have to opt in with
--copy-file-range, and simply to error out if it fails. It may not
work in some cases -- for example, the man page says that older Linux
systems can fail with EXDEV when you try to copy across file systems,
while newer systems will do something less efficient but still
sensible internally; also I saw a claim that some older versions had
weird bugs. Better to just expose the raw functionality and let users
say when they want it and read the error if it fail, I think.
When we added --clone, copy_file_range() was available, but the problem
was that it was hard for the user to predict whether you'd get the fast
clone behavior or the slow copy behavior. That's the kind of thing you
want to know when planning and testing your upgrade. At the time, there
were patches passed around in Linux kernel circles that would have been
able to enforce cloning via the flags argument of copy_file_range(), but
that didn't make it to the mainline.
So, yes, being able to specify exactly which copy mechanism to use makes
sense, so that users can choose the tradeoffs.
About your patch:
I think you should have a "check" function called from
check_new_cluster(). That check function can then also handle the "not
supported" case, and you don't need to handle that in
parseCommandLine(). I suggest following the clone example for these,
since the issues there are very similar.
On Mon, Jul 3, 2023 at 7:47 PM Peter Eisentraut <peter@eisentraut.org> wrote:
When we added --clone, copy_file_range() was available, but the problem
was that it was hard for the user to predict whether you'd get the fast
clone behavior or the slow copy behavior. That's the kind of thing you
want to know when planning and testing your upgrade. At the time, there
were patches passed around in Linux kernel circles that would have been
able to enforce cloning via the flags argument of copy_file_range(), but
that didn't make it to the mainline.So, yes, being able to specify exactly which copy mechanism to use makes
sense, so that users can choose the tradeoffs.
Thanks for looking. Yeah, it is quite inconvenient for planning
purposes that it is hard for a user to know which internal strategy it
uses, but that's the interface we have (and clearly "flags" is
reserved for future usage so that might still evolve..).
About your patch:
I think you should have a "check" function called from
check_new_cluster(). That check function can then also handle the "not
supported" case, and you don't need to handle that in
parseCommandLine(). I suggest following the clone example for these,
since the issues there are very similar.
Done.
Attachments:
v2-0001-Add-copy-file-range-option-to-pg_upgrade.patchtext/x-patch; charset=US-ASCII; name=v2-0001-Add-copy-file-range-option-to-pg_upgrade.patchDownload
From 9ea1c3fc39a47f634a4fffd1ff1c9b9cf0299d65 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Fri, 2 Jun 2023 13:35:54 -0400
Subject: [PATCH v2] Add --copy-file-range option to pg_upgrade.
The copy_file_range() system call is available on at least Linux and
FreeBSD, and asks the kernel to use efficient ways to copy ranges of a
file. Options available to the kernel include sharing block ranges
(similar to --clone mode), and pushing down block copies to the storage
layer.
Reviewed-by: Peter Eisentraut <peter@eisentraut.org>
Discussion: https://postgr.es/m/CA%2BhUKGKe7Hb0-UNih8VD5UNZy5-ojxFb3Pr3xSBBL8qj2M2%3DdQ%40mail.gmail.com
---
configure | 2 +-
configure.ac | 1 +
doc/src/sgml/ref/pgupgrade.sgml | 13 +++++
meson.build | 1 +
src/bin/pg_upgrade/check.c | 3 ++
src/bin/pg_upgrade/file.c | 78 ++++++++++++++++++++++++++++++
src/bin/pg_upgrade/option.c | 7 ++-
src/bin/pg_upgrade/pg_upgrade.h | 4 ++
src/bin/pg_upgrade/relfilenumber.c | 8 +++
src/include/pg_config.h.in | 3 ++
src/tools/msvc/Solution.pm | 1 +
11 files changed, 119 insertions(+), 2 deletions(-)
diff --git a/configure b/configure
index d47e0f8b26..2076b19a1b 100755
--- a/configure
+++ b/configure
@@ -15578,7 +15578,7 @@ fi
LIBS_including_readline="$LIBS"
LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'`
-for ac_func in backtrace_symbols copyfile getifaddrs getpeerucred inet_pton kqueue mbstowcs_l memset_s posix_fallocate ppoll pthread_is_threaded_np setproctitle setproctitle_fast strchrnul strsignal syncfs sync_file_range uselocale wcstombs_l
+for ac_func in backtrace_symbols copyfile copy_file_range getifaddrs getpeerucred inet_pton kqueue mbstowcs_l memset_s posix_fallocate ppoll pthread_is_threaded_np setproctitle setproctitle_fast strchrnul strsignal syncfs sync_file_range uselocale wcstombs_l
do :
as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var"
diff --git a/configure.ac b/configure.ac
index 440b08d113..d0d31dd91e 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1767,6 +1767,7 @@ LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'`
AC_CHECK_FUNCS(m4_normalize([
backtrace_symbols
copyfile
+ copy_file_range
getifaddrs
getpeerucred
inet_pton
diff --git a/doc/src/sgml/ref/pgupgrade.sgml b/doc/src/sgml/ref/pgupgrade.sgml
index f17fdb1ba5..8275cc067b 100644
--- a/doc/src/sgml/ref/pgupgrade.sgml
+++ b/doc/src/sgml/ref/pgupgrade.sgml
@@ -263,6 +263,19 @@ PostgreSQL documentation
</listitem>
</varlistentry>
+ <varlistentry>
+ <term><option>--copy-file-range</option></term>
+ <listitem>
+ <para>
+ Use the <function>copy_file_range</function> system call for efficient
+ copying. On some file systems this gives results similar to
+ <option>--clone</option>, sharing physical disk blocks, while on others
+ it may still copy blocks, but do so via an optimized path. At present,
+ it is supported on Linux and FreeBSD.
+ </para>
+ </listitem>
+ </varlistentry>
+
<varlistentry>
<term><option>-?</option></term>
<term><option>--help</option></term>
diff --git a/meson.build b/meson.build
index 862c955453..20e7327e9e 100644
--- a/meson.build
+++ b/meson.build
@@ -2415,6 +2415,7 @@ func_checks = [
['backtrace_symbols', {'dependencies': [execinfo_dep]}],
['clock_gettime', {'dependencies': [rt_dep], 'define': false}],
['copyfile'],
+ ['copy_file_range'],
# gcc/clang's sanitizer helper library provides dlopen but not dlsym, thus
# when enabling asan the dlopen check doesn't notice that -ldl is actually
# required. Just checking for dlsym() ought to suffice.
diff --git a/src/bin/pg_upgrade/check.c b/src/bin/pg_upgrade/check.c
index 21a0ff9e42..4a615edb62 100644
--- a/src/bin/pg_upgrade/check.c
+++ b/src/bin/pg_upgrade/check.c
@@ -213,6 +213,9 @@ check_new_cluster(void)
break;
case TRANSFER_MODE_COPY:
break;
+ case TRANSFER_MODE_COPY_FILE_RANGE:
+ check_copy_file_range();
+ break;
case TRANSFER_MODE_LINK:
check_hard_link();
break;
diff --git a/src/bin/pg_upgrade/file.c b/src/bin/pg_upgrade/file.c
index d173602882..e30d944be3 100644
--- a/src/bin/pg_upgrade/file.c
+++ b/src/bin/pg_upgrade/file.c
@@ -10,6 +10,7 @@
#include "postgres_fe.h"
#include <sys/stat.h>
+#include <limits.h>
#include <fcntl.h>
#ifdef HAVE_COPYFILE_H
#include <copyfile.h>
@@ -140,6 +141,45 @@ copyFile(const char *src, const char *dst,
}
+/*
+ * copyFileByRange()
+ *
+ * Copies a relation file from src to dst.
+ * schemaName/relName are relation's SQL name (used for error messages only).
+ */
+void
+copyFileByRange(const char *src, const char *dst,
+ const char *schemaName, const char *relName)
+{
+#ifdef HAVE_COPY_FILE_RANGE
+ int src_fd;
+ int dest_fd;
+ ssize_t nbytes;
+
+ if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0)
+ pg_fatal("error while copying relation \"%s.%s\": could not open file \"%s\": %s",
+ schemaName, relName, src, strerror(errno));
+
+ if ((dest_fd = open(dst, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
+ pg_file_create_mode)) < 0)
+ pg_fatal("error while copying relation \"%s.%s\": could not create file \"%s\": %s",
+ schemaName, relName, dst, strerror(errno));
+
+ do
+ {
+ nbytes = copy_file_range(src_fd, NULL, dest_fd, NULL, SSIZE_MAX, 0);
+ if (nbytes < 0 && errno != EINTR)
+ pg_fatal("error while copying relation \"%s.%s\": could not copy file range from \"%s\" to \"%s\": %s",
+ schemaName, relName, src, dst, strerror(errno));
+ }
+ while (nbytes > 0);
+
+ close(src_fd);
+ close(dest_fd);
+#endif
+}
+
+
/*
* linkFile()
*
@@ -358,6 +398,44 @@ check_file_clone(void)
unlink(new_link_file);
}
+void
+check_copy_file_range(void)
+{
+ char existing_file[MAXPGPATH];
+ char new_link_file[MAXPGPATH];
+
+ snprintf(existing_file, sizeof(existing_file), "%s/PG_VERSION", old_cluster.pgdata);
+ snprintf(new_link_file, sizeof(new_link_file), "%s/PG_VERSION.clonetest", new_cluster.pgdata);
+ unlink(new_link_file); /* might fail */
+
+#if defined(HAVE_COPY_FILE_RANGE)
+ {
+ int src_fd;
+ int dest_fd;
+
+ if ((src_fd = open(existing_file, O_RDONLY | PG_BINARY, 0)) < 0)
+ pg_fatal("could not open file \"%s\": %s",
+ existing_file, strerror(errno));
+
+ if ((dest_fd = open(new_link_file, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
+ pg_file_create_mode)) < 0)
+ pg_fatal("could not create file \"%s\": %s",
+ new_link_file, strerror(errno));
+
+ if (copy_file_range(src_fd, NULL, dest_fd, NULL, SSIZE_MAX, 0) < 0)
+ pg_fatal("could not copy file range between old and new data directories: %s",
+ strerror(errno));
+
+ close(src_fd);
+ close(dest_fd);
+ }
+#else
+ pg_fatal("copy_file_range not supported on this platform");
+#endif
+
+ unlink(new_link_file);
+}
+
void
check_hard_link(void)
{
diff --git a/src/bin/pg_upgrade/option.c b/src/bin/pg_upgrade/option.c
index b9d900d0db..600ba7e8eb 100644
--- a/src/bin/pg_upgrade/option.c
+++ b/src/bin/pg_upgrade/option.c
@@ -58,7 +58,8 @@ parseCommandLine(int argc, char *argv[])
{"verbose", no_argument, NULL, 'v'},
{"clone", no_argument, NULL, 1},
{"copy", no_argument, NULL, 2},
- {"sync-method", required_argument, NULL, 3},
+ {"copy-file-range", no_argument, NULL, 3},
+ {"sync-method", required_argument, NULL, 4},
{NULL, 0, NULL, 0}
};
@@ -203,6 +204,9 @@ parseCommandLine(int argc, char *argv[])
break;
case 3:
+ user_opts.transfer_mode = TRANSFER_MODE_COPY_FILE_RANGE;
+ break;
+ case 4:
if (!parse_sync_method(optarg, &unused))
exit(1);
user_opts.sync_method = pg_strdup(optarg);
@@ -301,6 +305,7 @@ usage(void)
printf(_(" -V, --version display version information, then exit\n"));
printf(_(" --clone clone instead of copying files to new cluster\n"));
printf(_(" --copy copy files to new cluster (default)\n"));
+ printf(_(" --copy-file-range copy files to new cluster with copy_file_range\n"));
printf(_(" --sync-method=METHOD set method for syncing files to disk\n"));
printf(_(" -?, --help show this help, then exit\n"));
printf(_("\n"
diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h
index 842f3b6cd3..25fb7dc7ad 100644
--- a/src/bin/pg_upgrade/pg_upgrade.h
+++ b/src/bin/pg_upgrade/pg_upgrade.h
@@ -234,6 +234,7 @@ typedef enum
{
TRANSFER_MODE_CLONE,
TRANSFER_MODE_COPY,
+ TRANSFER_MODE_COPY_FILE_RANGE,
TRANSFER_MODE_LINK
} transferMode;
@@ -380,11 +381,14 @@ void cloneFile(const char *src, const char *dst,
const char *schemaName, const char *relName);
void copyFile(const char *src, const char *dst,
const char *schemaName, const char *relName);
+void copyFileByRange(const char *src, const char *dst,
+ const char *schemaName, const char *relName);
void linkFile(const char *src, const char *dst,
const char *schemaName, const char *relName);
void rewriteVisibilityMap(const char *fromfile, const char *tofile,
const char *schemaName, const char *relName);
void check_file_clone(void);
+void check_copy_file_range(void);
void check_hard_link(void);
/* fopen_priv() is no longer different from fopen() */
diff --git a/src/bin/pg_upgrade/relfilenumber.c b/src/bin/pg_upgrade/relfilenumber.c
index 34bc9c1504..094a4db936 100644
--- a/src/bin/pg_upgrade/relfilenumber.c
+++ b/src/bin/pg_upgrade/relfilenumber.c
@@ -37,6 +37,9 @@ transfer_all_new_tablespaces(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr,
case TRANSFER_MODE_COPY:
prep_status_progress("Copying user relation files");
break;
+ case TRANSFER_MODE_COPY_FILE_RANGE:
+ prep_status_progress("Copying user relation files with copy_file_range");
+ break;
case TRANSFER_MODE_LINK:
prep_status_progress("Linking user relation files");
break;
@@ -250,6 +253,11 @@ transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_fro
old_file, new_file);
copyFile(old_file, new_file, map->nspname, map->relname);
break;
+ case TRANSFER_MODE_COPY_FILE_RANGE:
+ pg_log(PG_VERBOSE, "copying \"%s\" to \"%s\" with copy_file_range",
+ old_file, new_file);
+ copyFileByRange(old_file, new_file, map->nspname, map->relname);
+ break;
case TRANSFER_MODE_LINK:
pg_log(PG_VERBOSE, "linking \"%s\" to \"%s\"",
old_file, new_file);
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index d8a2985567..d787484259 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -81,6 +81,9 @@
/* Define to 1 if you have the <copyfile.h> header file. */
#undef HAVE_COPYFILE_H
+/* Define to 1 if you have the `copy_file_range' function. */
+#undef HAVE_COPY_FILE_RANGE
+
/* Define to 1 if you have the <crtdefs.h> header file. */
#undef HAVE_CRTDEFS_H
diff --git a/src/tools/msvc/Solution.pm b/src/tools/msvc/Solution.pm
index a50f730260..3d72a6e4aa 100644
--- a/src/tools/msvc/Solution.pm
+++ b/src/tools/msvc/Solution.pm
@@ -229,6 +229,7 @@ sub GenerateFiles
HAVE_COMPUTED_GOTO => undef,
HAVE_COPYFILE => undef,
HAVE_COPYFILE_H => undef,
+ HAVE_COPY_FILE_RANGE => undef,
HAVE_CRTDEFS_H => undef,
HAVE_CRYPTO_LOCK => undef,
HAVE_DECL_FDATASYNC => 0,
--
2.42.0
On 08.10.23 07:15, Thomas Munro wrote:
About your patch:
I think you should have a "check" function called from
check_new_cluster(). That check function can then also handle the "not
supported" case, and you don't need to handle that in
parseCommandLine(). I suggest following the clone example for these,
since the issues there are very similar.Done.
This version looks good to me.
Tiny nit: You copy-and-pasted "%s/PG_VERSION.clonetest"; maybe choose a
different suffix.
On 13.11.23 08:15, Peter Eisentraut wrote:
On 08.10.23 07:15, Thomas Munro wrote:
About your patch:
I think you should have a "check" function called from
check_new_cluster(). That check function can then also handle the "not
supported" case, and you don't need to handle that in
parseCommandLine(). I suggest following the clone example for these,
since the issues there are very similar.Done.
This version looks good to me.
Tiny nit: You copy-and-pasted "%s/PG_VERSION.clonetest"; maybe choose a
different suffix.
Thomas, are you planning to proceed with this patch?
On Sat, Dec 23, 2023 at 9:40 AM Peter Eisentraut <peter@eisentraut.org> wrote:
On 13.11.23 08:15, Peter Eisentraut wrote:
On 08.10.23 07:15, Thomas Munro wrote:
About your patch:
I think you should have a "check" function called from
check_new_cluster(). That check function can then also handle the "not
supported" case, and you don't need to handle that in
parseCommandLine(). I suggest following the clone example for these,
since the issues there are very similar.Done.
This version looks good to me.
Tiny nit: You copy-and-pasted "%s/PG_VERSION.clonetest"; maybe choose a
different suffix.Thomas, are you planning to proceed with this patch?
Yes. Sorry for being slow... got stuck working on an imminent new
version of streaming read. I will be defrosting my commit bit and
committing this one and a few things shortly.
As it happens I was just thinking about this particular patch because
I suddenly had a strong urge to teach pg_combinebackup to use
copy_file_range. I wonder if you had the same idea...
On Sat, Dec 23, 2023 at 09:52:59AM +1300, Thomas Munro wrote:
As it happens I was just thinking about this particular patch because
I suddenly had a strong urge to teach pg_combinebackup to use
copy_file_range. I wonder if you had the same idea...
Yeah, +1. That would make copy_file_blocks() more efficient where the
code is copying 50 blocks in batches because it needs to reassign
checksums to the blocks copied.
--
Michael
Hi Thomas, Michael, Peter and -hackers,
On Sun, Dec 24, 2023 at 3:57 AM Michael Paquier <michael@paquier.xyz> wrote:
On Sat, Dec 23, 2023 at 09:52:59AM +1300, Thomas Munro wrote:
As it happens I was just thinking about this particular patch because
I suddenly had a strong urge to teach pg_combinebackup to use
copy_file_range. I wonder if you had the same idea...Yeah, +1. That would make copy_file_blocks() more efficient where the
code is copying 50 blocks in batches because it needs to reassign
checksums to the blocks copied.
I've tried to achieve what you were discussing. Actually this was my
first thought when using pg_combinebackup with larger (realistic)
backup sizes back in December. Attached is a set of very DIRTY (!)
patches that provide CoW options (--clone/--copy-range-file) to
pg_combinebackup (just like pg_upgrade to keep it in sync), while also
refactoring some related bits of code to avoid duplication.
With XFS (with reflink=1 which is default) on Linux with kernel 5.10
and ~210GB backups, I'm getting:
root@jw-test-1:/xfs# du -sm *
210229 full
250 incr.1
Today in master, the old classic read()/while() loop without
CoW/reflink optimization :
root@jw-test-1:/xfs# rm -rf outtest; sync; sync ; sync; echo 3 | sudo
tee /proc/sys/vm/drop_caches ; time /usr/pgsql17/bin/pg_combinebackup
--manifest-checksums=NONE -o outtest full incr.1
3
real 49m43.963s
user 0m0.887s
sys 2m52.697s
VS patch with "--clone" :
root@jw-test-1:/xfs# rm -rf outtest; sync; sync ; sync; echo 3 | sudo
tee /proc/sys/vm/drop_caches ; time /usr/pgsql17/bin/pg_combinebackup
--manifest-checksums=NONE --clone -o outtest full incr.1
3
real 0m39.812s
user 0m0.325s
sys 0m2.401s
So it is 49mins down to 40 seconds(!) +/-10s (3 tries) if the FS
supports CoW/reflinks (XFS, BTRFS, upcoming bcachefs?). It looks to me
that this might mean that if one actually wants to use incremental
backups (to get minimal RTO), it would be wise to only use CoW
filesystems from the start so that RTO is as low as possible.
Random patch notes:
- main meat is in v3-0002*, I hope i did not screw something seriously
- in worst case: it is opt-in through switch, so the user always can
stick to the classic copy
- no docs so far
- pg_copyfile_offload_supported() should actually be fixed if it is a
good path forward
- pgindent actually indents larger areas of code that I would like to,
any ideas or is it ok?
- not tested on Win32/MacOS/FreeBSD
- i've tested pg_upgrade manually and it seems to work and issue
correct syscalls, however some tests are failing(?). I haven't
investigated why yet due to lack of time.
Any help is appreciated.
-J.
Attachments:
v3-0001-Add-copy_file_range-3-system-call-detection.-Futu.patchapplication/octet-stream; name=v3-0001-Add-copy_file_range-3-system-call-detection.-Futu.patchDownload
From 7216586c13d9a470e1c3da7349cb7f19e318d8a3 Mon Sep 17 00:00:00 2001
From: Jakub Wartak <jakub.wartak@enterprisedb.com>
Date: Thu, 4 Jan 2024 14:43:37 +0100
Subject: [PATCH v3 1/4] Add copy_file_range(3) system call detection. Future
patches may use it to optimize copying/cloning.
Discussion: https://www.postgresql.org/message-id/flat/CA%2BhUKGJvLLNQtzb%3DZWcTsYF8kv8cR_%3DH17CX-eL8qNixeC4DAw%40mail.gmail.com#ce606227e39df74c6b2abf80b8eab04a
---
configure | 2 +-
configure.ac | 1 +
meson.build | 1 +
src/include/pg_config.h.in | 3 +++
4 files changed, 6 insertions(+), 1 deletion(-)
diff --git a/configure b/configure
index 217704e9ca..ca30f4f96a 100755
--- a/configure
+++ b/configure
@@ -15539,7 +15539,7 @@ fi
LIBS_including_readline="$LIBS"
LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'`
-for ac_func in backtrace_symbols copyfile getifaddrs getpeerucred inet_pton kqueue mbstowcs_l memset_s posix_fallocate ppoll pthread_is_threaded_np setproctitle setproctitle_fast strchrnul strsignal syncfs sync_file_range uselocale wcstombs_l
+for ac_func in backtrace_symbols copyfile copy_file_range getifaddrs getpeerucred inet_pton kqueue mbstowcs_l memset_s posix_fallocate ppoll pthread_is_threaded_np setproctitle setproctitle_fast strchrnul strsignal syncfs sync_file_range uselocale wcstombs_l
do :
as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var"
diff --git a/configure.ac b/configure.ac
index e49de9e4f0..a80c83dd45 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1769,6 +1769,7 @@ LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'`
AC_CHECK_FUNCS(m4_normalize([
backtrace_symbols
copyfile
+ copy_file_range
getifaddrs
getpeerucred
inet_pton
diff --git a/meson.build b/meson.build
index 21abd7da85..cd69391d60 100644
--- a/meson.build
+++ b/meson.build
@@ -2419,6 +2419,7 @@ func_checks = [
['backtrace_symbols', {'dependencies': [execinfo_dep]}],
['clock_gettime', {'dependencies': [rt_dep], 'define': false}],
['copyfile'],
+ ['copy_file_range'],
# gcc/clang's sanitizer helper library provides dlopen but not dlsym, thus
# when enabling asan the dlopen check doesn't notice that -ldl is actually
# required. Just checking for dlsym() ought to suffice.
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 5f16918243..c848af34cb 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -78,6 +78,9 @@
/* Define to 1 if you have the <copyfile.h> header file. */
#undef HAVE_COPYFILE_H
+/* Define to 1 if you have the `copy_file_range' function. */
+#undef HAVE_COPY_FILE_RANGE
+
/* Define to 1 if you have the <crtdefs.h> header file. */
#undef HAVE_CRTDEFS_H
--
2.30.2
v3-0002-Confine-various-OS-copy-on-write-and-other-copy-a.patchapplication/octet-stream; name=v3-0002-Confine-various-OS-copy-on-write-and-other-copy-a.patchDownload
From 2f58f6af27c94d782925b7f38ea8845cdfb3e0d2 Mon Sep 17 00:00:00 2001
From: Jakub Wartak <jakub.wartak@enterprisedb.com>
Date: Thu, 4 Jan 2024 19:45:30 +0100
Subject: [PATCH v3 2/4] Confine various OS copy-on-write (and other copy
acceleration) methods via new internal pg_copyfile_* APIs in libpqcommon.
Later refactor pg_upgrade to use those APIs for ioctl(FICLONE).
Co-authored-by: Thomas Munro <thomas.munro@gmail.com>
Discussion: https://www.postgresql.org/message-id/flat/CA%2BhUKGJvLLNQtzb%3DZWcTsYF8kv8cR_%3DH17CX-eL8qNixeC4DAw%40mail.gmail.com#ce606227e39df74c6b2abf80b8eab04a
---
src/bin/pg_upgrade/file.c | 252 ++++++++-----------------
src/bin/pg_upgrade/relfilenumber.c | 81 ++++----
src/common/file_utils.c | 288 +++++++++++++++++++++++------
src/include/common/file_utils.h | 37 +++-
4 files changed, 380 insertions(+), 278 deletions(-)
diff --git a/src/bin/pg_upgrade/file.c b/src/bin/pg_upgrade/file.c
index d173602882..f91cc548ce 100644
--- a/src/bin/pg_upgrade/file.c
+++ b/src/bin/pg_upgrade/file.c
@@ -1,31 +1,23 @@
/*
- * file.c
+ * file.c
*
- * file system operations
+ * file system operations
*
- * Copyright (c) 2010-2023, PostgreSQL Global Development Group
- * src/bin/pg_upgrade/file.c
+ * Copyright (c) 2010-2023, PostgreSQL Global Development Group
+ * src/bin/pg_upgrade/file.c
*/
#include "postgres_fe.h"
-#include <sys/stat.h>
-#include <fcntl.h>
-#ifdef HAVE_COPYFILE_H
-#include <copyfile.h>
-#endif
-#ifdef __linux__
-#include <sys/ioctl.h>
-#include <linux/fs.h>
-#endif
-
#include "access/visibilitymapdefs.h"
#include "common/file_perm.h"
+#include "common/file_utils.h"
#include "pg_upgrade.h"
#include "storage/bufpage.h"
#include "storage/checksum.h"
#include "storage/checksum_impl.h"
-
+#include <fcntl.h>
+#include <sys/stat.h>
/*
* cloneFile()
@@ -35,127 +27,49 @@
* schemaName/relName are relation's SQL name (used for error messages only).
*/
void
-cloneFile(const char *src, const char *dst,
- const char *schemaName, const char *relName)
+cloneFile(const char *src, const char *dst, const char *schemaName,
+ const char *relName)
{
-#if defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE)
- if (copyfile(src, dst, NULL, COPYFILE_CLONE_FORCE) < 0)
- pg_fatal("error while cloning relation \"%s.%s\" (\"%s\" to \"%s\"): %s",
- schemaName, relName, src, dst, strerror(errno));
-#elif defined(__linux__) && defined(FICLONE)
- int src_fd;
- int dest_fd;
-
- if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0)
- pg_fatal("error while cloning relation \"%s.%s\": could not open file \"%s\": %s",
- schemaName, relName, src, strerror(errno));
-
- if ((dest_fd = open(dst, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
- pg_file_create_mode)) < 0)
- pg_fatal("error while cloning relation \"%s.%s\": could not create file \"%s\": %s",
- schemaName, relName, dst, strerror(errno));
-
- if (ioctl(dest_fd, FICLONE, src_fd) < 0)
- {
- int save_errno = errno;
-
- unlink(dst);
+ char action[1024];
- pg_fatal("error while cloning relation \"%s.%s\" (\"%s\" to \"%s\"): %s",
- schemaName, relName, src, dst, strerror(save_errno));
- }
-
- close(src_fd);
- close(dest_fd);
-#endif
+ snprintf(action, sizeof(action) - 1, "relation \"%s.%s\"", schemaName,
+ relName);
+ pg_copyfile_offload(src, dst, action, PG_COPYFILE_IOCTL_FICLONE);
}
-
/*
* copyFile()
*
- * Copies a relation file from src to dst.
- * schemaName/relName are relation's SQL name (used for error messages only).
+ * Copies a relation file from src to dst. schemaName/relName are relation's
+ * SQL name (used for error messages only).
*/
void
-copyFile(const char *src, const char *dst,
- const char *schemaName, const char *relName)
+copyFile(const char *src, const char *dst, const char *schemaName,
+ const char *relName)
{
-#ifndef WIN32
- int src_fd;
- int dest_fd;
- char *buffer;
-
- if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0)
- pg_fatal("error while copying relation \"%s.%s\": could not open file \"%s\": %s",
- schemaName, relName, src, strerror(errno));
-
- if ((dest_fd = open(dst, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
- pg_file_create_mode)) < 0)
- pg_fatal("error while copying relation \"%s.%s\": could not create file \"%s\": %s",
- schemaName, relName, dst, strerror(errno));
-
- /* copy in fairly large chunks for best efficiency */
-#define COPY_BUF_SIZE (50 * BLCKSZ)
-
- buffer = (char *) pg_malloc(COPY_BUF_SIZE);
-
- /* perform data copying i.e read src source, write to destination */
- while (true)
- {
- ssize_t nbytes = read(src_fd, buffer, COPY_BUF_SIZE);
+ char action[128];
- if (nbytes < 0)
- pg_fatal("error while copying relation \"%s.%s\": could not read file \"%s\": %s",
- schemaName, relName, src, strerror(errno));
-
- if (nbytes == 0)
- break;
-
- errno = 0;
- if (write(dest_fd, buffer, nbytes) != nbytes)
- {
- /* if write didn't set errno, assume problem is no disk space */
- if (errno == 0)
- errno = ENOSPC;
- pg_fatal("error while copying relation \"%s.%s\": could not write file \"%s\": %s",
- schemaName, relName, dst, strerror(errno));
- }
- }
-
- pg_free(buffer);
- close(src_fd);
- close(dest_fd);
-
-#else /* WIN32 */
-
- if (CopyFile(src, dst, true) == 0)
- {
- _dosmaperr(GetLastError());
- pg_fatal("error while copying relation \"%s.%s\" (\"%s\" to \"%s\"): %s",
- schemaName, relName, src, dst, strerror(errno));
- }
-
-#endif /* WIN32 */
+ snprintf(action, sizeof(action) - 1, "relation \"%s.%s\"", schemaName,
+ relName);
+ pg_copyfile(src, dst, action, NULL);
}
-
/*
* linkFile()
*
- * Hard-links a relation file from src to dst.
- * schemaName/relName are relation's SQL name (used for error messages only).
+ * Hard-links a relation file from src to dst. schemaName/relName are
+ * relation's SQL name (used for error messages only).
*/
void
-linkFile(const char *src, const char *dst,
- const char *schemaName, const char *relName)
+linkFile(const char *src, const char *dst, const char *schemaName,
+ const char *relName)
{
if (link(src, dst) < 0)
- pg_fatal("error while creating link for relation \"%s.%s\" (\"%s\" to \"%s\"): %s",
+ pg_fatal("error while creating link for relation \"%s.%s\" (\"%s\" to "
+ "\"%s\"): %s",
schemaName, relName, src, dst, strerror(errno));
}
-
/*
* rewriteVisibilityMap()
*
@@ -163,14 +77,14 @@ linkFile(const char *src, const char *dst,
* schemaName/relName are relation's SQL name (used for error messages only).
*
* In versions of PostgreSQL prior to catversion 201603011, PostgreSQL's
- * visibility map included one bit per heap page; it now includes two.
- * When upgrading a cluster from before that time to a current PostgreSQL
- * version, we could refuse to copy visibility maps from the old cluster
- * to the new cluster; the next VACUUM would recreate them, but at the
- * price of scanning the entire table. So, instead, we rewrite the old
- * visibility maps in the new format. That way, the all-visible bits
- * remain set for the pages for which they were set previously. The
- * all-frozen bits are never set by this conversion; we leave that to VACUUM.
+ * visibility map included one bit per heap page; it now includes two. When
+ * upgrading a cluster from before that time to a current PostgreSQL version,
+ * we could refuse to copy visibility maps from the old cluster to the new
+ * cluster; the next VACUUM would recreate them, but at the price of scanning
+ * the entire table. So, instead, we rewrite the old visibility maps in the
+ * new format. That way, the all-visible bits remain set for the pages for
+ * which they were set previously. The all-frozen bits are never set by this
+ * conversion; we leave that to VACUUM.
*/
void
rewriteVisibilityMap(const char *fromfile, const char *tofile,
@@ -190,16 +104,19 @@ rewriteVisibilityMap(const char *fromfile, const char *tofile,
rewriteVmBytesPerPage = (BLCKSZ - SizeOfPageHeaderData) / 2;
if ((src_fd = open(fromfile, O_RDONLY | PG_BINARY, 0)) < 0)
- pg_fatal("error while copying relation \"%s.%s\": could not open file \"%s\": %s",
+ pg_fatal("error while copying relation \"%s.%s\": could not open file "
+ "\"%s\": %s",
schemaName, relName, fromfile, strerror(errno));
if (fstat(src_fd, &statbuf) != 0)
- pg_fatal("error while copying relation \"%s.%s\": could not stat file \"%s\": %s",
+ pg_fatal("error while copying relation \"%s.%s\": could not stat file "
+ "\"%s\": %s",
schemaName, relName, fromfile, strerror(errno));
if ((dst_fd = open(tofile, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
pg_file_create_mode)) < 0)
- pg_fatal("error while copying relation \"%s.%s\": could not create file \"%s\": %s",
+ pg_fatal("error while copying relation \"%s.%s\": could not create file "
+ "\"%s\": %s",
schemaName, relName, tofile, strerror(errno));
/* Save old file size */
@@ -223,10 +140,12 @@ rewriteVisibilityMap(const char *fromfile, const char *tofile,
if ((bytesRead = read(src_fd, buffer.data, BLCKSZ)) != BLCKSZ)
{
if (bytesRead < 0)
- pg_fatal("error while copying relation \"%s.%s\": could not read file \"%s\": %s",
+ pg_fatal("error while copying relation \"%s.%s\": could not read file "
+ "\"%s\": %s",
schemaName, relName, fromfile, strerror(errno));
else
- pg_fatal("error while copying relation \"%s.%s\": partial page found in file \"%s\"",
+ pg_fatal("error while copying relation \"%s.%s\": partial page found "
+ "in file \"%s\"",
schemaName, relName, fromfile);
}
@@ -260,25 +179,30 @@ rewriteVisibilityMap(const char *fromfile, const char *tofile,
new_cur = new_vmbuf.data + SizeOfPageHeaderData;
- /* Process old page bytes one by one, and turn it into new page. */
+ /*
+ * Process old page bytes one by one, and turn it into new page.
+ */
while (old_cur < old_break)
{
uint8 byte = *(uint8 *) old_cur;
uint16 new_vmbits = 0;
int i;
- /* Generate new format bits while keeping old information */
+ /*
+ * Generate new format bits while keeping old information
+ */
for (i = 0; i < BITS_PER_BYTE; i++)
{
if (byte & (1 << i))
{
empty = false;
- new_vmbits |=
- VISIBILITYMAP_ALL_VISIBLE << (BITS_PER_HEAPBLOCK * i);
+ new_vmbits |= VISIBILITYMAP_ALL_VISIBLE << (BITS_PER_HEAPBLOCK * i);
}
}
- /* Copy new visibility map bytes to new-format page */
+ /*
+ * Copy new visibility map bytes to new-format page
+ */
new_cur[0] = (char) (new_vmbits & 0xFF);
new_cur[1] = (char) (new_vmbits >> 8);
@@ -286,11 +210,15 @@ rewriteVisibilityMap(const char *fromfile, const char *tofile,
new_cur += BITS_PER_HEAPBLOCK;
}
- /* If the last part of the last page is empty, skip writing it */
+ /*
+ * If the last part of the last page is empty, skip writing it
+ */
if (old_lastpart && empty)
break;
- /* Set new checksum for visibility map page, if enabled */
+ /*
+ * Set new checksum for visibility map page, if enabled
+ */
if (new_cluster.controldata.data_checksum_version != 0)
((PageHeader) new_vmbuf.data)->pd_checksum =
pg_checksum_page(new_vmbuf.data, new_blkno);
@@ -298,10 +226,13 @@ rewriteVisibilityMap(const char *fromfile, const char *tofile,
errno = 0;
if (write(dst_fd, new_vmbuf.data, BLCKSZ) != BLCKSZ)
{
- /* if write didn't set errno, assume problem is no disk space */
+ /*
+ * if write didn't set errno, assume problem is no disk space
+ */
if (errno == 0)
errno = ENOSPC;
- pg_fatal("error while copying relation \"%s.%s\": could not write file \"%s\": %s",
+ pg_fatal("error while copying relation \"%s.%s\": could not write file "
+ "\"%s\": %s",
schemaName, relName, tofile, strerror(errno));
}
@@ -322,40 +253,15 @@ check_file_clone(void)
char existing_file[MAXPGPATH];
char new_link_file[MAXPGPATH];
- snprintf(existing_file, sizeof(existing_file), "%s/PG_VERSION", old_cluster.pgdata);
- snprintf(new_link_file, sizeof(new_link_file), "%s/PG_VERSION.clonetest", new_cluster.pgdata);
+ snprintf(existing_file, sizeof(existing_file), "%s/PG_VERSION",
+ old_cluster.pgdata);
+ snprintf(new_link_file, sizeof(new_link_file), "%s/PG_VERSION.clonetest",
+ new_cluster.pgdata);
unlink(new_link_file); /* might fail */
-#if defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE)
- if (copyfile(existing_file, new_link_file, NULL, COPYFILE_CLONE_FORCE) < 0)
- pg_fatal("could not clone file between old and new data directories: %s",
- strerror(errno));
-#elif defined(__linux__) && defined(FICLONE)
- {
- int src_fd;
- int dest_fd;
-
- if ((src_fd = open(existing_file, O_RDONLY | PG_BINARY, 0)) < 0)
- pg_fatal("could not open file \"%s\": %s",
- existing_file, strerror(errno));
-
- if ((dest_fd = open(new_link_file, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
- pg_file_create_mode)) < 0)
- pg_fatal("could not create file \"%s\": %s",
- new_link_file, strerror(errno));
-
- if (ioctl(dest_fd, FICLONE, src_fd) < 0)
- pg_fatal("could not clone file between old and new data directories: %s",
- strerror(errno));
-
- close(src_fd);
- close(dest_fd);
- }
-#else
- pg_fatal("file cloning not supported on this platform");
-#endif
-
- unlink(new_link_file);
+ /* will throw error in case it is not supported */
+ pg_copyfile_offload_supported(existing_file, new_link_file, NULL,
+ PG_COPYFILE_IOCTL_FICLONE);
}
void
@@ -364,13 +270,17 @@ check_hard_link(void)
char existing_file[MAXPGPATH];
char new_link_file[MAXPGPATH];
- snprintf(existing_file, sizeof(existing_file), "%s/PG_VERSION", old_cluster.pgdata);
- snprintf(new_link_file, sizeof(new_link_file), "%s/PG_VERSION.linktest", new_cluster.pgdata);
+ snprintf(existing_file, sizeof(existing_file), "%s/PG_VERSION",
+ old_cluster.pgdata);
+ snprintf(new_link_file, sizeof(new_link_file), "%s/PG_VERSION.linktest",
+ new_cluster.pgdata);
unlink(new_link_file); /* might fail */
if (link(existing_file, new_link_file) < 0)
- pg_fatal("could not create hard link between old and new data directories: %s\n"
- "In link mode the old and new data directories must be on the same file system.",
+ pg_fatal(
+ "could not create hard link between old and new data directories: %s\n"
+ "In link mode the old and new data directories must be on the same "
+ "file system.",
strerror(errno));
unlink(new_link_file);
diff --git a/src/bin/pg_upgrade/relfilenumber.c b/src/bin/pg_upgrade/relfilenumber.c
index 34bc9c1504..d61fb77bdf 100644
--- a/src/bin/pg_upgrade/relfilenumber.c
+++ b/src/bin/pg_upgrade/relfilenumber.c
@@ -15,15 +15,16 @@
#include "catalog/pg_class_d.h"
#include "pg_upgrade.h"
-static void transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace);
-static void transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_frozenbit);
-
+static void transfer_single_new_db(FileNameMap *maps, int size,
+ char *old_tablespace);
+static void transfer_relfile(FileNameMap *map, const char *type_suffix,
+ bool vm_must_add_frozenbit);
/*
* transfer_all_new_tablespaces()
*
- * Responsible for upgrading all database. invokes routines to generate mappings and then
- * physically link the databases.
+ * Responsible for upgrading all database. invokes routines to generate mappings
+ * and then physically link the databases.
*/
void
transfer_all_new_tablespaces(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr,
@@ -40,6 +41,9 @@ transfer_all_new_tablespaces(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr,
case TRANSFER_MODE_LINK:
prep_status_progress("Linking user relation files");
break;
+ case TRANSFER_MODE_COPY_FILE_RANGE:
+ prep_status_progress("Copying user relation files with copy_file_range");
+ break;
}
/*
@@ -61,9 +65,7 @@ transfer_all_new_tablespaces(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr,
new_pgdata, old_pgdata);
for (tblnum = 0; tblnum < os_info.num_old_tablespaces; tblnum++)
- parallel_transfer_all_new_dbs(old_db_arr,
- new_db_arr,
- old_pgdata,
+ parallel_transfer_all_new_dbs(old_db_arr, new_db_arr, old_pgdata,
new_pgdata,
os_info.old_tablespaces[tblnum]);
/* reap all children */
@@ -75,23 +77,22 @@ transfer_all_new_tablespaces(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr,
check_ok();
}
-
/*
* transfer_all_new_dbs()
*
- * Responsible for upgrading all database. invokes routines to generate mappings and then
- * physically link the databases.
+ * Responsible for upgrading all database. invokes routines to generate mappings
+ * and then physically link the databases.
*/
void
transfer_all_new_dbs(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr,
- char *old_pgdata, char *new_pgdata, char *old_tablespace)
+ char *old_pgdata, char *new_pgdata,
+ char *old_tablespace)
{
int old_dbnum,
new_dbnum;
/* Scan the old cluster databases and transfer their files */
- for (old_dbnum = new_dbnum = 0;
- old_dbnum < old_db_arr->ndbs;
+ for (old_dbnum = new_dbnum = 0; old_dbnum < old_db_arr->ndbs;
old_dbnum++, new_dbnum++)
{
DbInfo *old_db = &old_db_arr->dbs[old_dbnum],
@@ -115,8 +116,8 @@ transfer_all_new_dbs(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr,
pg_fatal("old database \"%s\" not found in the new cluster",
old_db->db_name);
- mappings = gen_db_file_maps(old_db, new_db, &n_maps, old_pgdata,
- new_pgdata);
+ mappings =
+ gen_db_file_maps(old_db, new_db, &n_maps, old_pgdata, new_pgdata);
if (n_maps)
{
transfer_single_new_db(mappings, n_maps, old_tablespace);
@@ -132,7 +133,8 @@ transfer_all_new_dbs(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr,
* create links for mappings stored in "maps" array.
*/
static void
-transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace)
+transfer_single_new_db(FileNameMap *maps, int size,
+ char *old_tablespace)
{
int mapnum;
bool vm_must_add_frozenbit = false;
@@ -161,7 +163,6 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace)
}
}
-
/*
* transfer_relfile()
*
@@ -170,7 +171,8 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace)
* mode.
*/
static void
-transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_frozenbit)
+transfer_relfile(FileNameMap *map, const char *type_suffix,
+ bool vm_must_add_frozenbit)
{
char old_file[MAXPGPATH];
char new_file[MAXPGPATH];
@@ -190,20 +192,12 @@ transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_fro
else
snprintf(extent_suffix, sizeof(extent_suffix), ".%d", segno);
- snprintf(old_file, sizeof(old_file), "%s%s/%u/%u%s%s",
- map->old_tablespace,
- map->old_tablespace_suffix,
- map->db_oid,
- map->relfilenumber,
- type_suffix,
- extent_suffix);
- snprintf(new_file, sizeof(new_file), "%s%s/%u/%u%s%s",
- map->new_tablespace,
- map->new_tablespace_suffix,
- map->db_oid,
- map->relfilenumber,
- type_suffix,
- extent_suffix);
+ snprintf(old_file, sizeof(old_file), "%s%s/%u/%u%s%s", map->old_tablespace,
+ map->old_tablespace_suffix, map->db_oid, map->relfilenumber,
+ type_suffix, extent_suffix);
+ snprintf(new_file, sizeof(new_file), "%s%s/%u/%u%s%s", map->new_tablespace,
+ map->new_tablespace_suffix, map->db_oid, map->relfilenumber,
+ type_suffix, extent_suffix);
/* Is it an extent, fsm, or vm file? */
if (type_suffix[0] != '\0' || segno != 0)
@@ -215,7 +209,8 @@ transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_fro
if (errno == ENOENT)
return;
else
- pg_fatal("error while checking for file existence \"%s.%s\" (\"%s\" to \"%s\"): %s",
+ pg_fatal("error while checking for file existence \"%s.%s\" (\"%s\" "
+ "to \"%s\"): %s",
map->nspname, map->relname, old_file, new_file,
strerror(errno));
}
@@ -233,27 +228,29 @@ transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_fro
if (vm_must_add_frozenbit && strcmp(type_suffix, "_vm") == 0)
{
/* Need to rewrite visibility map format */
- pg_log(PG_VERBOSE, "rewriting \"%s\" to \"%s\"",
- old_file, new_file);
+ pg_log(PG_VERBOSE, "rewriting \"%s\" to \"%s\"", old_file, new_file);
rewriteVisibilityMap(old_file, new_file, map->nspname, map->relname);
}
else
switch (user_opts.transfer_mode)
{
case TRANSFER_MODE_CLONE:
- pg_log(PG_VERBOSE, "cloning \"%s\" to \"%s\"",
- old_file, new_file);
+ pg_log(PG_VERBOSE, "cloning \"%s\" to \"%s\"", old_file, new_file);
cloneFile(old_file, new_file, map->nspname, map->relname);
break;
case TRANSFER_MODE_COPY:
- pg_log(PG_VERBOSE, "copying \"%s\" to \"%s\"",
- old_file, new_file);
+ pg_log(PG_VERBOSE, "copying \"%s\" to \"%s\"", old_file, new_file);
copyFile(old_file, new_file, map->nspname, map->relname);
break;
case TRANSFER_MODE_LINK:
- pg_log(PG_VERBOSE, "linking \"%s\" to \"%s\"",
- old_file, new_file);
+ pg_log(PG_VERBOSE, "linking \"%s\" to \"%s\"", old_file, new_file);
linkFile(old_file, new_file, map->nspname, map->relname);
+ break;
+ case TRANSFER_MODE_COPY_FILE_RANGE:
+ pg_log(PG_VERBOSE, "copying \"%s\" to \"%s\" with copy_file_range",
+ old_file, new_file);
+ copyFileByRange(old_file, new_file, map->nspname, map->relname);
+ break;
}
}
}
diff --git a/src/common/file_utils.c b/src/common/file_utils.c
index 5380299f35..25f9a48b21 100644
--- a/src/common/file_utils.c
+++ b/src/common/file_utils.c
@@ -24,10 +24,19 @@
#include <sys/stat.h>
#include <unistd.h>
+#include "common/file_perm.h"
#include "common/file_utils.h"
#ifdef FRONTEND
+#include "common/checksum_helper.h"
#include "common/logging.h"
+#ifdef HAVE_COPYFILE_H
+#include <copyfile.h>
#endif
+#ifdef __linux__
+#include <linux/fs.h>
+#include <sys/ioctl.h>
+#endif
+#endif /* FRONTEND */
#include "port/pg_iovec.h"
#ifdef FRONTEND
@@ -42,7 +51,7 @@
/*
* pg_xlog has been renamed to pg_wal in version 10.
*/
-#define MINIMUM_VERSION_FOR_PG_WAL 100000
+#define MINIMUM_VERSION_FOR_PG_WAL 100000
#ifdef PG_FLUSH_DATA_WORKS
static int pre_sync_fname(const char *fname, bool isdir);
@@ -94,8 +103,7 @@ do_syncfs(const char *path)
* serverVersion indicates the version of the server to be sync'd.
*/
void
-sync_pgdata(const char *pg_data,
- int serverVersion,
+sync_pgdata(const char *pg_data, int serverVersion,
DataDirSyncMethod sync_method)
{
bool xlog_is_symlink;
@@ -127,8 +135,7 @@ sync_pgdata(const char *pg_data,
case DATA_DIR_SYNC_METHOD_SYNCFS:
{
#ifndef HAVE_SYNCFS
- pg_log_error("this build does not support sync method \"%s\"",
- "syncfs");
+ pg_log_error("this build does not support sync method \"%s\"", "syncfs");
exit(EXIT_FAILURE);
#else
DIR *dir;
@@ -145,29 +152,27 @@ sync_pgdata(const char *pg_data,
/* Sync the top level pgdata directory. */
do_syncfs(pg_data);
- /* If any tablespaces are configured, sync each of those. */
+ /*
+ * If any tablespaces are configured, sync each of those.
+ */
dir = opendir(pg_tblspc);
if (dir == NULL)
- pg_log_error("could not open directory \"%s\": %m",
- pg_tblspc);
+ pg_log_error("could not open directory \"%s\": %m", pg_tblspc);
else
{
while (errno = 0, (de = readdir(dir)) != NULL)
{
char subpath[MAXPGPATH * 2];
- if (strcmp(de->d_name, ".") == 0 ||
- strcmp(de->d_name, "..") == 0)
+ if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
continue;
- snprintf(subpath, sizeof(subpath), "%s/%s",
- pg_tblspc, de->d_name);
+ snprintf(subpath, sizeof(subpath), "%s/%s", pg_tblspc, de->d_name);
do_syncfs(subpath);
}
if (errno)
- pg_log_error("could not read directory \"%s\": %m",
- pg_tblspc);
+ pg_log_error("could not read directory \"%s\": %m", pg_tblspc);
(void) closedir(dir);
}
@@ -176,8 +181,7 @@ sync_pgdata(const char *pg_data,
if (xlog_is_symlink)
do_syncfs(pg_wal);
#endif /* HAVE_SYNCFS */
- }
- break;
+ } break;
case DATA_DIR_SYNC_METHOD_FSYNC:
{
@@ -206,8 +210,7 @@ sync_pgdata(const char *pg_data,
if (xlog_is_symlink)
walkdir(pg_wal, fsync_fname, false);
walkdir(pg_tblspc, fsync_fname, true);
- }
- break;
+ } break;
}
}
@@ -224,8 +227,7 @@ sync_dir_recurse(const char *dir, DataDirSyncMethod sync_method)
case DATA_DIR_SYNC_METHOD_SYNCFS:
{
#ifndef HAVE_SYNCFS
- pg_log_error("this build does not support sync method \"%s\"",
- "syncfs");
+ pg_log_error("this build does not support sync method \"%s\"", "syncfs");
exit(EXIT_FAILURE);
#else
/*
@@ -234,8 +236,7 @@ sync_dir_recurse(const char *dir, DataDirSyncMethod sync_method)
*/
do_syncfs(dir);
#endif /* HAVE_SYNCFS */
- }
- break;
+ } break;
case DATA_DIR_SYNC_METHOD_FSYNC:
{
@@ -248,20 +249,19 @@ sync_dir_recurse(const char *dir, DataDirSyncMethod sync_method)
#endif
walkdir(dir, fsync_fname, false);
- }
- break;
+ } break;
}
}
/*
- * walkdir: recursively walk a directory, applying the action to each
- * regular file and directory (including the named directory itself).
+ * walkdir: recursively walk a directory, applying the action to each regular
+ * file and directory (including the named directory itself).
*
- * If process_symlinks is true, the action and recursion are also applied
- * to regular files and directories that are pointed to by symlinks in the
- * given directory; otherwise symlinks are ignored. Symlinks are always
- * ignored in subdirectories, ie we intentionally don't pass down the
- * process_symlinks flag to recursive calls.
+ * If process_symlinks is true, the action and recursion are also applied to
+ * regular files and directories that are pointed to by symlinks in the given
+ * directory; otherwise symlinks are ignored. Symlinks are always ignored in
+ * subdirectories, ie we intentionally don't pass down the process_symlinks
+ * flag to recursive calls.
*
* Errors are reported but not considered fatal.
*
@@ -286,8 +286,7 @@ walkdir(const char *path,
{
char subpath[MAXPGPATH * 2];
- if (strcmp(de->d_name, ".") == 0 ||
- strcmp(de->d_name, "..") == 0)
+ if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
continue;
snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
@@ -371,8 +370,8 @@ pre_sync_fname(const char *fname, bool isdir)
* fsync_fname -- Try to fsync a file or directory
*
* Ignores errors trying to open unreadable files, or trying to fsync
- * directories on systems where that isn't allowed/required. All other errors
- * are fatal.
+ * directories on systems where that isn't allowed/required. All other
+ * errors are fatal.
*/
int
fsync_fname(const char *fname, bool isdir)
@@ -427,8 +426,8 @@ fsync_fname(const char *fname, bool isdir)
/*
* fsync_parent_path -- fsync the parent path of a file or directory
*
- * This is aimed at making file operations persistent on disk in case of
- * an OS crash or power failure.
+ * This is aimed at making file operations persistent on disk in case of an
+ * OS crash or power failure.
*/
int
fsync_parent_path(const char *fname)
@@ -453,7 +452,8 @@ fsync_parent_path(const char *fname)
}
/*
- * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
+ * durable_rename -- rename(2) wrapper, issuing fsyncs required for
+ * durability
*
* Wrapper around rename, similar to the backend version.
*/
@@ -495,8 +495,8 @@ durable_rename(const char *oldfile, const char *newfile)
/* Time to do the real deal... */
if (rename(oldfile, newfile) != 0)
{
- pg_log_error("could not rename file \"%s\" to \"%s\": %m",
- oldfile, newfile);
+ pg_log_error("could not rename file \"%s\" to \"%s\": %m", oldfile,
+ newfile);
return -1;
}
@@ -513,6 +513,186 @@ durable_rename(const char *oldfile, const char *newfile)
return 0;
}
+/* Helper function to optionally prepend error string */
+static inline char *
+opt_errinfo(const char *addon_errmsg)
+{
+ char buf[128];
+
+ if (addon_errmsg == NULL)
+ return "";
+
+ strcpy(buf, " ");
+ return strncat(buf, addon_errmsg, sizeof(buf) - 2);
+}
+
+/*
+ * Copies a relation file from src to dest. addon_errmsg is an optional
+ * addon error message (can be NULL or include schema/relName)
+ */
+void
+pg_copyfile(const char *src, const char *dest, const char *addon_errmsg,
+ pg_checksum_context *ctx)
+{
+#ifndef WIN32
+ int src_fd;
+ int dest_fd;
+ uint8 *buffer;
+
+ /* copy in fairly large chunks for best efficiency */
+ const int buffer_size = 50 * BLCKSZ;
+
+ if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0)
+ pg_fatal("error while copying%s: could not open file \"%s\": %s",
+ opt_errinfo(addon_errmsg), src, strerror(errno));
+
+ if ((dest_fd = open(dest, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
+ pg_file_create_mode)) < 0)
+ pg_fatal("error while copying%s: could not create file \"%s\": %s",
+ opt_errinfo(addon_errmsg), dest, strerror(errno));
+
+ buffer = pg_malloc(buffer_size);
+
+ /* perform data copying i.e read src source, write to destination */
+ while (true)
+ {
+ ssize_t nbytes = read(src_fd, buffer, buffer_size);
+
+ if (nbytes < 0)
+ pg_fatal("error while copying%s: could not read file "
+ "\"%s\": %s",
+ opt_errinfo(addon_errmsg), src, strerror(errno));
+
+ if (nbytes == 0)
+ break;
+
+ errno = 0;
+ if (write(dest_fd, buffer, nbytes) != nbytes)
+ {
+ /*
+ * if write didn't set errno, assume problem is no disk space
+ */
+ if (errno == 0)
+ errno = ENOSPC;
+ pg_fatal("error while copying%s: could not write file \"%s\": %s",
+ opt_errinfo(addon_errmsg), dest, strerror(errno));
+ }
+
+ if (pg_checksum_update(ctx, buffer, nbytes) < 0)
+ pg_fatal("could not calculate checksum of file \"%s\"", dest);
+ }
+
+ pg_free(buffer);
+ close(src_fd);
+ close(dest_fd);
+
+#else /* WIN32 */
+ if (CopyFile(src, dest, true) == 0)
+ {
+ _dosmaperr(GetLastError());
+ pg_fatal("error while copying%s (\"%s\" to \"%s\"): %s", addon_errmsg,
+ opt_errinfo(addon_errmsg), src, dest, strerror(errno));
+ }
+#endif /* WIN32 */
+}
+
+/*
+ * pg_copyfile_offload()
+ *
+ * Clones/reflinks a relation file from src to dest using variety of methods
+ *
+ * addon_errmsg can be used to pass additional information in case of errors.
+ * flags, see PG_COPYFILE_* enum in file_utils.h
+ */
+void
+pg_copyfile_offload(const char *src, const char *dest,
+ const char *addon_errmsg, CopyFileMethod flags)
+{
+
+#ifdef WIN32
+ /* on WIN32 we ignore flags, we have no other choice */
+ if (CopyFile(src, dest, true) == 0)
+ {
+ _dosmaperr(GetLastError());
+ pg_fatal("error while copying%s (\"%s\" to \"%s\"): %s", addon_errmsg,
+ opt_errinfo(addon_errmsg), src, dest, strerror(errno));
+ }
+#elif defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE)
+ /* on MacOS we ignore flags, we have no other choice */
+ if (copyfile(src, dest, NULL, COPYFILE_CLONE_FORCE) < 0)
+ pg_fatal("error while cloning%s: (\"%s\" to \"%s\"): %s",
+ opt_errinfo(addon_errmsg), src, dest, strerror(errno));
+
+#elif defined(HAVE_COPY_FILE_RANGE) || defined(FICLONE)
+ int src_fd;
+ int dest_fd;
+ ssize_t nbytes;
+
+ if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0)
+ pg_fatal("error while copying%s: could not open file \"%s\": %s",
+ opt_errinfo(addon_errmsg), src, strerror(errno));
+
+ if ((dest_fd = open(dest, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
+ pg_file_create_mode)) < 0)
+ pg_fatal("error while copying%s: could not create file \"%s\": %s",
+ opt_errinfo(addon_errmsg), dest, strerror(errno));
+
+ if (flags & PG_COPYFILE_COPY_FILE_RANGE)
+ {
+#ifdef HAVE_COPY_FILE_RANGE
+ do
+ {
+ nbytes = copy_file_range(src_fd, NULL, dest_fd, NULL, SSIZE_MAX, 0);
+ if (nbytes < 0 && errno != EINTR)
+ pg_fatal("error while copying%s: could not copy_file_range()"
+ "from \"%s\" to \"%s\": %s",
+ opt_errinfo(addon_errmsg), src, dest, strerror(errno));
+ } while (nbytes > 0);
+#else
+ pg_fatal("copy file accelaration via copy_file_range() is not supported on "
+ "this platform");
+#endif
+ }
+ else if (flags & PG_COPYFILE_IOCTL_FICLONE)
+ {
+#if defined(__linux__) && defined(FICLONE)
+ if (ioctl(dest_fd, FICLONE, src_fd) < 0)
+ {
+ int save_errno = errno;
+
+ unlink(dest);
+
+ pg_fatal("error while cloning%s: (\"%s\" to \"%s\"): %s",
+ opt_errinfo(addon_errmsg), src, dest, strerror(save_errno));
+ }
+#else
+ pg_fatal("clone file accelaration via ioctl(FICLONE) is not supported on "
+ "this platform");
+#endif
+ }
+
+ close(src_fd);
+ close(dest_fd);
+
+#else
+ if (flags & PG_COPYFILE_FALLBACK)
+ pg_copyfile(src, dest, addon_errmsg);
+ else
+ pg_fatal("none of the copy file acceleration methods are supported on this "
+ "platform");
+#endif
+}
+
+/* FIXME */
+bool
+pg_copyfile_offload_supported(const char *src, const char *dst,
+ const char *addon_errmsg,
+ CopyFileMethod flags)
+{
+ pg_copyfile_offload(src, dst, addon_errmsg, flags);
+ return true;
+}
+
#endif /* FRONTEND */
/*
@@ -522,10 +702,8 @@ durable_rename(const char *oldfile, const char *newfile)
* it should be a level from elog.h.
*/
PGFileType
-get_dirent_type(const char *path,
- const struct dirent *de,
- bool look_through_symlinks,
- int elevel)
+get_dirent_type(const char *path, const struct dirent *de,
+ bool look_through_symlinks, int elevel)
{
PGFileType result;
@@ -553,7 +731,6 @@ get_dirent_type(const char *path,
struct stat fst;
int sret;
-
if (look_through_symlinks)
sret = stat(path, &fst);
else
@@ -563,11 +740,11 @@ get_dirent_type(const char *path,
{
result = PGFILETYPE_ERROR;
#ifdef FRONTEND
- pg_log_generic(elevel, PG_LOG_PRIMARY, "could not stat file \"%s\": %m", path);
+ pg_log_generic(elevel, PG_LOG_PRIMARY, "could not stat file \"%s\": %m",
+ path);
#else
- ereport(elevel,
- (errcode_for_file_access(),
- errmsg("could not stat file \"%s\": %m", path)));
+ ereport(elevel, (errcode_for_file_access(),
+ errmsg("could not stat file \"%s\": %m", path)));
#endif
}
else if (S_ISREG(fst.st_mode))
@@ -586,12 +763,12 @@ get_dirent_type(const char *path,
* write. The part of 'source' beginning after 'transferred' bytes is copied
* to 'destination', and its length is returned. 'source' and 'destination'
* may point to the same array, for in-place adjustment. A return value of
- * zero indicates completion (for callers without a cheaper way to know that).
+ * zero indicates completion (for callers without a cheaper way to know
+ * that).
*/
int
compute_remaining_iovec(struct iovec *destination,
- const struct iovec *source,
- int iovcnt,
+ const struct iovec *source, int iovcnt,
size_t transferred)
{
Assert(iovcnt > 0);
@@ -634,7 +811,8 @@ compute_remaining_iovec(struct iovec *destination,
* error is returned, it is unspecified how much has been written.
*/
ssize_t
-pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset)
+pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt,
+ off_t offset)
{
struct iovec iov_copy[PG_IOV_MAX];
ssize_t sum = 0;
@@ -680,8 +858,8 @@ pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset)
* Writes zeros to file worth "size" bytes at "offset" (from the start of the
* file), using vectored I/O.
*
- * Returns the total amount of data written. On failure, a negative value
- * is returned with errno set.
+ * Returns the total amount of data written. On failure, a negative value is
+ * returned with errno set.
*/
ssize_t
pg_pwrite_zeros(int fd, size_t size, off_t offset)
diff --git a/src/include/common/file_utils.h b/src/include/common/file_utils.h
index 02a940e310..0747109217 100644
--- a/src/include/common/file_utils.h
+++ b/src/include/common/file_utils.h
@@ -30,31 +30,48 @@ typedef enum DataDirSyncMethod
DATA_DIR_SYNC_METHOD_SYNCFS,
} DataDirSyncMethod;
+typedef enum CopyFileMethod
+{
+ PG_COPYFILE_FALLBACK = 0x1,
+ PG_COPYFILE_IOCTL_FICLONE = 0x2, /* Linux */
+ PG_COPYFILE_COPY_FILE_RANGE = 0x4, /* FreeBSD & Linux >= 4.5 */
+ PG_COPYFILE_COPYFILE_CLONE_FORCE = 0x8 /* MacOS */
+} CopyFileMethod;
+#define PG_COPYFILE_ANY_WITH_FALLBACK (2 << 4) - 1
+
struct iovec; /* avoid including port/pg_iovec.h here */
#ifdef FRONTEND
+#include "c.h"
+#include "common/checksum_helper.h"
extern int fsync_fname(const char *fname, bool isdir);
extern void sync_pgdata(const char *pg_data, int serverVersion,
DataDirSyncMethod sync_method);
extern void sync_dir_recurse(const char *dir, DataDirSyncMethod sync_method);
extern int durable_rename(const char *oldfile, const char *newfile);
extern int fsync_parent_path(const char *fname);
+
+extern void pg_copyfile(const char *src, const char *dest,
+ const char *addon_errmsg, pg_checksum_context *ctx);
+
+extern void pg_copyfile_offload(const char *src, const char *dest,
+ const char *addon_errmsg, CopyFileMethod flags);
+
+extern bool pg_copyfile_offload_supported(const char *src, const char *dst,
+ const char *addon_errmsg,
+ CopyFileMethod flags);
+
#endif
-extern PGFileType get_dirent_type(const char *path,
- const struct dirent *de,
- bool look_through_symlinks,
- int elevel);
+extern PGFileType get_dirent_type(const char *path, const struct dirent *de,
+ bool look_through_symlinks, int elevel);
extern int compute_remaining_iovec(struct iovec *destination,
- const struct iovec *source,
- int iovcnt,
+ const struct iovec *source, int iovcnt,
size_t transferred);
-extern ssize_t pg_pwritev_with_retry(int fd,
- const struct iovec *iov,
- int iovcnt,
- off_t offset);
+extern ssize_t pg_pwritev_with_retry(int fd, const struct iovec *iov,
+ int iovcnt, off_t offset);
extern ssize_t pg_pwrite_zeros(int fd, size_t size, off_t offset);
--
2.30.2
v3-0003-Add-copy-file-range-to-pg_upgrade-using-pg_copyfi.patchapplication/octet-stream; name=v3-0003-Add-copy-file-range-to-pg_upgrade-using-pg_copyfi.patchDownload
From 0e1753613897e12abd9d246176dfc0c83cdcece6 Mon Sep 17 00:00:00 2001
From: Jakub Wartak <jakub.wartak@enterprisedb.com>
Date: Fri, 5 Jan 2024 08:33:45 +0100
Subject: [PATCH v3 3/4] Add --copy-file-range to pg_upgrade using
pg_copyfile_offload(). Original patch author is Thomas.
Co-authored-by: Thomas Munro <thomas.munro@gmail.com>
Discussion: https://www.postgresql.org/message-id/flat/CA%2BhUKGJvLLNQtzb%3DZWcTsYF8kv8cR_%3DH17CX-eL8qNixeC4DAw%40mail.gmail.com#ce606227e39df74c6b2abf80b8eab04a
---
src/bin/pg_upgrade/check.c | 409 ++++++++++++++++----------------
src/bin/pg_upgrade/file.c | 37 ++-
src/bin/pg_upgrade/option.c | 158 ++++++------
src/bin/pg_upgrade/pg_upgrade.h | 150 ++++++------
4 files changed, 401 insertions(+), 353 deletions(-)
diff --git a/src/bin/pg_upgrade/check.c b/src/bin/pg_upgrade/check.c
index 87c06628c6..3e6922f3fb 100644
--- a/src/bin/pg_upgrade/check.c
+++ b/src/bin/pg_upgrade/check.c
@@ -38,7 +38,6 @@ static void check_new_cluster_subscription_configuration(void);
static void check_old_cluster_for_valid_slots(bool live_check);
static void check_old_cluster_subscription_state(void);
-
/*
* fix_path_separator
* For non-Windows, just return the argument.
@@ -72,19 +71,16 @@ output_check_banner(bool live_check)
{
if (user_opts.check && live_check)
{
- pg_log(PG_REPORT,
- "Performing Consistency Checks on Old Live Server\n"
+ pg_log(PG_REPORT, "Performing Consistency Checks on Old Live Server\n"
"------------------------------------------------");
}
else
{
- pg_log(PG_REPORT,
- "Performing Consistency Checks\n"
+ pg_log(PG_REPORT, "Performing Consistency Checks\n"
"-----------------------------");
}
}
-
void
check_and_dump_old_cluster(bool live_check)
{
@@ -103,7 +99,6 @@ check_and_dump_old_cluster(bool live_check)
get_loadable_libraries();
-
/*
* Check for various failure cases
*/
@@ -218,7 +213,6 @@ check_and_dump_old_cluster(bool live_check)
stop_postmaster(false);
}
-
void
check_new_cluster(void)
{
@@ -238,6 +232,9 @@ check_new_cluster(void)
case TRANSFER_MODE_LINK:
check_hard_link();
break;
+ case TRANSFER_MODE_COPY_FILE_RANGE:
+ check_copy_file_range();
+ break;
}
check_is_install_user(&new_cluster);
@@ -251,7 +248,6 @@ check_new_cluster(void)
check_new_cluster_subscription_configuration();
}
-
void
report_clusters_compatible(void)
{
@@ -265,12 +261,12 @@ report_clusters_compatible(void)
exit(0);
}
- pg_log(PG_REPORT, "\n"
+ pg_log(PG_REPORT,
+ "\n"
"If pg_upgrade fails after this point, you must re-initdb the\n"
"new cluster before continuing.");
}
-
void
issue_warnings_and_set_wal_level(void)
{
@@ -291,7 +287,6 @@ issue_warnings_and_set_wal_level(void)
stop_postmaster(false);
}
-
void
output_completion_banner(char *deletion_script_file_name)
{
@@ -308,7 +303,8 @@ output_completion_banner(char *deletion_script_file_name)
pg_log(PG_REPORT,
"Optimizer statistics are not transferred by pg_upgrade.\n"
"Once you start the new server, consider running:\n"
- " %s/vacuumdb %s--all --analyze-in-stages", new_cluster.bindir, user_specification.data);
+ " %s/vacuumdb %s--all --analyze-in-stages",
+ new_cluster.bindir, user_specification.data);
if (deletion_script_file_name)
pg_log(PG_REPORT,
@@ -316,7 +312,8 @@ output_completion_banner(char *deletion_script_file_name)
" %s",
deletion_script_file_name);
else
- pg_log(PG_REPORT,
+ pg_log(
+ PG_REPORT,
"Could not create a script to delete the old cluster's data files\n"
"because user-defined tablespaces or the new cluster's data directory\n"
"exist in the old cluster directory. The old cluster's contents must\n"
@@ -325,7 +322,6 @@ output_completion_banner(char *deletion_script_file_name)
termPQExpBuffer(&user_specification);
}
-
void
check_cluster_versions(void)
{
@@ -341,11 +337,13 @@ check_cluster_versions(void)
*/
if (GET_MAJOR_VERSION(old_cluster.major_version) < 902)
- pg_fatal("This utility can only upgrade from PostgreSQL version %s and later.",
+ pg_fatal(
+ "This utility can only upgrade from PostgreSQL version %s and later.",
"9.2");
/* Only current PG version is supported as a target */
- if (GET_MAJOR_VERSION(new_cluster.major_version) != GET_MAJOR_VERSION(PG_VERSION_NUM))
+ if (GET_MAJOR_VERSION(new_cluster.major_version) !=
+ GET_MAJOR_VERSION(PG_VERSION_NUM))
pg_fatal("This utility can only upgrade to PostgreSQL version %s.",
PG_MAJORVERSION);
@@ -355,20 +353,22 @@ check_cluster_versions(void)
* older versions.
*/
if (old_cluster.major_version > new_cluster.major_version)
- pg_fatal("This utility cannot be used to downgrade to older major PostgreSQL versions.");
+ pg_fatal("This utility cannot be used to downgrade to older major "
+ "PostgreSQL versions.");
/* Ensure binaries match the designated data directories */
if (GET_MAJOR_VERSION(old_cluster.major_version) !=
GET_MAJOR_VERSION(old_cluster.bin_version))
- pg_fatal("Old cluster data and binary directories are from different major versions.");
+ pg_fatal("Old cluster data and binary directories are from different major "
+ "versions.");
if (GET_MAJOR_VERSION(new_cluster.major_version) !=
GET_MAJOR_VERSION(new_cluster.bin_version))
- pg_fatal("New cluster data and binary directories are from different major versions.");
+ pg_fatal("New cluster data and binary directories are from different major "
+ "versions.");
check_ok();
}
-
void
check_cluster_compatibility(bool live_check)
{
@@ -382,7 +382,6 @@ check_cluster_compatibility(bool live_check)
"the old and new port numbers must be different.");
}
-
static void
check_new_cluster_is_empty(void)
{
@@ -393,15 +392,14 @@ check_new_cluster_is_empty(void)
int relnum;
RelInfoArr *rel_arr = &new_cluster.dbarr.dbs[dbnum].rel_arr;
- for (relnum = 0; relnum < rel_arr->nrels;
- relnum++)
+ for (relnum = 0; relnum < rel_arr->nrels; relnum++)
{
/* pg_largeobject and its index should be skipped */
if (strcmp(rel_arr->rels[relnum].nspname, "pg_catalog") != 0)
- pg_fatal("New cluster database \"%s\" is not empty: found relation \"%s.%s\"",
+ pg_fatal("New cluster database \"%s\" is not empty: found relation "
+ "\"%s.%s\"",
new_cluster.dbarr.dbs[dbnum].db_name,
- rel_arr->rels[relnum].nspname,
- rel_arr->rels[relnum].relname);
+ rel_arr->rels[relnum].nspname, rel_arr->rels[relnum].relname);
}
}
}
@@ -428,8 +426,7 @@ check_for_new_tablespace_dir(void)
struct stat statbuf;
snprintf(new_tablespace_dir, MAXPGPATH, "%s%s",
- os_info.old_tablespaces[tblnum],
- new_cluster.tablespace_suffix);
+ os_info.old_tablespaces[tblnum], new_cluster.tablespace_suffix);
if (stat(new_tablespace_dir, &statbuf) == 0 || errno != ENOENT)
pg_fatal("new cluster tablespace directory already exists: \"%s\"",
@@ -452,8 +449,8 @@ create_script_for_old_cluster_deletion(char **deletion_script_file_name)
char old_cluster_pgdata[MAXPGPATH],
new_cluster_pgdata[MAXPGPATH];
- *deletion_script_file_name = psprintf("%sdelete_old_cluster.%s",
- SCRIPT_PREFIX, SCRIPT_EXT);
+ *deletion_script_file_name =
+ psprintf("%sdelete_old_cluster.%s", SCRIPT_PREFIX, SCRIPT_EXT);
strlcpy(old_cluster_pgdata, old_cluster.pgdata, MAXPGPATH);
canonicalize_path(old_cluster_pgdata);
@@ -465,7 +462,9 @@ create_script_for_old_cluster_deletion(char **deletion_script_file_name)
if (path_is_prefix_of_path(old_cluster_pgdata, new_cluster_pgdata))
{
pg_log(PG_WARNING,
- "\nWARNING: new data directory should not be inside the old data directory, i.e. %s", old_cluster_pgdata);
+ "\nWARNING: new data directory should not be inside the old data "
+ "directory, i.e. %s",
+ old_cluster_pgdata);
/* Unlink file in case it is left over from a previous run. */
unlink(*deletion_script_file_name);
@@ -489,7 +488,9 @@ create_script_for_old_cluster_deletion(char **deletion_script_file_name)
{
/* reproduce warning from CREATE TABLESPACE that is in the log */
pg_log(PG_WARNING,
- "\nWARNING: user-defined tablespace locations should not be inside the data directory, i.e. %s", old_tablespace_dir);
+ "\nWARNING: user-defined tablespace locations should not be "
+ "inside the data directory, i.e. %s",
+ old_tablespace_dir);
/* Unlink file in case it is left over from a previous run. */
unlink(*deletion_script_file_name);
@@ -502,8 +503,8 @@ create_script_for_old_cluster_deletion(char **deletion_script_file_name)
prep_status("Creating script to delete old cluster");
if ((script = fopen_priv(*deletion_script_file_name, "w")) == NULL)
- pg_fatal("could not open file \"%s\": %s",
- *deletion_script_file_name, strerror(errno));
+ pg_fatal("could not open file \"%s\": %s", *deletion_script_file_name,
+ strerror(errno));
#ifndef WIN32
/* add shebang header */
@@ -560,7 +561,6 @@ create_script_for_old_cluster_deletion(char **deletion_script_file_name)
check_ok();
}
-
/*
* check_is_install_user()
*
@@ -576,8 +576,7 @@ check_is_install_user(ClusterInfo *cluster)
prep_status("Checking database user is the install user");
/* Can't use pg_authid because only superusers can view it. */
- res = executeQueryOrDie(conn,
- "SELECT rolsuper, oid "
+ res = executeQueryOrDie(conn, "SELECT rolsuper, oid "
"FROM pg_catalog.pg_roles "
"WHERE rolname = current_user "
"AND rolname !~ '^pg_'");
@@ -589,13 +588,11 @@ check_is_install_user(ClusterInfo *cluster)
*/
if (PQntuples(res) != 1 ||
atooid(PQgetvalue(res, 0, 1)) != BOOTSTRAP_SUPERUSERID)
- pg_fatal("database user \"%s\" is not the install user",
- os_info.user);
+ pg_fatal("database user \"%s\" is not the install user", os_info.user);
PQclear(res);
- res = executeQueryOrDie(conn,
- "SELECT COUNT(*) "
+ res = executeQueryOrDie(conn, "SELECT COUNT(*) "
"FROM pg_catalog.pg_roles "
"WHERE rolname !~ '^pg_'");
@@ -617,7 +614,6 @@ check_is_install_user(ClusterInfo *cluster)
check_ok();
}
-
/*
* check_proper_datallowconn
*
@@ -639,16 +635,15 @@ check_proper_datallowconn(ClusterInfo *cluster)
prep_status("Checking database connection settings");
- snprintf(output_path, sizeof(output_path), "%s/%s",
- log_opts.basedir,
+ snprintf(output_path, sizeof(output_path), "%s/%s", log_opts.basedir,
"databases_with_datallowconn_false.txt");
conn_template1 = connectToServer(cluster, "template1");
/* get database names */
- dbres = executeQueryOrDie(conn_template1,
- "SELECT datname, datallowconn "
- "FROM pg_catalog.pg_database");
+ dbres =
+ executeQueryOrDie(conn_template1, "SELECT datname, datallowconn "
+ "FROM pg_catalog.pg_database");
i_datname = PQfnumber(dbres, "datname");
i_datallowconn = PQfnumber(dbres, "datallowconn");
@@ -675,8 +670,8 @@ check_proper_datallowconn(ClusterInfo *cluster)
if (strcmp(datallowconn, "f") == 0)
{
if (script == NULL && (script = fopen_priv(output_path, "w")) == NULL)
- pg_fatal("could not open file \"%s\": %s",
- output_path, strerror(errno));
+ pg_fatal("could not open file \"%s\": %s", output_path,
+ strerror(errno));
fprintf(script, "%s\n", datname);
}
@@ -691,19 +686,20 @@ check_proper_datallowconn(ClusterInfo *cluster)
{
fclose(script);
pg_log(PG_REPORT, "fatal");
- pg_fatal("All non-template0 databases must allow connections, i.e. their\n"
+ pg_fatal(
+ "All non-template0 databases must allow connections, i.e. their\n"
"pg_database.datallowconn must be true. Your installation contains\n"
"non-template0 databases with their pg_database.datallowconn set to\n"
"false. Consider allowing connection for all non-template0 databases\n"
"or drop the databases which do not allow connections. A list of\n"
"databases with the problem is in the file:\n"
- " %s", output_path);
+ " %s",
+ output_path);
}
else
check_ok();
}
-
/*
* check_for_prepared_transactions()
*
@@ -718,8 +714,7 @@ check_for_prepared_transactions(ClusterInfo *cluster)
prep_status("Checking for prepared transactions");
- res = executeQueryOrDie(conn,
- "SELECT * "
+ res = executeQueryOrDie(conn, "SELECT * "
"FROM pg_catalog.pg_prepared_xacts");
if (PQntuples(res) != 0)
@@ -737,7 +732,6 @@ check_for_prepared_transactions(ClusterInfo *cluster)
check_ok();
}
-
/*
* check_for_isn_and_int8_passing_mismatch()
*
@@ -762,8 +756,7 @@ check_for_isn_and_int8_passing_mismatch(ClusterInfo *cluster)
return;
}
- snprintf(output_path, sizeof(output_path), "%s/%s",
- log_opts.basedir,
+ snprintf(output_path, sizeof(output_path), "%s/%s", log_opts.basedir,
"contrib_isn_and_int8_pass_by_value.txt");
for (dbnum = 0; dbnum < cluster->dbarr.ndbs; dbnum++)
@@ -778,8 +771,7 @@ check_for_isn_and_int8_passing_mismatch(ClusterInfo *cluster)
PGconn *conn = connectToServer(cluster, active_db->db_name);
/* Find any functions coming from contrib/isn */
- res = executeQueryOrDie(conn,
- "SELECT n.nspname, p.proname "
+ res = executeQueryOrDie(conn, "SELECT n.nspname, p.proname "
"FROM pg_catalog.pg_proc p, "
" pg_catalog.pg_namespace n "
"WHERE p.pronamespace = n.oid AND "
@@ -791,15 +783,14 @@ check_for_isn_and_int8_passing_mismatch(ClusterInfo *cluster)
for (rowno = 0; rowno < ntups; rowno++)
{
if (script == NULL && (script = fopen_priv(output_path, "w")) == NULL)
- pg_fatal("could not open file \"%s\": %s",
- output_path, strerror(errno));
+ pg_fatal("could not open file \"%s\": %s", output_path,
+ strerror(errno));
if (!db_used)
{
fprintf(script, "In database: %s\n", active_db->db_name);
db_used = true;
}
- fprintf(script, " %s.%s\n",
- PQgetvalue(res, rowno, i_nspname),
+ fprintf(script, " %s.%s\n", PQgetvalue(res, rowno, i_nspname),
PQgetvalue(res, rowno, i_proname));
}
@@ -812,13 +803,17 @@ check_for_isn_and_int8_passing_mismatch(ClusterInfo *cluster)
{
fclose(script);
pg_log(PG_REPORT, "fatal");
- pg_fatal("Your installation contains \"contrib/isn\" functions which rely on the\n"
+ pg_fatal(
+ "Your installation contains \"contrib/isn\" functions which rely on "
+ "the\n"
"bigint data type. Your old and new clusters pass bigint values\n"
"differently so this cluster cannot currently be upgraded. You can\n"
"manually dump databases in the old cluster that use \"contrib/isn\"\n"
- "facilities, drop them, perform the upgrade, and then restore them. A\n"
+ "facilities, drop them, perform the upgrade, and then restore them. "
+ "A\n"
"list of the problem functions is in the file:\n"
- " %s", output_path);
+ " %s",
+ output_path);
}
else
check_ok();
@@ -836,8 +831,7 @@ check_for_user_defined_postfix_ops(ClusterInfo *cluster)
prep_status("Checking for user-defined postfix operators");
- snprintf(output_path, sizeof(output_path), "%s/%s",
- log_opts.basedir,
+ snprintf(output_path, sizeof(output_path), "%s/%s", log_opts.basedir,
"postfix_ops.txt");
/* Find any user defined postfix operators */
@@ -861,8 +855,7 @@ check_for_user_defined_postfix_ops(ClusterInfo *cluster)
* #define is ever changed, the cutoff we want to use is the value
* used by pre-version 14 servers, not that of some future version.
*/
- res = executeQueryOrDie(conn,
- "SELECT o.oid AS oproid, "
+ res = executeQueryOrDie(conn, "SELECT o.oid AS oproid, "
" n.nspname AS oprnsp, "
" o.oprname, "
" tn.nspname AS typnsp, "
@@ -884,20 +877,18 @@ check_for_user_defined_postfix_ops(ClusterInfo *cluster)
i_typname = PQfnumber(res, "typname");
for (rowno = 0; rowno < ntups; rowno++)
{
- if (script == NULL &&
- (script = fopen_priv(output_path, "w")) == NULL)
- pg_fatal("could not open file \"%s\": %s",
- output_path, strerror(errno));
+ if (script == NULL && (script = fopen_priv(output_path, "w")) == NULL)
+ pg_fatal("could not open file \"%s\": %s", output_path,
+ strerror(errno));
if (!db_used)
{
fprintf(script, "In database: %s\n", active_db->db_name);
db_used = true;
}
- fprintf(script, " (oid=%s) %s.%s (%s.%s, NONE)\n",
- PQgetvalue(res, rowno, i_oproid),
- PQgetvalue(res, rowno, i_oprnsp),
- PQgetvalue(res, rowno, i_oprname),
- PQgetvalue(res, rowno, i_typnsp),
+ fprintf(
+ script, " (oid=%s) %s.%s (%s.%s, NONE)\n",
+ PQgetvalue(res, rowno, i_oproid), PQgetvalue(res, rowno, i_oprnsp),
+ PQgetvalue(res, rowno, i_oprname), PQgetvalue(res, rowno, i_typnsp),
PQgetvalue(res, rowno, i_typname));
}
@@ -910,11 +901,14 @@ check_for_user_defined_postfix_ops(ClusterInfo *cluster)
{
fclose(script);
pg_log(PG_REPORT, "fatal");
- pg_fatal("Your installation contains user-defined postfix operators, which are not\n"
- "supported anymore. Consider dropping the postfix operators and replacing\n"
+ pg_fatal("Your installation contains user-defined postfix operators, which "
+ "are not\n"
+ "supported anymore. Consider dropping the postfix operators and "
+ "replacing\n"
"them with prefix operators or function calls.\n"
"A list of user-defined postfix operators is in the file:\n"
- " %s", output_path);
+ " %s",
+ output_path);
}
else
check_ok();
@@ -936,8 +930,7 @@ check_for_incompatible_polymorphics(ClusterInfo *cluster)
prep_status("Checking for incompatible polymorphic functions");
- snprintf(output_path, sizeof(output_path), "%s/%s",
- log_opts.basedir,
+ snprintf(output_path, sizeof(output_path), "%s/%s", log_opts.basedir,
"incompatible_polymorphics.txt");
/* The set of problematic functions varies a bit in different versions */
@@ -975,7 +968,8 @@ check_for_incompatible_polymorphics(ClusterInfo *cluster)
* #define is ever changed, the cutoff we want to use is the value
* used by pre-version 14 servers, not that of some future version.
*/
- res = executeQueryOrDie(conn,
+ res = executeQueryOrDie(
+ conn,
/* Aggregate transition functions */
"SELECT 'aggregate' AS objkind, p.oid::regprocedure::text AS objname "
"FROM pg_proc AS p "
@@ -1002,9 +996,7 @@ check_for_incompatible_polymorphics(ClusterInfo *cluster)
"WHERE op.oid >= 16384 "
"AND oprcode = ANY(ARRAY[%s]::regprocedure[]) "
"AND oprleft = ANY(ARRAY['anyarray', 'anyelement']::regtype[]);",
- old_polymorphics.data,
- old_polymorphics.data,
- old_polymorphics.data);
+ old_polymorphics.data, old_polymorphics.data, old_polymorphics.data);
ntups = PQntuples(res);
@@ -1013,18 +1005,16 @@ check_for_incompatible_polymorphics(ClusterInfo *cluster)
for (int rowno = 0; rowno < ntups; rowno++)
{
- if (script == NULL &&
- (script = fopen_priv(output_path, "w")) == NULL)
- pg_fatal("could not open file \"%s\": %s",
- output_path, strerror(errno));
+ if (script == NULL && (script = fopen_priv(output_path, "w")) == NULL)
+ pg_fatal("could not open file \"%s\": %s", output_path,
+ strerror(errno));
if (!db_used)
{
fprintf(script, "In database: %s\n", active_db->db_name);
db_used = true;
}
- fprintf(script, " %s: %s\n",
- PQgetvalue(res, rowno, i_objkind),
+ fprintf(script, " %s: %s\n", PQgetvalue(res, rowno, i_objkind),
PQgetvalue(res, rowno, i_objname));
}
@@ -1036,13 +1026,18 @@ check_for_incompatible_polymorphics(ClusterInfo *cluster)
{
fclose(script);
pg_log(PG_REPORT, "fatal");
- pg_fatal("Your installation contains user-defined objects that refer to internal\n"
- "polymorphic functions with arguments of type \"anyarray\" or \"anyelement\".\n"
- "These user-defined objects must be dropped before upgrading and restored\n"
- "afterwards, changing them to refer to the new corresponding functions with\n"
+ pg_fatal("Your installation contains user-defined objects that refer to "
+ "internal\n"
+ "polymorphic functions with arguments of type \"anyarray\" or "
+ "\"anyelement\".\n"
+ "These user-defined objects must be dropped before upgrading and "
+ "restored\n"
+ "afterwards, changing them to refer to the new corresponding "
+ "functions with\n"
"arguments of type \"anycompatiblearray\" and \"anycompatible\".\n"
"A list of the problematic objects is in the file:\n"
- " %s", output_path);
+ " %s",
+ output_path);
}
else
check_ok();
@@ -1062,8 +1057,7 @@ check_for_tables_with_oids(ClusterInfo *cluster)
prep_status("Checking for tables WITH OIDS");
- snprintf(output_path, sizeof(output_path), "%s/%s",
- log_opts.basedir,
+ snprintf(output_path, sizeof(output_path), "%s/%s", log_opts.basedir,
"tables_with_oids.txt");
/* Find any tables declared WITH OIDS */
@@ -1078,8 +1072,7 @@ check_for_tables_with_oids(ClusterInfo *cluster)
DbInfo *active_db = &cluster->dbarr.dbs[dbnum];
PGconn *conn = connectToServer(cluster, active_db->db_name);
- res = executeQueryOrDie(conn,
- "SELECT n.nspname, c.relname "
+ res = executeQueryOrDie(conn, "SELECT n.nspname, c.relname "
"FROM pg_catalog.pg_class c, "
" pg_catalog.pg_namespace n "
"WHERE c.relnamespace = n.oid AND "
@@ -1092,15 +1085,14 @@ check_for_tables_with_oids(ClusterInfo *cluster)
for (rowno = 0; rowno < ntups; rowno++)
{
if (script == NULL && (script = fopen_priv(output_path, "w")) == NULL)
- pg_fatal("could not open file \"%s\": %s",
- output_path, strerror(errno));
+ pg_fatal("could not open file \"%s\": %s", output_path,
+ strerror(errno));
if (!db_used)
{
fprintf(script, "In database: %s\n", active_db->db_name);
db_used = true;
}
- fprintf(script, " %s.%s\n",
- PQgetvalue(res, rowno, i_nspname),
+ fprintf(script, " %s.%s\n", PQgetvalue(res, rowno, i_nspname),
PQgetvalue(res, rowno, i_relname));
}
@@ -1113,17 +1105,18 @@ check_for_tables_with_oids(ClusterInfo *cluster)
{
fclose(script);
pg_log(PG_REPORT, "fatal");
- pg_fatal("Your installation contains tables declared WITH OIDS, which is not\n"
+ pg_fatal(
+ "Your installation contains tables declared WITH OIDS, which is not\n"
"supported anymore. Consider removing the oid column using\n"
" ALTER TABLE ... SET WITHOUT OIDS;\n"
"A list of tables with the problem is in the file:\n"
- " %s", output_path);
+ " %s",
+ output_path);
}
else
check_ok();
}
-
/*
* check_for_composite_data_type_usage()
* Check for system-defined composite types used in user tables.
@@ -1143,8 +1136,7 @@ check_for_composite_data_type_usage(ClusterInfo *cluster)
prep_status("Checking for system-defined composite types in user tables");
- snprintf(output_path, sizeof(output_path), "%s/%s",
- log_opts.basedir,
+ snprintf(output_path, sizeof(output_path), "%s/%s", log_opts.basedir,
"tables_using_composite.txt");
/*
@@ -1160,7 +1152,8 @@ check_for_composite_data_type_usage(ClusterInfo *cluster)
*/
firstUserOid = 16384;
- base_query = psprintf("SELECT t.oid FROM pg_catalog.pg_type t "
+ base_query = psprintf(
+ "SELECT t.oid FROM pg_catalog.pg_type t "
"LEFT JOIN pg_catalog.pg_namespace n ON t.typnamespace = n.oid "
" WHERE typtype = 'c' AND (t.oid < %u OR nspname = 'information_schema')",
firstUserOid);
@@ -1172,12 +1165,14 @@ check_for_composite_data_type_usage(ClusterInfo *cluster)
if (found)
{
pg_log(PG_REPORT, "fatal");
- pg_fatal("Your installation contains system-defined composite types in user tables.\n"
+ pg_fatal("Your installation contains system-defined composite types in "
+ "user tables.\n"
"These type OIDs are not stable across PostgreSQL versions,\n"
"so this cluster cannot currently be upgraded. You can\n"
"drop the problem columns and restart the upgrade.\n"
"A list of the problem columns is in the file:\n"
- " %s", output_path);
+ " %s",
+ output_path);
}
else
check_ok();
@@ -1202,15 +1197,15 @@ check_for_reg_data_type_usage(ClusterInfo *cluster)
prep_status("Checking for reg* data types in user tables");
- snprintf(output_path, sizeof(output_path), "%s/%s",
- log_opts.basedir,
+ snprintf(output_path, sizeof(output_path), "%s/%s", log_opts.basedir,
"tables_using_reg.txt");
/*
* Note: older servers will not have all of these reg* types, so we have
* to write the query like this rather than depending on casts to regtype.
*/
- found = check_for_data_types_usage(cluster,
+ found = check_for_data_types_usage(
+ cluster,
"SELECT oid FROM pg_catalog.pg_type t "
"WHERE t.typnamespace = "
" (SELECT oid FROM pg_catalog.pg_namespace "
@@ -1233,12 +1228,15 @@ check_for_reg_data_type_usage(ClusterInfo *cluster)
if (found)
{
pg_log(PG_REPORT, "fatal");
- pg_fatal("Your installation contains one of the reg* data types in user tables.\n"
+ pg_fatal(
+ "Your installation contains one of the reg* data types in user "
+ "tables.\n"
"These data types reference system OIDs that are not preserved by\n"
"pg_upgrade, so this cluster cannot currently be upgraded. You can\n"
"drop the problem columns and restart the upgrade.\n"
"A list of the problem columns is in the file:\n"
- " %s", output_path);
+ " %s",
+ output_path);
}
else
check_ok();
@@ -1262,12 +1260,14 @@ check_for_aclitem_data_type_usage(ClusterInfo *cluster)
if (check_for_data_type_usage(cluster, "pg_catalog.aclitem", output_path))
{
pg_log(PG_REPORT, "fatal");
- pg_fatal("Your installation contains the \"aclitem\" data type in user tables.\n"
+ pg_fatal(
+ "Your installation contains the \"aclitem\" data type in user tables.\n"
"The internal format of \"aclitem\" changed in PostgreSQL version 16\n"
"so this cluster cannot currently be upgraded. You can drop the\n"
"problem columns and restart the upgrade. A list of the problem\n"
"columns is in the file:\n"
- " %s", output_path);
+ " %s",
+ output_path);
}
else
check_ok();
@@ -1280,34 +1280,34 @@ check_for_aclitem_data_type_usage(ClusterInfo *cluster)
* the exact list.
*/
static void
-check_for_removed_data_type_usage(ClusterInfo *cluster, const char *version,
+check_for_removed_data_type_usage(ClusterInfo *cluster,
+ const char *version,
const char *datatype)
{
char output_path[MAXPGPATH];
char typename[NAMEDATALEN];
- prep_status("Checking for removed \"%s\" data type in user tables",
- datatype);
+ prep_status("Checking for removed \"%s\" data type in user tables", datatype);
- snprintf(output_path, sizeof(output_path), "tables_using_%s.txt",
- datatype);
+ snprintf(output_path, sizeof(output_path), "tables_using_%s.txt", datatype);
snprintf(typename, sizeof(typename), "pg_catalog.%s", datatype);
if (check_for_data_type_usage(cluster, typename, output_path))
{
pg_log(PG_REPORT, "fatal");
- pg_fatal("Your installation contains the \"%s\" data type in user tables.\n"
+ pg_fatal(
+ "Your installation contains the \"%s\" data type in user tables.\n"
"The \"%s\" type has been removed in PostgreSQL version %s,\n"
"so this cluster cannot currently be upgraded. You can drop the\n"
"problem columns, or change them to another data type, and restart\n"
"the upgrade. A list of the problem columns is in the file:\n"
- " %s", datatype, datatype, version, output_path);
+ " %s",
+ datatype, datatype, version, output_path);
}
else
check_ok();
}
-
/*
* check_for_jsonb_9_4_usage()
*
@@ -1320,19 +1320,20 @@ check_for_jsonb_9_4_usage(ClusterInfo *cluster)
prep_status("Checking for incompatible \"jsonb\" data type");
- snprintf(output_path, sizeof(output_path), "%s/%s",
- log_opts.basedir,
+ snprintf(output_path, sizeof(output_path), "%s/%s", log_opts.basedir,
"tables_using_jsonb.txt");
if (check_for_data_type_usage(cluster, "pg_catalog.jsonb", output_path))
{
pg_log(PG_REPORT, "fatal");
- pg_fatal("Your installation contains the \"jsonb\" data type in user tables.\n"
+ pg_fatal(
+ "Your installation contains the \"jsonb\" data type in user tables.\n"
"The internal format of \"jsonb\" changed during 9.4 beta so this\n"
"cluster cannot currently be upgraded. You can\n"
"drop the problem columns and restart the upgrade.\n"
"A list of the problem columns is in the file:\n"
- " %s", output_path);
+ " %s",
+ output_path);
}
else
check_ok();
@@ -1356,12 +1357,10 @@ check_for_pg_role_prefix(ClusterInfo *cluster)
prep_status("Checking for roles starting with \"pg_\"");
- snprintf(output_path, sizeof(output_path), "%s/%s",
- log_opts.basedir,
+ snprintf(output_path, sizeof(output_path), "%s/%s", log_opts.basedir,
"pg_role_prefix.txt");
- res = executeQueryOrDie(conn,
- "SELECT oid AS roloid, rolname "
+ res = executeQueryOrDie(conn, "SELECT oid AS roloid, rolname "
"FROM pg_catalog.pg_roles "
"WHERE rolname ~ '^pg_'");
@@ -1371,10 +1370,8 @@ check_for_pg_role_prefix(ClusterInfo *cluster)
for (int rowno = 0; rowno < ntups; rowno++)
{
if (script == NULL && (script = fopen_priv(output_path, "w")) == NULL)
- pg_fatal("could not open file \"%s\": %s",
- output_path, strerror(errno));
- fprintf(script, "%s (oid=%s)\n",
- PQgetvalue(res, rowno, i_rolname),
+ pg_fatal("could not open file \"%s\": %s", output_path, strerror(errno));
+ fprintf(script, "%s (oid=%s)\n", PQgetvalue(res, rowno, i_rolname),
PQgetvalue(res, rowno, i_roloid));
}
@@ -1390,7 +1387,8 @@ check_for_pg_role_prefix(ClusterInfo *cluster)
"\"pg_\" is a reserved prefix for system roles. The cluster\n"
"cannot be upgraded until these roles are renamed.\n"
"A list of roles starting with \"pg_\" is in the file:\n"
- " %s", output_path);
+ " %s",
+ output_path);
}
else
check_ok();
@@ -1408,8 +1406,7 @@ check_for_user_defined_encoding_conversions(ClusterInfo *cluster)
prep_status("Checking for user-defined encoding conversions");
- snprintf(output_path, sizeof(output_path), "%s/%s",
- log_opts.basedir,
+ snprintf(output_path, sizeof(output_path), "%s/%s", log_opts.basedir,
"encoding_conversions.txt");
/* Find any user defined encoding conversions */
@@ -1431,29 +1428,27 @@ check_for_user_defined_encoding_conversions(ClusterInfo *cluster)
* #define is ever changed, the cutoff we want to use is the value
* used by pre-version 14 servers, not that of some future version.
*/
- res = executeQueryOrDie(conn,
- "SELECT c.oid as conoid, c.conname, n.nspname "
- "FROM pg_catalog.pg_conversion c, "
- " pg_catalog.pg_namespace n "
- "WHERE c.connamespace = n.oid AND "
- " c.oid >= 16384");
+ res =
+ executeQueryOrDie(conn, "SELECT c.oid as conoid, c.conname, n.nspname "
+ "FROM pg_catalog.pg_conversion c, "
+ " pg_catalog.pg_namespace n "
+ "WHERE c.connamespace = n.oid AND "
+ " c.oid >= 16384");
ntups = PQntuples(res);
i_conoid = PQfnumber(res, "conoid");
i_conname = PQfnumber(res, "conname");
i_nspname = PQfnumber(res, "nspname");
for (rowno = 0; rowno < ntups; rowno++)
{
- if (script == NULL &&
- (script = fopen_priv(output_path, "w")) == NULL)
- pg_fatal("could not open file \"%s\": %s",
- output_path, strerror(errno));
+ if (script == NULL && (script = fopen_priv(output_path, "w")) == NULL)
+ pg_fatal("could not open file \"%s\": %s", output_path,
+ strerror(errno));
if (!db_used)
{
fprintf(script, "In database: %s\n", active_db->db_name);
db_used = true;
}
- fprintf(script, " (oid=%s) %s.%s\n",
- PQgetvalue(res, rowno, i_conoid),
+ fprintf(script, " (oid=%s) %s.%s\n", PQgetvalue(res, rowno, i_conoid),
PQgetvalue(res, rowno, i_nspname),
PQgetvalue(res, rowno, i_conname));
}
@@ -1467,12 +1462,14 @@ check_for_user_defined_encoding_conversions(ClusterInfo *cluster)
{
fclose(script);
pg_log(PG_REPORT, "fatal");
- pg_fatal("Your installation contains user-defined encoding conversions.\n"
+ pg_fatal(
+ "Your installation contains user-defined encoding conversions.\n"
"The conversion function parameters changed in PostgreSQL version 14\n"
"so this cluster cannot currently be upgraded. You can remove the\n"
"encoding conversions in the old cluster and restart the upgrade.\n"
"A list of user-defined encoding conversions is in the file:\n"
- " %s", output_path);
+ " %s",
+ output_path);
}
else
check_ok();
@@ -1524,7 +1521,8 @@ check_new_cluster_logical_replication_slots(void)
PQclear(res);
- res = executeQueryOrDie(conn, "SELECT setting FROM pg_settings "
+ res = executeQueryOrDie(
+ conn, "SELECT setting FROM pg_settings "
"WHERE name IN ('wal_level', 'max_replication_slots') "
"ORDER BY name DESC;");
@@ -1534,13 +1532,13 @@ check_new_cluster_logical_replication_slots(void)
wal_level = PQgetvalue(res, 0, 0);
if (strcmp(wal_level, "logical") != 0)
- pg_fatal("wal_level must be \"logical\", but is set to \"%s\"",
- wal_level);
+ pg_fatal("wal_level must be \"logical\", but is set to \"%s\"", wal_level);
max_replication_slots = atoi(PQgetvalue(res, 1, 0));
if (nslots_on_old > max_replication_slots)
- pg_fatal("max_replication_slots (%d) must be greater than or equal to the number of "
+ pg_fatal("max_replication_slots (%d) must be greater than or equal to the "
+ "number of "
"logical replication slots (%d) on the old cluster",
max_replication_slots, nslots_on_old);
@@ -1587,7 +1585,8 @@ check_new_cluster_subscription_configuration(void)
max_replication_slots = atoi(PQgetvalue(res, 0, 0));
if (nsubs_on_old > max_replication_slots)
- pg_fatal("max_replication_slots (%d) must be greater than or equal to the number of "
+ pg_fatal("max_replication_slots (%d) must be greater than or equal to the "
+ "number of "
"subscriptions (%d) on the old cluster",
max_replication_slots, nsubs_on_old);
@@ -1611,8 +1610,7 @@ check_old_cluster_for_valid_slots(bool live_check)
prep_status("Checking for valid logical replication slots");
- snprintf(output_path, sizeof(output_path), "%s/%s",
- log_opts.basedir,
+ snprintf(output_path, sizeof(output_path), "%s/%s", log_opts.basedir,
"invalid_logical_slots.txt");
for (int dbnum = 0; dbnum < old_cluster.dbarr.ndbs; dbnum++)
@@ -1626,13 +1624,11 @@ check_old_cluster_for_valid_slots(bool live_check)
/* Is the slot usable? */
if (slot->invalid)
{
- if (script == NULL &&
- (script = fopen_priv(output_path, "w")) == NULL)
- pg_fatal("could not open file \"%s\": %s",
- output_path, strerror(errno));
+ if (script == NULL && (script = fopen_priv(output_path, "w")) == NULL)
+ pg_fatal("could not open file \"%s\": %s", output_path,
+ strerror(errno));
- fprintf(script, "The slot \"%s\" is invalid\n",
- slot->slotname);
+ fprintf(script, "The slot \"%s\" is invalid\n", slot->slotname);
continue;
}
@@ -1646,13 +1642,11 @@ check_old_cluster_for_valid_slots(bool live_check)
*/
if (!live_check && !slot->caught_up)
{
- if (script == NULL &&
- (script = fopen_priv(output_path, "w")) == NULL)
- pg_fatal("could not open file \"%s\": %s",
- output_path, strerror(errno));
+ if (script == NULL && (script = fopen_priv(output_path, "w")) == NULL)
+ pg_fatal("could not open file \"%s\": %s", output_path,
+ strerror(errno));
- fprintf(script,
- "The slot \"%s\" has not consumed the WAL yet\n",
+ fprintf(script, "The slot \"%s\" has not consumed the WAL yet\n",
slot->slotname);
}
}
@@ -1663,11 +1657,14 @@ check_old_cluster_for_valid_slots(bool live_check)
fclose(script);
pg_log(PG_REPORT, "fatal");
- pg_fatal("Your installation contains logical replication slots that can't be upgraded.\n"
- "You can remove invalid slots and/or consume the pending WAL for other slots,\n"
+ pg_fatal("Your installation contains logical replication slots that can't "
+ "be upgraded.\n"
+ "You can remove invalid slots and/or consume the pending WAL for "
+ "other slots,\n"
"and then restart the upgrade.\n"
"A list of the problematic slots is in the file:\n"
- " %s", output_path);
+ " %s",
+ output_path);
}
check_ok();
@@ -1689,8 +1686,7 @@ check_old_cluster_subscription_state(void)
prep_status("Checking for subscription state");
- snprintf(output_path, sizeof(output_path), "%s/%s",
- log_opts.basedir,
+ snprintf(output_path, sizeof(output_path), "%s/%s", log_opts.basedir,
"subs_invalid.txt");
for (int dbnum = 0; dbnum < old_cluster.dbarr.ndbs; dbnum++)
{
@@ -1705,8 +1701,8 @@ check_old_cluster_subscription_state(void)
* Check that all the subscriptions have their respective
* replication origin.
*/
- res = executeQueryOrDie(conn,
- "SELECT d.datname, s.subname "
+ res = executeQueryOrDie(
+ conn, "SELECT d.datname, s.subname "
"FROM pg_catalog.pg_subscription s "
"LEFT OUTER JOIN pg_catalog.pg_replication_origin o "
" ON o.roname = 'pg_' || s.oid "
@@ -1718,11 +1714,12 @@ check_old_cluster_subscription_state(void)
for (int i = 0; i < ntup; i++)
{
if (script == NULL && (script = fopen_priv(output_path, "w")) == NULL)
- pg_fatal("could not open file \"%s\": %s",
- output_path, strerror(errno));
- fprintf(script, "The replication origin is missing for database:\"%s\" subscription:\"%s\"\n",
- PQgetvalue(res, i, 0),
- PQgetvalue(res, i, 1));
+ pg_fatal("could not open file \"%s\": %s", output_path,
+ strerror(errno));
+ fprintf(script,
+ "The replication origin is missing for database:\"%s\" "
+ "subscription:\"%s\"\n",
+ PQgetvalue(res, i, 0), PQgetvalue(res, i, 1));
}
PQclear(res);
}
@@ -1755,8 +1752,8 @@ check_old_cluster_subscription_state(void)
* SUBREL_STATE_UNKNOWN: These states are not stored in the catalog,
* so we need not allow these states.
*/
- res = executeQueryOrDie(conn,
- "SELECT r.srsubstate, s.subname, n.nspname, c.relname "
+ res = executeQueryOrDie(
+ conn, "SELECT r.srsubstate, s.subname, n.nspname, c.relname "
"FROM pg_catalog.pg_subscription_rel r "
"LEFT JOIN pg_catalog.pg_subscription s"
" ON r.srsubid = s.oid "
@@ -1771,15 +1768,14 @@ check_old_cluster_subscription_state(void)
for (int i = 0; i < ntup; i++)
{
if (script == NULL && (script = fopen_priv(output_path, "w")) == NULL)
- pg_fatal("could not open file \"%s\": %s",
- output_path, strerror(errno));
-
- fprintf(script, "The table sync state \"%s\" is not allowed for database:\"%s\" subscription:\"%s\" schema:\"%s\" relation:\"%s\"\n",
- PQgetvalue(res, i, 0),
- active_db->db_name,
- PQgetvalue(res, i, 1),
- PQgetvalue(res, i, 2),
- PQgetvalue(res, i, 3));
+ pg_fatal("could not open file \"%s\": %s", output_path,
+ strerror(errno));
+
+ fprintf(script,
+ "The table sync state \"%s\" is not allowed for database:\"%s\" "
+ "subscription:\"%s\" schema:\"%s\" relation:\"%s\"\n",
+ PQgetvalue(res, i, 0), active_db->db_name, PQgetvalue(res, i, 1),
+ PQgetvalue(res, i, 2), PQgetvalue(res, i, 3));
}
PQclear(res);
@@ -1790,10 +1786,13 @@ check_old_cluster_subscription_state(void)
{
fclose(script);
pg_log(PG_REPORT, "fatal");
- pg_fatal("Your installation contains subscriptions without origin or having relations not in i (initialize) or r (ready) state.\n"
- "You can allow the initial sync to finish for all relations and then restart the upgrade.\n"
+ pg_fatal("Your installation contains subscriptions without origin or "
+ "having relations not in i (initialize) or r (ready) state.\n"
+ "You can allow the initial sync to finish for all relations and "
+ "then restart the upgrade.\n"
"A list of the problematic subscriptions is in the file:\n"
- " %s", output_path);
+ " %s",
+ output_path);
}
else
check_ok();
diff --git a/src/bin/pg_upgrade/file.c b/src/bin/pg_upgrade/file.c
index f91cc548ce..7272c7fdb7 100644
--- a/src/bin/pg_upgrade/file.c
+++ b/src/bin/pg_upgrade/file.c
@@ -37,6 +37,23 @@ cloneFile(const char *src, const char *dst, const char *schemaName,
pg_copyfile_offload(src, dst, action, PG_COPYFILE_IOCTL_FICLONE);
}
+/*
+ * copyFileByRange()
+ *
+ * Copies a relation file from src to dst.
+ * schemaName/relName are relation's SQL name (used for error messages only).
+ */
+void
+copyFileByRange(const char *src, const char *dst, const char *schemaName,
+ const char *relName)
+{
+ char action[1024];
+
+ snprintf(action, sizeof(action) - 1, "relation \"%s.%s\"", schemaName,
+ relName);
+ pg_copyfile_offload(src, dst, action, PG_COPYFILE_COPY_FILE_RANGE);
+}
+
/*
* copyFile()
*
@@ -264,6 +281,23 @@ check_file_clone(void)
PG_COPYFILE_IOCTL_FICLONE);
}
+void
+check_copy_file_range(void)
+{
+ char existing_file[MAXPGPATH];
+ char new_link_file[MAXPGPATH];
+
+ snprintf(existing_file, sizeof(existing_file), "%s/PG_VERSION",
+ old_cluster.pgdata);
+ snprintf(new_link_file, sizeof(new_link_file),
+ "%s/PG_VERSION.copy_file_range_test", new_cluster.pgdata);
+ unlink(new_link_file); /* might fail */
+
+ /* will throw error in case it is not supported */
+ pg_copyfile_offload_supported(existing_file, new_link_file, NULL,
+ PG_COPYFILE_COPY_FILE_RANGE);
+}
+
void
check_hard_link(void)
{
@@ -278,7 +312,8 @@ check_hard_link(void)
if (link(existing_file, new_link_file) < 0)
pg_fatal(
- "could not create hard link between old and new data directories: %s\n"
+ "could not create hard link between old and new data directories: "
+ "%s\n"
"In link mode the old and new data directories must be on the same "
"file system.",
strerror(errno));
diff --git a/src/bin/pg_upgrade/option.c b/src/bin/pg_upgrade/option.c
index b9d900d0db..9e992c7bbf 100644
--- a/src/bin/pg_upgrade/option.c
+++ b/src/bin/pg_upgrade/option.c
@@ -20,16 +20,13 @@
#include "utils/pidfile.h"
static void usage(void);
-static void check_required_directory(char **dirpath,
- const char *envVarName, bool useCwd,
- const char *cmdLineOption, const char *description,
- bool missingOk);
+static void check_required_directory(char **dirpath, const char *envVarName,
+ bool useCwd, const char *cmdLineOption,
+ const char *description, bool missingOk);
#define FIX_DEFAULT_READ_ONLY "-c default_transaction_read_only=false"
-
UserOpts user_opts;
-
/*
* parseCommandLine()
*
@@ -59,9 +56,9 @@ parseCommandLine(int argc, char *argv[])
{"clone", no_argument, NULL, 1},
{"copy", no_argument, NULL, 2},
{"sync-method", required_argument, NULL, 3},
+ {"copy-file-range", no_argument, NULL, 3},
- {NULL, 0, NULL, 0}
- };
+ {NULL, 0, NULL, 0}};
int option; /* Command line option */
int optindex = 0; /* used by getopt_long */
int os_user_effective_id;
@@ -73,8 +70,10 @@ parseCommandLine(int argc, char *argv[])
os_info.progname = get_progname(argv[0]);
/* Process libpq env. variables; load values here for usage() output */
- old_cluster.port = getenv("PGPORTOLD") ? atoi(getenv("PGPORTOLD")) : DEF_PGUPORT;
- new_cluster.port = getenv("PGPORTNEW") ? atoi(getenv("PGPORTNEW")) : DEF_PGUPORT;
+ old_cluster.port =
+ getenv("PGPORTOLD") ? atoi(getenv("PGPORTOLD")) : DEF_PGUPORT;
+ new_cluster.port =
+ getenv("PGPORTNEW") ? atoi(getenv("PGPORTNEW")) : DEF_PGUPORT;
os_user_effective_id = get_user_info(&os_info.user);
/* we override just the database user name; we got the OS id above */
@@ -197,12 +196,13 @@ parseCommandLine(int argc, char *argv[])
case 1:
user_opts.transfer_mode = TRANSFER_MODE_CLONE;
break;
-
case 2:
user_opts.transfer_mode = TRANSFER_MODE_COPY;
break;
-
case 3:
+ user_opts.transfer_mode = TRANSFER_MODE_COPY_FILE_RANGE;
+ break;
+ case 4:
if (!parse_sync_method(optarg, &unused))
exit(1);
user_opts.sync_method = pg_strdup(optarg);
@@ -229,8 +229,8 @@ parseCommandLine(int argc, char *argv[])
/* Turn off read-only mode; add prefix to PGOPTIONS? */
if (getenv("PGOPTIONS"))
{
- char *pgoptions = psprintf("%s %s", FIX_DEFAULT_READ_ONLY,
- getenv("PGOPTIONS"));
+ char *pgoptions =
+ psprintf("%s %s", FIX_DEFAULT_READ_ONLY, getenv("PGOPTIONS"));
setenv("PGOPTIONS", pgoptions, 1);
pfree(pgoptions);
@@ -239,16 +239,16 @@ parseCommandLine(int argc, char *argv[])
setenv("PGOPTIONS", FIX_DEFAULT_READ_ONLY, 1);
/* Get values from env if not already set */
- check_required_directory(&old_cluster.bindir, "PGBINOLD", false,
- "-b", _("old cluster binaries reside"), false);
- check_required_directory(&new_cluster.bindir, "PGBINNEW", false,
- "-B", _("new cluster binaries reside"), true);
- check_required_directory(&old_cluster.pgdata, "PGDATAOLD", false,
- "-d", _("old cluster data resides"), false);
- check_required_directory(&new_cluster.pgdata, "PGDATANEW", false,
- "-D", _("new cluster data resides"), false);
- check_required_directory(&user_opts.socketdir, "PGSOCKETDIR", true,
- "-s", _("sockets will be created"), false);
+ check_required_directory(&old_cluster.bindir, "PGBINOLD", false, "-b",
+ _("old cluster binaries reside"), false);
+ check_required_directory(&new_cluster.bindir, "PGBINNEW", false, "-B",
+ _("new cluster binaries reside"), true);
+ check_required_directory(&old_cluster.pgdata, "PGDATAOLD", false, "-d",
+ _("old cluster data resides"), false);
+ check_required_directory(&new_cluster.pgdata, "PGDATANEW", false, "-D",
+ _("new cluster data resides"), false);
+ check_required_directory(&user_opts.socketdir, "PGSOCKETDIR", true, "-s",
+ _("sockets will be created"), false);
#ifdef WIN32
@@ -268,47 +268,73 @@ parseCommandLine(int argc, char *argv[])
pg_fatal("could not determine current directory");
canonicalize_path(cwd);
if (path_is_prefix_of_path(new_cluster_pgdata, cwd))
- pg_fatal("cannot run pg_upgrade from inside the new cluster data directory on Windows");
+ pg_fatal("cannot run pg_upgrade from inside the new cluster data "
+ "directory on Windows");
}
#endif
}
-
static void
usage(void)
{
- printf(_("pg_upgrade upgrades a PostgreSQL cluster to a different major version.\n\n"));
+ printf(_("pg_upgrade upgrades a PostgreSQL cluster to a different major "
+ "version.\n\n"));
printf(_("Usage:\n"));
printf(_(" pg_upgrade [OPTION]...\n\n"));
printf(_("Options:\n"));
- printf(_(" -b, --old-bindir=BINDIR old cluster executable directory\n"));
- printf(_(" -B, --new-bindir=BINDIR new cluster executable directory (default\n"
+ printf(
+ _(" -b, --old-bindir=BINDIR old cluster executable directory\n"));
+ printf(_(" -B, --new-bindir=BINDIR new cluster executable directory "
+ "(default\n"
" same directory as pg_upgrade)\n"));
- printf(_(" -c, --check check clusters only, don't change any data\n"));
+ printf(_(" -c, --check check clusters only, don't change "
+ "any data\n"));
printf(_(" -d, --old-datadir=DATADIR old cluster data directory\n"));
printf(_(" -D, --new-datadir=DATADIR new cluster data directory\n"));
- printf(_(" -j, --jobs=NUM number of simultaneous processes or threads to use\n"));
- printf(_(" -k, --link link instead of copying files to new cluster\n"));
- printf(_(" -N, --no-sync do not wait for changes to be written safely to disk\n"));
- printf(_(" -o, --old-options=OPTIONS old cluster options to pass to the server\n"));
- printf(_(" -O, --new-options=OPTIONS new cluster options to pass to the server\n"));
- printf(_(" -p, --old-port=PORT old cluster port number (default %d)\n"), old_cluster.port);
- printf(_(" -P, --new-port=PORT new cluster port number (default %d)\n"), new_cluster.port);
- printf(_(" -r, --retain retain SQL and log files after success\n"));
- printf(_(" -s, --socketdir=DIR socket directory to use (default current dir.)\n"));
- printf(_(" -U, --username=NAME cluster superuser (default \"%s\")\n"), os_info.user);
- printf(_(" -v, --verbose enable verbose internal logging\n"));
- printf(_(" -V, --version display version information, then exit\n"));
- printf(_(" --clone clone instead of copying files to new cluster\n"));
- printf(_(" --copy copy files to new cluster (default)\n"));
- printf(_(" --sync-method=METHOD set method for syncing files to disk\n"));
+ printf(_(" -j, --jobs=NUM number of simultaneous processes "
+ "or threads to use\n"));
+ printf(_(" -k, --link link instead of copying files to "
+ "new cluster\n"));
+ printf(_(" -N, --no-sync do not wait for changes to be "
+ "written safely to disk\n"));
+ printf(_(" -o, --old-options=OPTIONS old cluster options to pass to the "
+ "server\n"));
+ printf(_(" -O, --new-options=OPTIONS new cluster options to pass to the "
+ "server\n"));
+ printf(_(" -p, --old-port=PORT old cluster port number (default "
+ "%d)\n"),
+ old_cluster.port);
+ printf(_(" -P, --new-port=PORT new cluster port number (default "
+ "%d)\n"),
+ new_cluster.port);
+ printf(_(" -r, --retain retain SQL and log files after "
+ "success\n"));
+ printf(_(" -s, --socketdir=DIR socket directory to use (default "
+ "current dir.)\n"));
+ printf(
+ _(" -U, --username=NAME cluster superuser (default \"%s\")\n"),
+ os_info.user);
+ printf(
+ _(" -v, --verbose enable verbose internal logging\n"));
+ printf(_(" -V, --version display version information, then "
+ "exit\n"));
+ printf(_(" --clone clone instead of copying files to "
+ "new cluster\n"));
+ printf(_(
+ " --copy copy files to new cluster (default)\n"));
+ printf(_(" --copy-file-range copy files to new cluster with "
+ "copy_file_range()\n"));
+
+ printf(_(" --sync-method=METHOD set method for syncing files to "
+ "disk\n"));
printf(_(" -?, --help show this help, then exit\n"));
printf(_("\n"
"Before running pg_upgrade you must:\n"
" create a new database cluster (using the new version of initdb)\n"
" shutdown the postmaster servicing the old cluster\n"
" shutdown the postmaster servicing the new cluster\n"));
- printf(_("\n"
+ printf(
+ _("\n"
"When you run pg_upgrade, you must provide the following information:\n"
" the data directory for the old cluster (-d DATADIR)\n"
" the data directory for the new cluster (-D DATADIR)\n"
@@ -316,7 +342,8 @@ usage(void)
" the \"bin\" directory for the new version (-B BINDIR)\n"));
printf(_("\n"
"For example:\n"
- " pg_upgrade -d oldCluster/data -D newCluster/data -b oldCluster/bin -B newCluster/bin\n"
+ " pg_upgrade -d oldCluster/data -D newCluster/data -b "
+ "oldCluster/bin -B newCluster/bin\n"
"or\n"));
#ifndef WIN32
printf(_(" $ export PGDATAOLD=oldCluster/data\n"
@@ -335,25 +362,25 @@ usage(void)
printf(_("%s home page: <%s>\n"), PACKAGE_NAME, PACKAGE_URL);
}
-
/*
* check_required_directory()
*
* Checks a directory option.
- * dirpath - the directory name supplied on the command line, or NULL
- * envVarName - the name of an environment variable to get if dirpath is NULL
- * useCwd - true if OK to default to CWD
+ * dirpath - the directory name supplied on the command line, or
+ *NULL envVarName - the name of an environment variable to get if
+ *dirpath is NULL useCwd - true if OK to default to CWD
* cmdLineOption - the command line option for this directory
* description - a description of this directory option
- * missingOk - true if OK that both dirpath and envVarName are not existing
+ * missingOk - true if OK that both dirpath and envVarName are not
+ *existing
*
* We use the last two arguments to construct a meaningful error message if the
* user hasn't provided the required directory name.
*/
static void
-check_required_directory(char **dirpath, const char *envVarName, bool useCwd,
- const char *cmdLineOption, const char *description,
- bool missingOk)
+check_required_directory(char **dirpath, const char *envVarName,
+ bool useCwd, const char *cmdLineOption,
+ const char *description, bool missingOk)
{
if (*dirpath == NULL || strlen(*dirpath) == 0)
{
@@ -373,7 +400,8 @@ check_required_directory(char **dirpath, const char *envVarName, bool useCwd,
return;
else
pg_fatal("You must identify the directory where the %s.\n"
- "Please use the %s command-line option or the %s environment variable.",
+ "Please use the %s command-line option or the %s environment "
+ "variable.",
description, cmdLineOption, envVarName);
}
@@ -440,13 +468,12 @@ adjust_data_dir(ClusterInfo *cluster)
if ((output = popen(cmd, "r")) == NULL ||
fgets(cmd_output, sizeof(cmd_output), output) == NULL)
- pg_fatal("could not get data directory using %s: %s",
- cmd, strerror(errno));
+ pg_fatal("could not get data directory using %s: %s", cmd, strerror(errno));
rc = pclose(output);
if (rc != 0)
- pg_fatal("could not get data directory using %s: %s",
- cmd, wait_result_to_str(rc));
+ pg_fatal("could not get data directory using %s: %s", cmd,
+ wait_result_to_str(rc));
/* strip trailing newline and carriage return */
(void) pg_strip_crlf(cmd_output);
@@ -456,7 +483,6 @@ adjust_data_dir(ClusterInfo *cluster)
check_ok();
}
-
/*
* get_sock_dir
*
@@ -483,19 +509,17 @@ get_sock_dir(ClusterInfo *cluster, bool live_check)
FILE *fp;
int lineno;
- snprintf(filename, sizeof(filename), "%s/postmaster.pid",
- cluster->pgdata);
+ snprintf(filename, sizeof(filename), "%s/postmaster.pid", cluster->pgdata);
if ((fp = fopen(filename, "r")) == NULL)
- pg_fatal("could not open file \"%s\": %s",
- filename, strerror(errno));
+ pg_fatal("could not open file \"%s\": %s", filename, strerror(errno));
for (lineno = 1;
lineno <= Max(LOCK_FILE_LINE_PORT, LOCK_FILE_LINE_SOCKET_DIR);
lineno++)
{
if (fgets(line, sizeof(line), fp) == NULL)
- pg_fatal("could not read line %d from file \"%s\": %s",
- lineno, filename, strerror(errno));
+ pg_fatal("could not read line %d from file \"%s\": %s", lineno,
+ filename, strerror(errno));
/* potentially overwrite user-supplied value */
if (lineno == LOCK_FILE_LINE_PORT)
diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h
index d63f13fffc..f993d7255e 100644
--- a/src/bin/pg_upgrade/pg_upgrade.h
+++ b/src/bin/pg_upgrade/pg_upgrade.h
@@ -5,10 +5,10 @@
* src/bin/pg_upgrade/pg_upgrade.h
*/
-#include <unistd.h>
#include <assert.h>
#include <sys/stat.h>
#include <sys/time.h>
+#include <unistd.h>
#include "common/relpath.h"
#include "libpq-fe.h"
@@ -17,18 +17,18 @@
#undef pg_fatal
/* Use port in the private/dynamic port number range */
-#define DEF_PGUPORT 50432
+#define DEF_PGUPORT 50432
-#define MAX_STRING 1024
-#define QUERY_ALLOC 8192
+#define MAX_STRING 1024
+#define QUERY_ALLOC 8192
-#define MESSAGE_WIDTH 62
+#define MESSAGE_WIDTH 62
-#define GET_MAJOR_VERSION(v) ((v) / 100)
+#define GET_MAJOR_VERSION(v) ((v) / 100)
/* contains both global db information and CREATE DATABASE commands */
-#define GLOBALS_DUMP_FILE "pg_upgrade_dump_globals.sql"
-#define DB_DUMP_FILE_MASK "pg_upgrade_dump_%u.custom"
+#define GLOBALS_DUMP_FILE "pg_upgrade_dump_globals.sql"
+#define DB_DUMP_FILE_MASK "pg_upgrade_dump_%u.custom"
/*
* Base directories that include all the files generated internally, from the
@@ -36,14 +36,14 @@
* BASE_OUTPUTDIR/$timestamp/{LOG_OUTPUTDIR,DUMP_OUTPUTDIR} to ensure their
* uniqueness in each run.
*/
-#define BASE_OUTPUTDIR "pg_upgrade_output.d"
-#define LOG_OUTPUTDIR "log"
-#define DUMP_OUTPUTDIR "dump"
+#define BASE_OUTPUTDIR "pg_upgrade_output.d"
+#define LOG_OUTPUTDIR "log"
+#define DUMP_OUTPUTDIR "dump"
-#define DB_DUMP_LOG_FILE_MASK "pg_upgrade_dump_%u.log"
-#define SERVER_LOG_FILE "pg_upgrade_server.log"
-#define UTILITY_LOG_FILE "pg_upgrade_utility.log"
-#define INTERNAL_LOG_FILE "pg_upgrade_internal.log"
+#define DB_DUMP_LOG_FILE_MASK "pg_upgrade_dump_%u.log"
+#define SERVER_LOG_FILE "pg_upgrade_server.log"
+#define UTILITY_LOG_FILE "pg_upgrade_utility.log"
+#define INTERNAL_LOG_FILE "pg_upgrade_internal.log"
extern char *output_files[];
@@ -64,44 +64,42 @@ extern char *output_files[];
* the error message appropriately.
*/
#ifndef WIN32
-#define SERVER_START_LOG_FILE SERVER_LOG_FILE
-#define SERVER_STOP_LOG_FILE SERVER_LOG_FILE
+#define SERVER_START_LOG_FILE SERVER_LOG_FILE
+#define SERVER_STOP_LOG_FILE SERVER_LOG_FILE
#else
-#define SERVER_START_LOG_FILE "pg_upgrade_server_start.log"
+#define SERVER_START_LOG_FILE "pg_upgrade_server_start.log"
/*
* "pg_ctl start" keeps SERVER_START_LOG_FILE and SERVER_LOG_FILE open
* while the server is running, so we use UTILITY_LOG_FILE for "pg_ctl
* stop".
*/
-#define SERVER_STOP_LOG_FILE UTILITY_LOG_FILE
+#define SERVER_STOP_LOG_FILE UTILITY_LOG_FILE
#endif
-
#ifndef WIN32
-#define pg_mv_file rename
-#define PATH_SEPARATOR '/'
-#define PATH_QUOTE '\''
-#define RM_CMD "rm -f"
-#define RMDIR_CMD "rm -rf"
-#define SCRIPT_PREFIX "./"
-#define SCRIPT_EXT "sh"
-#define ECHO_QUOTE "'"
-#define ECHO_BLANK ""
+#define pg_mv_file rename
+#define PATH_SEPARATOR '/'
+#define PATH_QUOTE '\''
+#define RM_CMD "rm -f"
+#define RMDIR_CMD "rm -rf"
+#define SCRIPT_PREFIX "./"
+#define SCRIPT_EXT "sh"
+#define ECHO_QUOTE "'"
+#define ECHO_BLANK ""
#else
-#define pg_mv_file pgrename
-#define PATH_SEPARATOR '\\'
-#define PATH_QUOTE '"'
+#define pg_mv_file pgrename
+#define PATH_SEPARATOR '\\'
+#define PATH_QUOTE '"'
/* @ prefix disables command echo in .bat files */
-#define RM_CMD "@DEL /q"
-#define RMDIR_CMD "@RMDIR /s/q"
-#define SCRIPT_PREFIX ""
-#define SCRIPT_EXT "bat"
-#define EXE_EXT ".exe"
-#define ECHO_QUOTE ""
-#define ECHO_BLANK "."
+#define RM_CMD "@DEL /q"
+#define RMDIR_CMD "@RMDIR /s/q"
+#define SCRIPT_PREFIX ""
+#define SCRIPT_EXT "bat"
+#define EXE_EXT ".exe"
+#define ECHO_QUOTE ""
+#define ECHO_BLANK "."
#endif
-
/*
* The format of visibility map was changed with this 9.6 commit.
*/
@@ -126,7 +124,6 @@ extern char *output_files[];
*/
#define JSONB_FORMAT_CHANGE_CAT_VER 201409291
-
/*
* Each relation is represented by a relinfo structure.
*/
@@ -255,6 +252,7 @@ typedef enum
TRANSFER_MODE_CLONE,
TRANSFER_MODE_COPY,
TRANSFER_MODE_LINK,
+ TRANSFER_MODE_COPY_FILE_RANGE,
} transferMode;
/*
@@ -270,7 +268,6 @@ typedef enum
PG_FATAL,
} eLogType;
-
/*
* cluster
*
@@ -295,10 +292,9 @@ typedef struct
const char *tablespace_suffix; /* directory specification */
} ClusterInfo;
-
/*
* LogOpts
-*/
+ */
typedef struct
{
FILE *internal; /* internal log FILE */
@@ -312,10 +308,9 @@ typedef struct
bool isatty; /* is stdout a tty */
} LogOpts;
-
/*
* UserOpts
-*/
+ */
typedef struct
{
bool check; /* true -> ask user for permission to make
@@ -348,7 +343,6 @@ typedef struct
ClusterInfo *running_cluster;
} OSInfo;
-
/*
* Global variables
*/
@@ -358,7 +352,6 @@ extern ClusterInfo old_cluster,
new_cluster;
extern OSInfo os_info;
-
/* check.c */
void output_check_banner(bool live_check);
@@ -371,44 +364,45 @@ void check_cluster_versions(void);
void check_cluster_compatibility(bool live_check);
void create_script_for_old_cluster_deletion(char **deletion_script_file_name);
-
/* controldata.c */
void get_control_data(ClusterInfo *cluster, bool live_check);
void check_control_data(ControlData *oldctrl, ControlData *newctrl);
void disable_old_cluster(void);
-
/* dump.c */
void generate_old_dump(void);
-
/* exec.c */
-#define EXEC_PSQL_ARGS "--echo-queries --set ON_ERROR_STOP=on --no-psqlrc --dbname=template1"
+#define EXEC_PSQL_ARGS \
+ "--echo-queries --set ON_ERROR_STOP=on --no-psqlrc --dbname=template1"
bool exec_prog(const char *log_filename, const char *opt_log_file,
- bool report_error, bool exit_on_error, const char *fmt,...) pg_attribute_printf(5, 6);
+ bool report_error, bool exit_on_error, const char *fmt,...)
+ pg_attribute_printf(5, 6);
void verify_directories(void);
bool pid_lock_file_exists(const char *datadir);
-
/* file.c */
-void cloneFile(const char *src, const char *dst,
- const char *schemaName, const char *relName);
-void copyFile(const char *src, const char *dst,
- const char *schemaName, const char *relName);
-void linkFile(const char *src, const char *dst,
- const char *schemaName, const char *relName);
+void cloneFile(const char *src, const char *dst, const char *schemaName,
+ const char *relName);
+void copyFile(const char *src, const char *dst, const char *schemaName,
+ const char *relName);
+void copyFileByRange(const char *src, const char *dst, const char *schemaName,
+ const char *relName);
+void linkFile(const char *src, const char *dst, const char *schemaName,
+ const char *relName);
void rewriteVisibilityMap(const char *fromfile, const char *tofile,
const char *schemaName, const char *relName);
void check_file_clone(void);
+void check_copy_file_range(void);
void check_hard_link(void);
/* fopen_priv() is no longer different from fopen() */
-#define fopen_priv(path, mode) fopen(path, mode)
+#define fopen_priv(path, mode) fopen(path, mode)
/* function.c */
@@ -417,9 +411,8 @@ void check_loadable_libraries(void);
/* info.c */
-FileNameMap *gen_db_file_maps(DbInfo *old_db,
- DbInfo *new_db, int *nmaps, const char *old_pgdata,
- const char *new_pgdata);
+FileNameMap *gen_db_file_maps(DbInfo *old_db, DbInfo *new_db, int *nmaps,
+ const char *old_pgdata, const char *new_pgdata);
void get_db_rel_and_slot_infos(ClusterInfo *cluster, bool live_check);
int count_old_cluster_logical_slots(void);
int count_old_cluster_subscriptions(void);
@@ -432,21 +425,21 @@ void get_sock_dir(ClusterInfo *cluster, bool live_check);
/* relfilenumber.c */
-void transfer_all_new_tablespaces(DbInfoArr *old_db_arr,
- DbInfoArr *new_db_arr, char *old_pgdata, char *new_pgdata);
-void transfer_all_new_dbs(DbInfoArr *old_db_arr,
- DbInfoArr *new_db_arr, char *old_pgdata, char *new_pgdata,
+void transfer_all_new_tablespaces(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr,
+ char *old_pgdata, char *new_pgdata);
+void transfer_all_new_dbs(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr,
+ char *old_pgdata, char *new_pgdata,
char *old_tablespace);
/* tablespace.c */
void init_tablespaces(void);
-
/* server.c */
PGconn *connectToServer(ClusterInfo *cluster, const char *db_name);
-PGresult *executeQueryOrDie(PGconn *conn, const char *fmt,...) pg_attribute_printf(2, 3);
+PGresult *executeQueryOrDie(PGconn *conn, const char *fmt,...)
+ pg_attribute_printf(2, 3);
char *cluster_conn_opts(ClusterInfo *cluster);
@@ -455,34 +448,31 @@ void stop_postmaster(bool in_atexit);
uint32 get_major_server_version(ClusterInfo *cluster);
void check_pghost_envvar(void);
-
/* util.c */
char *quote_identifier(const char *s);
int get_user_info(char **user_name_p);
void check_ok(void);
-void report_status(eLogType type, const char *fmt,...) pg_attribute_printf(2, 3);
+void report_status(eLogType type, const char *fmt,...)
+ pg_attribute_printf(2, 3);
void pg_log(eLogType type, const char *fmt,...) pg_attribute_printf(2, 3);
-void pg_fatal(const char *fmt,...) pg_attribute_printf(1, 2) pg_attribute_noreturn();
+void pg_fatal(const char *fmt,...) pg_attribute_printf(1, 2)
+ pg_attribute_noreturn();
void end_progress_output(void);
void cleanup_output_dirs(void);
void prep_status(const char *fmt,...) pg_attribute_printf(1, 2);
void prep_status_progress(const char *fmt,...) pg_attribute_printf(1, 2);
unsigned int str2uint(const char *str);
-
/* version.c */
-bool check_for_data_types_usage(ClusterInfo *cluster,
- const char *base_query,
+bool check_for_data_types_usage(ClusterInfo *cluster, const char *base_query,
const char *output_path);
-bool check_for_data_type_usage(ClusterInfo *cluster,
- const char *type_name,
+bool check_for_data_type_usage(ClusterInfo *cluster, const char *type_name,
const char *output_path);
void old_9_3_check_for_line_data_type_usage(ClusterInfo *cluster);
void old_9_6_check_for_unknown_data_type_usage(ClusterInfo *cluster);
-void old_9_6_invalidate_hash_indexes(ClusterInfo *cluster,
- bool check_mode);
+void old_9_6_invalidate_hash_indexes(ClusterInfo *cluster, bool check_mode);
void old_11_check_for_sql_identifier_data_type_usage(ClusterInfo *cluster);
void report_extension_updates(ClusterInfo *cluster);
--
2.30.2
v3-0004-Add-clone-and-copy-file-range-copy-strategies-to-.patchapplication/octet-stream; name=v3-0004-Add-clone-and-copy-file-range-copy-strategies-to-.patchDownload
From 7925d17874d2f173899caff7894d348560674069 Mon Sep 17 00:00:00 2001
From: Jakub Wartak <jakub.wartak@enterprisedb.com>
Date: Fri, 5 Jan 2024 11:15:24 +0100
Subject: [PATCH v3 4/4] Add --clone and --copy-file-range copy strategies to
pg_combinebackup using pg_copyfile_offload().
Discussion: https://www.postgresql.org/message-id/flat/CA%2BhUKGJvLLNQtzb%3DZWcTsYF8kv8cR_%3DH17CX-eL8qNixeC4DAw%40mail.gmail.com#ce606227e39df74c6b2abf80b8eab04a
---
src/bin/pg_combinebackup/copy_file.c | 64 +-----
src/bin/pg_combinebackup/copy_file.h | 5 +-
src/bin/pg_combinebackup/pg_combinebackup.c | 230 ++++++++++----------
src/bin/pg_combinebackup/reconstruct.c | 111 ++++------
src/bin/pg_combinebackup/reconstruct.h | 22 +-
5 files changed, 184 insertions(+), 248 deletions(-)
diff --git a/src/bin/pg_combinebackup/copy_file.c b/src/bin/pg_combinebackup/copy_file.c
index 40a55e3087..448bfcd642 100644
--- a/src/bin/pg_combinebackup/copy_file.c
+++ b/src/bin/pg_combinebackup/copy_file.c
@@ -10,14 +10,12 @@
*/
#include "postgres_fe.h"
-#ifdef HAVE_COPYFILE_H
-#include <copyfile.h>
-#endif
#include <fcntl.h>
#include <sys/stat.h>
#include <unistd.h>
#include "common/file_perm.h"
+#include "common/file_utils.h"
#include "common/logging.h"
#include "copy_file.h"
@@ -35,7 +33,8 @@ static void copy_file_copyfile(const char *src, const char *dst);
*/
void
copy_file(const char *src, const char *dst,
- pg_checksum_context *checksum_ctx, bool dry_run)
+ pg_checksum_context *checksum_ctx, bool dry_run,
+ CopyFileMethod copy_strategy)
{
/*
* In dry-run mode, we don't actually copy anything, nor do we read any
@@ -49,6 +48,8 @@ copy_file(const char *src, const char *dst,
pg_fatal("could not open \"%s\": %m", src);
if (close(fd) < 0)
pg_fatal("could not close \"%s\": %m", src);
+
+ return;
}
/*
@@ -56,56 +57,13 @@ copy_file(const char *src, const char *dst,
* operating system primitives that we know about to copy the file; this
* may be quicker than a naive block copy.
*/
- if (checksum_ctx->type == CHECKSUM_TYPE_NONE)
- {
- char *strategy_name = NULL;
- void (*strategy_implementation) (const char *, const char *) = NULL;
-
-#ifdef WIN32
- strategy_name = "CopyFile";
- strategy_implementation = copy_file_copyfile;
-#endif
-
- if (strategy_name != NULL)
- {
- if (dry_run)
- pg_log_debug("would copy \"%s\" to \"%s\" using strategy %s",
- src, dst, strategy_name);
- else
- {
- pg_log_debug("copying \"%s\" to \"%s\" using strategy %s",
- src, dst, strategy_name);
- (*strategy_implementation) (src, dst);
- }
- return;
- }
- }
-
- /*
- * Fall back to the simple approach of reading and writing all the blocks,
- * feeding them into the checksum context as we go.
- */
- if (dry_run)
- {
- if (checksum_ctx->type == CHECKSUM_TYPE_NONE)
- pg_log_debug("would copy \"%s\" to \"%s\"",
- src, dst);
- else
- pg_log_debug("would copy \"%s\" to \"%s\" and checksum with %s",
- src, dst, pg_checksum_type_name(checksum_ctx->type));
- }
+ if (checksum_ctx->type == CHECKSUM_TYPE_NONE && copy_strategy != 0)
+ pg_copyfile_offload(src, dst, NULL, copy_strategy);
else
- {
- if (checksum_ctx->type == CHECKSUM_TYPE_NONE)
- pg_log_debug("copying \"%s\" to \"%s\"",
- src, dst);
- else
- pg_log_debug("copying \"%s\" to \"%s\" and checksumming with %s",
- src, dst, pg_checksum_type_name(checksum_ctx->type));
- copy_file_blocks(src, dst, checksum_ctx);
- }
+ pg_copyfile(src, dst, NULL, checksum_ctx);
}
+#if 0
/*
* Copy a file block by block, and optionally compute a checksum as we go.
*/
@@ -138,7 +96,8 @@ copy_file_blocks(const char *src, const char *dst,
if (wb < 0)
pg_fatal("could not write file \"%s\": %m", dst);
else
- pg_fatal("could not write file \"%s\": wrote only %d of %d bytes at offset %u",
+ pg_fatal("could not write file \"%s\": wrote only %d of %d bytes at "
+ "offset %u",
dst, (int) wb, (int) rb, offset);
}
@@ -167,3 +126,4 @@ copy_file_copyfile(const char *src, const char *dst)
}
}
#endif /* WIN32 */
+#endif
diff --git a/src/bin/pg_combinebackup/copy_file.h b/src/bin/pg_combinebackup/copy_file.h
index 031030bacb..cafaa0bc9c 100644
--- a/src/bin/pg_combinebackup/copy_file.h
+++ b/src/bin/pg_combinebackup/copy_file.h
@@ -11,9 +11,12 @@
#ifndef COPY_FILE_H
#define COPY_FILE_H
+#include "c.h"
#include "common/checksum_helper.h"
+#include "common/file_utils.h"
extern void copy_file(const char *src, const char *dst,
- pg_checksum_context *checksum_ctx, bool dry_run);
+ pg_checksum_context *checksum_ctx, bool dry_run,
+ CopyFileMethod copy_strategy);
#endif /* COPY_FILE_H */
diff --git a/src/bin/pg_combinebackup/pg_combinebackup.c b/src/bin/pg_combinebackup/pg_combinebackup.c
index cef4941d84..64dbc6c5fe 100644
--- a/src/bin/pg_combinebackup/pg_combinebackup.c
+++ b/src/bin/pg_combinebackup/pg_combinebackup.c
@@ -25,15 +25,15 @@
#include "common/logging.h"
#include "copy_file.h"
#include "fe_utils/option_utils.h"
+#include "getopt_long.h"
#include "lib/stringinfo.h"
#include "load_manifest.h"
-#include "getopt_long.h"
#include "reconstruct.h"
#include "write_manifest.h"
/* Incremental file naming convention. */
-#define INCREMENTAL_PREFIX "INCREMENTAL."
-#define INCREMENTAL_PREFIX_LENGTH (sizeof(INCREMENTAL_PREFIX) - 1)
+#define INCREMENTAL_PREFIX "INCREMENTAL."
+#define INCREMENTAL_PREFIX_LENGTH (sizeof(INCREMENTAL_PREFIX) - 1)
/*
* Tracking for directories that need to be removed, or have their contents
@@ -69,6 +69,7 @@ typedef struct cb_options
pg_checksum_type manifest_checksums;
bool no_manifest;
DataDirSyncMethod sync_method;
+ CopyFileMethod copy_method;
} cb_options;
/*
@@ -98,15 +99,10 @@ static void cleanup_directories_atexit(void);
static void create_output_directory(char *dirname, cb_options *opt);
static void help(const char *progname);
static bool parse_oid(char *s, Oid *result);
-static void process_directory_recursively(Oid tsoid,
- char *input_directory,
- char *output_directory,
- char *relative_path,
- int n_prior_backups,
- char **prior_backup_dirs,
- manifest_data **manifests,
- manifest_writer *mwriter,
- cb_options *opt);
+static void process_directory_recursively(
+ Oid tsoid, char *input_directory, char *output_directory,
+ char *relative_path, int n_prior_backups, char **prior_backup_dirs,
+ manifest_data **manifests, manifest_writer *mwriter, cb_options *opt);
static int read_pg_version_file(char *directory);
static void remember_to_cleanup_directory(char *target_path, bool rmtopdir);
static void reset_directory_cleanup_list(void);
@@ -129,8 +125,9 @@ main(int argc, char *argv[])
{"manifest-checksums", required_argument, NULL, 1},
{"no-manifest", no_argument, NULL, 2},
{"sync-method", required_argument, NULL, 3},
- {NULL, 0, NULL, 0}
- };
+ {"clone", no_argument, NULL, 4},
+ {"copy-file-range", no_argument, NULL, 5},
+ {NULL, 0, NULL, 0}};
const char *progname;
char *last_input_dir;
@@ -154,10 +151,11 @@ main(int argc, char *argv[])
memset(&opt, 0, sizeof(opt));
opt.manifest_checksums = CHECKSUM_TYPE_CRC32C;
opt.sync_method = DATA_DIR_SYNC_METHOD_FSYNC;
+ opt.copy_method = 0;
/* process command-line options */
- while ((c = getopt_long(argc, argv, "dnNPo:T:",
- long_options, &optindex)) != -1)
+ while ((c = getopt_long(argc, argv, "dnNPo:T:", long_options, &optindex)) !=
+ -1)
{
switch (c)
{
@@ -178,10 +176,8 @@ main(int argc, char *argv[])
add_tablespace_mapping(&opt, optarg);
break;
case 1:
- if (!pg_checksum_parse_type(optarg,
- &opt.manifest_checksums))
- pg_fatal("unrecognized checksum algorithm: \"%s\"",
- optarg);
+ if (!pg_checksum_parse_type(optarg, &opt.manifest_checksums))
+ pg_fatal("unrecognized checksum algorithm: \"%s\"", optarg);
break;
case 2:
opt.no_manifest = true;
@@ -190,6 +186,12 @@ main(int argc, char *argv[])
if (!parse_sync_method(optarg, &opt.sync_method))
exit(1);
break;
+ case 4:
+ opt.copy_method = PG_COPYFILE_IOCTL_FICLONE;
+ break;
+ case 5:
+ opt.copy_method = PG_COPYFILE_COPY_FILE_RANGE;
+ break;
default:
/* getopt_long already emitted a complaint */
pg_log_error_hint("Try \"%s --help\" for more information.", progname);
@@ -211,6 +213,14 @@ main(int argc, char *argv[])
if (opt.no_manifest)
opt.manifest_checksums = CHECKSUM_TYPE_NONE;
+ /*
+ * We cannot provide file copy/clone offload in case when we need to
+ * calculate checksums
+ */
+ if (opt.copy_method != 0 && opt.manifest_checksums != CHECKSUM_TYPE_NONE)
+ pg_fatal("unable to use accelerated copy when manifest checksums "
+ "are to be calculated. Use --no-manifest");
+
/* Read the server version from the final backup. */
version = read_pg_version_file(argv[argc - 1]);
@@ -263,7 +273,8 @@ main(int argc, char *argv[])
* won't have the WAL ranges for the resulting manifest.
*/
if (manifests[n_prior_backups] == NULL)
- pg_fatal("can't generate a manifest because no manifest is available for the final input backup");
+ pg_fatal("can't generate a manifest because no manifest is available for "
+ "the final input backup");
}
else
mwriter = NULL;
@@ -275,15 +286,15 @@ main(int argc, char *argv[])
{
pg_log_debug("generating \"%s/backup_label\"", opt.output);
last_backup_label->cursor = 0;
- write_backup_label(opt.output, last_backup_label,
- opt.manifest_checksums, mwriter);
+ write_backup_label(opt.output, last_backup_label, opt.manifest_checksums,
+ mwriter);
}
/* Process everything that's not part of a user-defined tablespace. */
pg_log_debug("processing backup directory \"%s\"", last_input_dir);
- process_directory_recursively(InvalidOid, last_input_dir, opt.output,
- NULL, n_prior_backups, prior_backup_dirs,
- manifests, mwriter, &opt);
+ process_directory_recursively(InvalidOid, last_input_dir, opt.output, NULL,
+ n_prior_backups, prior_backup_dirs, manifests,
+ mwriter, &opt);
/* Process user-defined tablespaces. */
for (ts = tablespaces; ts != NULL; ts = ts->next)
@@ -299,16 +310,15 @@ main(int argc, char *argv[])
{
char linkpath[MAXPGPATH];
- snprintf(linkpath, MAXPGPATH, "%s/pg_tblspc/%u", opt.output,
- ts->oid);
+ snprintf(linkpath, MAXPGPATH, "%s/pg_tblspc/%u", opt.output, ts->oid);
if (opt.dry_run)
pg_log_debug("would create symbolic link from \"%s\" to \"%s\"",
linkpath, ts->new_dir);
else
{
- pg_log_debug("creating symbolic link from \"%s\" to \"%s\"",
- linkpath, ts->new_dir);
+ pg_log_debug("creating symbolic link from \"%s\" to \"%s\"", linkpath,
+ ts->new_dir);
if (symlink(ts->new_dir, linkpath) != 0)
pg_fatal("could not create symbolic link from \"%s\" to \"%s\": %m",
linkpath, ts->new_dir);
@@ -322,21 +332,19 @@ main(int argc, char *argv[])
{
pg_log_debug("creating directory \"%s\"", ts->new_dir);
if (pg_mkdir_p(ts->new_dir, pg_dir_create_mode) == -1)
- pg_fatal("could not create directory \"%s\": %m",
- ts->new_dir);
+ pg_fatal("could not create directory \"%s\": %m", ts->new_dir);
}
}
/* OK, now handle the directory contents. */
- process_directory_recursively(ts->oid, ts->old_dir, ts->new_dir,
- NULL, n_prior_backups, prior_backup_dirs,
- manifests, mwriter, &opt);
+ process_directory_recursively(ts->oid, ts->old_dir, ts->new_dir, NULL,
+ n_prior_backups, prior_backup_dirs, manifests,
+ mwriter, &opt);
}
/* Finalize the backup_manifest, if we're generating one. */
if (mwriter != NULL)
- finalize_manifest(mwriter,
- manifests[n_prior_backups]->first_wal_range);
+ finalize_manifest(mwriter, manifests[n_prior_backups]->first_wal_range);
/* fsync that output directory unless we've been told not to do so */
if (!opt.no_sync)
@@ -392,7 +400,9 @@ add_tablespace_mapping(cb_options *opt, char *arg)
*dst_ptr++ = *arg_ptr;
}
if (!tsmap->old_dir[0] || !tsmap->new_dir[0])
- pg_fatal("invalid tablespace mapping format \"%s\", must be \"OLDDIR=NEWDIR\"", arg);
+ pg_fatal(
+ "invalid tablespace mapping format \"%s\", must be \"OLDDIR=NEWDIR\"",
+ arg);
/*
* All tablespaces are created with absolute directories, so specifying a
@@ -464,8 +474,8 @@ check_backup_label_files(int n_backups, char **backup_dirs)
pg_fatal("could not close \"%s\": %m", pathbuf);
/* Parse the file contents. */
- parse_backup_label(pathbuf, buf, &start_tli, &start_lsn,
- &previous_tli, &previous_lsn);
+ parse_backup_label(pathbuf, buf, &start_tli, &start_lsn, &previous_tli,
+ &previous_lsn);
/*
* Sanity checks.
@@ -476,18 +486,19 @@ check_backup_label_files(int n_backups, char **backup_dirs)
* we don't have that information.
*/
if (i > 0 && previous_tli == 0)
- pg_fatal("backup at \"%s\" is a full backup, but only the first backup should be a full backup",
+ pg_fatal("backup at \"%s\" is a full backup, but only the first backup "
+ "should be a full backup",
backup_dirs[i]);
if (i == 0 && previous_tli != 0)
- pg_fatal("backup at \"%s\" is an incremental backup, but the first backup should be a full backup",
+ pg_fatal("backup at \"%s\" is an incremental backup, but the first "
+ "backup should be a full backup",
backup_dirs[i]);
if (i < n_backups - 1 && start_tli != check_tli)
pg_fatal("backup at \"%s\" starts on timeline %u, but expected %u",
backup_dirs[i], start_tli, check_tli);
if (i < n_backups - 1 && start_lsn != check_lsn)
pg_fatal("backup at \"%s\" starts at LSN %X/%X, but expected %X/%X",
- backup_dirs[i],
- LSN_FORMAT_ARGS(start_lsn),
+ backup_dirs[i], LSN_FORMAT_ARGS(start_lsn),
LSN_FORMAT_ARGS(check_lsn));
check_tli = previous_tli;
check_lsn = previous_lsn;
@@ -542,8 +553,7 @@ check_control_files(int n_backups, char **backup_dirs)
/* Can't interpret control file if not current version. */
if (control_file->pg_control_version != PG_CONTROL_VERSION)
- pg_fatal("%s: unexpected control file version",
- controlpath);
+ pg_fatal("%s: unexpected control file version", controlpath);
/* System identifiers should all match. */
if (i == n_backups - 1)
@@ -667,14 +677,23 @@ help(const char *progname)
printf(_("\nOptions:\n"));
printf(_(" -d, --debug generate lots of debugging output\n"));
printf(_(" -n, --dry-run don't actually do anything\n"));
- printf(_(" -N, --no-sync do not wait for changes to be written safely to disk\n"));
+ printf(_(" -N, --no-sync do not wait for changes to be written "
+ "safely to disk\n"));
printf(_(" -o, --output output directory\n"));
- printf(_(" -T, --tablespace-mapping=OLDDIR=NEWDIR\n"
+ printf(_(
+ " -T, --tablespace-mapping=OLDDIR=NEWDIR\n"
" relocate tablespace in OLDDIR to NEWDIR\n"));
- printf(_(" --manifest-checksums=SHA{224,256,384,512}|CRC32C|NONE\n"
+ printf(
+ _(" --manifest-checksums=SHA{224,256,384,512}|CRC32C|NONE\n"
" use algorithm for manifest checksums\n"));
- printf(_(" --no-manifest suppress generation of backup manifest\n"));
- printf(_(" --sync-method=METHOD set method for syncing files to disk\n"));
+ printf(_(
+ " --no-manifest suppress generation of backup manifest\n"));
+ printf(
+ _(" --sync-method=METHOD set method for syncing files to disk\n"));
+ printf(_(" --clone clone (reflink) instead of copying "
+ "files\n"));
+ printf(
+ _(" --copy-file-range copy using copy_file_range() syscall\n"));
printf(_(" -?, --help show this help, then exit\n"));
printf(_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
@@ -721,15 +740,10 @@ parse_oid(char *s, Oid *result)
* the locations of those previous backups.
*/
static void
-process_directory_recursively(Oid tsoid,
- char *input_directory,
- char *output_directory,
- char *relative_path,
- int n_prior_backups,
- char **prior_backup_dirs,
- manifest_data **manifests,
- manifest_writer *mwriter,
- cb_options *opt)
+process_directory_recursively(
+ Oid tsoid, char *input_directory, char *output_directory,
+ char *relative_path, int n_prior_backups, char **prior_backup_dirs,
+ manifest_data **manifests, manifest_writer *mwriter, cb_options *opt)
{
char ifulldir[MAXPGPATH];
char ofulldir[MAXPGPATH];
@@ -782,13 +796,11 @@ process_directory_recursively(Oid tsoid,
}
else
{
- snprintf(ifulldir, MAXPGPATH, "%s/%s", input_directory,
- relative_path);
- snprintf(ofulldir, MAXPGPATH, "%s/%s", output_directory,
- relative_path);
+ snprintf(ifulldir, MAXPGPATH, "%s/%s", input_directory, relative_path);
+ snprintf(ofulldir, MAXPGPATH, "%s/%s", output_directory, relative_path);
if (OidIsValid(tsoid))
- snprintf(manifest_prefix, MAXPGPATH, "pg_tblspc/%u/%s/",
- tsoid, relative_path);
+ snprintf(manifest_prefix, MAXPGPATH, "pg_tblspc/%u/%s/", tsoid,
+ relative_path);
else
snprintf(manifest_prefix, MAXPGPATH, "%s/", relative_path);
}
@@ -824,8 +836,7 @@ process_directory_recursively(Oid tsoid,
pg_checksum_context checksum_ctx;
/* Ignore "." and ".." entries. */
- if (strcmp(de->d_name, ".") == 0 ||
- strcmp(de->d_name, "..") == 0)
+ if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
continue;
/* Construct input path. */
@@ -861,11 +872,9 @@ process_directory_recursively(Oid tsoid,
de->d_name);
/* And recurse. */
- process_directory_recursively(tsoid,
- input_directory, output_directory,
- new_relative_path,
- n_prior_backups, prior_backup_dirs,
- manifests, mwriter, opt);
+ process_directory_recursively(tsoid, input_directory, output_directory,
+ new_relative_path, n_prior_backups,
+ prior_backup_dirs, manifests, mwriter, opt);
continue;
}
@@ -883,46 +892,37 @@ process_directory_recursively(Oid tsoid,
* Skip the backup_label and backup_manifest files; they require
* special handling and are handled elsewhere.
*/
- if (relative_path == NULL &&
- (strcmp(de->d_name, "backup_label") == 0 ||
- strcmp(de->d_name, "backup_manifest") == 0))
+ if (relative_path == NULL && (strcmp(de->d_name, "backup_label") == 0 ||
+ strcmp(de->d_name, "backup_manifest") == 0))
continue;
/*
* If it's an incremental file, hand it off to the reconstruction
* code, which will figure out what to do.
*/
- if (strncmp(de->d_name, INCREMENTAL_PREFIX,
- INCREMENTAL_PREFIX_LENGTH) == 0)
+ if (strncmp(de->d_name, INCREMENTAL_PREFIX, INCREMENTAL_PREFIX_LENGTH) ==
+ 0)
{
/* Output path should not include "INCREMENTAL." prefix. */
snprintf(ofullpath, MAXPGPATH, "%s/%s", ofulldir,
de->d_name + INCREMENTAL_PREFIX_LENGTH);
-
/* Manifest path likewise omits incremental prefix. */
snprintf(manifest_path, MAXPGPATH, "%s%s", manifest_prefix,
de->d_name + INCREMENTAL_PREFIX_LENGTH);
/* Reconstruction logic will do the rest. */
- reconstruct_from_incremental_file(ifullpath, ofullpath,
- relative_path,
- de->d_name + INCREMENTAL_PREFIX_LENGTH,
- n_prior_backups,
- prior_backup_dirs,
- manifests,
- manifest_path,
- checksum_type,
- &checksum_length,
- &checksum_payload,
- opt->debug,
- opt->dry_run);
+ reconstruct_from_incremental_file(
+ ifullpath, ofullpath, relative_path,
+ de->d_name + INCREMENTAL_PREFIX_LENGTH, n_prior_backups,
+ prior_backup_dirs, manifests, manifest_path, checksum_type,
+ &checksum_length, &checksum_payload, opt->debug, opt->dry_run,
+ opt->copy_method);
}
else
{
/* Construct the path that the backup_manifest will use. */
- snprintf(manifest_path, MAXPGPATH, "%s%s", manifest_prefix,
- de->d_name);
+ snprintf(manifest_path, MAXPGPATH, "%s%s", manifest_prefix, de->d_name);
/*
* It's not an incremental file, so we need to copy the entire
@@ -932,13 +932,11 @@ process_directory_recursively(Oid tsoid,
* backup_manifest for the final input directory, we can save some
* work by reusing that checksum instead of computing a new one.
*/
- if (checksum_type != CHECKSUM_TYPE_NONE &&
- latest_manifest != NULL)
+ if (checksum_type != CHECKSUM_TYPE_NONE && latest_manifest != NULL)
{
manifest_file *mfile;
- mfile = manifest_files_lookup(latest_manifest->files,
- manifest_path);
+ mfile = manifest_files_lookup(latest_manifest->files, manifest_path);
if (mfile == NULL)
{
char *bmpath;
@@ -947,10 +945,9 @@ process_directory_recursively(Oid tsoid,
* The directory is out of sync with the backup_manifest,
* so emit a warning.
*/
- bmpath = psprintf("%s/%s", input_directory,
- "backup_manifest");
- pg_log_warning("\"%s\" contains no entry for \"%s\"",
- bmpath, manifest_path);
+ bmpath = psprintf("%s/%s", input_directory, "backup_manifest");
+ pg_log_warning("\"%s\" contains no entry for \"%s\"", bmpath,
+ manifest_path);
pfree(bmpath);
}
else if (mfile->checksum_type == checksum_type)
@@ -972,7 +969,8 @@ process_directory_recursively(Oid tsoid,
/* Actually copy the file. */
snprintf(ofullpath, MAXPGPATH, "%s/%s", ofulldir, de->d_name);
- copy_file(ifullpath, ofullpath, &checksum_ctx, opt->dry_run);
+ copy_file(ifullpath, ofullpath, &checksum_ctx, opt->dry_run,
+ opt->copy_method);
/*
* If copy_file() performed a checksum calculation for us, then
@@ -982,8 +980,7 @@ process_directory_recursively(Oid tsoid,
if (checksum_ctx.type != CHECKSUM_TYPE_NONE && !opt->dry_run)
{
checksum_payload = pg_malloc(PG_CHECKSUM_MAX_LENGTH);
- checksum_length = pg_checksum_final(&checksum_ctx,
- checksum_payload);
+ checksum_length = pg_checksum_final(&checksum_ctx, checksum_payload);
}
}
@@ -1009,10 +1006,8 @@ process_directory_recursively(Oid tsoid,
pg_fatal("could not stat file \"%s\": %m", ofullpath);
/* OK, now do the work. */
- add_file_to_manifest(mwriter, manifest_path,
- sb.st_size, sb.st_mtime,
- checksum_type, checksum_length,
- checksum_payload);
+ add_file_to_manifest(mwriter, manifest_path, sb.st_size, sb.st_mtime,
+ checksum_type, checksum_length, checksum_payload);
}
/* Avoid leaking memory. */
@@ -1120,7 +1115,8 @@ reset_directory_cleanup_list(void)
* final backup in the backup chain.
*/
static cb_tablespace *
-scan_for_existing_tablespaces(char *pathname, cb_options *opt)
+scan_for_existing_tablespaces(char *pathname,
+ cb_options *opt)
{
char pg_tblspc[MAXPGPATH];
DIR *dir;
@@ -1153,7 +1149,8 @@ scan_for_existing_tablespaces(char *pathname, cb_options *opt)
/* Ignore any file name that doesn't look like a proper OID. */
if (!parse_oid(de->d_name, &oid))
{
- pg_log_debug("skipping \"%s\" because the filename is not a legal tablespace OID",
+ pg_log_debug(
+ "skipping \"%s\" because the filename is not a legal tablespace OID",
tblspcdir);
continue;
}
@@ -1164,7 +1161,8 @@ scan_for_existing_tablespaces(char *pathname, cb_options *opt)
exit(1);
if (type != PGFILETYPE_LNK && type != PGFILETYPE_DIR)
{
- pg_log_debug("skipping \"%s\" because it is neither a symbolic link nor a directory",
+ pg_log_debug("skipping \"%s\" because it is neither a symbolic link nor "
+ "a directory",
tblspcdir);
continue;
}
@@ -1184,8 +1182,7 @@ scan_for_existing_tablespaces(char *pathname, cb_options *opt)
/* Read the link target. */
link_length = readlink(tblspcdir, link_target, sizeof(link_target));
if (link_length < 0)
- pg_fatal("could not read symbolic link \"%s\": %m",
- tblspcdir);
+ pg_fatal("could not read symbolic link \"%s\": %m", tblspcdir);
if (link_length >= sizeof(link_target))
pg_fatal("symbolic link \"%s\" is too long", tblspcdir);
link_target[link_length] = '\0';
@@ -1212,8 +1209,7 @@ scan_for_existing_tablespaces(char *pathname, cb_options *opt)
/* Every non-in-place tablespace must be mapped. */
if (tsmap == NULL)
- pg_fatal("tablespace at \"%s\" has no tablespace mapping",
- link_target);
+ pg_fatal("tablespace at \"%s\" has no tablespace mapping", link_target);
}
else
{
@@ -1274,8 +1270,8 @@ slurp_file(int fd, char *filename, StringInfo buf, int maxlen)
if (rb < 0)
pg_fatal("could not read file \"%s\": %m", filename);
else
- pg_fatal("could not read file \"%s\": read only %d of %d bytes",
- filename, (int) rb, (int) st.st_size);
+ pg_fatal("could not read file \"%s\": read only %d of %d bytes", filename,
+ (int) rb, (int) st.st_size);
}
/* Adjust buffer length for new data and restore trailing-\0 invariant */
diff --git a/src/bin/pg_combinebackup/reconstruct.c b/src/bin/pg_combinebackup/reconstruct.c
index 874e6cd150..3f66eb9ad6 100644
--- a/src/bin/pg_combinebackup/reconstruct.c
+++ b/src/bin/pg_combinebackup/reconstruct.c
@@ -15,8 +15,8 @@
#include <unistd.h>
#include "backup/basebackup_incremental.h"
-#include "common/logging.h"
#include "common/file_perm.h"
+#include "common/logging.h"
#include "copy_file.h"
#include "lib/stringinfo.h"
#include "reconstruct.h"
@@ -46,20 +46,16 @@ typedef struct rfile
off_t highest_offset_read;
} rfile;
-static void debug_reconstruction(int n_source,
- rfile **sources,
- bool dry_run);
+static void debug_reconstruction(int n_source, rfile **sources, bool dry_run);
static unsigned find_reconstructed_block_length(rfile *s);
static rfile *make_incremental_rfile(char *filename);
static rfile *make_rfile(char *filename, bool missing_ok);
static void write_reconstructed_file(char *input_filename,
char *output_filename,
- unsigned block_length,
- rfile **sourcemap,
+ unsigned block_length, rfile **sourcemap,
off_t *offsetmap,
pg_checksum_context *checksum_ctx,
- bool debug,
- bool dry_run);
+ bool debug, bool dry_run);
static void read_bytes(rfile *rf, void *buffer, unsigned length);
/*
@@ -78,19 +74,13 @@ static void read_bytes(rfile *rf, void *buffer, unsigned length);
* an array of pathnames where those backups can be found.
*/
void
-reconstruct_from_incremental_file(char *input_filename,
- char *output_filename,
- char *relative_path,
- char *bare_file_name,
- int n_prior_backups,
- char **prior_backup_dirs,
- manifest_data **manifests,
- char *manifest_path,
- pg_checksum_type checksum_type,
- int *checksum_length,
- uint8 **checksum_payload,
- bool debug,
- bool dry_run)
+reconstruct_from_incremental_file(
+ char *input_filename, char *output_filename, char *relative_path,
+ char *bare_file_name, int n_prior_backups, char **prior_backup_dirs,
+ manifest_data **manifests, char *manifest_path,
+ pg_checksum_type checksum_type, int *checksum_length,
+ uint8 **checksum_payload, bool debug, bool dry_run,
+ CopyFileMethod copy_method)
{
rfile **source;
rfile *latest_source = NULL;
@@ -167,8 +157,8 @@ reconstruct_from_incremental_file(char *input_filename,
* Look for the full file in the previous backup. If not found, then
* look for an incremental file instead.
*/
- snprintf(source_filename, MAXPGPATH, "%s/%s/%s",
- prior_backup_dirs[sidx], relative_path, bare_file_name);
+ snprintf(source_filename, MAXPGPATH, "%s/%s/%s", prior_backup_dirs[sidx],
+ relative_path, bare_file_name);
if ((s = make_rfile(source_filename, true)) == NULL)
{
snprintf(source_filename, MAXPGPATH, "%s/%s/INCREMENTAL.%s",
@@ -231,8 +221,7 @@ reconstruct_from_incremental_file(char *input_filename,
{
uint64 expected_length;
- expected_length =
- (uint64) latest_source->truncation_block_length;
+ expected_length = (uint64) latest_source->truncation_block_length;
expected_length *= BLCKSZ;
if (expected_length == sb.st_size)
{
@@ -253,8 +242,7 @@ reconstruct_from_incremental_file(char *input_filename,
{
BlockNumber b = s->relative_block_numbers[i];
- if (b < latest_source->truncation_block_length &&
- sourcemap[b] == NULL)
+ if (b < latest_source->truncation_block_length && sourcemap[b] == NULL)
{
sourcemap[b] = s;
offsetmap[b] = s->header_length + (i * BLCKSZ);
@@ -283,16 +271,16 @@ reconstruct_from_incremental_file(char *input_filename,
manifest_path);
if (mfile == NULL)
{
- char *path = psprintf("%s/backup_manifest",
- prior_backup_dirs[copy_source_index]);
+ char *path =
+ psprintf("%s/backup_manifest", prior_backup_dirs[copy_source_index]);
/*
* The directory is out of sync with the backup_manifest, so emit
* a warning.
*/
- /*- translator: the first %s is a backup manifest file, the second is a file absent therein */
- pg_log_warning("\"%s\" contains no entry for \"%s\"",
- path,
+ /*- translator: the first %s is a backup manifest file, the second is a
+ * file absent therein */
+ pg_log_warning("\"%s\" contains no entry for \"%s\"", path,
manifest_path);
pfree(path);
}
@@ -300,8 +288,7 @@ reconstruct_from_incremental_file(char *input_filename,
{
*checksum_length = mfile->checksum_length;
*checksum_payload = pg_malloc(*checksum_length);
- memcpy(*checksum_payload, mfile->checksum_payload,
- *checksum_length);
+ memcpy(*checksum_payload, mfile->checksum_payload, *checksum_length);
checksum_type = CHECKSUM_TYPE_NONE;
}
}
@@ -318,13 +305,13 @@ reconstruct_from_incremental_file(char *input_filename,
* Otherwise, reconstruct.
*/
if (copy_source != NULL)
- copy_file(copy_source->filename, output_filename,
- &checksum_ctx, dry_run);
+ copy_file(copy_source->filename, output_filename, &checksum_ctx, dry_run,
+ copy_method);
else
{
- write_reconstructed_file(input_filename, output_filename,
- block_length, sourcemap, offsetmap,
- &checksum_ctx, debug, dry_run);
+ write_reconstructed_file(input_filename, output_filename, block_length,
+ sourcemap, offsetmap, &checksum_ctx, debug,
+ dry_run);
debug_reconstruction(n_prior_backups + 1, source, dry_run);
}
@@ -332,8 +319,7 @@ reconstruct_from_incremental_file(char *input_filename,
if (checksum_type != CHECKSUM_TYPE_NONE)
{
*checksum_payload = pg_malloc(PG_CHECKSUM_MAX_LENGTH);
- *checksum_length = pg_checksum_final(&checksum_ctx,
- *checksum_payload);
+ *checksum_length = pg_checksum_final(&checksum_ctx, *checksum_payload);
}
/*
@@ -378,11 +364,11 @@ debug_reconstruction(int n_source, rfile **sources, bool dry_run)
/* Debug logging. */
if (dry_run)
- pg_log_debug("would have read %u blocks from \"%s\"",
- s->num_blocks_read, s->filename);
+ pg_log_debug("would have read %u blocks from \"%s\"", s->num_blocks_read,
+ s->filename);
else
- pg_log_debug("read %u blocks from \"%s\"",
- s->num_blocks_read, s->filename);
+ pg_log_debug("read %u blocks from \"%s\"", s->num_blocks_read,
+ s->filename);
/*
* In dry-run mode, we don't actually try to read data from the file,
@@ -401,8 +387,7 @@ debug_reconstruction(int n_source, rfile **sources, bool dry_run)
pg_fatal("could not stat \"%s\": %m", s->filename);
if (sb.st_size < s->highest_offset_read)
pg_fatal("file \"%s\" is too short: expected %llu, found %llu",
- s->filename,
- (unsigned long long) s->highest_offset_read,
+ s->filename, (unsigned long long) s->highest_offset_read,
(unsigned long long) sb.st_size);
}
}
@@ -455,7 +440,8 @@ make_incremental_rfile(char *filename)
read_bytes(rf, &rf->truncation_block_length,
sizeof(rf->truncation_block_length));
if (rf->truncation_block_length > RELSEG_SIZE)
- pg_fatal("file \"%s\" has truncation block length %u in excess of segment size %u",
+ pg_fatal("file \"%s\" has truncation block length %u in excess of segment "
+ "size %u",
filename, rf->truncation_block_length, RELSEG_SIZE);
/* Read block numbers if there are any. */
@@ -522,12 +508,10 @@ read_bytes(rfile *rf, void *buffer, unsigned length)
static void
write_reconstructed_file(char *input_filename,
char *output_filename,
- unsigned block_length,
- rfile **sourcemap,
+ unsigned block_length, rfile **sourcemap,
off_t *offsetmap,
pg_checksum_context *checksum_ctx,
- bool debug,
- bool dry_run)
+ bool debug, bool dry_run)
{
int wfd = -1;
unsigned i;
@@ -570,14 +554,13 @@ write_reconstructed_file(char *input_filename,
if (current_block == start_of_range)
appendStringInfo(&debug_buf, " %u:zero", current_block);
else
- appendStringInfo(&debug_buf, " %u-%u:zero",
- start_of_range, current_block);
+ appendStringInfo(&debug_buf, " %u-%u:zero", start_of_range,
+ current_block);
}
else
{
if (current_block == start_of_range)
- appendStringInfo(&debug_buf, " %u:%s@" UINT64_FORMAT,
- current_block,
+ appendStringInfo(&debug_buf, " %u:%s@" UINT64_FORMAT, current_block,
s == NULL ? "ZERO" : s->filename,
(uint64) offsetmap[current_block]);
else
@@ -604,8 +587,7 @@ write_reconstructed_file(char *input_filename,
/* Open the output file, except in dry_run mode. */
if (!dry_run &&
- (wfd = open(output_filename,
- O_RDWR | PG_BINARY | O_CREAT | O_EXCL,
+ (wfd = open(output_filename, O_RDWR | PG_BINARY | O_CREAT | O_EXCL,
pg_file_create_mode)) < 0)
pg_fatal("could not open file \"%s\": %m", output_filename);
@@ -622,8 +604,8 @@ write_reconstructed_file(char *input_filename,
else
{
s->num_blocks_read++;
- s->highest_offset_read = Max(s->highest_offset_read,
- offsetmap[i] + BLCKSZ);
+ s->highest_offset_read =
+ Max(s->highest_offset_read, offsetmap[i] + BLCKSZ);
}
/* Skip the rest of this in dry-run mode. */
@@ -650,9 +632,9 @@ write_reconstructed_file(char *input_filename,
if (rb < 0)
pg_fatal("could not read file \"%s\": %m", s->filename);
else
- pg_fatal("could not read file \"%s\": read only %d of %d bytes at offset %llu",
- s->filename, rb, BLCKSZ,
- (unsigned long long) offsetmap[i]);
+ pg_fatal("could not read file \"%s\": read only %d of %d bytes at "
+ "offset %llu",
+ s->filename, rb, BLCKSZ, (unsigned long long) offsetmap[i]);
}
}
@@ -668,8 +650,7 @@ write_reconstructed_file(char *input_filename,
/* Update the checksum computation. */
if (pg_checksum_update(checksum_ctx, buffer, BLCKSZ) < 0)
- pg_fatal("could not update checksum of file \"%s\"",
- output_filename);
+ pg_fatal("could not update checksum of file \"%s\"", output_filename);
}
/* Debugging output. */
diff --git a/src/bin/pg_combinebackup/reconstruct.h b/src/bin/pg_combinebackup/reconstruct.h
index d689aeb5c2..6623ca932e 100644
--- a/src/bin/pg_combinebackup/reconstruct.h
+++ b/src/bin/pg_combinebackup/reconstruct.h
@@ -13,21 +13,17 @@
#ifndef RECONSTRUCT_H
#define RECONSTRUCT_H
+#include "c.h"
#include "common/checksum_helper.h"
+#include "common/file_utils.h"
#include "load_manifest.h"
-extern void reconstruct_from_incremental_file(char *input_filename,
- char *output_filename,
- char *relative_path,
- char *bare_file_name,
- int n_prior_backups,
- char **prior_backup_dirs,
- manifest_data **manifests,
- char *manifest_path,
- pg_checksum_type checksum_type,
- int *checksum_length,
- uint8 **checksum_payload,
- bool debug,
- bool dry_run);
+extern void reconstruct_from_incremental_file(
+ char *input_filename, char *output_filename, char *relative_path,
+ char *bare_file_name, int n_prior_backups, char **prior_backup_dirs,
+ manifest_data **manifests, char *manifest_path,
+ pg_checksum_type checksum_type, int *checksum_length,
+ uint8 **checksum_payload, bool debug, bool dry_run,
+ CopyFileMethod copy_method);
#endif
--
2.30.2
On 05.01.24 13:40, Jakub Wartak wrote:
Random patch notes:
- main meat is in v3-0002*, I hope i did not screw something seriously
- in worst case: it is opt-in through switch, so the user always can
stick to the classic copy
- no docs so far
- pg_copyfile_offload_supported() should actually be fixed if it is a
good path forward
- pgindent actually indents larger areas of code that I would like to,
any ideas or is it ok?
- not tested on Win32/MacOS/FreeBSD
- i've tested pg_upgrade manually and it seems to work and issue
correct syscalls, however some tests are failing(?). I haven't
investigated why yet due to lack of time.
Something is wrong with the pgindent in your patch set. Maybe you used
a wrong version. You should try to fix that, because it is hard to
process your patch with that amount of unrelated reformatting.
As far as I can tell, the original pg_upgrade patch has been ready to
commit since October. Unless Thomas has any qualms that have not been
made explicit in this thread, I suggest we move ahead with that.
And then Jakub could rebase his patch set on top of that. It looks like
if the formatting issues are fixed, the remaining pg_combinebackup
support isn't that big.
On Wed, Mar 6, 2024 at 2:43 AM Peter Eisentraut <peter@eisentraut.org> wrote:
As far as I can tell, the original pg_upgrade patch has been ready to
commit since October. Unless Thomas has any qualms that have not been
made explicit in this thread, I suggest we move ahead with that.
pg_upgrade --copy-file-range pushed. The only change I made was to
remove the EINTR retry condition which was subtly wrong and actually
not needed here AFAICS. (Erm, maybe I did have an unexpressed qualm
about some bug reports unfolding around that time about corruption
linked to copy_file_range that might have spooked me but those seem to
have been addressed.)
And then Jakub could rebase his patch set on top of that. It looks like
if the formatting issues are fixed, the remaining pg_combinebackup
support isn't that big.
+1
I'll also go and rebase CREATE DATABASE ... STRATEGY=file_clone[1]/messages/by-id/CA+hUKGLM+t+SwBU-cHeMUXJCOgBxSHLGZutV5zCwY4qrCcE02w@mail.gmail.com.
[1]: /messages/by-id/CA+hUKGLM+t+SwBU-cHeMUXJCOgBxSHLGZutV5zCwY4qrCcE02w@mail.gmail.com
Hi,
I took a quick look at the remaining part adding copy_file_range to
pg_combinebackup. The patch no longer applies, so I had to rebase it.
Most of the issues were trivial, but I had to fix a couple missing
prototypes - I added them to copy_file.h/c, mostly.
0001 is the minimal rebase + those fixes
0002 has a couple review comments in copy_file, and it also undoes a lot
of unnecessary formatting changes (already pointed out by Peter a couple
days ago).
A couple review comments:
1) AFAIK opt_errinfo() returns pointer to the local "buf" variable.
2) I wonder if we even need opt_errinfo(). I'm not sure it actually
makes anything simpler.
3) I think it'd be nice to make CopyFileMethod more consistent with
transferMode in pg_upgrade.h (I mean, it seems wise to make the naming
more consistent, it's probably not worth unifying this somehow).
4) I wonder how we came up with copying the files by 50 blocks, but I
now realize it's been like this before this patch. I only noticed
because the patch adds a comment before buffer_size calculation.
5) I dislike the renaming of copy_file_blocks to pg_copyfile. The new
name is way more generic / less descriptive - it's clear it copies the
file block by block (well, in chunks). pg_copyfile is pretty vague.
6) This leaves behind copy_file_copyfile, which is now unused.
7) The patch reworks how combinebackup deals with alternative copy
implementations - instead of setting strategy_implementation and calling
that, the decisions now happen in pg_copyfile_offload with a lot of
conditions / ifdef / defined ... I find it pretty hard to understand and
reason about. I liked the strategy_implementation approach, as it forces
us to keep each method in a separate function.
Perhaps there's a reason why that doesn't work for copy_file_range? But
in that case this needs much clearer comments.
regards
--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
Attachments:
v20240319-0002-review-and-cleanup.patchtext/x-patch; charset=UTF-8; name=v20240319-0002-review-and-cleanup.patchDownload
From 39f42eee4c6f50d106672afe108294ee59082500 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@2ndquadrant.com>
Date: Tue, 19 Mar 2024 15:34:18 +0100
Subject: [PATCH v20240319 2/2] review and cleanup
---
src/bin/pg_combinebackup/copy_file.c | 3 +
src/bin/pg_combinebackup/copy_file.h | 1 +
src/bin/pg_combinebackup/pg_combinebackup.c | 197 +++++++++++---------
src/bin/pg_combinebackup/reconstruct.c | 105 ++++++-----
src/bin/pg_combinebackup/reconstruct.h | 19 +-
5 files changed, 190 insertions(+), 135 deletions(-)
diff --git a/src/bin/pg_combinebackup/copy_file.c b/src/bin/pg_combinebackup/copy_file.c
index 16e26b4f573..f45670dd47c 100644
--- a/src/bin/pg_combinebackup/copy_file.c
+++ b/src/bin/pg_combinebackup/copy_file.c
@@ -77,6 +77,8 @@ opt_errinfo(const char *addon_errmsg)
return "";
strcpy(buf, " ");
+
+ /* XXX isn't this broken? this returns pointer to local variable */
return strncat(buf, addon_errmsg, sizeof(buf) - 2);
}
@@ -93,6 +95,7 @@ pg_copyfile(const char *src, const char *dest, const char *addon_errmsg,
int dest_fd;
uint8 *buffer;
+ /* XXX where does the 50 blocks come from? larger/smaller? */
/* copy in fairly large chunks for best efficiency */
const int buffer_size = 50 * BLCKSZ;
diff --git a/src/bin/pg_combinebackup/copy_file.h b/src/bin/pg_combinebackup/copy_file.h
index 2797a340055..f4d0ac47d0e 100644
--- a/src/bin/pg_combinebackup/copy_file.h
+++ b/src/bin/pg_combinebackup/copy_file.h
@@ -15,6 +15,7 @@
#include "common/checksum_helper.h"
#include "common/file_utils.h"
+/* XXX do we even want this? how does pg_upgrade to this? */
typedef enum CopyFileMethod
{
PG_COPYFILE_FALLBACK = 0x1,
diff --git a/src/bin/pg_combinebackup/pg_combinebackup.c b/src/bin/pg_combinebackup/pg_combinebackup.c
index 1455360d81c..8fa7827c563 100644
--- a/src/bin/pg_combinebackup/pg_combinebackup.c
+++ b/src/bin/pg_combinebackup/pg_combinebackup.c
@@ -99,10 +99,15 @@ static void cleanup_directories_atexit(void);
static void create_output_directory(char *dirname, cb_options *opt);
static void help(const char *progname);
static bool parse_oid(char *s, Oid *result);
-static void process_directory_recursively(
- Oid tsoid, char *input_directory, char *output_directory,
- char *relative_path, int n_prior_backups, char **prior_backup_dirs,
- manifest_data **manifests, manifest_writer *mwriter, cb_options *opt);
+static void process_directory_recursively(Oid tsoid,
+ char *input_directory,
+ char *output_directory,
+ char *relative_path,
+ int n_prior_backups,
+ char **prior_backup_dirs,
+ manifest_data **manifests,
+ manifest_writer *mwriter,
+ cb_options *opt);
static int read_pg_version_file(char *directory);
static void remember_to_cleanup_directory(char *target_path, bool rmtopdir);
static void reset_directory_cleanup_list(void);
@@ -156,8 +161,8 @@ main(int argc, char *argv[])
opt.copy_method = 0;
/* process command-line options */
- while ((c = getopt_long(argc, argv, "dnNPo:T:", long_options, &optindex)) !=
- -1)
+ while ((c = getopt_long(argc, argv, "dnNPo:T:",
+ long_options, &optindex)) != -1)
{
switch (c)
{
@@ -178,8 +183,10 @@ main(int argc, char *argv[])
add_tablespace_mapping(&opt, optarg);
break;
case 1:
- if (!pg_checksum_parse_type(optarg, &opt.manifest_checksums))
- pg_fatal("unrecognized checksum algorithm: \"%s\"", optarg);
+ if (!pg_checksum_parse_type(optarg,
+ &opt.manifest_checksums))
+ pg_fatal("unrecognized checksum algorithm: \"%s\"",
+ optarg);
break;
case 2:
opt.no_manifest = true;
@@ -295,8 +302,7 @@ main(int argc, char *argv[])
* won't have the WAL ranges for the resulting manifest.
*/
if (manifests[n_prior_backups] == NULL)
- pg_fatal("can't generate a manifest because no manifest is available for "
- "the final input backup");
+ pg_fatal("can't generate a manifest because no manifest is available for the final input backup");
}
else
mwriter = NULL;
@@ -308,15 +314,15 @@ main(int argc, char *argv[])
{
pg_log_debug("generating \"%s/backup_label\"", opt.output);
last_backup_label->cursor = 0;
- write_backup_label(opt.output, last_backup_label, opt.manifest_checksums,
- mwriter);
+ write_backup_label(opt.output, last_backup_label,
+ opt.manifest_checksums, mwriter);
}
/* Process everything that's not part of a user-defined tablespace. */
pg_log_debug("processing backup directory \"%s\"", last_input_dir);
- process_directory_recursively(InvalidOid, last_input_dir, opt.output, NULL,
- n_prior_backups, prior_backup_dirs, manifests,
- mwriter, &opt);
+ process_directory_recursively(InvalidOid, last_input_dir, opt.output,
+ NULL, n_prior_backups, prior_backup_dirs,
+ manifests, mwriter, &opt);
/* Process user-defined tablespaces. */
for (ts = tablespaces; ts != NULL; ts = ts->next)
@@ -332,15 +338,16 @@ main(int argc, char *argv[])
{
char linkpath[MAXPGPATH];
- snprintf(linkpath, MAXPGPATH, "%s/pg_tblspc/%u", opt.output, ts->oid);
+ snprintf(linkpath, MAXPGPATH, "%s/pg_tblspc/%u", opt.output,
+ ts->oid);
if (opt.dry_run)
pg_log_debug("would create symbolic link from \"%s\" to \"%s\"",
linkpath, ts->new_dir);
else
{
- pg_log_debug("creating symbolic link from \"%s\" to \"%s\"", linkpath,
- ts->new_dir);
+ pg_log_debug("creating symbolic link from \"%s\" to \"%s\"",
+ linkpath, ts->new_dir);
if (symlink(ts->new_dir, linkpath) != 0)
pg_fatal("could not create symbolic link from \"%s\" to \"%s\": %m",
linkpath, ts->new_dir);
@@ -354,19 +361,21 @@ main(int argc, char *argv[])
{
pg_log_debug("creating directory \"%s\"", ts->new_dir);
if (pg_mkdir_p(ts->new_dir, pg_dir_create_mode) == -1)
- pg_fatal("could not create directory \"%s\": %m", ts->new_dir);
+ pg_fatal("could not create directory \"%s\": %m",
+ ts->new_dir);
}
}
/* OK, now handle the directory contents. */
- process_directory_recursively(ts->oid, ts->old_dir, ts->new_dir, NULL,
- n_prior_backups, prior_backup_dirs, manifests,
- mwriter, &opt);
+ process_directory_recursively(ts->oid, ts->old_dir, ts->new_dir,
+ NULL, n_prior_backups, prior_backup_dirs,
+ manifests, mwriter, &opt);
}
/* Finalize the backup_manifest, if we're generating one. */
if (mwriter != NULL)
- finalize_manifest(mwriter, manifests[n_prior_backups]->first_wal_range);
+ finalize_manifest(mwriter,
+ manifests[n_prior_backups]->first_wal_range);
/* fsync that output directory unless we've been told not to do so */
if (!opt.no_sync)
@@ -422,9 +431,7 @@ add_tablespace_mapping(cb_options *opt, char *arg)
*dst_ptr++ = *arg_ptr;
}
if (!tsmap->old_dir[0] || !tsmap->new_dir[0])
- pg_fatal(
- "invalid tablespace mapping format \"%s\", must be \"OLDDIR=NEWDIR\"",
- arg);
+ pg_fatal("invalid tablespace mapping format \"%s\", must be \"OLDDIR=NEWDIR\"", arg);
/*
* All tablespaces are created with absolute directories, so specifying a
@@ -496,8 +503,8 @@ check_backup_label_files(int n_backups, char **backup_dirs)
pg_fatal("could not close \"%s\": %m", pathbuf);
/* Parse the file contents. */
- parse_backup_label(pathbuf, buf, &start_tli, &start_lsn, &previous_tli,
- &previous_lsn);
+ parse_backup_label(pathbuf, buf, &start_tli, &start_lsn,
+ &previous_tli, &previous_lsn);
/*
* Sanity checks.
@@ -508,19 +515,18 @@ check_backup_label_files(int n_backups, char **backup_dirs)
* we don't have that information.
*/
if (i > 0 && previous_tli == 0)
- pg_fatal("backup at \"%s\" is a full backup, but only the first backup "
- "should be a full backup",
+ pg_fatal("backup at \"%s\" is a full backup, but only the first backup should be a full backup",
backup_dirs[i]);
if (i == 0 && previous_tli != 0)
- pg_fatal("backup at \"%s\" is an incremental backup, but the first "
- "backup should be a full backup",
+ pg_fatal("backup at \"%s\" is an incremental backup, but the first backup should be a full backup",
backup_dirs[i]);
if (i < n_backups - 1 && start_tli != check_tli)
pg_fatal("backup at \"%s\" starts on timeline %u, but expected %u",
backup_dirs[i], start_tli, check_tli);
if (i < n_backups - 1 && start_lsn != check_lsn)
pg_fatal("backup at \"%s\" starts at LSN %X/%X, but expected %X/%X",
- backup_dirs[i], LSN_FORMAT_ARGS(start_lsn),
+ backup_dirs[i],
+ LSN_FORMAT_ARGS(start_lsn),
LSN_FORMAT_ARGS(check_lsn));
check_tli = previous_tli;
check_lsn = previous_lsn;
@@ -572,7 +578,8 @@ check_control_files(int n_backups, char **backup_dirs)
/* Can't interpret control file if not current version. */
if (control_file->pg_control_version != PG_CONTROL_VERSION)
- pg_fatal("%s: unexpected control file version", controlpath);
+ pg_fatal("%s: unexpected control file version",
+ controlpath);
/* System identifiers should all match. */
if (i == n_backups - 1)
@@ -698,23 +705,16 @@ help(const char *progname)
printf(_("\nOptions:\n"));
printf(_(" -d, --debug generate lots of debugging output\n"));
printf(_(" -n, --dry-run don't actually do anything\n"));
- printf(_(" -N, --no-sync do not wait for changes to be written "
- "safely to disk\n"));
+ printf(_(" -N, --no-sync do not wait for changes to be written safely to disk\n"));
printf(_(" -o, --output output directory\n"));
- printf(_(
- " -T, --tablespace-mapping=OLDDIR=NEWDIR\n"
+ printf(_(" -T, --tablespace-mapping=OLDDIR=NEWDIR\n"
" relocate tablespace in OLDDIR to NEWDIR\n"));
- printf(
- _(" --manifest-checksums=SHA{224,256,384,512}|CRC32C|NONE\n"
+ printf(_(" --manifest-checksums=SHA{224,256,384,512}|CRC32C|NONE\n"
" use algorithm for manifest checksums\n"));
- printf(_(
- " --no-manifest suppress generation of backup manifest\n"));
- printf(
- _(" --sync-method=METHOD set method for syncing files to disk\n"));
- printf(_(" --clone clone (reflink) instead of copying "
- "files\n"));
- printf(
- _(" --copy-file-range copy using copy_file_range() syscall\n"));
+ printf(_(" --no-manifest suppress generation of backup manifest\n"));
+ printf(_(" --sync-method=METHOD set method for syncing files to disk\n"));
+ printf(_(" --clone clone (reflink) instead of copying files\n"));
+ printf(_(" --copy-file-range copy using copy_file_range() syscall\n"));
printf(_(" -?, --help show this help, then exit\n"));
printf(_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
@@ -761,10 +761,15 @@ parse_oid(char *s, Oid *result)
* the locations of those previous backups.
*/
static void
-process_directory_recursively(
- Oid tsoid, char *input_directory, char *output_directory,
- char *relative_path, int n_prior_backups, char **prior_backup_dirs,
- manifest_data **manifests, manifest_writer *mwriter, cb_options *opt)
+process_directory_recursively(Oid tsoid,
+ char *input_directory,
+ char *output_directory,
+ char *relative_path,
+ int n_prior_backups,
+ char **prior_backup_dirs,
+ manifest_data **manifests,
+ manifest_writer *mwriter,
+ cb_options *opt)
{
char ifulldir[MAXPGPATH];
char ofulldir[MAXPGPATH];
@@ -817,11 +822,13 @@ process_directory_recursively(
}
else
{
- snprintf(ifulldir, MAXPGPATH, "%s/%s", input_directory, relative_path);
- snprintf(ofulldir, MAXPGPATH, "%s/%s", output_directory, relative_path);
+ snprintf(ifulldir, MAXPGPATH, "%s/%s", input_directory,
+ relative_path);
+ snprintf(ofulldir, MAXPGPATH, "%s/%s", output_directory,
+ relative_path);
if (OidIsValid(tsoid))
- snprintf(manifest_prefix, MAXPGPATH, "pg_tblspc/%u/%s/", tsoid,
- relative_path);
+ snprintf(manifest_prefix, MAXPGPATH, "pg_tblspc/%u/%s/",
+ tsoid, relative_path);
else
snprintf(manifest_prefix, MAXPGPATH, "%s/", relative_path);
}
@@ -857,7 +864,8 @@ process_directory_recursively(
pg_checksum_context checksum_ctx;
/* Ignore "." and ".." entries. */
- if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+ if (strcmp(de->d_name, ".") == 0 ||
+ strcmp(de->d_name, "..") == 0)
continue;
/* Construct input path. */
@@ -893,9 +901,11 @@ process_directory_recursively(
de->d_name);
/* And recurse. */
- process_directory_recursively(tsoid, input_directory, output_directory,
- new_relative_path, n_prior_backups,
- prior_backup_dirs, manifests, mwriter, opt);
+ process_directory_recursively(tsoid,
+ input_directory, output_directory,
+ new_relative_path,
+ n_prior_backups, prior_backup_dirs,
+ manifests, mwriter, opt);
continue;
}
@@ -913,37 +923,47 @@ process_directory_recursively(
* Skip the backup_label and backup_manifest files; they require
* special handling and are handled elsewhere.
*/
- if (relative_path == NULL && (strcmp(de->d_name, "backup_label") == 0 ||
- strcmp(de->d_name, "backup_manifest") == 0))
+ if (relative_path == NULL &&
+ (strcmp(de->d_name, "backup_label") == 0 ||
+ strcmp(de->d_name, "backup_manifest") == 0))
continue;
/*
* If it's an incremental file, hand it off to the reconstruction
* code, which will figure out what to do.
*/
- if (strncmp(de->d_name, INCREMENTAL_PREFIX, INCREMENTAL_PREFIX_LENGTH) ==
- 0)
+ if (strncmp(de->d_name, INCREMENTAL_PREFIX,
+ INCREMENTAL_PREFIX_LENGTH) == 0)
{
/* Output path should not include "INCREMENTAL." prefix. */
snprintf(ofullpath, MAXPGPATH, "%s/%s", ofulldir,
de->d_name + INCREMENTAL_PREFIX_LENGTH);
+
/* Manifest path likewise omits incremental prefix. */
snprintf(manifest_path, MAXPGPATH, "%s%s", manifest_prefix,
de->d_name + INCREMENTAL_PREFIX_LENGTH);
/* Reconstruction logic will do the rest. */
- reconstruct_from_incremental_file(
- ifullpath, ofullpath, relative_path,
- de->d_name + INCREMENTAL_PREFIX_LENGTH, n_prior_backups,
- prior_backup_dirs, manifests, manifest_path, checksum_type,
- &checksum_length, &checksum_payload, opt->debug, opt->dry_run,
+ reconstruct_from_incremental_file(ifullpath, ofullpath,
+ relative_path,
+ de->d_name + INCREMENTAL_PREFIX_LENGTH,
+ n_prior_backups,
+ prior_backup_dirs,
+ manifests,
+ manifest_path,
+ checksum_type,
+ &checksum_length,
+ &checksum_payload,
+ opt->debug,
+ opt->dry_run,
opt->copy_method);
}
else
{
/* Construct the path that the backup_manifest will use. */
- snprintf(manifest_path, MAXPGPATH, "%s%s", manifest_prefix, de->d_name);
+ snprintf(manifest_path, MAXPGPATH, "%s%s", manifest_prefix,
+ de->d_name);
/*
* It's not an incremental file, so we need to copy the entire
@@ -953,11 +973,13 @@ process_directory_recursively(
* backup_manifest for the final input directory, we can save some
* work by reusing that checksum instead of computing a new one.
*/
- if (checksum_type != CHECKSUM_TYPE_NONE && latest_manifest != NULL)
+ if (checksum_type != CHECKSUM_TYPE_NONE &&
+ latest_manifest != NULL)
{
manifest_file *mfile;
- mfile = manifest_files_lookup(latest_manifest->files, manifest_path);
+ mfile = manifest_files_lookup(latest_manifest->files,
+ manifest_path);
if (mfile == NULL)
{
char *bmpath;
@@ -966,9 +988,10 @@ process_directory_recursively(
* The directory is out of sync with the backup_manifest,
* so emit a warning.
*/
- bmpath = psprintf("%s/%s", input_directory, "backup_manifest");
- pg_log_warning("\"%s\" contains no entry for \"%s\"", bmpath,
- manifest_path);
+ bmpath = psprintf("%s/%s", input_directory,
+ "backup_manifest");
+ pg_log_warning("\"%s\" contains no entry for \"%s\"",
+ bmpath, manifest_path);
pfree(bmpath);
}
else if (mfile->checksum_type == checksum_type)
@@ -1001,7 +1024,8 @@ process_directory_recursively(
if (checksum_ctx.type != CHECKSUM_TYPE_NONE && !opt->dry_run)
{
checksum_payload = pg_malloc(PG_CHECKSUM_MAX_LENGTH);
- checksum_length = pg_checksum_final(&checksum_ctx, checksum_payload);
+ checksum_length = pg_checksum_final(&checksum_ctx,
+ checksum_payload);
}
}
@@ -1027,8 +1051,10 @@ process_directory_recursively(
pg_fatal("could not stat file \"%s\": %m", ofullpath);
/* OK, now do the work. */
- add_file_to_manifest(mwriter, manifest_path, sb.st_size, sb.st_mtime,
- checksum_type, checksum_length, checksum_payload);
+ add_file_to_manifest(mwriter, manifest_path,
+ sb.st_size, sb.st_mtime,
+ checksum_type, checksum_length,
+ checksum_payload);
}
/* Avoid leaking memory. */
@@ -1136,8 +1162,7 @@ reset_directory_cleanup_list(void)
* final backup in the backup chain.
*/
static cb_tablespace *
-scan_for_existing_tablespaces(char *pathname,
- cb_options *opt)
+scan_for_existing_tablespaces(char *pathname, cb_options *opt)
{
char pg_tblspc[MAXPGPATH];
DIR *dir;
@@ -1170,8 +1195,7 @@ scan_for_existing_tablespaces(char *pathname,
/* Ignore any file name that doesn't look like a proper OID. */
if (!parse_oid(de->d_name, &oid))
{
- pg_log_debug(
- "skipping \"%s\" because the filename is not a legal tablespace OID",
+ pg_log_debug("skipping \"%s\" because the filename is not a legal tablespace OID",
tblspcdir);
continue;
}
@@ -1182,8 +1206,7 @@ scan_for_existing_tablespaces(char *pathname,
exit(1);
if (type != PGFILETYPE_LNK && type != PGFILETYPE_DIR)
{
- pg_log_debug("skipping \"%s\" because it is neither a symbolic link nor "
- "a directory",
+ pg_log_debug("skipping \"%s\" because it is neither a symbolic link nor a directory",
tblspcdir);
continue;
}
@@ -1203,7 +1226,8 @@ scan_for_existing_tablespaces(char *pathname,
/* Read the link target. */
link_length = readlink(tblspcdir, link_target, sizeof(link_target));
if (link_length < 0)
- pg_fatal("could not read symbolic link \"%s\": %m", tblspcdir);
+ pg_fatal("could not read symbolic link \"%s\": %m",
+ tblspcdir);
if (link_length >= sizeof(link_target))
pg_fatal("symbolic link \"%s\" is too long", tblspcdir);
link_target[link_length] = '\0';
@@ -1230,7 +1254,8 @@ scan_for_existing_tablespaces(char *pathname,
/* Every non-in-place tablespace must be mapped. */
if (tsmap == NULL)
- pg_fatal("tablespace at \"%s\" has no tablespace mapping", link_target);
+ pg_fatal("tablespace at \"%s\" has no tablespace mapping",
+ link_target);
}
else
{
diff --git a/src/bin/pg_combinebackup/reconstruct.c b/src/bin/pg_combinebackup/reconstruct.c
index 4daff9c77be..c37cceba030 100644
--- a/src/bin/pg_combinebackup/reconstruct.c
+++ b/src/bin/pg_combinebackup/reconstruct.c
@@ -46,16 +46,20 @@ typedef struct rfile
off_t highest_offset_read;
} rfile;
-static void debug_reconstruction(int n_source, rfile **sources, bool dry_run);
+static void debug_reconstruction(int n_source,
+ rfile **sources,
+ bool dry_run);
static unsigned find_reconstructed_block_length(rfile *s);
static rfile *make_incremental_rfile(char *filename);
static rfile *make_rfile(char *filename, bool missing_ok);
static void write_reconstructed_file(char *input_filename,
char *output_filename,
- unsigned block_length, rfile **sourcemap,
+ unsigned block_length,
+ rfile **sourcemap,
off_t *offsetmap,
pg_checksum_context *checksum_ctx,
- bool debug, bool dry_run);
+ bool debug,
+ bool dry_run);
static void read_bytes(rfile *rf, void *buffer, unsigned length);
/*
@@ -74,12 +78,19 @@ static void read_bytes(rfile *rf, void *buffer, unsigned length);
* an array of pathnames where those backups can be found.
*/
void
-reconstruct_from_incremental_file(
- char *input_filename, char *output_filename, char *relative_path,
- char *bare_file_name, int n_prior_backups, char **prior_backup_dirs,
- manifest_data **manifests, char *manifest_path,
- pg_checksum_type checksum_type, int *checksum_length,
- uint8 **checksum_payload, bool debug, bool dry_run,
+reconstruct_from_incremental_file(char *input_filename,
+ char *output_filename,
+ char *relative_path,
+ char *bare_file_name,
+ int n_prior_backups,
+ char **prior_backup_dirs,
+ manifest_data **manifests,
+ char *manifest_path,
+ pg_checksum_type checksum_type,
+ int *checksum_length,
+ uint8 **checksum_payload,
+ bool debug,
+ bool dry_run,
CopyFileMethod copy_method)
{
rfile **source;
@@ -157,8 +168,8 @@ reconstruct_from_incremental_file(
* Look for the full file in the previous backup. If not found, then
* look for an incremental file instead.
*/
- snprintf(source_filename, MAXPGPATH, "%s/%s/%s", prior_backup_dirs[sidx],
- relative_path, bare_file_name);
+ snprintf(source_filename, MAXPGPATH, "%s/%s/%s",
+ prior_backup_dirs[sidx], relative_path, bare_file_name);
if ((s = make_rfile(source_filename, true)) == NULL)
{
snprintf(source_filename, MAXPGPATH, "%s/%s/INCREMENTAL.%s",
@@ -221,7 +232,8 @@ reconstruct_from_incremental_file(
{
uint64 expected_length;
- expected_length = (uint64) latest_source->truncation_block_length;
+ expected_length =
+ (uint64) latest_source->truncation_block_length;
expected_length *= BLCKSZ;
if (expected_length == sb.st_size)
{
@@ -242,7 +254,8 @@ reconstruct_from_incremental_file(
{
BlockNumber b = s->relative_block_numbers[i];
- if (b < latest_source->truncation_block_length && sourcemap[b] == NULL)
+ if (b < latest_source->truncation_block_length &&
+ sourcemap[b] == NULL)
{
sourcemap[b] = s;
offsetmap[b] = s->header_length + (i * BLCKSZ);
@@ -271,16 +284,16 @@ reconstruct_from_incremental_file(
manifest_path);
if (mfile == NULL)
{
- char *path =
- psprintf("%s/backup_manifest", prior_backup_dirs[copy_source_index]);
+ char *path = psprintf("%s/backup_manifest",
+ prior_backup_dirs[copy_source_index]);
/*
* The directory is out of sync with the backup_manifest, so emit
* a warning.
*/
- /*- translator: the first %s is a backup manifest file, the second is a
- * file absent therein */
- pg_log_warning("\"%s\" contains no entry for \"%s\"", path,
+ /*- translator: the first %s is a backup manifest file, the second is a file absent therein */
+ pg_log_warning("\"%s\" contains no entry for \"%s\"",
+ path,
manifest_path);
pfree(path);
}
@@ -288,7 +301,8 @@ reconstruct_from_incremental_file(
{
*checksum_length = mfile->checksum_length;
*checksum_payload = pg_malloc(*checksum_length);
- memcpy(*checksum_payload, mfile->checksum_payload, *checksum_length);
+ memcpy(*checksum_payload, mfile->checksum_payload,
+ *checksum_length);
checksum_type = CHECKSUM_TYPE_NONE;
}
}
@@ -305,13 +319,13 @@ reconstruct_from_incremental_file(
* Otherwise, reconstruct.
*/
if (copy_source != NULL)
- copy_file(copy_source->filename, output_filename, &checksum_ctx, dry_run,
- copy_method);
+ copy_file(copy_source->filename, output_filename,
+ &checksum_ctx, dry_run, copy_method);
else
{
- write_reconstructed_file(input_filename, output_filename, block_length,
- sourcemap, offsetmap, &checksum_ctx, debug,
- dry_run);
+ write_reconstructed_file(input_filename, output_filename,
+ block_length, sourcemap, offsetmap,
+ &checksum_ctx, debug, dry_run);
debug_reconstruction(n_prior_backups + 1, source, dry_run);
}
@@ -319,7 +333,8 @@ reconstruct_from_incremental_file(
if (checksum_type != CHECKSUM_TYPE_NONE)
{
*checksum_payload = pg_malloc(PG_CHECKSUM_MAX_LENGTH);
- *checksum_length = pg_checksum_final(&checksum_ctx, *checksum_payload);
+ *checksum_length = pg_checksum_final(&checksum_ctx,
+ *checksum_payload);
}
/*
@@ -364,11 +379,11 @@ debug_reconstruction(int n_source, rfile **sources, bool dry_run)
/* Debug logging. */
if (dry_run)
- pg_log_debug("would have read %u blocks from \"%s\"", s->num_blocks_read,
- s->filename);
+ pg_log_debug("would have read %u blocks from \"%s\"",
+ s->num_blocks_read, s->filename);
else
- pg_log_debug("read %u blocks from \"%s\"", s->num_blocks_read,
- s->filename);
+ pg_log_debug("read %u blocks from \"%s\"",
+ s->num_blocks_read, s->filename);
/*
* In dry-run mode, we don't actually try to read data from the file,
@@ -387,7 +402,8 @@ debug_reconstruction(int n_source, rfile **sources, bool dry_run)
pg_fatal("could not stat \"%s\": %m", s->filename);
if (sb.st_size < s->highest_offset_read)
pg_fatal("file \"%s\" is too short: expected %llu, found %llu",
- s->filename, (unsigned long long) s->highest_offset_read,
+ s->filename,
+ (unsigned long long) s->highest_offset_read,
(unsigned long long) sb.st_size);
}
}
@@ -440,8 +456,7 @@ make_incremental_rfile(char *filename)
read_bytes(rf, &rf->truncation_block_length,
sizeof(rf->truncation_block_length));
if (rf->truncation_block_length > RELSEG_SIZE)
- pg_fatal("file \"%s\" has truncation block length %u in excess of segment "
- "size %u",
+ pg_fatal("file \"%s\" has truncation block length %u in excess of segment size %u",
filename, rf->truncation_block_length, RELSEG_SIZE);
/* Read block numbers if there are any. */
@@ -508,10 +523,12 @@ read_bytes(rfile *rf, void *buffer, unsigned length)
static void
write_reconstructed_file(char *input_filename,
char *output_filename,
- unsigned block_length, rfile **sourcemap,
+ unsigned block_length,
+ rfile **sourcemap,
off_t *offsetmap,
pg_checksum_context *checksum_ctx,
- bool debug, bool dry_run)
+ bool debug,
+ bool dry_run)
{
int wfd = -1;
unsigned i;
@@ -554,8 +571,8 @@ write_reconstructed_file(char *input_filename,
if (current_block == start_of_range)
appendStringInfo(&debug_buf, " %u:zero", current_block);
else
- appendStringInfo(&debug_buf, " %u-%u:zero", start_of_range,
- current_block);
+ appendStringInfo(&debug_buf, " %u-%u:zero",
+ start_of_range, current_block);
}
else
{
@@ -587,7 +604,8 @@ write_reconstructed_file(char *input_filename,
/* Open the output file, except in dry_run mode. */
if (!dry_run &&
- (wfd = open(output_filename, O_RDWR | PG_BINARY | O_CREAT | O_EXCL,
+ (wfd = open(output_filename,
+ O_RDWR | PG_BINARY | O_CREAT | O_EXCL,
pg_file_create_mode)) < 0)
pg_fatal("could not open file \"%s\": %m", output_filename);
@@ -604,8 +622,8 @@ write_reconstructed_file(char *input_filename,
else
{
s->num_blocks_read++;
- s->highest_offset_read =
- Max(s->highest_offset_read, offsetmap[i] + BLCKSZ);
+ s->highest_offset_read = Max(s->highest_offset_read,
+ offsetmap[i] + BLCKSZ);
}
/* Skip the rest of this in dry-run mode. */
@@ -632,9 +650,9 @@ write_reconstructed_file(char *input_filename,
if (rb < 0)
pg_fatal("could not read file \"%s\": %m", s->filename);
else
- pg_fatal("could not read file \"%s\": read only %d of %d bytes at "
- "offset %llu",
- s->filename, rb, BLCKSZ, (unsigned long long) offsetmap[i]);
+ pg_fatal("could not read file \"%s\": read only %d of %d bytes at offset %llu",
+ s->filename, rb, BLCKSZ,
+ (unsigned long long) offsetmap[i]);
}
}
@@ -650,7 +668,8 @@ write_reconstructed_file(char *input_filename,
/* Update the checksum computation. */
if (pg_checksum_update(checksum_ctx, buffer, BLCKSZ) < 0)
- pg_fatal("could not update checksum of file \"%s\"", output_filename);
+ pg_fatal("could not update checksum of file \"%s\"",
+ output_filename);
}
/* Debugging output. */
diff --git a/src/bin/pg_combinebackup/reconstruct.h b/src/bin/pg_combinebackup/reconstruct.h
index 1fa734011bd..8d19dbf7e50 100644
--- a/src/bin/pg_combinebackup/reconstruct.h
+++ b/src/bin/pg_combinebackup/reconstruct.h
@@ -18,12 +18,19 @@
#include "common/file_utils.h"
#include "load_manifest.h"
-extern void reconstruct_from_incremental_file(
- char *input_filename, char *output_filename, char *relative_path,
- char *bare_file_name, int n_prior_backups, char **prior_backup_dirs,
- manifest_data **manifests, char *manifest_path,
- pg_checksum_type checksum_type, int *checksum_length,
- uint8 **checksum_payload, bool debug, bool dry_run,
+extern void reconstruct_from_incremental_file(char *input_filename,
+ char *output_filename,
+ char *relative_path,
+ char *bare_file_name,
+ int n_prior_backups,
+ char **prior_backup_dirs,
+ manifest_data **manifests,
+ char *manifest_path,
+ pg_checksum_type checksum_type,
+ int *checksum_length,
+ uint8 **checksum_payload,
+ bool debug,
+ bool dry_run,
CopyFileMethod copy_method);
#endif
--
2.44.0
v20240319-0001-rebased-patch.patchtext/x-patch; charset=UTF-8; name=v20240319-0001-rebased-patch.patchDownload
From b1183fbae8ed0123d7385a8501f1a843f0d9aa85 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@2ndquadrant.com>
Date: Tue, 19 Mar 2024 15:16:29 +0100
Subject: [PATCH v20240319 1/2] rebased patch
---
src/bin/pg_combinebackup/copy_file.c | 227 +++++++++++++-------
src/bin/pg_combinebackup/copy_file.h | 14 +-
src/bin/pg_combinebackup/pg_combinebackup.c | 220 ++++++++++---------
src/bin/pg_combinebackup/reconstruct.c | 106 ++++-----
src/bin/pg_combinebackup/reconstruct.h | 22 +-
5 files changed, 328 insertions(+), 261 deletions(-)
diff --git a/src/bin/pg_combinebackup/copy_file.c b/src/bin/pg_combinebackup/copy_file.c
index e6d2423278a..16e26b4f573 100644
--- a/src/bin/pg_combinebackup/copy_file.c
+++ b/src/bin/pg_combinebackup/copy_file.c
@@ -10,19 +10,21 @@
*/
#include "postgres_fe.h"
-#ifdef HAVE_COPYFILE_H
-#include <copyfile.h>
-#endif
#include <fcntl.h>
+#include <limits.h>
#include <sys/stat.h>
#include <unistd.h>
#include "common/file_perm.h"
+#include "common/file_utils.h"
#include "common/logging.h"
#include "copy_file.h"
-static void copy_file_blocks(const char *src, const char *dst,
- pg_checksum_context *checksum_ctx);
+static void pg_copyfile(const char *src, const char *dest, const char *addon_errmsg,
+ pg_checksum_context *ctx);
+
+static void pg_copyfile_offload(const char *src, const char *dest,
+ const char *addon_errmsg, CopyFileMethod flags);
#ifdef WIN32
static void copy_file_copyfile(const char *src, const char *dst);
@@ -35,7 +37,8 @@ static void copy_file_copyfile(const char *src, const char *dst);
*/
void
copy_file(const char *src, const char *dst,
- pg_checksum_context *checksum_ctx, bool dry_run)
+ pg_checksum_context *checksum_ctx, bool dry_run,
+ CopyFileMethod copy_strategy)
{
/*
* In dry-run mode, we don't actually copy anything, nor do we read any
@@ -49,6 +52,8 @@ copy_file(const char *src, const char *dst,
pg_fatal("could not open \"%s\": %m", src);
if (close(fd) < 0)
pg_fatal("could not close \"%s\": %m", src);
+
+ return;
}
/*
@@ -56,104 +61,180 @@ copy_file(const char *src, const char *dst,
* operating system primitives that we know about to copy the file; this
* may be quicker than a naive block copy.
*/
- if (checksum_ctx->type == CHECKSUM_TYPE_NONE)
- {
- char *strategy_name = NULL;
- void (*strategy_implementation) (const char *, const char *) = NULL;
+ if (checksum_ctx->type == CHECKSUM_TYPE_NONE && copy_strategy != 0)
+ pg_copyfile_offload(src, dst, NULL, copy_strategy);
+ else
+ pg_copyfile(src, dst, NULL, checksum_ctx);
+}
-#ifdef WIN32
- strategy_name = "CopyFile";
- strategy_implementation = copy_file_copyfile;
-#endif
+/* Helper function to optionally prepend error string */
+static inline char *
+opt_errinfo(const char *addon_errmsg)
+{
+ char buf[128];
- if (strategy_name != NULL)
- {
- if (dry_run)
- pg_log_debug("would copy \"%s\" to \"%s\" using strategy %s",
- src, dst, strategy_name);
- else
- {
- pg_log_debug("copying \"%s\" to \"%s\" using strategy %s",
- src, dst, strategy_name);
- (*strategy_implementation) (src, dst);
- }
- return;
- }
- }
+ if (addon_errmsg == NULL)
+ return "";
- /*
- * Fall back to the simple approach of reading and writing all the blocks,
- * feeding them into the checksum context as we go.
- */
- if (dry_run)
- {
- if (checksum_ctx->type == CHECKSUM_TYPE_NONE)
- pg_log_debug("would copy \"%s\" to \"%s\"",
- src, dst);
- else
- pg_log_debug("would copy \"%s\" to \"%s\" and checksum with %s",
- src, dst, pg_checksum_type_name(checksum_ctx->type));
- }
- else
- {
- if (checksum_ctx->type == CHECKSUM_TYPE_NONE)
- pg_log_debug("copying \"%s\" to \"%s\"",
- src, dst);
- else
- pg_log_debug("copying \"%s\" to \"%s\" and checksumming with %s",
- src, dst, pg_checksum_type_name(checksum_ctx->type));
- copy_file_blocks(src, dst, checksum_ctx);
- }
+ strcpy(buf, " ");
+ return strncat(buf, addon_errmsg, sizeof(buf) - 2);
}
/*
- * Copy a file block by block, and optionally compute a checksum as we go.
+ * Copies a relation file from src to dest. addon_errmsg is an optional
+ * addon error message (can be NULL or include schema/relName)
*/
static void
-copy_file_blocks(const char *src, const char *dst,
- pg_checksum_context *checksum_ctx)
+pg_copyfile(const char *src, const char *dest, const char *addon_errmsg,
+ pg_checksum_context *ctx)
{
+#ifndef WIN32
int src_fd;
int dest_fd;
uint8 *buffer;
+
+ /* copy in fairly large chunks for best efficiency */
const int buffer_size = 50 * BLCKSZ;
- ssize_t rb;
- unsigned offset = 0;
if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0)
- pg_fatal("could not open file \"%s\": %m", src);
+ pg_fatal("error while copying%s: could not open file \"%s\": %s",
+ opt_errinfo(addon_errmsg), src, strerror(errno));
- if ((dest_fd = open(dst, O_WRONLY | O_CREAT | O_EXCL | PG_BINARY,
+ if ((dest_fd = open(dest, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
pg_file_create_mode)) < 0)
- pg_fatal("could not open file \"%s\": %m", dst);
+ pg_fatal("error while copying%s: could not create file \"%s\": %s",
+ opt_errinfo(addon_errmsg), dest, strerror(errno));
buffer = pg_malloc(buffer_size);
- while ((rb = read(src_fd, buffer, buffer_size)) > 0)
+ /* perform data copying i.e read src source, write to destination */
+ while (true)
{
- ssize_t wb;
+ ssize_t nbytes = read(src_fd, buffer, buffer_size);
- if ((wb = write(dest_fd, buffer, rb)) != rb)
+ if (nbytes < 0)
+ pg_fatal("error while copying%s: could not read file "
+ "\"%s\": %s",
+ opt_errinfo(addon_errmsg), src, strerror(errno));
+
+ if (nbytes == 0)
+ break;
+
+ errno = 0;
+ if (write(dest_fd, buffer, nbytes) != nbytes)
{
- if (wb < 0)
- pg_fatal("could not write file \"%s\": %m", dst);
- else
- pg_fatal("could not write file \"%s\": wrote only %d of %d bytes at offset %u",
- dst, (int) wb, (int) rb, offset);
+ /*
+ * if write didn't set errno, assume problem is no disk space
+ */
+ if (errno == 0)
+ errno = ENOSPC;
+ pg_fatal("error while copying%s: could not write file \"%s\": %s",
+ opt_errinfo(addon_errmsg), dest, strerror(errno));
}
- if (pg_checksum_update(checksum_ctx, buffer, rb) < 0)
- pg_fatal("could not update checksum of file \"%s\"", dst);
+ if (pg_checksum_update(ctx, buffer, nbytes) < 0)
+ pg_fatal("could not calculate checksum of file \"%s\"", dest);
+ }
+
+ pg_free(buffer);
+ close(src_fd);
+ close(dest_fd);
+
+#else /* WIN32 */
+ if (CopyFile(src, dest, true) == 0)
+ {
+ _dosmaperr(GetLastError());
+ pg_fatal("error while copying%s (\"%s\" to \"%s\"): %s", addon_errmsg,
+ opt_errinfo(addon_errmsg), src, dest, strerror(errno));
+ }
+#endif /* WIN32 */
+}
+
+/*
+ * pg_copyfile_offload()
+ *
+ * Clones/reflinks a relation file from src to dest using variety of methods
+ *
+ * addon_errmsg can be used to pass additional information in case of errors.
+ * flags, see PG_COPYFILE_* enum in file_utils.h
+ */
+static void
+pg_copyfile_offload(const char *src, const char *dest,
+ const char *addon_errmsg, CopyFileMethod flags)
+{
- offset += rb;
+#ifdef WIN32
+ /* on WIN32 we ignore flags, we have no other choice */
+ if (CopyFile(src, dest, true) == 0)
+ {
+ _dosmaperr(GetLastError());
+ pg_fatal("error while copying%s (\"%s\" to \"%s\"): %s", addon_errmsg,
+ opt_errinfo(addon_errmsg), src, dest, strerror(errno));
}
+#elif defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE)
+ /* on MacOS we ignore flags, we have no other choice */
+ if (copyfile(src, dest, NULL, COPYFILE_CLONE_FORCE) < 0)
+ pg_fatal("error while cloning%s: (\"%s\" to \"%s\"): %s",
+ opt_errinfo(addon_errmsg), src, dest, strerror(errno));
+
+#elif defined(HAVE_COPY_FILE_RANGE) || defined(FICLONE)
+ int src_fd;
+ int dest_fd;
+ ssize_t nbytes;
- if (rb < 0)
- pg_fatal("could not read file \"%s\": %m", dst);
+ if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0)
+ pg_fatal("error while copying%s: could not open file \"%s\": %s",
+ opt_errinfo(addon_errmsg), src, strerror(errno));
+
+ if ((dest_fd = open(dest, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
+ pg_file_create_mode)) < 0)
+ pg_fatal("error while copying%s: could not create file \"%s\": %s",
+ opt_errinfo(addon_errmsg), dest, strerror(errno));
+
+ if (flags & PG_COPYFILE_COPY_FILE_RANGE)
+ {
+#ifdef HAVE_COPY_FILE_RANGE
+ do
+ {
+ nbytes = copy_file_range(src_fd, NULL, dest_fd, NULL, SSIZE_MAX, 0);
+ if (nbytes < 0 && errno != EINTR)
+ pg_fatal("error while copying%s: could not copy_file_range()"
+ "from \"%s\" to \"%s\": %s",
+ opt_errinfo(addon_errmsg), src, dest, strerror(errno));
+ } while (nbytes > 0);
+#else
+ pg_fatal("copy file accelaration via copy_file_range() is not supported on "
+ "this platform");
+#endif
+ }
+ else if (flags & PG_COPYFILE_IOCTL_FICLONE)
+ {
+#if defined(__linux__) && defined(FICLONE)
+ if (ioctl(dest_fd, FICLONE, src_fd) < 0)
+ {
+ int save_errno = errno;
+
+ unlink(dest);
+
+ pg_fatal("error while cloning%s: (\"%s\" to \"%s\"): %s",
+ opt_errinfo(addon_errmsg), src, dest, strerror(save_errno));
+ }
+#else
+ pg_fatal("clone file accelaration via ioctl(FICLONE) is not supported on "
+ "this platform");
+#endif
+ }
- pg_free(buffer);
close(src_fd);
close(dest_fd);
+
+#else
+ if (flags & PG_COPYFILE_FALLBACK)
+ pg_copyfile(src, dest, addon_errmsg);
+ else
+ pg_fatal("none of the copy file acceleration methods are supported on this "
+ "platform");
+#endif
}
#ifdef WIN32
diff --git a/src/bin/pg_combinebackup/copy_file.h b/src/bin/pg_combinebackup/copy_file.h
index 0f6bc09403f..2797a340055 100644
--- a/src/bin/pg_combinebackup/copy_file.h
+++ b/src/bin/pg_combinebackup/copy_file.h
@@ -11,9 +11,21 @@
#ifndef COPY_FILE_H
#define COPY_FILE_H
+#include "c.h"
#include "common/checksum_helper.h"
+#include "common/file_utils.h"
+
+typedef enum CopyFileMethod
+{
+ PG_COPYFILE_FALLBACK = 0x1,
+ PG_COPYFILE_IOCTL_FICLONE = 0x2, /* Linux */
+ PG_COPYFILE_COPY_FILE_RANGE = 0x4, /* FreeBSD & Linux >= 4.5 */
+ PG_COPYFILE_COPYFILE_CLONE_FORCE = 0x8 /* MacOS */
+} CopyFileMethod;
+#define PG_COPYFILE_ANY_WITH_FALLBACK (2 << 4) - 1
extern void copy_file(const char *src, const char *dst,
- pg_checksum_context *checksum_ctx, bool dry_run);
+ pg_checksum_context *checksum_ctx, bool dry_run,
+ CopyFileMethod copy_strategy);
#endif /* COPY_FILE_H */
diff --git a/src/bin/pg_combinebackup/pg_combinebackup.c b/src/bin/pg_combinebackup/pg_combinebackup.c
index 74f8be9eeac..1455360d81c 100644
--- a/src/bin/pg_combinebackup/pg_combinebackup.c
+++ b/src/bin/pg_combinebackup/pg_combinebackup.c
@@ -69,6 +69,7 @@ typedef struct cb_options
pg_checksum_type manifest_checksums;
bool no_manifest;
DataDirSyncMethod sync_method;
+ CopyFileMethod copy_method;
} cb_options;
/*
@@ -98,15 +99,10 @@ static void cleanup_directories_atexit(void);
static void create_output_directory(char *dirname, cb_options *opt);
static void help(const char *progname);
static bool parse_oid(char *s, Oid *result);
-static void process_directory_recursively(Oid tsoid,
- char *input_directory,
- char *output_directory,
- char *relative_path,
- int n_prior_backups,
- char **prior_backup_dirs,
- manifest_data **manifests,
- manifest_writer *mwriter,
- cb_options *opt);
+static void process_directory_recursively(
+ Oid tsoid, char *input_directory, char *output_directory,
+ char *relative_path, int n_prior_backups, char **prior_backup_dirs,
+ manifest_data **manifests, manifest_writer *mwriter, cb_options *opt);
static int read_pg_version_file(char *directory);
static void remember_to_cleanup_directory(char *target_path, bool rmtopdir);
static void reset_directory_cleanup_list(void);
@@ -129,8 +125,9 @@ main(int argc, char *argv[])
{"manifest-checksums", required_argument, NULL, 1},
{"no-manifest", no_argument, NULL, 2},
{"sync-method", required_argument, NULL, 3},
- {NULL, 0, NULL, 0}
- };
+ {"clone", no_argument, NULL, 4},
+ {"copy-file-range", no_argument, NULL, 5},
+ {NULL, 0, NULL, 0}};
const char *progname;
char *last_input_dir;
@@ -156,10 +153,11 @@ main(int argc, char *argv[])
memset(&opt, 0, sizeof(opt));
opt.manifest_checksums = CHECKSUM_TYPE_CRC32C;
opt.sync_method = DATA_DIR_SYNC_METHOD_FSYNC;
+ opt.copy_method = 0;
/* process command-line options */
- while ((c = getopt_long(argc, argv, "dnNPo:T:",
- long_options, &optindex)) != -1)
+ while ((c = getopt_long(argc, argv, "dnNPo:T:", long_options, &optindex)) !=
+ -1)
{
switch (c)
{
@@ -180,10 +178,8 @@ main(int argc, char *argv[])
add_tablespace_mapping(&opt, optarg);
break;
case 1:
- if (!pg_checksum_parse_type(optarg,
- &opt.manifest_checksums))
- pg_fatal("unrecognized checksum algorithm: \"%s\"",
- optarg);
+ if (!pg_checksum_parse_type(optarg, &opt.manifest_checksums))
+ pg_fatal("unrecognized checksum algorithm: \"%s\"", optarg);
break;
case 2:
opt.no_manifest = true;
@@ -192,6 +188,12 @@ main(int argc, char *argv[])
if (!parse_sync_method(optarg, &opt.sync_method))
exit(1);
break;
+ case 4:
+ opt.copy_method = PG_COPYFILE_IOCTL_FICLONE;
+ break;
+ case 5:
+ opt.copy_method = PG_COPYFILE_COPY_FILE_RANGE;
+ break;
default:
/* getopt_long already emitted a complaint */
pg_log_error_hint("Try \"%s --help\" for more information.", progname);
@@ -213,6 +215,14 @@ main(int argc, char *argv[])
if (opt.no_manifest)
opt.manifest_checksums = CHECKSUM_TYPE_NONE;
+ /*
+ * We cannot provide file copy/clone offload in case when we need to
+ * calculate checksums
+ */
+ if (opt.copy_method != 0 && opt.manifest_checksums != CHECKSUM_TYPE_NONE)
+ pg_fatal("unable to use accelerated copy when manifest checksums "
+ "are to be calculated. Use --no-manifest");
+
/* Read the server version from the final backup. */
version = read_pg_version_file(argv[argc - 1]);
@@ -285,7 +295,8 @@ main(int argc, char *argv[])
* won't have the WAL ranges for the resulting manifest.
*/
if (manifests[n_prior_backups] == NULL)
- pg_fatal("can't generate a manifest because no manifest is available for the final input backup");
+ pg_fatal("can't generate a manifest because no manifest is available for "
+ "the final input backup");
}
else
mwriter = NULL;
@@ -297,15 +308,15 @@ main(int argc, char *argv[])
{
pg_log_debug("generating \"%s/backup_label\"", opt.output);
last_backup_label->cursor = 0;
- write_backup_label(opt.output, last_backup_label,
- opt.manifest_checksums, mwriter);
+ write_backup_label(opt.output, last_backup_label, opt.manifest_checksums,
+ mwriter);
}
/* Process everything that's not part of a user-defined tablespace. */
pg_log_debug("processing backup directory \"%s\"", last_input_dir);
- process_directory_recursively(InvalidOid, last_input_dir, opt.output,
- NULL, n_prior_backups, prior_backup_dirs,
- manifests, mwriter, &opt);
+ process_directory_recursively(InvalidOid, last_input_dir, opt.output, NULL,
+ n_prior_backups, prior_backup_dirs, manifests,
+ mwriter, &opt);
/* Process user-defined tablespaces. */
for (ts = tablespaces; ts != NULL; ts = ts->next)
@@ -321,16 +332,15 @@ main(int argc, char *argv[])
{
char linkpath[MAXPGPATH];
- snprintf(linkpath, MAXPGPATH, "%s/pg_tblspc/%u", opt.output,
- ts->oid);
+ snprintf(linkpath, MAXPGPATH, "%s/pg_tblspc/%u", opt.output, ts->oid);
if (opt.dry_run)
pg_log_debug("would create symbolic link from \"%s\" to \"%s\"",
linkpath, ts->new_dir);
else
{
- pg_log_debug("creating symbolic link from \"%s\" to \"%s\"",
- linkpath, ts->new_dir);
+ pg_log_debug("creating symbolic link from \"%s\" to \"%s\"", linkpath,
+ ts->new_dir);
if (symlink(ts->new_dir, linkpath) != 0)
pg_fatal("could not create symbolic link from \"%s\" to \"%s\": %m",
linkpath, ts->new_dir);
@@ -344,21 +354,19 @@ main(int argc, char *argv[])
{
pg_log_debug("creating directory \"%s\"", ts->new_dir);
if (pg_mkdir_p(ts->new_dir, pg_dir_create_mode) == -1)
- pg_fatal("could not create directory \"%s\": %m",
- ts->new_dir);
+ pg_fatal("could not create directory \"%s\": %m", ts->new_dir);
}
}
/* OK, now handle the directory contents. */
- process_directory_recursively(ts->oid, ts->old_dir, ts->new_dir,
- NULL, n_prior_backups, prior_backup_dirs,
- manifests, mwriter, &opt);
+ process_directory_recursively(ts->oid, ts->old_dir, ts->new_dir, NULL,
+ n_prior_backups, prior_backup_dirs, manifests,
+ mwriter, &opt);
}
/* Finalize the backup_manifest, if we're generating one. */
if (mwriter != NULL)
- finalize_manifest(mwriter,
- manifests[n_prior_backups]->first_wal_range);
+ finalize_manifest(mwriter, manifests[n_prior_backups]->first_wal_range);
/* fsync that output directory unless we've been told not to do so */
if (!opt.no_sync)
@@ -414,7 +422,9 @@ add_tablespace_mapping(cb_options *opt, char *arg)
*dst_ptr++ = *arg_ptr;
}
if (!tsmap->old_dir[0] || !tsmap->new_dir[0])
- pg_fatal("invalid tablespace mapping format \"%s\", must be \"OLDDIR=NEWDIR\"", arg);
+ pg_fatal(
+ "invalid tablespace mapping format \"%s\", must be \"OLDDIR=NEWDIR\"",
+ arg);
/*
* All tablespaces are created with absolute directories, so specifying a
@@ -486,8 +496,8 @@ check_backup_label_files(int n_backups, char **backup_dirs)
pg_fatal("could not close \"%s\": %m", pathbuf);
/* Parse the file contents. */
- parse_backup_label(pathbuf, buf, &start_tli, &start_lsn,
- &previous_tli, &previous_lsn);
+ parse_backup_label(pathbuf, buf, &start_tli, &start_lsn, &previous_tli,
+ &previous_lsn);
/*
* Sanity checks.
@@ -498,18 +508,19 @@ check_backup_label_files(int n_backups, char **backup_dirs)
* we don't have that information.
*/
if (i > 0 && previous_tli == 0)
- pg_fatal("backup at \"%s\" is a full backup, but only the first backup should be a full backup",
+ pg_fatal("backup at \"%s\" is a full backup, but only the first backup "
+ "should be a full backup",
backup_dirs[i]);
if (i == 0 && previous_tli != 0)
- pg_fatal("backup at \"%s\" is an incremental backup, but the first backup should be a full backup",
+ pg_fatal("backup at \"%s\" is an incremental backup, but the first "
+ "backup should be a full backup",
backup_dirs[i]);
if (i < n_backups - 1 && start_tli != check_tli)
pg_fatal("backup at \"%s\" starts on timeline %u, but expected %u",
backup_dirs[i], start_tli, check_tli);
if (i < n_backups - 1 && start_lsn != check_lsn)
pg_fatal("backup at \"%s\" starts at LSN %X/%X, but expected %X/%X",
- backup_dirs[i],
- LSN_FORMAT_ARGS(start_lsn),
+ backup_dirs[i], LSN_FORMAT_ARGS(start_lsn),
LSN_FORMAT_ARGS(check_lsn));
check_tli = previous_tli;
check_lsn = previous_lsn;
@@ -561,8 +572,7 @@ check_control_files(int n_backups, char **backup_dirs)
/* Can't interpret control file if not current version. */
if (control_file->pg_control_version != PG_CONTROL_VERSION)
- pg_fatal("%s: unexpected control file version",
- controlpath);
+ pg_fatal("%s: unexpected control file version", controlpath);
/* System identifiers should all match. */
if (i == n_backups - 1)
@@ -688,14 +698,23 @@ help(const char *progname)
printf(_("\nOptions:\n"));
printf(_(" -d, --debug generate lots of debugging output\n"));
printf(_(" -n, --dry-run don't actually do anything\n"));
- printf(_(" -N, --no-sync do not wait for changes to be written safely to disk\n"));
+ printf(_(" -N, --no-sync do not wait for changes to be written "
+ "safely to disk\n"));
printf(_(" -o, --output output directory\n"));
- printf(_(" -T, --tablespace-mapping=OLDDIR=NEWDIR\n"
+ printf(_(
+ " -T, --tablespace-mapping=OLDDIR=NEWDIR\n"
" relocate tablespace in OLDDIR to NEWDIR\n"));
- printf(_(" --manifest-checksums=SHA{224,256,384,512}|CRC32C|NONE\n"
+ printf(
+ _(" --manifest-checksums=SHA{224,256,384,512}|CRC32C|NONE\n"
" use algorithm for manifest checksums\n"));
- printf(_(" --no-manifest suppress generation of backup manifest\n"));
- printf(_(" --sync-method=METHOD set method for syncing files to disk\n"));
+ printf(_(
+ " --no-manifest suppress generation of backup manifest\n"));
+ printf(
+ _(" --sync-method=METHOD set method for syncing files to disk\n"));
+ printf(_(" --clone clone (reflink) instead of copying "
+ "files\n"));
+ printf(
+ _(" --copy-file-range copy using copy_file_range() syscall\n"));
printf(_(" -?, --help show this help, then exit\n"));
printf(_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
@@ -742,15 +761,10 @@ parse_oid(char *s, Oid *result)
* the locations of those previous backups.
*/
static void
-process_directory_recursively(Oid tsoid,
- char *input_directory,
- char *output_directory,
- char *relative_path,
- int n_prior_backups,
- char **prior_backup_dirs,
- manifest_data **manifests,
- manifest_writer *mwriter,
- cb_options *opt)
+process_directory_recursively(
+ Oid tsoid, char *input_directory, char *output_directory,
+ char *relative_path, int n_prior_backups, char **prior_backup_dirs,
+ manifest_data **manifests, manifest_writer *mwriter, cb_options *opt)
{
char ifulldir[MAXPGPATH];
char ofulldir[MAXPGPATH];
@@ -803,13 +817,11 @@ process_directory_recursively(Oid tsoid,
}
else
{
- snprintf(ifulldir, MAXPGPATH, "%s/%s", input_directory,
- relative_path);
- snprintf(ofulldir, MAXPGPATH, "%s/%s", output_directory,
- relative_path);
+ snprintf(ifulldir, MAXPGPATH, "%s/%s", input_directory, relative_path);
+ snprintf(ofulldir, MAXPGPATH, "%s/%s", output_directory, relative_path);
if (OidIsValid(tsoid))
- snprintf(manifest_prefix, MAXPGPATH, "pg_tblspc/%u/%s/",
- tsoid, relative_path);
+ snprintf(manifest_prefix, MAXPGPATH, "pg_tblspc/%u/%s/", tsoid,
+ relative_path);
else
snprintf(manifest_prefix, MAXPGPATH, "%s/", relative_path);
}
@@ -845,8 +857,7 @@ process_directory_recursively(Oid tsoid,
pg_checksum_context checksum_ctx;
/* Ignore "." and ".." entries. */
- if (strcmp(de->d_name, ".") == 0 ||
- strcmp(de->d_name, "..") == 0)
+ if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
continue;
/* Construct input path. */
@@ -882,11 +893,9 @@ process_directory_recursively(Oid tsoid,
de->d_name);
/* And recurse. */
- process_directory_recursively(tsoid,
- input_directory, output_directory,
- new_relative_path,
- n_prior_backups, prior_backup_dirs,
- manifests, mwriter, opt);
+ process_directory_recursively(tsoid, input_directory, output_directory,
+ new_relative_path, n_prior_backups,
+ prior_backup_dirs, manifests, mwriter, opt);
continue;
}
@@ -904,46 +913,37 @@ process_directory_recursively(Oid tsoid,
* Skip the backup_label and backup_manifest files; they require
* special handling and are handled elsewhere.
*/
- if (relative_path == NULL &&
- (strcmp(de->d_name, "backup_label") == 0 ||
- strcmp(de->d_name, "backup_manifest") == 0))
+ if (relative_path == NULL && (strcmp(de->d_name, "backup_label") == 0 ||
+ strcmp(de->d_name, "backup_manifest") == 0))
continue;
/*
* If it's an incremental file, hand it off to the reconstruction
* code, which will figure out what to do.
*/
- if (strncmp(de->d_name, INCREMENTAL_PREFIX,
- INCREMENTAL_PREFIX_LENGTH) == 0)
+ if (strncmp(de->d_name, INCREMENTAL_PREFIX, INCREMENTAL_PREFIX_LENGTH) ==
+ 0)
{
/* Output path should not include "INCREMENTAL." prefix. */
snprintf(ofullpath, MAXPGPATH, "%s/%s", ofulldir,
de->d_name + INCREMENTAL_PREFIX_LENGTH);
-
/* Manifest path likewise omits incremental prefix. */
snprintf(manifest_path, MAXPGPATH, "%s%s", manifest_prefix,
de->d_name + INCREMENTAL_PREFIX_LENGTH);
/* Reconstruction logic will do the rest. */
- reconstruct_from_incremental_file(ifullpath, ofullpath,
- relative_path,
- de->d_name + INCREMENTAL_PREFIX_LENGTH,
- n_prior_backups,
- prior_backup_dirs,
- manifests,
- manifest_path,
- checksum_type,
- &checksum_length,
- &checksum_payload,
- opt->debug,
- opt->dry_run);
+ reconstruct_from_incremental_file(
+ ifullpath, ofullpath, relative_path,
+ de->d_name + INCREMENTAL_PREFIX_LENGTH, n_prior_backups,
+ prior_backup_dirs, manifests, manifest_path, checksum_type,
+ &checksum_length, &checksum_payload, opt->debug, opt->dry_run,
+ opt->copy_method);
}
else
{
/* Construct the path that the backup_manifest will use. */
- snprintf(manifest_path, MAXPGPATH, "%s%s", manifest_prefix,
- de->d_name);
+ snprintf(manifest_path, MAXPGPATH, "%s%s", manifest_prefix, de->d_name);
/*
* It's not an incremental file, so we need to copy the entire
@@ -953,13 +953,11 @@ process_directory_recursively(Oid tsoid,
* backup_manifest for the final input directory, we can save some
* work by reusing that checksum instead of computing a new one.
*/
- if (checksum_type != CHECKSUM_TYPE_NONE &&
- latest_manifest != NULL)
+ if (checksum_type != CHECKSUM_TYPE_NONE && latest_manifest != NULL)
{
manifest_file *mfile;
- mfile = manifest_files_lookup(latest_manifest->files,
- manifest_path);
+ mfile = manifest_files_lookup(latest_manifest->files, manifest_path);
if (mfile == NULL)
{
char *bmpath;
@@ -968,10 +966,9 @@ process_directory_recursively(Oid tsoid,
* The directory is out of sync with the backup_manifest,
* so emit a warning.
*/
- bmpath = psprintf("%s/%s", input_directory,
- "backup_manifest");
- pg_log_warning("\"%s\" contains no entry for \"%s\"",
- bmpath, manifest_path);
+ bmpath = psprintf("%s/%s", input_directory, "backup_manifest");
+ pg_log_warning("\"%s\" contains no entry for \"%s\"", bmpath,
+ manifest_path);
pfree(bmpath);
}
else if (mfile->checksum_type == checksum_type)
@@ -993,7 +990,8 @@ process_directory_recursively(Oid tsoid,
/* Actually copy the file. */
snprintf(ofullpath, MAXPGPATH, "%s/%s", ofulldir, de->d_name);
- copy_file(ifullpath, ofullpath, &checksum_ctx, opt->dry_run);
+ copy_file(ifullpath, ofullpath, &checksum_ctx, opt->dry_run,
+ opt->copy_method);
/*
* If copy_file() performed a checksum calculation for us, then
@@ -1003,8 +1001,7 @@ process_directory_recursively(Oid tsoid,
if (checksum_ctx.type != CHECKSUM_TYPE_NONE && !opt->dry_run)
{
checksum_payload = pg_malloc(PG_CHECKSUM_MAX_LENGTH);
- checksum_length = pg_checksum_final(&checksum_ctx,
- checksum_payload);
+ checksum_length = pg_checksum_final(&checksum_ctx, checksum_payload);
}
}
@@ -1030,10 +1027,8 @@ process_directory_recursively(Oid tsoid,
pg_fatal("could not stat file \"%s\": %m", ofullpath);
/* OK, now do the work. */
- add_file_to_manifest(mwriter, manifest_path,
- sb.st_size, sb.st_mtime,
- checksum_type, checksum_length,
- checksum_payload);
+ add_file_to_manifest(mwriter, manifest_path, sb.st_size, sb.st_mtime,
+ checksum_type, checksum_length, checksum_payload);
}
/* Avoid leaking memory. */
@@ -1141,7 +1136,8 @@ reset_directory_cleanup_list(void)
* final backup in the backup chain.
*/
static cb_tablespace *
-scan_for_existing_tablespaces(char *pathname, cb_options *opt)
+scan_for_existing_tablespaces(char *pathname,
+ cb_options *opt)
{
char pg_tblspc[MAXPGPATH];
DIR *dir;
@@ -1174,7 +1170,8 @@ scan_for_existing_tablespaces(char *pathname, cb_options *opt)
/* Ignore any file name that doesn't look like a proper OID. */
if (!parse_oid(de->d_name, &oid))
{
- pg_log_debug("skipping \"%s\" because the filename is not a legal tablespace OID",
+ pg_log_debug(
+ "skipping \"%s\" because the filename is not a legal tablespace OID",
tblspcdir);
continue;
}
@@ -1185,7 +1182,8 @@ scan_for_existing_tablespaces(char *pathname, cb_options *opt)
exit(1);
if (type != PGFILETYPE_LNK && type != PGFILETYPE_DIR)
{
- pg_log_debug("skipping \"%s\" because it is neither a symbolic link nor a directory",
+ pg_log_debug("skipping \"%s\" because it is neither a symbolic link nor "
+ "a directory",
tblspcdir);
continue;
}
@@ -1205,8 +1203,7 @@ scan_for_existing_tablespaces(char *pathname, cb_options *opt)
/* Read the link target. */
link_length = readlink(tblspcdir, link_target, sizeof(link_target));
if (link_length < 0)
- pg_fatal("could not read symbolic link \"%s\": %m",
- tblspcdir);
+ pg_fatal("could not read symbolic link \"%s\": %m", tblspcdir);
if (link_length >= sizeof(link_target))
pg_fatal("symbolic link \"%s\" is too long", tblspcdir);
link_target[link_length] = '\0';
@@ -1233,8 +1230,7 @@ scan_for_existing_tablespaces(char *pathname, cb_options *opt)
/* Every non-in-place tablespace must be mapped. */
if (tsmap == NULL)
- pg_fatal("tablespace at \"%s\" has no tablespace mapping",
- link_target);
+ pg_fatal("tablespace at \"%s\" has no tablespace mapping", link_target);
}
else
{
diff --git a/src/bin/pg_combinebackup/reconstruct.c b/src/bin/pg_combinebackup/reconstruct.c
index 41f06bb26b5..4daff9c77be 100644
--- a/src/bin/pg_combinebackup/reconstruct.c
+++ b/src/bin/pg_combinebackup/reconstruct.c
@@ -46,20 +46,16 @@ typedef struct rfile
off_t highest_offset_read;
} rfile;
-static void debug_reconstruction(int n_source,
- rfile **sources,
- bool dry_run);
+static void debug_reconstruction(int n_source, rfile **sources, bool dry_run);
static unsigned find_reconstructed_block_length(rfile *s);
static rfile *make_incremental_rfile(char *filename);
static rfile *make_rfile(char *filename, bool missing_ok);
static void write_reconstructed_file(char *input_filename,
char *output_filename,
- unsigned block_length,
- rfile **sourcemap,
+ unsigned block_length, rfile **sourcemap,
off_t *offsetmap,
pg_checksum_context *checksum_ctx,
- bool debug,
- bool dry_run);
+ bool debug, bool dry_run);
static void read_bytes(rfile *rf, void *buffer, unsigned length);
/*
@@ -78,19 +74,13 @@ static void read_bytes(rfile *rf, void *buffer, unsigned length);
* an array of pathnames where those backups can be found.
*/
void
-reconstruct_from_incremental_file(char *input_filename,
- char *output_filename,
- char *relative_path,
- char *bare_file_name,
- int n_prior_backups,
- char **prior_backup_dirs,
- manifest_data **manifests,
- char *manifest_path,
- pg_checksum_type checksum_type,
- int *checksum_length,
- uint8 **checksum_payload,
- bool debug,
- bool dry_run)
+reconstruct_from_incremental_file(
+ char *input_filename, char *output_filename, char *relative_path,
+ char *bare_file_name, int n_prior_backups, char **prior_backup_dirs,
+ manifest_data **manifests, char *manifest_path,
+ pg_checksum_type checksum_type, int *checksum_length,
+ uint8 **checksum_payload, bool debug, bool dry_run,
+ CopyFileMethod copy_method)
{
rfile **source;
rfile *latest_source = NULL;
@@ -167,8 +157,8 @@ reconstruct_from_incremental_file(char *input_filename,
* Look for the full file in the previous backup. If not found, then
* look for an incremental file instead.
*/
- snprintf(source_filename, MAXPGPATH, "%s/%s/%s",
- prior_backup_dirs[sidx], relative_path, bare_file_name);
+ snprintf(source_filename, MAXPGPATH, "%s/%s/%s", prior_backup_dirs[sidx],
+ relative_path, bare_file_name);
if ((s = make_rfile(source_filename, true)) == NULL)
{
snprintf(source_filename, MAXPGPATH, "%s/%s/INCREMENTAL.%s",
@@ -231,8 +221,7 @@ reconstruct_from_incremental_file(char *input_filename,
{
uint64 expected_length;
- expected_length =
- (uint64) latest_source->truncation_block_length;
+ expected_length = (uint64) latest_source->truncation_block_length;
expected_length *= BLCKSZ;
if (expected_length == sb.st_size)
{
@@ -253,8 +242,7 @@ reconstruct_from_incremental_file(char *input_filename,
{
BlockNumber b = s->relative_block_numbers[i];
- if (b < latest_source->truncation_block_length &&
- sourcemap[b] == NULL)
+ if (b < latest_source->truncation_block_length && sourcemap[b] == NULL)
{
sourcemap[b] = s;
offsetmap[b] = s->header_length + (i * BLCKSZ);
@@ -283,16 +271,16 @@ reconstruct_from_incremental_file(char *input_filename,
manifest_path);
if (mfile == NULL)
{
- char *path = psprintf("%s/backup_manifest",
- prior_backup_dirs[copy_source_index]);
+ char *path =
+ psprintf("%s/backup_manifest", prior_backup_dirs[copy_source_index]);
/*
* The directory is out of sync with the backup_manifest, so emit
* a warning.
*/
- /*- translator: the first %s is a backup manifest file, the second is a file absent therein */
- pg_log_warning("\"%s\" contains no entry for \"%s\"",
- path,
+ /*- translator: the first %s is a backup manifest file, the second is a
+ * file absent therein */
+ pg_log_warning("\"%s\" contains no entry for \"%s\"", path,
manifest_path);
pfree(path);
}
@@ -300,8 +288,7 @@ reconstruct_from_incremental_file(char *input_filename,
{
*checksum_length = mfile->checksum_length;
*checksum_payload = pg_malloc(*checksum_length);
- memcpy(*checksum_payload, mfile->checksum_payload,
- *checksum_length);
+ memcpy(*checksum_payload, mfile->checksum_payload, *checksum_length);
checksum_type = CHECKSUM_TYPE_NONE;
}
}
@@ -318,13 +305,13 @@ reconstruct_from_incremental_file(char *input_filename,
* Otherwise, reconstruct.
*/
if (copy_source != NULL)
- copy_file(copy_source->filename, output_filename,
- &checksum_ctx, dry_run);
+ copy_file(copy_source->filename, output_filename, &checksum_ctx, dry_run,
+ copy_method);
else
{
- write_reconstructed_file(input_filename, output_filename,
- block_length, sourcemap, offsetmap,
- &checksum_ctx, debug, dry_run);
+ write_reconstructed_file(input_filename, output_filename, block_length,
+ sourcemap, offsetmap, &checksum_ctx, debug,
+ dry_run);
debug_reconstruction(n_prior_backups + 1, source, dry_run);
}
@@ -332,8 +319,7 @@ reconstruct_from_incremental_file(char *input_filename,
if (checksum_type != CHECKSUM_TYPE_NONE)
{
*checksum_payload = pg_malloc(PG_CHECKSUM_MAX_LENGTH);
- *checksum_length = pg_checksum_final(&checksum_ctx,
- *checksum_payload);
+ *checksum_length = pg_checksum_final(&checksum_ctx, *checksum_payload);
}
/*
@@ -378,11 +364,11 @@ debug_reconstruction(int n_source, rfile **sources, bool dry_run)
/* Debug logging. */
if (dry_run)
- pg_log_debug("would have read %u blocks from \"%s\"",
- s->num_blocks_read, s->filename);
+ pg_log_debug("would have read %u blocks from \"%s\"", s->num_blocks_read,
+ s->filename);
else
- pg_log_debug("read %u blocks from \"%s\"",
- s->num_blocks_read, s->filename);
+ pg_log_debug("read %u blocks from \"%s\"", s->num_blocks_read,
+ s->filename);
/*
* In dry-run mode, we don't actually try to read data from the file,
@@ -401,8 +387,7 @@ debug_reconstruction(int n_source, rfile **sources, bool dry_run)
pg_fatal("could not stat \"%s\": %m", s->filename);
if (sb.st_size < s->highest_offset_read)
pg_fatal("file \"%s\" is too short: expected %llu, found %llu",
- s->filename,
- (unsigned long long) s->highest_offset_read,
+ s->filename, (unsigned long long) s->highest_offset_read,
(unsigned long long) sb.st_size);
}
}
@@ -455,7 +440,8 @@ make_incremental_rfile(char *filename)
read_bytes(rf, &rf->truncation_block_length,
sizeof(rf->truncation_block_length));
if (rf->truncation_block_length > RELSEG_SIZE)
- pg_fatal("file \"%s\" has truncation block length %u in excess of segment size %u",
+ pg_fatal("file \"%s\" has truncation block length %u in excess of segment "
+ "size %u",
filename, rf->truncation_block_length, RELSEG_SIZE);
/* Read block numbers if there are any. */
@@ -522,12 +508,10 @@ read_bytes(rfile *rf, void *buffer, unsigned length)
static void
write_reconstructed_file(char *input_filename,
char *output_filename,
- unsigned block_length,
- rfile **sourcemap,
+ unsigned block_length, rfile **sourcemap,
off_t *offsetmap,
pg_checksum_context *checksum_ctx,
- bool debug,
- bool dry_run)
+ bool debug, bool dry_run)
{
int wfd = -1;
unsigned i;
@@ -570,8 +554,8 @@ write_reconstructed_file(char *input_filename,
if (current_block == start_of_range)
appendStringInfo(&debug_buf, " %u:zero", current_block);
else
- appendStringInfo(&debug_buf, " %u-%u:zero",
- start_of_range, current_block);
+ appendStringInfo(&debug_buf, " %u-%u:zero", start_of_range,
+ current_block);
}
else
{
@@ -603,8 +587,7 @@ write_reconstructed_file(char *input_filename,
/* Open the output file, except in dry_run mode. */
if (!dry_run &&
- (wfd = open(output_filename,
- O_RDWR | PG_BINARY | O_CREAT | O_EXCL,
+ (wfd = open(output_filename, O_RDWR | PG_BINARY | O_CREAT | O_EXCL,
pg_file_create_mode)) < 0)
pg_fatal("could not open file \"%s\": %m", output_filename);
@@ -621,8 +604,8 @@ write_reconstructed_file(char *input_filename,
else
{
s->num_blocks_read++;
- s->highest_offset_read = Max(s->highest_offset_read,
- offsetmap[i] + BLCKSZ);
+ s->highest_offset_read =
+ Max(s->highest_offset_read, offsetmap[i] + BLCKSZ);
}
/* Skip the rest of this in dry-run mode. */
@@ -649,9 +632,9 @@ write_reconstructed_file(char *input_filename,
if (rb < 0)
pg_fatal("could not read file \"%s\": %m", s->filename);
else
- pg_fatal("could not read file \"%s\": read only %d of %d bytes at offset %llu",
- s->filename, rb, BLCKSZ,
- (unsigned long long) offsetmap[i]);
+ pg_fatal("could not read file \"%s\": read only %d of %d bytes at "
+ "offset %llu",
+ s->filename, rb, BLCKSZ, (unsigned long long) offsetmap[i]);
}
}
@@ -667,8 +650,7 @@ write_reconstructed_file(char *input_filename,
/* Update the checksum computation. */
if (pg_checksum_update(checksum_ctx, buffer, BLCKSZ) < 0)
- pg_fatal("could not update checksum of file \"%s\"",
- output_filename);
+ pg_fatal("could not update checksum of file \"%s\"", output_filename);
}
/* Debugging output. */
diff --git a/src/bin/pg_combinebackup/reconstruct.h b/src/bin/pg_combinebackup/reconstruct.h
index 8e33a8a95a0..1fa734011bd 100644
--- a/src/bin/pg_combinebackup/reconstruct.h
+++ b/src/bin/pg_combinebackup/reconstruct.h
@@ -13,21 +13,17 @@
#ifndef RECONSTRUCT_H
#define RECONSTRUCT_H
+#include "c.h"
#include "common/checksum_helper.h"
+#include "common/file_utils.h"
#include "load_manifest.h"
-extern void reconstruct_from_incremental_file(char *input_filename,
- char *output_filename,
- char *relative_path,
- char *bare_file_name,
- int n_prior_backups,
- char **prior_backup_dirs,
- manifest_data **manifests,
- char *manifest_path,
- pg_checksum_type checksum_type,
- int *checksum_length,
- uint8 **checksum_payload,
- bool debug,
- bool dry_run);
+extern void reconstruct_from_incremental_file(
+ char *input_filename, char *output_filename, char *relative_path,
+ char *bare_file_name, int n_prior_backups, char **prior_backup_dirs,
+ manifest_data **manifests, char *manifest_path,
+ pg_checksum_type checksum_type, int *checksum_length,
+ uint8 **checksum_payload, bool debug, bool dry_run,
+ CopyFileMethod copy_method);
#endif
--
2.44.0
Hi Tomas,
I took a quick look at the remaining part adding copy_file_range to
pg_combinebackup. The patch no longer applies, so I had to rebase it.
Most of the issues were trivial, but I had to fix a couple missing
prototypes - I added them to copy_file.h/c, mostly.0001 is the minimal rebase + those fixes
0002 has a couple review comments in copy_file, and it also undoes a lot
of unnecessary formatting changes (already pointed out by Peter a couple
days ago).
Thank you very much for this! As discussed privately, I'm not in
position right now to pursue this further at this late stage (at least
for v17, which would require an aggressive schedule ). My plan was
more for v18 after Peter's email, due to other obligations. But if you
have cycles and want to continue, please do so without hesitation -
I'll try to chime in a long way to test and review for sure.
A couple review comments:
1) AFAIK opt_errinfo() returns pointer to the local "buf" variable.
2) I wonder if we even need opt_errinfo(). I'm not sure it actually
makes anything simpler.
Yes, as it stands it's broken (somewhat I've missed gcc warning),
should be pg_malloc(). I hardly remember, but I wanted to avoid code
duplication. No strong opinion, maybe that's a different style, I'll
adapt as necessary.
3) I think it'd be nice to make CopyFileMethod more consistent with
transferMode in pg_upgrade.h (I mean, it seems wise to make the naming
more consistent, it's probably not worth unifying this somehow).4) I wonder how we came up with copying the files by 50 blocks, but I
now realize it's been like this before this patch. I only noticed
because the patch adds a comment before buffer_size calculation.
It looks like it was like that before pg_upgrade even was moved into
the core. 400kB is indeed bit strange value, so we can leave it as it
is or make the COPY_BUF_SIZ 128kb - see [1]https://eklitzke.org/efficient-file-copying-on-linux (i've double checked cp(1)
uses still 128kB today), or maybe just stick to something like 256 or
512 kBs.
5) I dislike the renaming of copy_file_blocks to pg_copyfile. The new
name is way more generic / less descriptive - it's clear it copies the
file block by block (well, in chunks). pg_copyfile is pretty vague.6) This leaves behind copy_file_copyfile, which is now unused.
7) The patch reworks how combinebackup deals with alternative copy
implementations - instead of setting strategy_implementation and calling
that, the decisions now happen in pg_copyfile_offload with a lot of
conditions / ifdef / defined ... I find it pretty hard to understand and
reason about. I liked the strategy_implementation approach, as it forces
us to keep each method in a separate function.
Well some context (maybe it was my mistake to continue in this
./thread rather starting a new one): my plan was 3-in-1: in the
original proposal (from Jan) to provide CoW as generic facility for
other to use - in src/common/file_utils.c as per
v3-0002-Confine-various-OS-copy-on-write-and-other-copy-a.patch - to
unify & confine CoW methods and their quirkiness between
pg_combinebackup and pg_upgrade and other potential CoW uses too. That
was before Thomas M. pushed CoW just for pg_upgrade as
d93627bcbe5001750e7611f0e637200e2d81dcff. I had this idea back then to
have pg_copyfile() [normal blocks copy] and
pg_copyfile_offload_supported(),
pg_copyfile_offload(PG_COPYFILE_IOCTL_FICLONE ,
PG_COPYFILE_COPY_FILE_RANGE,
PG_COPYFILE_who_has_idea_what_they_come_up_with_in_future). In Your's
version of the patch it's local to pg_combinebackup, so it might make
no sense after all. If you look at the pg_upgrade and pg_combinebackup
they both have code duplication with lots of those ifs/IFs (assuming
user wants to have it as drop-in [--clone/--copy/--copyfile] and
platform may / may not have it). I've even considered
--cow=ficlone|copy_file_range to sync both tools from CLI arguments
point of view, but that would break backwards compatibility, so I did
not do that.
Also there's a problem with pg_combinebackup's strategy_implementation
that it actually cannot on its own decide (I think) which CoW to use
or not. There were some longer discussions that settled on one thing
(for pg_upgrade): it's the user who is in control HOW the copy gets
done (due to potential issues in OS CoW() implementations where e.g.
if NFS would be involved on one side). See pg_upgrade
--clone/--copy/--copy-file-range/--sync-method options. I wanted to
stick to that, so pg_combinebackup also needs to give the same options
to the user.
That's was for the historical context, now you wrote "it's probably
not worth unifying this somehow" few sentences earlier, so my take is
the following: we can just concentrate on getting the
copy_file_range() and ioctl_ficlone to pg_combinebackup at the price
of duplicating some complexity for now (in short to start with clear
plate , it doesn't necessary needs to be my patch as base if we think
it's worthwhile for v17 - or stick to your reworked patch of mine).
Later (v18?) some bigger than this refactor could unify and move the
copy methods to some more central place (so then we would have sync as
there would be no doubling like you mentioned e.g.: pg_upgrade's enum
transferMode <-> patch enum CopyFileMethod.
So for now I'm +1 to renaming all the things as you want -- indeed
pg_copy* might not be a good fit in a localized version.
-J.
Here's a patch reworked along the lines from a couple days ago.
The primary goals were to add clone/copy_file_range while minimizing
unnecessary disruption, and overall cleanup of the patch. I'm not saying
it's committable, but I think the patch is much easier to understand.
The main change is that this abandons the idea of handling all possible
cases in a single function that looks like a maze of ifdefs, and instead
separates each case into it's own function and the decision happens much
earlier. This is pretty much exactly what pg_upgrade does, BTW.
There's maybe an argument that these functions could be unified and
moved to a library in src/common - I can imagine doing that, but I don't
think it's required. The functions are pretty trivial wrappers, and it's
not like we expect many more callers. And there's probably stuff we'd
need to keep out of that library (e.g. the decision which copy/clone
methods are available / should be used or error reporting). So it
doesn't seem worth it, at least for now.
There's one question, though. As it stands, there's a bit of asymmetry
between handling CopyFile() on WIN32 and the clone/copy_file_range on
other platforms). On WIN32, we simply automatically switch to CopyFile
automatically, if we don't need to calculate checksum. But with the
other methods, error out if the user requests those and we need to
calculate the checksum.
The asymmetry comes from the fact there's no option to request CopyFile
on WIN32, and we feel confident it's always the right thing to do (safe,
faster). We don't seem to know that for the other methods, so the user
has to explicitly request those. And if the user requests --clone, for
example, it'd be wrong to silently fallback to plain copy.
Still, I wonder if this might cause some undesirable issues during
restores. But I guess that's why we have --dry-run.
This asymmetry also shows a bit in the code - the CopyFile is coded and
called a bit differently from the other methods. FWIW I abandoned the
approach with "strategy" and just use a switch on CopyMode enum, just
like pg_upgrade does.
There's a couple more smaller changes:
- Addition of docs for --clone/--copy-file-range (shameless copy from
pg_upgrade docs).
- Removal of opt_errinfo - not only was it buggy, I think the code is
actually cleaner without it.
- Removal of EINTR retry condition from copy_file_range handling (this
is what Thomas ended up for pg_upgrade while committing that part).
Put together, this cuts the patch from ~40kB to ~15kB (most of this is
due to the cleanup of unnecessary whitespace changes, though).
I think to make this committable, this requires some review and testing,
ideally on a range of platforms.
One open question is how to allow testing this. For pg_upgrade we now
have PG_TEST_PG_UPGRADE_MODE, which can be set to e.g. "--clone". I
wonder if we should add PG_TEST_PG_COMBINEBACKUP_MODE ...
regards
--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
Attachments:
v20240322-0001-pg_combinebackup-allow-using-clone-copy_fi.patchtext/x-patch; charset=UTF-8; name=v20240322-0001-pg_combinebackup-allow-using-clone-copy_fi.patchDownload
From 7b6a6fe1b555647109caec2817f9200ac8fe9db9 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@2ndquadrant.com>
Date: Tue, 19 Mar 2024 15:16:29 +0100
Subject: [PATCH v20240322] pg_combinebackup - allow using
clone/copy_file_range
---
doc/src/sgml/ref/pg_combinebackup.sgml | 34 +++++
src/bin/pg_combinebackup/copy_file.c | 157 +++++++++++++++-----
src/bin/pg_combinebackup/copy_file.h | 18 ++-
src/bin/pg_combinebackup/pg_combinebackup.c | 26 +++-
src/bin/pg_combinebackup/reconstruct.c | 5 +-
src/bin/pg_combinebackup/reconstruct.h | 5 +-
6 files changed, 202 insertions(+), 43 deletions(-)
diff --git a/doc/src/sgml/ref/pg_combinebackup.sgml b/doc/src/sgml/ref/pg_combinebackup.sgml
index 8a0a600c2b2..60a60e3fae6 100644
--- a/doc/src/sgml/ref/pg_combinebackup.sgml
+++ b/doc/src/sgml/ref/pg_combinebackup.sgml
@@ -185,6 +185,40 @@ PostgreSQL documentation
</listitem>
</varlistentry>
+ <varlistentry>
+ <term><option>--clone</option></term>
+ <listitem>
+ <para>
+ Use efficient file cloning (also known as <quote>reflinks</quote> on
+ some systems) instead of copying files to the new cluster. This can
+ result in near-instantaneous copying of the data files, giving the
+ speed advantages of <option>-k</option>/<option>--link</option> while
+ leaving the old cluster untouched.
+ </para>
+
+ <para>
+ File cloning is only supported on some operating systems and file
+ systems. If it is selected but not supported, the
+ <application>pg_combinebackup</application> run will error. At present,
+ it is supported on Linux (kernel 4.5 or later) with Btrfs and XFS (on
+ file systems created with reflink support), and on macOS with APFS.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term><option>--copy-file-range</option></term>
+ <listitem>
+ <para>
+ Use the <function>copy_file_range</function> system call for efficient
+ copying. On some file systems this gives results similar to
+ <option>--clone</option>, sharing physical disk blocks, while on others
+ it may still copy blocks, but do so via an optimized path. At present,
+ it is supported on Linux and FreeBSD.
+ </para>
+ </listitem>
+ </varlistentry>
+
<varlistentry>
<term><option>-V</option></term>
<term><option>--version</option></term>
diff --git a/src/bin/pg_combinebackup/copy_file.c b/src/bin/pg_combinebackup/copy_file.c
index e6d2423278a..cd3b8447308 100644
--- a/src/bin/pg_combinebackup/copy_file.c
+++ b/src/bin/pg_combinebackup/copy_file.c
@@ -14,6 +14,7 @@
#include <copyfile.h>
#endif
#include <fcntl.h>
+#include <limits.h>
#include <sys/stat.h>
#include <unistd.h>
@@ -24,6 +25,10 @@
static void copy_file_blocks(const char *src, const char *dst,
pg_checksum_context *checksum_ctx);
+static void copy_file_clone(const char *src, const char *dst);
+
+static void copy_file_by_range(const char *src, const char *dst);
+
#ifdef WIN32
static void copy_file_copyfile(const char *src, const char *dst);
#endif
@@ -35,7 +40,8 @@ static void copy_file_copyfile(const char *src, const char *dst);
*/
void
copy_file(const char *src, const char *dst,
- pg_checksum_context *checksum_ctx, bool dry_run)
+ pg_checksum_context *checksum_ctx, bool dry_run,
+ CopyMode copy_mode)
{
/*
* In dry-run mode, we don't actually copy anything, nor do we read any
@@ -55,54 +61,70 @@ copy_file(const char *src, const char *dst,
* If we don't need to compute a checksum, then we can use any special
* operating system primitives that we know about to copy the file; this
* may be quicker than a naive block copy.
+ *
+ * On the other hand, if we need to compute a checksum, but the user
+ * requested a special copy method that does not support this, report
+ * this and fail.
+ *
+ * XXX Why do do this only for WIN32 and not for the other systems? Are
+ * there some reliability concerns/issues?
+ *
+ * XXX Maybe this should simply fall back to the basic copy method, and
+ * not fail the whole command?
*/
if (checksum_ctx->type == CHECKSUM_TYPE_NONE)
{
- char *strategy_name = NULL;
- void (*strategy_implementation) (const char *, const char *) = NULL;
-
#ifdef WIN32
- strategy_name = "CopyFile";
- strategy_implementation = copy_file_copyfile;
+ copy_method = COPY_MODE_COPYFILE;
#endif
-
- if (strategy_name != NULL)
- {
- if (dry_run)
- pg_log_debug("would copy \"%s\" to \"%s\" using strategy %s",
- src, dst, strategy_name);
- else
- {
- pg_log_debug("copying \"%s\" to \"%s\" using strategy %s",
- src, dst, strategy_name);
- (*strategy_implementation) (src, dst);
- }
- return;
- }
+ }
+ else if (copy_mode != COPY_MODE_COPY)
+ {
+ /* XXX maybe silently fall back to simple copy? */
+ pg_fatal("copy method does not support checksums");
}
- /*
- * Fall back to the simple approach of reading and writing all the blocks,
- * feeding them into the checksum context as we go.
- */
+
if (dry_run)
{
- if (checksum_ctx->type == CHECKSUM_TYPE_NONE)
- pg_log_debug("would copy \"%s\" to \"%s\"",
- src, dst);
- else
- pg_log_debug("would copy \"%s\" to \"%s\" and checksum with %s",
- src, dst, pg_checksum_type_name(checksum_ctx->type));
+ switch (copy_mode)
+ {
+ case COPY_MODE_CLONE:
+ pg_log_debug("would copy \"%s\" to \"%s\" (clone)", src, dst);
+ break;
+ case COPY_MODE_COPY:
+ pg_log_debug("would copy \"%s\" to \"%s\"", src, dst);
+ break;
+ case COPY_MODE_COPY_FILE_RANGE:
+ pg_log_debug("would copy \"%s\" to \"%s\" (copy_file_range)",
+ src, dst);
+#ifdef WIN32
+ case COPY_MODE_COPYFILE:
+ pg_log_debug("would copy \"%s\" to \"%s\" (copyfile)",
+ src, dst);
+ break;
+#endif
+ }
}
else
{
- if (checksum_ctx->type == CHECKSUM_TYPE_NONE)
- pg_log_debug("copying \"%s\" to \"%s\"",
- src, dst);
- else
- pg_log_debug("copying \"%s\" to \"%s\" and checksumming with %s",
- src, dst, pg_checksum_type_name(checksum_ctx->type));
- copy_file_blocks(src, dst, checksum_ctx);
+ switch (copy_mode)
+ {
+ case COPY_MODE_CLONE:
+ copy_file_clone(src, dst);
+ break;
+ case COPY_MODE_COPY:
+ copy_file_blocks(src, dst, checksum_ctx);
+ break;
+ case COPY_MODE_COPY_FILE_RANGE:
+ copy_file_by_range(src, dst);
+ break;
+#ifdef WIN32
+ case COPY_MODE_COPYFILE:
+ copy_file_copyfile(src, dst);
+ break;
+#endif
+ }
}
}
@@ -156,6 +178,67 @@ copy_file_blocks(const char *src, const char *dst,
close(dest_fd);
}
+static void
+copy_file_clone(const char *src, const char *dest)
+{
+#if defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE)
+ if (copyfile(src, dest, NULL, COPYFILE_CLONE_FORCE) < 0)
+ pg_fatal("error while cloning file \"%s\" to \"%s\": %m", src, dest);
+#elif defined(__linux__) && defined(FICLONE)
+ {
+ if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0)
+ pg_fatal("could not open file \"%s\": %m", src);
+
+ if ((dest_fd = open(dest, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
+ pg_file_create_mode)) < 0)
+ pg_fatal("could not create file \"%s\": %m", dest);
+
+ if (ioctl(dest_fd, FICLONE, src_fd) < 0)
+ {
+ int save_errno = errno;
+
+ unlink(dest);
+
+ pg_fatal("error while cloning file \"%s\" to \"%s\": %s",
+ src, dest);
+ }
+ }
+#else
+ pg_fatal("file cloning not supported on this platform");
+#endif
+}
+
+static void
+copy_file_by_range(const char *src, const char *dest)
+{
+#if defined(HAVE_COPY_FILE_RANGE)
+ int src_fd;
+ int dest_fd;
+ ssize_t nbytes;
+
+ if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0)
+ pg_fatal("could not open file \"%s\": %m", src);
+
+ if ((dest_fd = open(dest, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
+ pg_file_create_mode)) < 0)
+ pg_fatal("could not create file \"%s\": %m", dest);
+
+ do
+ {
+ nbytes = copy_file_range(src_fd, NULL, dest_fd, NULL, SSIZE_MAX, 0);
+ if (nbytes < 0)
+ pg_fatal("error while copying file range from \"%s\" to \"%s\": %m",
+ src, dest);
+ } while (nbytes > 0);
+
+ close(src_fd);
+ close(dest_fd);
+#else
+ pg_fatal("copy_file_range not supported on this platform");
+#endif
+}
+
+/* XXX maybe this should do the check internally, same as the other functions? */
#ifdef WIN32
static void
copy_file_copyfile(const char *src, const char *dst)
diff --git a/src/bin/pg_combinebackup/copy_file.h b/src/bin/pg_combinebackup/copy_file.h
index 0f6bc09403f..3a1c5eb764f 100644
--- a/src/bin/pg_combinebackup/copy_file.h
+++ b/src/bin/pg_combinebackup/copy_file.h
@@ -11,9 +11,25 @@
#ifndef COPY_FILE_H
#define COPY_FILE_H
+#include "c.h"
#include "common/checksum_helper.h"
+#include "common/file_utils.h"
+
+/*
+ * Enumeration to denote copy modes
+ */
+typedef enum CopyMode
+{
+ COPY_MODE_CLONE,
+ COPY_MODE_COPY,
+ COPY_MODE_COPY_FILE_RANGE,
+#ifdef WIN32
+ COPY_MODE_COPYFILE,
+#endif
+} CopyMode;
extern void copy_file(const char *src, const char *dst,
- pg_checksum_context *checksum_ctx, bool dry_run);
+ pg_checksum_context *checksum_ctx, bool dry_run,
+ CopyMode copy_mode);
#endif /* COPY_FILE_H */
diff --git a/src/bin/pg_combinebackup/pg_combinebackup.c b/src/bin/pg_combinebackup/pg_combinebackup.c
index 74f8be9eeac..fb5e51811bd 100644
--- a/src/bin/pg_combinebackup/pg_combinebackup.c
+++ b/src/bin/pg_combinebackup/pg_combinebackup.c
@@ -69,6 +69,7 @@ typedef struct cb_options
pg_checksum_type manifest_checksums;
bool no_manifest;
DataDirSyncMethod sync_method;
+ CopyMode copy_method;
} cb_options;
/*
@@ -129,6 +130,8 @@ main(int argc, char *argv[])
{"manifest-checksums", required_argument, NULL, 1},
{"no-manifest", no_argument, NULL, 2},
{"sync-method", required_argument, NULL, 3},
+ {"clone", no_argument, NULL, 4},
+ {"copy-file-range", no_argument, NULL, 5},
{NULL, 0, NULL, 0}
};
@@ -156,6 +159,7 @@ main(int argc, char *argv[])
memset(&opt, 0, sizeof(opt));
opt.manifest_checksums = CHECKSUM_TYPE_CRC32C;
opt.sync_method = DATA_DIR_SYNC_METHOD_FSYNC;
+ opt.copy_method = COPY_MODE_COPY;
/* process command-line options */
while ((c = getopt_long(argc, argv, "dnNPo:T:",
@@ -192,6 +196,12 @@ main(int argc, char *argv[])
if (!parse_sync_method(optarg, &opt.sync_method))
exit(1);
break;
+ case 4:
+ opt.copy_method = COPY_MODE_CLONE;
+ break;
+ case 5:
+ opt.copy_method = COPY_MODE_COPY_FILE_RANGE;
+ break;
default:
/* getopt_long already emitted a complaint */
pg_log_error_hint("Try \"%s --help\" for more information.", progname);
@@ -213,6 +223,14 @@ main(int argc, char *argv[])
if (opt.no_manifest)
opt.manifest_checksums = CHECKSUM_TYPE_NONE;
+ /*
+ * We cannot provide file copy/clone offload in case when we need to
+ * calculate checksums
+ */
+ if (opt.copy_method != COPY_MODE_COPY && opt.manifest_checksums != CHECKSUM_TYPE_NONE)
+ pg_fatal("unable to use accelerated copy when manifest checksums "
+ "are to be calculated. Use --no-manifest");
+
/* Read the server version from the final backup. */
version = read_pg_version_file(argv[argc - 1]);
@@ -696,6 +714,8 @@ help(const char *progname)
" use algorithm for manifest checksums\n"));
printf(_(" --no-manifest suppress generation of backup manifest\n"));
printf(_(" --sync-method=METHOD set method for syncing files to disk\n"));
+ printf(_(" --clone clone (reflink) instead of copying files\n"));
+ printf(_(" --copy-file-range copy using copy_file_range() syscall\n"));
printf(_(" -?, --help show this help, then exit\n"));
printf(_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
@@ -937,7 +957,8 @@ process_directory_recursively(Oid tsoid,
&checksum_length,
&checksum_payload,
opt->debug,
- opt->dry_run);
+ opt->dry_run,
+ opt->copy_method);
}
else
{
@@ -993,7 +1014,8 @@ process_directory_recursively(Oid tsoid,
/* Actually copy the file. */
snprintf(ofullpath, MAXPGPATH, "%s/%s", ofulldir, de->d_name);
- copy_file(ifullpath, ofullpath, &checksum_ctx, opt->dry_run);
+ copy_file(ifullpath, ofullpath, &checksum_ctx, opt->dry_run,
+ opt->copy_method);
/*
* If copy_file() performed a checksum calculation for us, then
diff --git a/src/bin/pg_combinebackup/reconstruct.c b/src/bin/pg_combinebackup/reconstruct.c
index 41f06bb26b5..f5c7af8a23c 100644
--- a/src/bin/pg_combinebackup/reconstruct.c
+++ b/src/bin/pg_combinebackup/reconstruct.c
@@ -90,7 +90,8 @@ reconstruct_from_incremental_file(char *input_filename,
int *checksum_length,
uint8 **checksum_payload,
bool debug,
- bool dry_run)
+ bool dry_run,
+ CopyMode copy_method)
{
rfile **source;
rfile *latest_source = NULL;
@@ -319,7 +320,7 @@ reconstruct_from_incremental_file(char *input_filename,
*/
if (copy_source != NULL)
copy_file(copy_source->filename, output_filename,
- &checksum_ctx, dry_run);
+ &checksum_ctx, dry_run, copy_method);
else
{
write_reconstructed_file(input_filename, output_filename,
diff --git a/src/bin/pg_combinebackup/reconstruct.h b/src/bin/pg_combinebackup/reconstruct.h
index 8e33a8a95a0..726f94389f3 100644
--- a/src/bin/pg_combinebackup/reconstruct.h
+++ b/src/bin/pg_combinebackup/reconstruct.h
@@ -13,7 +13,9 @@
#ifndef RECONSTRUCT_H
#define RECONSTRUCT_H
+#include "c.h"
#include "common/checksum_helper.h"
+#include "common/file_utils.h"
#include "load_manifest.h"
extern void reconstruct_from_incremental_file(char *input_filename,
@@ -28,6 +30,7 @@ extern void reconstruct_from_incremental_file(char *input_filename,
int *checksum_length,
uint8 **checksum_payload,
bool debug,
- bool dry_run);
+ bool dry_run,
+ CopyMode copy_mode);
#endif
--
2.44.0
On Fri, Mar 22, 2024 at 10:40 AM Tomas Vondra
<tomas.vondra@enterprisedb.com> wrote:
There's one question, though. As it stands, there's a bit of asymmetry
between handling CopyFile() on WIN32 and the clone/copy_file_range on
other platforms). On WIN32, we simply automatically switch to CopyFile
automatically, if we don't need to calculate checksum. But with the
other methods, error out if the user requests those and we need to
calculate the checksum.
That seems completely broken. copy_file() needs to have the ability to
calculate a checksum if one is required; when one isn't required, it
can do whatever it likes. So we should always fall back to the
block-by-block method if we need a checksum. Whatever option the user
specified should only be applied when we don't need a checksum.
Consider, for example:
pg_basebackup -D sunday -c fast --manifest-checksums=CRC32C
pg_basebackup -D monday -c fast --manifest-checksums=SHA224
--incremental sunday/backup_manifest
pg_combinebackup sunday monday -o tuesday --manifest-checksums=CRC32C --clone
Any files that are copied in their entirety from Sunday's backup can
be cloned, if we have support for cloning. But any files copied from
Monday's backup will need to be re-checksummed, since the checksum
algorithms don't match. With what you're describing, it sounds like
pg_combinebackup would just fail in this case; I don't think that's
the behavior that the user is going to want.
--
Robert Haas
EDB: http://www.enterprisedb.com
On 3/22/24 17:42, Robert Haas wrote:
On Fri, Mar 22, 2024 at 10:40 AM Tomas Vondra
<tomas.vondra@enterprisedb.com> wrote:There's one question, though. As it stands, there's a bit of asymmetry
between handling CopyFile() on WIN32 and the clone/copy_file_range on
other platforms). On WIN32, we simply automatically switch to CopyFile
automatically, if we don't need to calculate checksum. But with the
other methods, error out if the user requests those and we need to
calculate the checksum.That seems completely broken. copy_file() needs to have the ability to
calculate a checksum if one is required; when one isn't required, it
can do whatever it likes. So we should always fall back to the
block-by-block method if we need a checksum. Whatever option the user
specified should only be applied when we don't need a checksum.Consider, for example:
pg_basebackup -D sunday -c fast --manifest-checksums=CRC32C
pg_basebackup -D monday -c fast --manifest-checksums=SHA224
--incremental sunday/backup_manifest
pg_combinebackup sunday monday -o tuesday --manifest-checksums=CRC32C --cloneAny files that are copied in their entirety from Sunday's backup can
be cloned, if we have support for cloning. But any files copied from
Monday's backup will need to be re-checksummed, since the checksum
algorithms don't match. With what you're describing, it sounds like
pg_combinebackup would just fail in this case; I don't think that's
the behavior that the user is going to want.
Right, this will happen:
pg_combinebackup: error: unable to use accelerated copy when manifest
checksums are to be calculated. Use --no-manifest
Are you saying we should just silently override the copy method and do
the copy block by block? I'm not strongly opposed to that, but it feels
wrong to just ignore that the user explicitly requested cloning, and I'm
not sure why should this be different from any other case when the user
requests incompatible combination of options and/or options that are not
supported on the current configuration.
Why not just to tell the user to use the correct parameters, i.e. either
remove --clone or add --no-manifest?
FWIW I now realize it actually fails a bit earlier than I thought - when
parsing the options, not in copy_file. But then some checks (if a given
copy method is supported) happen in the copy functions. I wonder if it'd
be better/possible to do all of that in one place, not sure.
Also, the message only suggests to use --no-manifest. It probably should
suggest removing --clone too.
regards
--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
On Fri, Mar 22, 2024 at 1:22 PM Tomas Vondra
<tomas.vondra@enterprisedb.com> wrote:
Right, this will happen:
pg_combinebackup: error: unable to use accelerated copy when manifest
checksums are to be calculated. Use --no-manifestAre you saying we should just silently override the copy method and do
the copy block by block?
Yes.
I'm not strongly opposed to that, but it feels
wrong to just ignore that the user explicitly requested cloning, and I'm
not sure why should this be different from any other case when the user
requests incompatible combination of options and/or options that are not
supported on the current configuration.
I don't feel like copying block-by-block when that's needed to compute
a checksum is ignoring what the user requested. I mean, if we'd had to
perform reconstruction rather than copying an entire file, we would
have done that regardless of whether --clone had been there, and I
don't see why the need-to-compute-a-checksum case is any different. I
think we should view a flag like --clone as specifying how to copy a
file when we don't need to do anything but copy it. I don't think it
should dictate that we're not allowed to perform other processing when
that other processing is required.
From my point of view, this is not a case of incompatible options
having been specified. If you specify run pg_basebackup with both
--format=p and --format=t, those are incompatible options; the backup
can be done one way or the other, but not both at once. But here
there's no such conflict. Block-by-block copying and fast-copying can
happen as part of the same operation, as in the example that I showed,
where some files need the block-by-block copying and some can be
fast-copied. The user is entitled to specify which fast-copying method
they would like to have used for the files where fast-copying is
possible without getting a failure just because it isn't possible for
every single file.
Or to say it the other way around, if there's 1 file that needs to be
copied block by block and 99 files that can be fast-copied, you want
to force the user to the block-by-block method for all 100 files. I
want to force the use of the block-by-block method for the 1 file
where that's the only valid method, and let them choose what they want
to do for the other 99.
--
Robert Haas
EDB: http://www.enterprisedb.com
Hmm, this discussion seems to assume that we only use
copy_file_range() to copy/clone whole segment files, right? That's
great and may even get most of the available benefit given typical
databases with many segments of old data that never changes, but... I
think copy_write_range() allows us to go further than the other
whole-file clone techniques: we can stitch together parts of an old
backup segment file and an incremental backup to create a new file.
If you're interested in minimising disk use while also removing
dependencies on the preceding chain of backups, then it might make
sense to do that even if you *also* have to read the data to compute
the checksums, I think? That's why I mentioned it: if
copy_file_range() (ie sub-file-level block sharing) is a solution in
search of a problem, has the world ever seen a better problem than
pg_combinebackup?
On Fri, Mar 22, 2024 at 8:26 PM Thomas Munro <thomas.munro@gmail.com> wrote:
Hmm, this discussion seems to assume that we only use
copy_file_range() to copy/clone whole segment files, right? That's
great and may even get most of the available benefit given typical
databases with many segments of old data that never changes, but... I
think copy_write_range() allows us to go further than the other
whole-file clone techniques: we can stitch together parts of an old
backup segment file and an incremental backup to create a new file.
If you're interested in minimising disk use while also removing
dependencies on the preceding chain of backups, then it might make
sense to do that even if you *also* have to read the data to compute
the checksums, I think? That's why I mentioned it: if
copy_file_range() (ie sub-file-level block sharing) is a solution in
search of a problem, has the world ever seen a better problem than
pg_combinebackup?
That makes sense; it's just a different part of the code than I
thought we were talking about.
--
Robert Haas
EDB: http://www.enterprisedb.com
On 3/22/24 19:40, Robert Haas wrote:
On Fri, Mar 22, 2024 at 1:22 PM Tomas Vondra
<tomas.vondra@enterprisedb.com> wrote:Right, this will happen:
pg_combinebackup: error: unable to use accelerated copy when manifest
checksums are to be calculated. Use --no-manifestAre you saying we should just silently override the copy method and do
the copy block by block?Yes.
I'm not strongly opposed to that, but it feels
wrong to just ignore that the user explicitly requested cloning, and I'm
not sure why should this be different from any other case when the user
requests incompatible combination of options and/or options that are not
supported on the current configuration.I don't feel like copying block-by-block when that's needed to compute
a checksum is ignoring what the user requested. I mean, if we'd had to
perform reconstruction rather than copying an entire file, we would
have done that regardless of whether --clone had been there, and I
don't see why the need-to-compute-a-checksum case is any different. I
think we should view a flag like --clone as specifying how to copy a
file when we don't need to do anything but copy it. I don't think it
should dictate that we're not allowed to perform other processing when
that other processing is required.From my point of view, this is not a case of incompatible options
having been specified. If you specify run pg_basebackup with both
--format=p and --format=t, those are incompatible options; the backup
can be done one way or the other, but not both at once. But here
there's no such conflict. Block-by-block copying and fast-copying can
happen as part of the same operation, as in the example that I showed,
where some files need the block-by-block copying and some can be
fast-copied. The user is entitled to specify which fast-copying method
they would like to have used for the files where fast-copying is
possible without getting a failure just because it isn't possible for
every single file.Or to say it the other way around, if there's 1 file that needs to be
copied block by block and 99 files that can be fast-copied, you want
to force the user to the block-by-block method for all 100 files. I
want to force the use of the block-by-block method for the 1 file
where that's the only valid method, and let them choose what they want
to do for the other 99.
OK, that makes sense. Here's a patch that should work like this - in
copy_file we check if we need to calculate checksums, and either use the
requested copy method, or fall back to the block-by-block copy.
regards
--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
Attachments:
v20240323-0001-pg_combinebackup-allow-using-clone-copy_fi.patchtext/x-patch; charset=UTF-8; name=v20240323-0001-pg_combinebackup-allow-using-clone-copy_fi.patchDownload
From 558321b7ee10fa3902aaed1d1a08857865a232bb Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@2ndquadrant.com>
Date: Tue, 19 Mar 2024 15:16:29 +0100
Subject: [PATCH v20240323] pg_combinebackup - allow using
clone/copy_file_range
---
doc/src/sgml/ref/pg_combinebackup.sgml | 34 +++++
src/bin/pg_combinebackup/copy_file.c | 156 +++++++++++++++-----
src/bin/pg_combinebackup/copy_file.h | 18 ++-
src/bin/pg_combinebackup/pg_combinebackup.c | 18 ++-
src/bin/pg_combinebackup/reconstruct.c | 5 +-
src/bin/pg_combinebackup/reconstruct.h | 5 +-
6 files changed, 192 insertions(+), 44 deletions(-)
diff --git a/doc/src/sgml/ref/pg_combinebackup.sgml b/doc/src/sgml/ref/pg_combinebackup.sgml
index 8a0a600c2b2..60a60e3fae6 100644
--- a/doc/src/sgml/ref/pg_combinebackup.sgml
+++ b/doc/src/sgml/ref/pg_combinebackup.sgml
@@ -185,6 +185,40 @@ PostgreSQL documentation
</listitem>
</varlistentry>
+ <varlistentry>
+ <term><option>--clone</option></term>
+ <listitem>
+ <para>
+ Use efficient file cloning (also known as <quote>reflinks</quote> on
+ some systems) instead of copying files to the new cluster. This can
+ result in near-instantaneous copying of the data files, giving the
+ speed advantages of <option>-k</option>/<option>--link</option> while
+ leaving the old cluster untouched.
+ </para>
+
+ <para>
+ File cloning is only supported on some operating systems and file
+ systems. If it is selected but not supported, the
+ <application>pg_combinebackup</application> run will error. At present,
+ it is supported on Linux (kernel 4.5 or later) with Btrfs and XFS (on
+ file systems created with reflink support), and on macOS with APFS.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term><option>--copy-file-range</option></term>
+ <listitem>
+ <para>
+ Use the <function>copy_file_range</function> system call for efficient
+ copying. On some file systems this gives results similar to
+ <option>--clone</option>, sharing physical disk blocks, while on others
+ it may still copy blocks, but do so via an optimized path. At present,
+ it is supported on Linux and FreeBSD.
+ </para>
+ </listitem>
+ </varlistentry>
+
<varlistentry>
<term><option>-V</option></term>
<term><option>--version</option></term>
diff --git a/src/bin/pg_combinebackup/copy_file.c b/src/bin/pg_combinebackup/copy_file.c
index e6d2423278a..0874679e53a 100644
--- a/src/bin/pg_combinebackup/copy_file.c
+++ b/src/bin/pg_combinebackup/copy_file.c
@@ -14,6 +14,7 @@
#include <copyfile.h>
#endif
#include <fcntl.h>
+#include <limits.h>
#include <sys/stat.h>
#include <unistd.h>
@@ -24,6 +25,10 @@
static void copy_file_blocks(const char *src, const char *dst,
pg_checksum_context *checksum_ctx);
+static void copy_file_clone(const char *src, const char *dst);
+
+static void copy_file_by_range(const char *src, const char *dst);
+
#ifdef WIN32
static void copy_file_copyfile(const char *src, const char *dst);
#endif
@@ -35,7 +40,8 @@ static void copy_file_copyfile(const char *src, const char *dst);
*/
void
copy_file(const char *src, const char *dst,
- pg_checksum_context *checksum_ctx, bool dry_run)
+ pg_checksum_context *checksum_ctx, bool dry_run,
+ CopyMode copy_mode)
{
/*
* In dry-run mode, we don't actually copy anything, nor do we read any
@@ -54,55 +60,68 @@ copy_file(const char *src, const char *dst,
/*
* If we don't need to compute a checksum, then we can use any special
* operating system primitives that we know about to copy the file; this
- * may be quicker than a naive block copy.
+ * may be quicker than a naive block copy. We only do this for WIN32.
+ * On other operating systems the user has to explicitly specify one of
+ * the available primitives - there may be multiple, we don't know which
+ * are reliable/preferred.
+ *
+ * On the other hand, if we need to compute a checksum, but the user
+ * requested a special copy method that does not support this, fallback
+ * to the default block-by-block copy. We don't want to fail if just
+ * one of many files requires checksum, etc.
*/
if (checksum_ctx->type == CHECKSUM_TYPE_NONE)
{
- char *strategy_name = NULL;
- void (*strategy_implementation) (const char *, const char *) = NULL;
-
#ifdef WIN32
- strategy_name = "CopyFile";
- strategy_implementation = copy_file_copyfile;
+ copy_mode = COPY_MODE_COPYFILE;
#endif
-
- if (strategy_name != NULL)
- {
- if (dry_run)
- pg_log_debug("would copy \"%s\" to \"%s\" using strategy %s",
- src, dst, strategy_name);
- else
- {
- pg_log_debug("copying \"%s\" to \"%s\" using strategy %s",
- src, dst, strategy_name);
- (*strategy_implementation) (src, dst);
- }
- return;
- }
+ }
+ else
+ {
+ /* fallback to block-by-block copy */
+ copy_mode = COPY_MODE_COPY;
}
- /*
- * Fall back to the simple approach of reading and writing all the blocks,
- * feeding them into the checksum context as we go.
- */
if (dry_run)
{
- if (checksum_ctx->type == CHECKSUM_TYPE_NONE)
- pg_log_debug("would copy \"%s\" to \"%s\"",
- src, dst);
- else
- pg_log_debug("would copy \"%s\" to \"%s\" and checksum with %s",
- src, dst, pg_checksum_type_name(checksum_ctx->type));
+ switch (copy_mode)
+ {
+ case COPY_MODE_CLONE:
+ pg_log_debug("would copy \"%s\" to \"%s\" (clone)", src, dst);
+ break;
+ case COPY_MODE_COPY:
+ pg_log_debug("would copy \"%s\" to \"%s\"", src, dst);
+ break;
+ case COPY_MODE_COPY_FILE_RANGE:
+ pg_log_debug("would copy \"%s\" to \"%s\" (copy_file_range)",
+ src, dst);
+#ifdef WIN32
+ case COPY_MODE_COPYFILE:
+ pg_log_debug("would copy \"%s\" to \"%s\" (copyfile)",
+ src, dst);
+ break;
+#endif
+ }
}
else
{
- if (checksum_ctx->type == CHECKSUM_TYPE_NONE)
- pg_log_debug("copying \"%s\" to \"%s\"",
- src, dst);
- else
- pg_log_debug("copying \"%s\" to \"%s\" and checksumming with %s",
- src, dst, pg_checksum_type_name(checksum_ctx->type));
- copy_file_blocks(src, dst, checksum_ctx);
+ switch (copy_mode)
+ {
+ case COPY_MODE_CLONE:
+ copy_file_clone(src, dst);
+ break;
+ case COPY_MODE_COPY:
+ copy_file_blocks(src, dst, checksum_ctx);
+ break;
+ case COPY_MODE_COPY_FILE_RANGE:
+ copy_file_by_range(src, dst);
+ break;
+#ifdef WIN32
+ case COPY_MODE_COPYFILE:
+ copy_file_copyfile(src, dst);
+ break;
+#endif
+ }
}
}
@@ -156,6 +175,67 @@ copy_file_blocks(const char *src, const char *dst,
close(dest_fd);
}
+static void
+copy_file_clone(const char *src, const char *dest)
+{
+#if defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE)
+ if (copyfile(src, dest, NULL, COPYFILE_CLONE_FORCE) < 0)
+ pg_fatal("error while cloning file \"%s\" to \"%s\": %m", src, dest);
+#elif defined(__linux__) && defined(FICLONE)
+ {
+ if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0)
+ pg_fatal("could not open file \"%s\": %m", src);
+
+ if ((dest_fd = open(dest, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
+ pg_file_create_mode)) < 0)
+ pg_fatal("could not create file \"%s\": %m", dest);
+
+ if (ioctl(dest_fd, FICLONE, src_fd) < 0)
+ {
+ int save_errno = errno;
+
+ unlink(dest);
+
+ pg_fatal("error while cloning file \"%s\" to \"%s\": %s",
+ src, dest);
+ }
+ }
+#else
+ pg_fatal("file cloning not supported on this platform");
+#endif
+}
+
+static void
+copy_file_by_range(const char *src, const char *dest)
+{
+#if defined(HAVE_COPY_FILE_RANGE)
+ int src_fd;
+ int dest_fd;
+ ssize_t nbytes;
+
+ if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0)
+ pg_fatal("could not open file \"%s\": %m", src);
+
+ if ((dest_fd = open(dest, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
+ pg_file_create_mode)) < 0)
+ pg_fatal("could not create file \"%s\": %m", dest);
+
+ do
+ {
+ nbytes = copy_file_range(src_fd, NULL, dest_fd, NULL, SSIZE_MAX, 0);
+ if (nbytes < 0)
+ pg_fatal("error while copying file range from \"%s\" to \"%s\": %m",
+ src, dest);
+ } while (nbytes > 0);
+
+ close(src_fd);
+ close(dest_fd);
+#else
+ pg_fatal("copy_file_range not supported on this platform");
+#endif
+}
+
+/* XXX maybe this should do the check internally, same as the other functions? */
#ifdef WIN32
static void
copy_file_copyfile(const char *src, const char *dst)
diff --git a/src/bin/pg_combinebackup/copy_file.h b/src/bin/pg_combinebackup/copy_file.h
index 0f6bc09403f..3a1c5eb764f 100644
--- a/src/bin/pg_combinebackup/copy_file.h
+++ b/src/bin/pg_combinebackup/copy_file.h
@@ -11,9 +11,25 @@
#ifndef COPY_FILE_H
#define COPY_FILE_H
+#include "c.h"
#include "common/checksum_helper.h"
+#include "common/file_utils.h"
+
+/*
+ * Enumeration to denote copy modes
+ */
+typedef enum CopyMode
+{
+ COPY_MODE_CLONE,
+ COPY_MODE_COPY,
+ COPY_MODE_COPY_FILE_RANGE,
+#ifdef WIN32
+ COPY_MODE_COPYFILE,
+#endif
+} CopyMode;
extern void copy_file(const char *src, const char *dst,
- pg_checksum_context *checksum_ctx, bool dry_run);
+ pg_checksum_context *checksum_ctx, bool dry_run,
+ CopyMode copy_mode);
#endif /* COPY_FILE_H */
diff --git a/src/bin/pg_combinebackup/pg_combinebackup.c b/src/bin/pg_combinebackup/pg_combinebackup.c
index 74f8be9eeac..b6e1e62e160 100644
--- a/src/bin/pg_combinebackup/pg_combinebackup.c
+++ b/src/bin/pg_combinebackup/pg_combinebackup.c
@@ -69,6 +69,7 @@ typedef struct cb_options
pg_checksum_type manifest_checksums;
bool no_manifest;
DataDirSyncMethod sync_method;
+ CopyMode copy_method;
} cb_options;
/*
@@ -129,6 +130,8 @@ main(int argc, char *argv[])
{"manifest-checksums", required_argument, NULL, 1},
{"no-manifest", no_argument, NULL, 2},
{"sync-method", required_argument, NULL, 3},
+ {"clone", no_argument, NULL, 4},
+ {"copy-file-range", no_argument, NULL, 5},
{NULL, 0, NULL, 0}
};
@@ -156,6 +159,7 @@ main(int argc, char *argv[])
memset(&opt, 0, sizeof(opt));
opt.manifest_checksums = CHECKSUM_TYPE_CRC32C;
opt.sync_method = DATA_DIR_SYNC_METHOD_FSYNC;
+ opt.copy_method = COPY_MODE_COPY;
/* process command-line options */
while ((c = getopt_long(argc, argv, "dnNPo:T:",
@@ -192,6 +196,12 @@ main(int argc, char *argv[])
if (!parse_sync_method(optarg, &opt.sync_method))
exit(1);
break;
+ case 4:
+ opt.copy_method = COPY_MODE_CLONE;
+ break;
+ case 5:
+ opt.copy_method = COPY_MODE_COPY_FILE_RANGE;
+ break;
default:
/* getopt_long already emitted a complaint */
pg_log_error_hint("Try \"%s --help\" for more information.", progname);
@@ -696,6 +706,8 @@ help(const char *progname)
" use algorithm for manifest checksums\n"));
printf(_(" --no-manifest suppress generation of backup manifest\n"));
printf(_(" --sync-method=METHOD set method for syncing files to disk\n"));
+ printf(_(" --clone clone (reflink) instead of copying files\n"));
+ printf(_(" --copy-file-range copy using copy_file_range() syscall\n"));
printf(_(" -?, --help show this help, then exit\n"));
printf(_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
@@ -937,7 +949,8 @@ process_directory_recursively(Oid tsoid,
&checksum_length,
&checksum_payload,
opt->debug,
- opt->dry_run);
+ opt->dry_run,
+ opt->copy_method);
}
else
{
@@ -993,7 +1006,8 @@ process_directory_recursively(Oid tsoid,
/* Actually copy the file. */
snprintf(ofullpath, MAXPGPATH, "%s/%s", ofulldir, de->d_name);
- copy_file(ifullpath, ofullpath, &checksum_ctx, opt->dry_run);
+ copy_file(ifullpath, ofullpath, &checksum_ctx, opt->dry_run,
+ opt->copy_method);
/*
* If copy_file() performed a checksum calculation for us, then
diff --git a/src/bin/pg_combinebackup/reconstruct.c b/src/bin/pg_combinebackup/reconstruct.c
index 41f06bb26b5..f5c7af8a23c 100644
--- a/src/bin/pg_combinebackup/reconstruct.c
+++ b/src/bin/pg_combinebackup/reconstruct.c
@@ -90,7 +90,8 @@ reconstruct_from_incremental_file(char *input_filename,
int *checksum_length,
uint8 **checksum_payload,
bool debug,
- bool dry_run)
+ bool dry_run,
+ CopyMode copy_method)
{
rfile **source;
rfile *latest_source = NULL;
@@ -319,7 +320,7 @@ reconstruct_from_incremental_file(char *input_filename,
*/
if (copy_source != NULL)
copy_file(copy_source->filename, output_filename,
- &checksum_ctx, dry_run);
+ &checksum_ctx, dry_run, copy_method);
else
{
write_reconstructed_file(input_filename, output_filename,
diff --git a/src/bin/pg_combinebackup/reconstruct.h b/src/bin/pg_combinebackup/reconstruct.h
index 8e33a8a95a0..726f94389f3 100644
--- a/src/bin/pg_combinebackup/reconstruct.h
+++ b/src/bin/pg_combinebackup/reconstruct.h
@@ -13,7 +13,9 @@
#ifndef RECONSTRUCT_H
#define RECONSTRUCT_H
+#include "c.h"
#include "common/checksum_helper.h"
+#include "common/file_utils.h"
#include "load_manifest.h"
extern void reconstruct_from_incremental_file(char *input_filename,
@@ -28,6 +30,7 @@ extern void reconstruct_from_incremental_file(char *input_filename,
int *checksum_length,
uint8 **checksum_payload,
bool debug,
- bool dry_run);
+ bool dry_run,
+ CopyMode copy_mode);
#endif
--
2.44.0
On 3/23/24 13:38, Robert Haas wrote:
On Fri, Mar 22, 2024 at 8:26 PM Thomas Munro <thomas.munro@gmail.com> wrote:
Hmm, this discussion seems to assume that we only use
copy_file_range() to copy/clone whole segment files, right? That's
great and may even get most of the available benefit given typical
databases with many segments of old data that never changes, but... I
think copy_write_range() allows us to go further than the other
whole-file clone techniques: we can stitch together parts of an old
backup segment file and an incremental backup to create a new file.
If you're interested in minimising disk use while also removing
dependencies on the preceding chain of backups, then it might make
sense to do that even if you *also* have to read the data to compute
the checksums, I think? That's why I mentioned it: if
copy_file_range() (ie sub-file-level block sharing) is a solution in
search of a problem, has the world ever seen a better problem than
pg_combinebackup?That makes sense; it's just a different part of the code than I
thought we were talking about.
Yeah, that's in write_reconstructed_file() and the patch does not touch
that at all. I agree it would be nice to use copy_file_range() in this
part too, and it doesn't seem it'd be that hard to do, I think.
It seems we'd just need a "fork" that either calls pread/pwrite or
copy_file_range, depending on checksums and what was requested.
BTW is there a reason why the code calls "write" and not "pg_pwrite"?
regards
--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
On 3/23/24 14:47, Tomas Vondra wrote:
On 3/23/24 13:38, Robert Haas wrote:
On Fri, Mar 22, 2024 at 8:26 PM Thomas Munro <thomas.munro@gmail.com> wrote:
Hmm, this discussion seems to assume that we only use
copy_file_range() to copy/clone whole segment files, right? That's
great and may even get most of the available benefit given typical
databases with many segments of old data that never changes, but... I
think copy_write_range() allows us to go further than the other
whole-file clone techniques: we can stitch together parts of an old
backup segment file and an incremental backup to create a new file.
If you're interested in minimising disk use while also removing
dependencies on the preceding chain of backups, then it might make
sense to do that even if you *also* have to read the data to compute
the checksums, I think? That's why I mentioned it: if
copy_file_range() (ie sub-file-level block sharing) is a solution in
search of a problem, has the world ever seen a better problem than
pg_combinebackup?That makes sense; it's just a different part of the code than I
thought we were talking about.Yeah, that's in write_reconstructed_file() and the patch does not touch
that at all. I agree it would be nice to use copy_file_range() in this
part too, and it doesn't seem it'd be that hard to do, I think.It seems we'd just need a "fork" that either calls pread/pwrite or
copy_file_range, depending on checksums and what was requested.
Here's a patch to use copy_file_range in write_reconstructed_file too,
when requested/possible. One thing that I'm not sure about is whether to
do pg_fatal() if --copy-file-range but the platform does not support it.
This is more like what pg_upgrade does, but maybe we should just ignore
what the user requested and fallback to the regular copy (a bit like
when having to do a checksum for some files). Or maybe the check should
just happen earlier ...
I've been thinking about what Thomas wrote - that maybe it'd be good to
do copy_file_range() even when calculating the checksum, and I think he
may be right. But the current patch does not do that, and while it
doesn't seem very difficult to do (at least when reconstructing the file
from incremental backups) I don't have a very good intuition whether
it'd be a win or not in typical cases.
I have a naive question about the checksumming - if we used a
merkle-tree-like scheme, i.e. hashing blocks and not hashes of blocks,
wouldn't that allow calculating the hashes even without having to read
the blocks, making copy_file_range more efficient? Sure, it's more
complex, but a well known scheme. (OK, now I realized it'd mean we can't
use tools like sha224sum to hash the files and compare to manifest. I
guess that's why we don't do this ...)
regards
--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
Attachments:
v20240323-2-0002-write_reconstructed_file.patchtext/x-patch; charset=UTF-8; name=v20240323-2-0002-write_reconstructed_file.patchDownload
From d2b717d14638632c25d0e6919f5cd40333e9ebd0 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@2ndquadrant.com>
Date: Sat, 23 Mar 2024 18:26:21 +0100
Subject: [PATCH v20240323-2 2/2] write_reconstructed_file
---
src/bin/pg_combinebackup/reconstruct.c | 32 +++++++++++++++++++++++---
1 file changed, 29 insertions(+), 3 deletions(-)
diff --git a/src/bin/pg_combinebackup/reconstruct.c b/src/bin/pg_combinebackup/reconstruct.c
index f5c7af8a23c..4de92894bed 100644
--- a/src/bin/pg_combinebackup/reconstruct.c
+++ b/src/bin/pg_combinebackup/reconstruct.c
@@ -59,7 +59,8 @@ static void write_reconstructed_file(char *input_filename,
off_t *offsetmap,
pg_checksum_context *checksum_ctx,
bool debug,
- bool dry_run);
+ bool dry_run,
+ CopyMode copy_mode);
static void read_bytes(rfile *rf, void *buffer, unsigned length);
/*
@@ -325,7 +326,8 @@ reconstruct_from_incremental_file(char *input_filename,
{
write_reconstructed_file(input_filename, output_filename,
block_length, sourcemap, offsetmap,
- &checksum_ctx, debug, dry_run);
+ &checksum_ctx, debug, dry_run,
+ copy_method);
debug_reconstruction(n_prior_backups + 1, source, dry_run);
}
@@ -528,7 +530,8 @@ write_reconstructed_file(char *input_filename,
off_t *offsetmap,
pg_checksum_context *checksum_ctx,
bool debug,
- bool dry_run)
+ bool dry_run,
+ CopyMode copy_mode)
{
int wfd = -1;
unsigned i;
@@ -630,6 +633,29 @@ write_reconstructed_file(char *input_filename,
if (dry_run)
continue;
+ /*
+ * If requested, copy the block using copy_file_range.
+ *
+ * We can'd do this if the block needs to be zero-filled or when we
+ * need to update checksum.
+ */
+ if ((copy_mode == COPY_MODE_COPY_FILE_RANGE) &&
+ (s != NULL) && (checksum_ctx->type == CHECKSUM_TYPE_NONE))
+ {
+#if defined(HAVE_COPY_FILE_RANGE)
+ do
+ {
+ wb = copy_file_range(s->fd, &offsetmap[i], wfd, NULL, BLCKSZ, 0);
+ if (wb < 0)
+ pg_fatal("error while copying file range from \"%s\" to \"%s\": %m",
+ input_filename, output_filename);
+ } while (wb > 0);
+#else
+ pg_fatal("copy_file_range not supported on this platform");
+#endif
+ continue;
+ }
+
/* Read or zero-fill the block as appropriate. */
if (s == NULL)
{
--
2.44.0
v20240323-2-0001-pg_combinebackup-allow-using-clone-copy_.patchtext/x-patch; charset=UTF-8; name=v20240323-2-0001-pg_combinebackup-allow-using-clone-copy_.patchDownload
From 558321b7ee10fa3902aaed1d1a08857865a232bb Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@2ndquadrant.com>
Date: Tue, 19 Mar 2024 15:16:29 +0100
Subject: [PATCH v20240323-2 1/2] pg_combinebackup - allow using
clone/copy_file_range
---
doc/src/sgml/ref/pg_combinebackup.sgml | 34 +++++
src/bin/pg_combinebackup/copy_file.c | 156 +++++++++++++++-----
src/bin/pg_combinebackup/copy_file.h | 18 ++-
src/bin/pg_combinebackup/pg_combinebackup.c | 18 ++-
src/bin/pg_combinebackup/reconstruct.c | 5 +-
src/bin/pg_combinebackup/reconstruct.h | 5 +-
6 files changed, 192 insertions(+), 44 deletions(-)
diff --git a/doc/src/sgml/ref/pg_combinebackup.sgml b/doc/src/sgml/ref/pg_combinebackup.sgml
index 8a0a600c2b2..60a60e3fae6 100644
--- a/doc/src/sgml/ref/pg_combinebackup.sgml
+++ b/doc/src/sgml/ref/pg_combinebackup.sgml
@@ -185,6 +185,40 @@ PostgreSQL documentation
</listitem>
</varlistentry>
+ <varlistentry>
+ <term><option>--clone</option></term>
+ <listitem>
+ <para>
+ Use efficient file cloning (also known as <quote>reflinks</quote> on
+ some systems) instead of copying files to the new cluster. This can
+ result in near-instantaneous copying of the data files, giving the
+ speed advantages of <option>-k</option>/<option>--link</option> while
+ leaving the old cluster untouched.
+ </para>
+
+ <para>
+ File cloning is only supported on some operating systems and file
+ systems. If it is selected but not supported, the
+ <application>pg_combinebackup</application> run will error. At present,
+ it is supported on Linux (kernel 4.5 or later) with Btrfs and XFS (on
+ file systems created with reflink support), and on macOS with APFS.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term><option>--copy-file-range</option></term>
+ <listitem>
+ <para>
+ Use the <function>copy_file_range</function> system call for efficient
+ copying. On some file systems this gives results similar to
+ <option>--clone</option>, sharing physical disk blocks, while on others
+ it may still copy blocks, but do so via an optimized path. At present,
+ it is supported on Linux and FreeBSD.
+ </para>
+ </listitem>
+ </varlistentry>
+
<varlistentry>
<term><option>-V</option></term>
<term><option>--version</option></term>
diff --git a/src/bin/pg_combinebackup/copy_file.c b/src/bin/pg_combinebackup/copy_file.c
index e6d2423278a..0874679e53a 100644
--- a/src/bin/pg_combinebackup/copy_file.c
+++ b/src/bin/pg_combinebackup/copy_file.c
@@ -14,6 +14,7 @@
#include <copyfile.h>
#endif
#include <fcntl.h>
+#include <limits.h>
#include <sys/stat.h>
#include <unistd.h>
@@ -24,6 +25,10 @@
static void copy_file_blocks(const char *src, const char *dst,
pg_checksum_context *checksum_ctx);
+static void copy_file_clone(const char *src, const char *dst);
+
+static void copy_file_by_range(const char *src, const char *dst);
+
#ifdef WIN32
static void copy_file_copyfile(const char *src, const char *dst);
#endif
@@ -35,7 +40,8 @@ static void copy_file_copyfile(const char *src, const char *dst);
*/
void
copy_file(const char *src, const char *dst,
- pg_checksum_context *checksum_ctx, bool dry_run)
+ pg_checksum_context *checksum_ctx, bool dry_run,
+ CopyMode copy_mode)
{
/*
* In dry-run mode, we don't actually copy anything, nor do we read any
@@ -54,55 +60,68 @@ copy_file(const char *src, const char *dst,
/*
* If we don't need to compute a checksum, then we can use any special
* operating system primitives that we know about to copy the file; this
- * may be quicker than a naive block copy.
+ * may be quicker than a naive block copy. We only do this for WIN32.
+ * On other operating systems the user has to explicitly specify one of
+ * the available primitives - there may be multiple, we don't know which
+ * are reliable/preferred.
+ *
+ * On the other hand, if we need to compute a checksum, but the user
+ * requested a special copy method that does not support this, fallback
+ * to the default block-by-block copy. We don't want to fail if just
+ * one of many files requires checksum, etc.
*/
if (checksum_ctx->type == CHECKSUM_TYPE_NONE)
{
- char *strategy_name = NULL;
- void (*strategy_implementation) (const char *, const char *) = NULL;
-
#ifdef WIN32
- strategy_name = "CopyFile";
- strategy_implementation = copy_file_copyfile;
+ copy_mode = COPY_MODE_COPYFILE;
#endif
-
- if (strategy_name != NULL)
- {
- if (dry_run)
- pg_log_debug("would copy \"%s\" to \"%s\" using strategy %s",
- src, dst, strategy_name);
- else
- {
- pg_log_debug("copying \"%s\" to \"%s\" using strategy %s",
- src, dst, strategy_name);
- (*strategy_implementation) (src, dst);
- }
- return;
- }
+ }
+ else
+ {
+ /* fallback to block-by-block copy */
+ copy_mode = COPY_MODE_COPY;
}
- /*
- * Fall back to the simple approach of reading and writing all the blocks,
- * feeding them into the checksum context as we go.
- */
if (dry_run)
{
- if (checksum_ctx->type == CHECKSUM_TYPE_NONE)
- pg_log_debug("would copy \"%s\" to \"%s\"",
- src, dst);
- else
- pg_log_debug("would copy \"%s\" to \"%s\" and checksum with %s",
- src, dst, pg_checksum_type_name(checksum_ctx->type));
+ switch (copy_mode)
+ {
+ case COPY_MODE_CLONE:
+ pg_log_debug("would copy \"%s\" to \"%s\" (clone)", src, dst);
+ break;
+ case COPY_MODE_COPY:
+ pg_log_debug("would copy \"%s\" to \"%s\"", src, dst);
+ break;
+ case COPY_MODE_COPY_FILE_RANGE:
+ pg_log_debug("would copy \"%s\" to \"%s\" (copy_file_range)",
+ src, dst);
+#ifdef WIN32
+ case COPY_MODE_COPYFILE:
+ pg_log_debug("would copy \"%s\" to \"%s\" (copyfile)",
+ src, dst);
+ break;
+#endif
+ }
}
else
{
- if (checksum_ctx->type == CHECKSUM_TYPE_NONE)
- pg_log_debug("copying \"%s\" to \"%s\"",
- src, dst);
- else
- pg_log_debug("copying \"%s\" to \"%s\" and checksumming with %s",
- src, dst, pg_checksum_type_name(checksum_ctx->type));
- copy_file_blocks(src, dst, checksum_ctx);
+ switch (copy_mode)
+ {
+ case COPY_MODE_CLONE:
+ copy_file_clone(src, dst);
+ break;
+ case COPY_MODE_COPY:
+ copy_file_blocks(src, dst, checksum_ctx);
+ break;
+ case COPY_MODE_COPY_FILE_RANGE:
+ copy_file_by_range(src, dst);
+ break;
+#ifdef WIN32
+ case COPY_MODE_COPYFILE:
+ copy_file_copyfile(src, dst);
+ break;
+#endif
+ }
}
}
@@ -156,6 +175,67 @@ copy_file_blocks(const char *src, const char *dst,
close(dest_fd);
}
+static void
+copy_file_clone(const char *src, const char *dest)
+{
+#if defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE)
+ if (copyfile(src, dest, NULL, COPYFILE_CLONE_FORCE) < 0)
+ pg_fatal("error while cloning file \"%s\" to \"%s\": %m", src, dest);
+#elif defined(__linux__) && defined(FICLONE)
+ {
+ if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0)
+ pg_fatal("could not open file \"%s\": %m", src);
+
+ if ((dest_fd = open(dest, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
+ pg_file_create_mode)) < 0)
+ pg_fatal("could not create file \"%s\": %m", dest);
+
+ if (ioctl(dest_fd, FICLONE, src_fd) < 0)
+ {
+ int save_errno = errno;
+
+ unlink(dest);
+
+ pg_fatal("error while cloning file \"%s\" to \"%s\": %s",
+ src, dest);
+ }
+ }
+#else
+ pg_fatal("file cloning not supported on this platform");
+#endif
+}
+
+static void
+copy_file_by_range(const char *src, const char *dest)
+{
+#if defined(HAVE_COPY_FILE_RANGE)
+ int src_fd;
+ int dest_fd;
+ ssize_t nbytes;
+
+ if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0)
+ pg_fatal("could not open file \"%s\": %m", src);
+
+ if ((dest_fd = open(dest, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
+ pg_file_create_mode)) < 0)
+ pg_fatal("could not create file \"%s\": %m", dest);
+
+ do
+ {
+ nbytes = copy_file_range(src_fd, NULL, dest_fd, NULL, SSIZE_MAX, 0);
+ if (nbytes < 0)
+ pg_fatal("error while copying file range from \"%s\" to \"%s\": %m",
+ src, dest);
+ } while (nbytes > 0);
+
+ close(src_fd);
+ close(dest_fd);
+#else
+ pg_fatal("copy_file_range not supported on this platform");
+#endif
+}
+
+/* XXX maybe this should do the check internally, same as the other functions? */
#ifdef WIN32
static void
copy_file_copyfile(const char *src, const char *dst)
diff --git a/src/bin/pg_combinebackup/copy_file.h b/src/bin/pg_combinebackup/copy_file.h
index 0f6bc09403f..3a1c5eb764f 100644
--- a/src/bin/pg_combinebackup/copy_file.h
+++ b/src/bin/pg_combinebackup/copy_file.h
@@ -11,9 +11,25 @@
#ifndef COPY_FILE_H
#define COPY_FILE_H
+#include "c.h"
#include "common/checksum_helper.h"
+#include "common/file_utils.h"
+
+/*
+ * Enumeration to denote copy modes
+ */
+typedef enum CopyMode
+{
+ COPY_MODE_CLONE,
+ COPY_MODE_COPY,
+ COPY_MODE_COPY_FILE_RANGE,
+#ifdef WIN32
+ COPY_MODE_COPYFILE,
+#endif
+} CopyMode;
extern void copy_file(const char *src, const char *dst,
- pg_checksum_context *checksum_ctx, bool dry_run);
+ pg_checksum_context *checksum_ctx, bool dry_run,
+ CopyMode copy_mode);
#endif /* COPY_FILE_H */
diff --git a/src/bin/pg_combinebackup/pg_combinebackup.c b/src/bin/pg_combinebackup/pg_combinebackup.c
index 74f8be9eeac..b6e1e62e160 100644
--- a/src/bin/pg_combinebackup/pg_combinebackup.c
+++ b/src/bin/pg_combinebackup/pg_combinebackup.c
@@ -69,6 +69,7 @@ typedef struct cb_options
pg_checksum_type manifest_checksums;
bool no_manifest;
DataDirSyncMethod sync_method;
+ CopyMode copy_method;
} cb_options;
/*
@@ -129,6 +130,8 @@ main(int argc, char *argv[])
{"manifest-checksums", required_argument, NULL, 1},
{"no-manifest", no_argument, NULL, 2},
{"sync-method", required_argument, NULL, 3},
+ {"clone", no_argument, NULL, 4},
+ {"copy-file-range", no_argument, NULL, 5},
{NULL, 0, NULL, 0}
};
@@ -156,6 +159,7 @@ main(int argc, char *argv[])
memset(&opt, 0, sizeof(opt));
opt.manifest_checksums = CHECKSUM_TYPE_CRC32C;
opt.sync_method = DATA_DIR_SYNC_METHOD_FSYNC;
+ opt.copy_method = COPY_MODE_COPY;
/* process command-line options */
while ((c = getopt_long(argc, argv, "dnNPo:T:",
@@ -192,6 +196,12 @@ main(int argc, char *argv[])
if (!parse_sync_method(optarg, &opt.sync_method))
exit(1);
break;
+ case 4:
+ opt.copy_method = COPY_MODE_CLONE;
+ break;
+ case 5:
+ opt.copy_method = COPY_MODE_COPY_FILE_RANGE;
+ break;
default:
/* getopt_long already emitted a complaint */
pg_log_error_hint("Try \"%s --help\" for more information.", progname);
@@ -696,6 +706,8 @@ help(const char *progname)
" use algorithm for manifest checksums\n"));
printf(_(" --no-manifest suppress generation of backup manifest\n"));
printf(_(" --sync-method=METHOD set method for syncing files to disk\n"));
+ printf(_(" --clone clone (reflink) instead of copying files\n"));
+ printf(_(" --copy-file-range copy using copy_file_range() syscall\n"));
printf(_(" -?, --help show this help, then exit\n"));
printf(_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
@@ -937,7 +949,8 @@ process_directory_recursively(Oid tsoid,
&checksum_length,
&checksum_payload,
opt->debug,
- opt->dry_run);
+ opt->dry_run,
+ opt->copy_method);
}
else
{
@@ -993,7 +1006,8 @@ process_directory_recursively(Oid tsoid,
/* Actually copy the file. */
snprintf(ofullpath, MAXPGPATH, "%s/%s", ofulldir, de->d_name);
- copy_file(ifullpath, ofullpath, &checksum_ctx, opt->dry_run);
+ copy_file(ifullpath, ofullpath, &checksum_ctx, opt->dry_run,
+ opt->copy_method);
/*
* If copy_file() performed a checksum calculation for us, then
diff --git a/src/bin/pg_combinebackup/reconstruct.c b/src/bin/pg_combinebackup/reconstruct.c
index 41f06bb26b5..f5c7af8a23c 100644
--- a/src/bin/pg_combinebackup/reconstruct.c
+++ b/src/bin/pg_combinebackup/reconstruct.c
@@ -90,7 +90,8 @@ reconstruct_from_incremental_file(char *input_filename,
int *checksum_length,
uint8 **checksum_payload,
bool debug,
- bool dry_run)
+ bool dry_run,
+ CopyMode copy_method)
{
rfile **source;
rfile *latest_source = NULL;
@@ -319,7 +320,7 @@ reconstruct_from_incremental_file(char *input_filename,
*/
if (copy_source != NULL)
copy_file(copy_source->filename, output_filename,
- &checksum_ctx, dry_run);
+ &checksum_ctx, dry_run, copy_method);
else
{
write_reconstructed_file(input_filename, output_filename,
diff --git a/src/bin/pg_combinebackup/reconstruct.h b/src/bin/pg_combinebackup/reconstruct.h
index 8e33a8a95a0..726f94389f3 100644
--- a/src/bin/pg_combinebackup/reconstruct.h
+++ b/src/bin/pg_combinebackup/reconstruct.h
@@ -13,7 +13,9 @@
#ifndef RECONSTRUCT_H
#define RECONSTRUCT_H
+#include "c.h"
#include "common/checksum_helper.h"
+#include "common/file_utils.h"
#include "load_manifest.h"
extern void reconstruct_from_incremental_file(char *input_filename,
@@ -28,6 +30,7 @@ extern void reconstruct_from_incremental_file(char *input_filename,
int *checksum_length,
uint8 **checksum_payload,
bool debug,
- bool dry_run);
+ bool dry_run,
+ CopyMode copy_mode);
#endif
--
2.44.0
On Sat, Mar 23, 2024 at 9:37 AM Tomas Vondra
<tomas.vondra@enterprisedb.com> wrote:
OK, that makes sense. Here's a patch that should work like this - in
copy_file we check if we need to calculate checksums, and either use the
requested copy method, or fall back to the block-by-block copy.
+ Use efficient file cloning (also known as <quote>reflinks</quote> on
+ some systems) instead of copying files to the new cluster. This can
new cluster -> output directory
I think your version kind of messes up the debug logging. In my
version, every call to copy_file() would emit either "would copy
\"%s\" to \"%s\" using strategy %s" and "copying \"%s\" to \"%s\"
using strategy %s". In your version, the dry_run mode emits a string
similar to the former, but creates separate translatable strings for
each copy method instead of using the same one with a different value
of %s. In non-dry-run mode, I think your version loses the debug
logging altogether.
--
Robert Haas
EDB: http://www.enterprisedb.com
On Sat, Mar 23, 2024 at 9:47 AM Tomas Vondra
<tomas.vondra@enterprisedb.com> wrote:
BTW is there a reason why the code calls "write" and not "pg_pwrite"?
I think it's mostly because I learned to code a really long time ago.
--
Robert Haas
EDB: http://www.enterprisedb.com
On Sat, Mar 23, 2024 at 6:57 PM Tomas Vondra
<tomas.vondra@enterprisedb.com> wrote:
On 3/23/24 14:47, Tomas Vondra wrote:
On 3/23/24 13:38, Robert Haas wrote:
On Fri, Mar 22, 2024 at 8:26 PM Thomas Munro <thomas.munro@gmail.com> wrote:
[..]
Yeah, that's in write_reconstructed_file() and the patch does not touch
that at all. I agree it would be nice to use copy_file_range() in this
part too, and it doesn't seem it'd be that hard to do, I think.It seems we'd just need a "fork" that either calls pread/pwrite or
copy_file_range, depending on checksums and what was requested.Here's a patch to use copy_file_range in write_reconstructed_file too,
when requested/possible. One thing that I'm not sure about is whether to
do pg_fatal() if --copy-file-range but the platform does not support it.
[..]
Hi Tomas, so I gave a go to the below patches today:
- v20240323-2-0001-pg_combinebackup-allow-using-clone-copy_.patch
- v20240323-2-0002-write_reconstructed_file.patch
My assessment:
v20240323-2-0001-pg_combinebackup-allow-using-clone-copy_.patch -
looks like more or less good to go
v20240323-2-0002-write_reconstructed_file.patch - needs work and
without that clone/copy_file_range() good effects are unlikely
Given Debian 12, ~100GB DB, (pgbench -i -s 7000 , and some additional
tables with GiST and GIN indexes , just to see more WAL record types)
and with backups sizes in MB like that:
106831 full
2823 incr.1 # captured after some time with pgbench -R 100
165 incr.2 # captured after some time with pgbench -R 100
Test cmd: rm -rf outtest; sync; sync ; sync; echo 3 | sudo tee
/proc/sys/vm/drop_caches ; time /usr/pgsql17/bin/pg_combinebackup -o
outtest full incr.1 incr.2
Test results of various copies on small I/O constrained XFS device:
normal copy: 31m47.407s
--clone copy: error: file cloning not supported on this platform (it's
due #ifdef of having COPY_FILE_RANGE available)
--copy-file-range: aborted, as it was taking too long , I was
expecting it to accelerate, but it did not... obviously this is the
transparent failover in case of calculating checksums...
--manifest-checksums=NONE --copy-file-range: BUG, it keep on appending
to just one file e.g. outtest/base/5/16427.29 with 200GB+ ?? and ended
up with ENOSPC [more on this later]
--manifest-checksums=NONE --copy-file-range without v20240323-2-0002: 27m23.887s
--manifest-checksums=NONE --copy-file-range with v20240323-2-0002 and
loop-fix: 5m1.986s but it creates corruption as it stands
Issues:
1. https://cirrus-ci.com/task/5937513653600256?logs=mingw_cross_warning#L327
compains about win32/mingw:
[15:47:27.184] In file included from copy_file.c:22:
[15:47:27.184] copy_file.c: In function ‘copy_file’:
[15:47:27.184] ../../../src/include/common/logging.h:134:6: error:
this statement may fall through [-Werror=implicit-fallthrough=]
[15:47:27.184] 134 | if (unlikely(__pg_log_level <= PG_LOG_DEBUG)) \
[15:47:27.184] | ^
[15:47:27.184] copy_file.c:96:5: note: in expansion of macro ‘pg_log_debug’
[15:47:27.184] 96 | pg_log_debug("would copy \"%s\" to \"%s\"
(copy_file_range)",
[15:47:27.184] | ^~~~~~~~~~~~
[15:47:27.184] copy_file.c:99:4: note: here
[15:47:27.184] 99 | case COPY_MODE_COPYFILE:
[15:47:27.184] | ^~~~
[15:47:27.184] cc1: all warnings being treated as errors
2. I do not know what's the consensus between --clone and
--copy-file-range , but if we have #ifdef FICLONE clone_works() #elif
HAVE_COPY_FILE_RANGE copy_file_range_only_works() then we should also
apply the same logic to the --help so that --clone is not visible
there (for consistency?). Also the "error: file cloning not supported
on this platform " is technically incorrect, Linux does support
ioctl(FICLONE) and copy_file_range(), but we are just not choosing one
over another (so technically it is "available"). Nitpicking I know.
3. [v20240323-2-0002-write_reconstructed_file.patch]: The mentioned
ENOSPACE spiral-of-death-bug symptoms are like that:
strace:
copy_file_range(8, [697671680], 9, NULL, 8192, 0) = 8192
copy_file_range(8, [697679872], 9, NULL, 8192, 0) = 8192
copy_file_range(8, [697688064], 9, NULL, 8192, 0) = 8192
copy_file_range(8, [697696256], 9, NULL, 8192, 0) = 8192
copy_file_range(8, [697704448], 9, NULL, 8192, 0) = 8192
copy_file_range(8, [697712640], 9, NULL, 8192, 0) = 8192
copy_file_range(8, [697720832], 9, NULL, 8192, 0) = 8192
copy_file_range(8, [697729024], 9, NULL, 8192, 0) = 8192
copy_file_range(8, [697737216], 9, NULL, 8192, 0) = 8192
copy_file_range(8, [697745408], 9, NULL, 8192, 0) = 8192
copy_file_range(8, [697753600], 9, NULL, 8192, 0) = 8192
copy_file_range(8, [697761792], 9, NULL, 8192, 0) = 8192
copy_file_range(8, [697769984], 9, NULL, 8192, 0) = 8192
Notice that dest_off_t (poutoff) is NULL.
(gdb) where
#0 0x00007f2cd56f6733 in copy_file_range (infd=8,
pinoff=pinoff@entry=0x7f2cd53f54e8, outfd=outfd@entry=9,
poutoff=poutoff@entry=0x0,
length=length@entry=8192, flags=flags@entry=0) at
../sysdeps/unix/sysv/linux/copy_file_range.c:28
#1 0x00005555ecd077f4 in write_reconstructed_file
(copy_mode=COPY_MODE_COPY_FILE_RANGE, dry_run=false, debug=true,
checksum_ctx=0x7ffc4cdb7700,
offsetmap=<optimized out>, sourcemap=0x7f2cd54f6010,
block_length=<optimized out>, output_filename=0x7ffc4cdba910
"outtest/base/5/16427.29",
input_filename=0x7ffc4cdba510
"incr.2/base/5/INCREMENTAL.16427.29") at reconstruct.c:648
#2 reconstruct_from_incremental_file
(input_filename=input_filename@entry=0x7ffc4cdba510
"incr.2/base/5/INCREMENTAL.16427.29",
output_filename=output_filename@entry=0x7ffc4cdba910
"outtest/base/5/16427.29",
relative_path=relative_path@entry=0x7ffc4cdbc670 "base/5",
bare_file_name=bare_file_name@entry=0x5555ee2056ef "16427.29",
n_prior_backups=n_prior_backups@entry=2,
prior_backup_dirs=prior_backup_dirs@entry=0x7ffc4cdbf248,
manifests=0x5555ee137a10, manifest_path=0x7ffc4cdbad10
"base/5/16427.29",
checksum_type=CHECKSUM_TYPE_NONE, checksum_length=0x7ffc4cdb9864,
checksum_payload=0x7ffc4cdb9868, debug=true, dry_run=false,
copy_method=COPY_MODE_COPY_FILE_RANGE) at reconstruct.c:327
.. it's a spiral of death till ENOSPC. Reverting the
v20240323-2-0002-write_reconstructed_file.patch helps. The problem
lies in that do-wb=-inifity-loop (?) along with NULL for destination
off_t. This seem to solves that thingy(?):
- do
- {
- wb = copy_file_range(s->fd,
&offsetmap[i], wfd, NULL, BLCKSZ, 0);
+ //do
+ //{
+ wb = copy_file_range(s->fd,
&offsetmap[i], wfd, &offsetmap[i], BLCKSZ, 0);
if (wb < 0)
pg_fatal("error while copying
file range from \"%s\" to \"%s\": %m",
input_filename, output_filename);
- } while (wb > 0);
+ //} while (wb > 0);
#else
...so that way I've got it down to 5mins.
3. .. but onn startup I've got this after trying psql login: invalid
page in block 0 of relation base/5/1259 . I've again reverted the
v20240323-2-0002 to see if that helped for next-round of
pg_combinebackup --manifest-checksums=NONE --copy-file-range and after
32mins of waiting it did help indeed: I was able to login and select
counts worked and matched properly the data. I've reapplied the
v20240323-2-0002 (with my fix to prevent that endless loop) and the
issue was again(!) there. Probably it's related to the destination
offset. I couldn't find more time to look on it today and the setup
was big 100GB on slow device, so just letting You know as fast as
possible.
4. More efficiency is on the table option (optional patch node ; just
for completeness; I dont think we have time for that? ): even if
v20240323-2-0002 would work, the problem is that it would be sending
syscall for every 8kB. We seem to be performing lots of per-8KB
syscalls which hinder performance (both in copy_file_range and in
normal copy):
pread64(8, ""..., 8192, 369115136) = 8192 // 369115136 + 8192 =
369123328 (matches next pread offset)
write(9, ""..., 8192) = 8192
pread64(8, ""..., 8192, 369123328) = 8192 // 369123328 + 8192 = 369131520
write(9, ""..., 8192) = 8192
pread64(8, ""..., 8192, 369131520) = 8192 // and so on
write(9, ""..., 8192) = 8192
Apparently there's no merging of adjacent IO/s, so pg_combinebackup
wastes lots of time on issuing instead small syscalls but it could
let's say do single pread/write (or even copy_file_range()). I think
it was not evident in my earlier testing (200GB; 39min vs ~40s) as I
had much smaller modifications in my incremental (think of 99% of
static data).
5. I think we should change the subject with new patch revision, so
that such functionality for incremental backups is not buried down in
the pg_upgrade thread ;)
-J.
On 3/26/24 15:09, Jakub Wartak wrote:
On Sat, Mar 23, 2024 at 6:57 PM Tomas Vondra
<tomas.vondra@enterprisedb.com> wrote:On 3/23/24 14:47, Tomas Vondra wrote:
On 3/23/24 13:38, Robert Haas wrote:
On Fri, Mar 22, 2024 at 8:26 PM Thomas Munro <thomas.munro@gmail.com> wrote:
[..]
Yeah, that's in write_reconstructed_file() and the patch does not touch
that at all. I agree it would be nice to use copy_file_range() in this
part too, and it doesn't seem it'd be that hard to do, I think.It seems we'd just need a "fork" that either calls pread/pwrite or
copy_file_range, depending on checksums and what was requested.Here's a patch to use copy_file_range in write_reconstructed_file too,
when requested/possible. One thing that I'm not sure about is whether to
do pg_fatal() if --copy-file-range but the platform does not support it.[..]
Hi Tomas, so I gave a go to the below patches today:
- v20240323-2-0001-pg_combinebackup-allow-using-clone-copy_.patch
- v20240323-2-0002-write_reconstructed_file.patchMy assessment:
v20240323-2-0001-pg_combinebackup-allow-using-clone-copy_.patch -
looks like more or less good to go
There's some issues with the --dry-run, pointed out by Robert. Should be
fixed in the attached version.
v20240323-2-0002-write_reconstructed_file.patch - needs work and
without that clone/copy_file_range() good effects are unlikelyGiven Debian 12, ~100GB DB, (pgbench -i -s 7000 , and some additional
tables with GiST and GIN indexes , just to see more WAL record types)
and with backups sizes in MB like that:106831 full
2823 incr.1 # captured after some time with pgbench -R 100
165 incr.2 # captured after some time with pgbench -R 100Test cmd: rm -rf outtest; sync; sync ; sync; echo 3 | sudo tee
/proc/sys/vm/drop_caches ; time /usr/pgsql17/bin/pg_combinebackup -o
outtest full incr.1 incr.2Test results of various copies on small I/O constrained XFS device:
normal copy: 31m47.407s
--clone copy: error: file cloning not supported on this platform (it's
due #ifdef of having COPY_FILE_RANGE available)
--copy-file-range: aborted, as it was taking too long , I was
expecting it to accelerate, but it did not... obviously this is the
transparent failover in case of calculating checksums...
--manifest-checksums=NONE --copy-file-range: BUG, it keep on appending
to just one file e.g. outtest/base/5/16427.29 with 200GB+ ?? and ended
up with ENOSPC [more on this later]
That's really strange.
--manifest-checksums=NONE --copy-file-range without v20240323-2-0002: 27m23.887s
--manifest-checksums=NONE --copy-file-range with v20240323-2-0002 and
loop-fix: 5m1.986s but it creates corruption as it stands
Thanks. I plan to do more similar tests, once my machines get done with
some other stuff.
Issues:
1. https://cirrus-ci.com/task/5937513653600256?logs=mingw_cross_warning#L327
compains about win32/mingw:[15:47:27.184] In file included from copy_file.c:22:
[15:47:27.184] copy_file.c: In function ‘copy_file’:
[15:47:27.184] ../../../src/include/common/logging.h:134:6: error:
this statement may fall through [-Werror=implicit-fallthrough=]
[15:47:27.184] 134 | if (unlikely(__pg_log_level <= PG_LOG_DEBUG)) \
[15:47:27.184] | ^
[15:47:27.184] copy_file.c:96:5: note: in expansion of macro ‘pg_log_debug’
[15:47:27.184] 96 | pg_log_debug("would copy \"%s\" to \"%s\"
(copy_file_range)",
[15:47:27.184] | ^~~~~~~~~~~~
[15:47:27.184] copy_file.c:99:4: note: here
[15:47:27.184] 99 | case COPY_MODE_COPYFILE:
[15:47:27.184] | ^~~~
[15:47:27.184] cc1: all warnings being treated as errors
Yup, missing break.
2. I do not know what's the consensus between --clone and
--copy-file-range , but if we have #ifdef FICLONE clone_works() #elif
HAVE_COPY_FILE_RANGE copy_file_range_only_works() then we should also
apply the same logic to the --help so that --clone is not visible
there (for consistency?). Also the "error: file cloning not supported
on this platform " is technically incorrect, Linux does support
ioctl(FICLONE) and copy_file_range(), but we are just not choosing one
over another (so technically it is "available"). Nitpicking I know.
That's a good question, I'm not sure. But whatever we do, we should do
the same thing in pg_upgrade. Maybe there's some sort of precedent?
3. [v20240323-2-0002-write_reconstructed_file.patch]: The mentioned
ENOSPACE spiral-of-death-bug symptoms are like that:strace:
copy_file_range(8, [697671680], 9, NULL, 8192, 0) = 8192
copy_file_range(8, [697679872], 9, NULL, 8192, 0) = 8192
copy_file_range(8, [697688064], 9, NULL, 8192, 0) = 8192
copy_file_range(8, [697696256], 9, NULL, 8192, 0) = 8192
copy_file_range(8, [697704448], 9, NULL, 8192, 0) = 8192
copy_file_range(8, [697712640], 9, NULL, 8192, 0) = 8192
copy_file_range(8, [697720832], 9, NULL, 8192, 0) = 8192
copy_file_range(8, [697729024], 9, NULL, 8192, 0) = 8192
copy_file_range(8, [697737216], 9, NULL, 8192, 0) = 8192
copy_file_range(8, [697745408], 9, NULL, 8192, 0) = 8192
copy_file_range(8, [697753600], 9, NULL, 8192, 0) = 8192
copy_file_range(8, [697761792], 9, NULL, 8192, 0) = 8192
copy_file_range(8, [697769984], 9, NULL, 8192, 0) = 8192Notice that dest_off_t (poutoff) is NULL.
(gdb) where
#0 0x00007f2cd56f6733 in copy_file_range (infd=8,
pinoff=pinoff@entry=0x7f2cd53f54e8, outfd=outfd@entry=9,
poutoff=poutoff@entry=0x0,
length=length@entry=8192, flags=flags@entry=0) at
../sysdeps/unix/sysv/linux/copy_file_range.c:28
#1 0x00005555ecd077f4 in write_reconstructed_file
(copy_mode=COPY_MODE_COPY_FILE_RANGE, dry_run=false, debug=true,
checksum_ctx=0x7ffc4cdb7700,
offsetmap=<optimized out>, sourcemap=0x7f2cd54f6010,
block_length=<optimized out>, output_filename=0x7ffc4cdba910
"outtest/base/5/16427.29",
input_filename=0x7ffc4cdba510
"incr.2/base/5/INCREMENTAL.16427.29") at reconstruct.c:648
#2 reconstruct_from_incremental_file
(input_filename=input_filename@entry=0x7ffc4cdba510
"incr.2/base/5/INCREMENTAL.16427.29",
output_filename=output_filename@entry=0x7ffc4cdba910
"outtest/base/5/16427.29",
relative_path=relative_path@entry=0x7ffc4cdbc670 "base/5",
bare_file_name=bare_file_name@entry=0x5555ee2056ef "16427.29",
n_prior_backups=n_prior_backups@entry=2,
prior_backup_dirs=prior_backup_dirs@entry=0x7ffc4cdbf248,
manifests=0x5555ee137a10, manifest_path=0x7ffc4cdbad10
"base/5/16427.29",
checksum_type=CHECKSUM_TYPE_NONE, checksum_length=0x7ffc4cdb9864,
checksum_payload=0x7ffc4cdb9868, debug=true, dry_run=false,
copy_method=COPY_MODE_COPY_FILE_RANGE) at reconstruct.c:327.. it's a spiral of death till ENOSPC. Reverting the
v20240323-2-0002-write_reconstructed_file.patch helps. The problem
lies in that do-wb=-inifity-loop (?) along with NULL for destination
off_t. This seem to solves that thingy(?):- do - { - wb = copy_file_range(s->fd, &offsetmap[i], wfd, NULL, BLCKSZ, 0); + //do + //{ + wb = copy_file_range(s->fd, &offsetmap[i], wfd, &offsetmap[i], BLCKSZ, 0); if (wb < 0) pg_fatal("error while copying file range from \"%s\" to \"%s\": %m",input_filename, output_filename); - } while (wb > 0); + //} while (wb > 0); #else...so that way I've got it down to 5mins.
Yeah, that retry logic is wrong. I ended up copying the check from the
"regular" copy branch, which simply bails out if copy_file_range returns
anything but the expected 8192.
I wonder if this should deal with partial writes, though. I mean, it's
allowed copy_file_range() copies only some of the bytes - I don't know
how often / in what situations that happens, though ... And if we want
to handle that for copy_file_range(), pwrite() needs the same treatment.
3. .. but onn startup I've got this after trying psql login: invalid
page in block 0 of relation base/5/1259 . I've again reverted the
v20240323-2-0002 to see if that helped for next-round of
pg_combinebackup --manifest-checksums=NONE --copy-file-range and after
32mins of waiting it did help indeed: I was able to login and select
counts worked and matched properly the data. I've reapplied the
v20240323-2-0002 (with my fix to prevent that endless loop) and the
issue was again(!) there. Probably it's related to the destination
offset. I couldn't find more time to look on it today and the setup
was big 100GB on slow device, so just letting You know as fast as
possible.
Can you see if you can still reproduce this with the attached version?
4. More efficiency is on the table option (optional patch node ; just
for completeness; I dont think we have time for that? ): even if
v20240323-2-0002 would work, the problem is that it would be sending
syscall for every 8kB. We seem to be performing lots of per-8KB
syscalls which hinder performance (both in copy_file_range and in
normal copy):pread64(8, ""..., 8192, 369115136) = 8192 // 369115136 + 8192 =
369123328 (matches next pread offset)
write(9, ""..., 8192) = 8192
pread64(8, ""..., 8192, 369123328) = 8192 // 369123328 + 8192 = 369131520
write(9, ""..., 8192) = 8192
pread64(8, ""..., 8192, 369131520) = 8192 // and so on
write(9, ""..., 8192) = 8192Apparently there's no merging of adjacent IO/s, so pg_combinebackup
wastes lots of time on issuing instead small syscalls but it could
let's say do single pread/write (or even copy_file_range()). I think
it was not evident in my earlier testing (200GB; 39min vs ~40s) as I
had much smaller modifications in my incremental (think of 99% of
static data).
Yes, I've been thinking about exactly this optimization, but I think
we're way past proposing this for PG17. The changes that would require
in reconstruct_from_incremental_file are way too significant. Has to
wait for PG18 ;-)
I do think there's more on the table, as mentioned by Thomas a couple
days ago - maybe we shouldn't approach clone/copy_file_range merely as
an optimization to save time, it might be entirely reasonable to do this
simply to allow the filesystem to do CoW magic and save space (even if
we need to read the data and recalculate the checksum, which now
disables these copy methods).
5. I think we should change the subject with new patch revision, so
that such functionality for incremental backups is not buried down in
the pg_upgrade thread ;)
OK.
regards
--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
Attachments:
v20240326-0001-pg_combinebackup-allow-using-clone-copy_fi.patchtext/x-patch; charset=UTF-8; name=v20240326-0001-pg_combinebackup-allow-using-clone-copy_fi.patchDownload
From 13387b49b33cdb2a16c3d336368cd48c79f4dc76 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@2ndquadrant.com>
Date: Tue, 19 Mar 2024 15:16:29 +0100
Subject: [PATCH v20240326 1/2] pg_combinebackup - allow using
clone/copy_file_range
---
doc/src/sgml/ref/pg_combinebackup.sgml | 34 ++++
src/bin/pg_combinebackup/copy_file.c | 166 ++++++++++++++++----
src/bin/pg_combinebackup/copy_file.h | 18 ++-
src/bin/pg_combinebackup/pg_combinebackup.c | 18 ++-
src/bin/pg_combinebackup/reconstruct.c | 5 +-
src/bin/pg_combinebackup/reconstruct.h | 5 +-
6 files changed, 206 insertions(+), 40 deletions(-)
diff --git a/doc/src/sgml/ref/pg_combinebackup.sgml b/doc/src/sgml/ref/pg_combinebackup.sgml
index 8a0a600c2b2..60a60e3fae6 100644
--- a/doc/src/sgml/ref/pg_combinebackup.sgml
+++ b/doc/src/sgml/ref/pg_combinebackup.sgml
@@ -185,6 +185,40 @@ PostgreSQL documentation
</listitem>
</varlistentry>
+ <varlistentry>
+ <term><option>--clone</option></term>
+ <listitem>
+ <para>
+ Use efficient file cloning (also known as <quote>reflinks</quote> on
+ some systems) instead of copying files to the new cluster. This can
+ result in near-instantaneous copying of the data files, giving the
+ speed advantages of <option>-k</option>/<option>--link</option> while
+ leaving the old cluster untouched.
+ </para>
+
+ <para>
+ File cloning is only supported on some operating systems and file
+ systems. If it is selected but not supported, the
+ <application>pg_combinebackup</application> run will error. At present,
+ it is supported on Linux (kernel 4.5 or later) with Btrfs and XFS (on
+ file systems created with reflink support), and on macOS with APFS.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term><option>--copy-file-range</option></term>
+ <listitem>
+ <para>
+ Use the <function>copy_file_range</function> system call for efficient
+ copying. On some file systems this gives results similar to
+ <option>--clone</option>, sharing physical disk blocks, while on others
+ it may still copy blocks, but do so via an optimized path. At present,
+ it is supported on Linux and FreeBSD.
+ </para>
+ </listitem>
+ </varlistentry>
+
<varlistentry>
<term><option>-V</option></term>
<term><option>--version</option></term>
diff --git a/src/bin/pg_combinebackup/copy_file.c b/src/bin/pg_combinebackup/copy_file.c
index e6d2423278a..a690ecb8e12 100644
--- a/src/bin/pg_combinebackup/copy_file.c
+++ b/src/bin/pg_combinebackup/copy_file.c
@@ -14,6 +14,7 @@
#include <copyfile.h>
#endif
#include <fcntl.h>
+#include <limits.h>
#include <sys/stat.h>
#include <unistd.h>
@@ -24,6 +25,10 @@
static void copy_file_blocks(const char *src, const char *dst,
pg_checksum_context *checksum_ctx);
+static void copy_file_clone(const char *src, const char *dst);
+
+static void copy_file_by_range(const char *src, const char *dst);
+
#ifdef WIN32
static void copy_file_copyfile(const char *src, const char *dst);
#endif
@@ -35,8 +40,11 @@ static void copy_file_copyfile(const char *src, const char *dst);
*/
void
copy_file(const char *src, const char *dst,
- pg_checksum_context *checksum_ctx, bool dry_run)
+ pg_checksum_context *checksum_ctx, bool dry_run,
+ CopyMode copy_mode)
{
+ char *strategy_name = NULL;
+
/*
* In dry-run mode, we don't actually copy anything, nor do we read any
* data from the source file, but we do verify that we can open it.
@@ -52,57 +60,86 @@ copy_file(const char *src, const char *dst,
}
/*
+ *
+ * If we need to compute a checksum, but the user perhaps requested
+ * a special copy method that does not support this, fallback to the
+ * default block-by-block copy. We don't want to fail if just one of
+ * many files requires checksum, etc.
+ *
* If we don't need to compute a checksum, then we can use any special
* operating system primitives that we know about to copy the file; this
- * may be quicker than a naive block copy.
+ * may be quicker than a naive block copy. We only do this for WIN32.
+ * On other operating systems the user has to explicitly specify one of
+ * the available primitives - there may be multiple, we don't know which
+ * are reliable/preferred.
*/
- if (checksum_ctx->type == CHECKSUM_TYPE_NONE)
+ if (checksum_ctx->type != CHECKSUM_TYPE_NONE)
{
- char *strategy_name = NULL;
- void (*strategy_implementation) (const char *, const char *) = NULL;
-
+ /* fallback to block-by-block copy */
+ copy_mode = COPY_MODE_COPY;
+ }
#ifdef WIN32
- strategy_name = "CopyFile";
- strategy_implementation = copy_file_copyfile;
+ else
+ {
+ copy_mode = COPY_MODE_COPYFILE;
+ }
#endif
- if (strategy_name != NULL)
- {
- if (dry_run)
- pg_log_debug("would copy \"%s\" to \"%s\" using strategy %s",
- src, dst, strategy_name);
- else
- {
- pg_log_debug("copying \"%s\" to \"%s\" using strategy %s",
- src, dst, strategy_name);
- (*strategy_implementation) (src, dst);
- }
- return;
- }
+ /* Determine the name of the copy strategy for use in log messages. */
+ switch (copy_mode)
+ {
+ case COPY_MODE_CLONE:
+ strategy_name = "clone";
+ break;
+ case COPY_MODE_COPY:
+ /* leave NULL for simple block-by-block copy */
+ break;
+ case COPY_MODE_COPY_FILE_RANGE:
+ strategy_name = "copy_file_range";
+ break;
+#ifdef WIN32
+ case COPY_MODE_COPYFILE:
+ strategy_name = "CopyFile";
+ break;
+#endif
}
- /*
- * Fall back to the simple approach of reading and writing all the blocks,
- * feeding them into the checksum context as we go.
- */
if (dry_run)
{
- if (checksum_ctx->type == CHECKSUM_TYPE_NONE)
+ if (strategy_name)
+ pg_log_debug("would copy \"%s\" to \"%s\" using strategy %s",
+ src, dst, strategy_name);
+ else
pg_log_debug("would copy \"%s\" to \"%s\"",
src, dst);
- else
- pg_log_debug("would copy \"%s\" to \"%s\" and checksum with %s",
- src, dst, pg_checksum_type_name(checksum_ctx->type));
}
else
{
- if (checksum_ctx->type == CHECKSUM_TYPE_NONE)
+
+ if (strategy_name)
+ pg_log_debug("copying \"%s\" to \"%s\" using strategy %s",
+ src, dst, strategy_name);
+ else
pg_log_debug("copying \"%s\" to \"%s\"",
src, dst);
- else
- pg_log_debug("copying \"%s\" to \"%s\" and checksumming with %s",
- src, dst, pg_checksum_type_name(checksum_ctx->type));
- copy_file_blocks(src, dst, checksum_ctx);
+
+ switch (copy_mode)
+ {
+ case COPY_MODE_CLONE:
+ copy_file_clone(src, dst);
+ break;
+ case COPY_MODE_COPY:
+ copy_file_blocks(src, dst, checksum_ctx);
+ break;
+ case COPY_MODE_COPY_FILE_RANGE:
+ copy_file_by_range(src, dst);
+ break;
+#ifdef WIN32
+ case COPY_MODE_COPYFILE:
+ copy_file_copyfile(src, dst);
+ break;
+#endif
+ }
}
}
@@ -156,6 +193,67 @@ copy_file_blocks(const char *src, const char *dst,
close(dest_fd);
}
+static void
+copy_file_clone(const char *src, const char *dest)
+{
+#if defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE)
+ if (copyfile(src, dest, NULL, COPYFILE_CLONE_FORCE) < 0)
+ pg_fatal("error while cloning file \"%s\" to \"%s\": %m", src, dest);
+#elif defined(__linux__) && defined(FICLONE)
+ {
+ if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0)
+ pg_fatal("could not open file \"%s\": %m", src);
+
+ if ((dest_fd = open(dest, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
+ pg_file_create_mode)) < 0)
+ pg_fatal("could not create file \"%s\": %m", dest);
+
+ if (ioctl(dest_fd, FICLONE, src_fd) < 0)
+ {
+ int save_errno = errno;
+
+ unlink(dest);
+
+ pg_fatal("error while cloning file \"%s\" to \"%s\": %s",
+ src, dest);
+ }
+ }
+#else
+ pg_fatal("file cloning not supported on this platform");
+#endif
+}
+
+static void
+copy_file_by_range(const char *src, const char *dest)
+{
+#if defined(HAVE_COPY_FILE_RANGE)
+ int src_fd;
+ int dest_fd;
+ ssize_t nbytes;
+
+ if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0)
+ pg_fatal("could not open file \"%s\": %m", src);
+
+ if ((dest_fd = open(dest, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
+ pg_file_create_mode)) < 0)
+ pg_fatal("could not create file \"%s\": %m", dest);
+
+ do
+ {
+ nbytes = copy_file_range(src_fd, NULL, dest_fd, NULL, SSIZE_MAX, 0);
+ if (nbytes < 0)
+ pg_fatal("error while copying file range from \"%s\" to \"%s\": %m",
+ src, dest);
+ } while (nbytes > 0);
+
+ close(src_fd);
+ close(dest_fd);
+#else
+ pg_fatal("copy_file_range not supported on this platform");
+#endif
+}
+
+/* XXX maybe this should do the check internally, same as the other functions? */
#ifdef WIN32
static void
copy_file_copyfile(const char *src, const char *dst)
diff --git a/src/bin/pg_combinebackup/copy_file.h b/src/bin/pg_combinebackup/copy_file.h
index 0f6bc09403f..3a1c5eb764f 100644
--- a/src/bin/pg_combinebackup/copy_file.h
+++ b/src/bin/pg_combinebackup/copy_file.h
@@ -11,9 +11,25 @@
#ifndef COPY_FILE_H
#define COPY_FILE_H
+#include "c.h"
#include "common/checksum_helper.h"
+#include "common/file_utils.h"
+
+/*
+ * Enumeration to denote copy modes
+ */
+typedef enum CopyMode
+{
+ COPY_MODE_CLONE,
+ COPY_MODE_COPY,
+ COPY_MODE_COPY_FILE_RANGE,
+#ifdef WIN32
+ COPY_MODE_COPYFILE,
+#endif
+} CopyMode;
extern void copy_file(const char *src, const char *dst,
- pg_checksum_context *checksum_ctx, bool dry_run);
+ pg_checksum_context *checksum_ctx, bool dry_run,
+ CopyMode copy_mode);
#endif /* COPY_FILE_H */
diff --git a/src/bin/pg_combinebackup/pg_combinebackup.c b/src/bin/pg_combinebackup/pg_combinebackup.c
index 74f8be9eeac..b6e1e62e160 100644
--- a/src/bin/pg_combinebackup/pg_combinebackup.c
+++ b/src/bin/pg_combinebackup/pg_combinebackup.c
@@ -69,6 +69,7 @@ typedef struct cb_options
pg_checksum_type manifest_checksums;
bool no_manifest;
DataDirSyncMethod sync_method;
+ CopyMode copy_method;
} cb_options;
/*
@@ -129,6 +130,8 @@ main(int argc, char *argv[])
{"manifest-checksums", required_argument, NULL, 1},
{"no-manifest", no_argument, NULL, 2},
{"sync-method", required_argument, NULL, 3},
+ {"clone", no_argument, NULL, 4},
+ {"copy-file-range", no_argument, NULL, 5},
{NULL, 0, NULL, 0}
};
@@ -156,6 +159,7 @@ main(int argc, char *argv[])
memset(&opt, 0, sizeof(opt));
opt.manifest_checksums = CHECKSUM_TYPE_CRC32C;
opt.sync_method = DATA_DIR_SYNC_METHOD_FSYNC;
+ opt.copy_method = COPY_MODE_COPY;
/* process command-line options */
while ((c = getopt_long(argc, argv, "dnNPo:T:",
@@ -192,6 +196,12 @@ main(int argc, char *argv[])
if (!parse_sync_method(optarg, &opt.sync_method))
exit(1);
break;
+ case 4:
+ opt.copy_method = COPY_MODE_CLONE;
+ break;
+ case 5:
+ opt.copy_method = COPY_MODE_COPY_FILE_RANGE;
+ break;
default:
/* getopt_long already emitted a complaint */
pg_log_error_hint("Try \"%s --help\" for more information.", progname);
@@ -696,6 +706,8 @@ help(const char *progname)
" use algorithm for manifest checksums\n"));
printf(_(" --no-manifest suppress generation of backup manifest\n"));
printf(_(" --sync-method=METHOD set method for syncing files to disk\n"));
+ printf(_(" --clone clone (reflink) instead of copying files\n"));
+ printf(_(" --copy-file-range copy using copy_file_range() syscall\n"));
printf(_(" -?, --help show this help, then exit\n"));
printf(_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
@@ -937,7 +949,8 @@ process_directory_recursively(Oid tsoid,
&checksum_length,
&checksum_payload,
opt->debug,
- opt->dry_run);
+ opt->dry_run,
+ opt->copy_method);
}
else
{
@@ -993,7 +1006,8 @@ process_directory_recursively(Oid tsoid,
/* Actually copy the file. */
snprintf(ofullpath, MAXPGPATH, "%s/%s", ofulldir, de->d_name);
- copy_file(ifullpath, ofullpath, &checksum_ctx, opt->dry_run);
+ copy_file(ifullpath, ofullpath, &checksum_ctx, opt->dry_run,
+ opt->copy_method);
/*
* If copy_file() performed a checksum calculation for us, then
diff --git a/src/bin/pg_combinebackup/reconstruct.c b/src/bin/pg_combinebackup/reconstruct.c
index 41f06bb26b5..f5c7af8a23c 100644
--- a/src/bin/pg_combinebackup/reconstruct.c
+++ b/src/bin/pg_combinebackup/reconstruct.c
@@ -90,7 +90,8 @@ reconstruct_from_incremental_file(char *input_filename,
int *checksum_length,
uint8 **checksum_payload,
bool debug,
- bool dry_run)
+ bool dry_run,
+ CopyMode copy_method)
{
rfile **source;
rfile *latest_source = NULL;
@@ -319,7 +320,7 @@ reconstruct_from_incremental_file(char *input_filename,
*/
if (copy_source != NULL)
copy_file(copy_source->filename, output_filename,
- &checksum_ctx, dry_run);
+ &checksum_ctx, dry_run, copy_method);
else
{
write_reconstructed_file(input_filename, output_filename,
diff --git a/src/bin/pg_combinebackup/reconstruct.h b/src/bin/pg_combinebackup/reconstruct.h
index 8e33a8a95a0..726f94389f3 100644
--- a/src/bin/pg_combinebackup/reconstruct.h
+++ b/src/bin/pg_combinebackup/reconstruct.h
@@ -13,7 +13,9 @@
#ifndef RECONSTRUCT_H
#define RECONSTRUCT_H
+#include "c.h"
#include "common/checksum_helper.h"
+#include "common/file_utils.h"
#include "load_manifest.h"
extern void reconstruct_from_incremental_file(char *input_filename,
@@ -28,6 +30,7 @@ extern void reconstruct_from_incremental_file(char *input_filename,
int *checksum_length,
uint8 **checksum_payload,
bool debug,
- bool dry_run);
+ bool dry_run,
+ CopyMode copy_mode);
#endif
--
2.44.0
v20240326-0002-write_reconstructed_file.patchtext/x-patch; charset=UTF-8; name=v20240326-0002-write_reconstructed_file.patchDownload
From ccd879c90da8c5383d997a4d0a5188d2497313f9 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@2ndquadrant.com>
Date: Sat, 23 Mar 2024 18:26:21 +0100
Subject: [PATCH v20240326 2/2] write_reconstructed_file
---
src/bin/pg_combinebackup/reconstruct.c | 33 +++++++++++++++++++++++---
1 file changed, 30 insertions(+), 3 deletions(-)
diff --git a/src/bin/pg_combinebackup/reconstruct.c b/src/bin/pg_combinebackup/reconstruct.c
index f5c7af8a23c..eec7860f0e3 100644
--- a/src/bin/pg_combinebackup/reconstruct.c
+++ b/src/bin/pg_combinebackup/reconstruct.c
@@ -59,7 +59,8 @@ static void write_reconstructed_file(char *input_filename,
off_t *offsetmap,
pg_checksum_context *checksum_ctx,
bool debug,
- bool dry_run);
+ bool dry_run,
+ CopyMode copy_mode);
static void read_bytes(rfile *rf, void *buffer, unsigned length);
/*
@@ -325,7 +326,8 @@ reconstruct_from_incremental_file(char *input_filename,
{
write_reconstructed_file(input_filename, output_filename,
block_length, sourcemap, offsetmap,
- &checksum_ctx, debug, dry_run);
+ &checksum_ctx, debug, dry_run,
+ copy_method);
debug_reconstruction(n_prior_backups + 1, source, dry_run);
}
@@ -528,7 +530,8 @@ write_reconstructed_file(char *input_filename,
off_t *offsetmap,
pg_checksum_context *checksum_ctx,
bool debug,
- bool dry_run)
+ bool dry_run,
+ CopyMode copy_mode)
{
int wfd = -1;
unsigned i;
@@ -630,6 +633,30 @@ write_reconstructed_file(char *input_filename,
if (dry_run)
continue;
+ /*
+ * If requested, copy the block using copy_file_range.
+ *
+ * We can'd do this if the block needs to be zero-filled or when we
+ * need to update checksum.
+ */
+ if ((copy_mode == COPY_MODE_COPY_FILE_RANGE) &&
+ (s != NULL) && (checksum_ctx->type == CHECKSUM_TYPE_NONE))
+ {
+#if defined(HAVE_COPY_FILE_RANGE)
+ wb = copy_file_range(s->fd, &offsetmap[i], wfd, NULL, BLCKSZ, 0);
+
+ if (wb < 0)
+ pg_fatal("error while copying file range from \"%s\" to \"%s\": %m",
+ input_filename, output_filename);
+ else if (wb != BLCKSZ)
+ pg_fatal("could not write file \"%s\": wrote only %d of %d bytes",
+ output_filename, wb, BLCKSZ);
+#else
+ pg_fatal("copy_file_range not supported on this platform");
+#endif
+ continue;
+ }
+
/* Read or zero-fill the block as appropriate. */
if (s == NULL)
{
--
2.44.0
On 3/25/24 15:31, Robert Haas wrote:
On Sat, Mar 23, 2024 at 9:37 AM Tomas Vondra
<tomas.vondra@enterprisedb.com> wrote:OK, that makes sense. Here's a patch that should work like this - in
copy_file we check if we need to calculate checksums, and either use the
requested copy method, or fall back to the block-by-block copy.+ Use efficient file cloning (also known as <quote>reflinks</quote> on + some systems) instead of copying files to the new cluster. This cannew cluster -> output directory
Ooops, forgot to fix this. Will do in next version.
I think your version kind of messes up the debug logging. In my
version, every call to copy_file() would emit either "would copy
\"%s\" to \"%s\" using strategy %s" and "copying \"%s\" to \"%s\"
using strategy %s". In your version, the dry_run mode emits a string
similar to the former, but creates separate translatable strings for
each copy method instead of using the same one with a different value
of %s. In non-dry-run mode, I think your version loses the debug
logging altogether.
Yeah. Sorry for not being careful enough about that, I was focusing on
the actual copy logic and forgot about this.
The patch I shared a couple minutes ago should fix this, effectively
restoring the original debug behavior. I liked the approach with calling
strategy_implementation a bit more, I wonder if it'd be better to go
back to that for the "accelerated" copy methods, somehow.
regards
--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
On Tue, Mar 26, 2024 at 7:03 PM Tomas Vondra
<tomas.vondra@enterprisedb.com> wrote:
[..]
That's really strange.
Hi Tomas, but it looks like it's fixed now :)
--manifest-checksums=NONE --copy-file-range without v20240323-2-0002: 27m23.887s
--manifest-checksums=NONE --copy-file-range with v20240323-2-0002 and
loop-fix: 5m1.986s but it creates corruption as it standsThanks. I plan to do more similar tests, once my machines get done with
some other stuff.
Please do so as I do not trust my fingers :-)
Issues:
1. https://cirrus-ci.com/task/5937513653600256?logs=mingw_cross_warning#L327
compains about win32/mingw:
[..]
Yup, missing break.
Now it's https://cirrus-ci.com/task/4997185324974080?logs=headers_headerscheck#L10
, reproducible with "make -s headerscheck
EXTRAFLAGS='-fmax-errors=10'":
/tmp/postgres/src/bin/pg_combinebackup/reconstruct.h:34:91: error:
unknown type name ‘CopyMode’ / CopyMode copy_mode);
to me it looks like reconstruct.h needs to include definition of
CopyMode which is in "#include "copy_file.h"
2. I do not know what's the consensus between --clone and
--copy-file-range , but if we have #ifdef FICLONE clone_works() #elif
HAVE_COPY_FILE_RANGE copy_file_range_only_works() then we should also
apply the same logic to the --help so that --clone is not visible
there (for consistency?). Also the "error: file cloning not supported
on this platform " is technically incorrect, Linux does support
ioctl(FICLONE) and copy_file_range(), but we are just not choosing one
over another (so technically it is "available"). Nitpicking I know.That's a good question, I'm not sure. But whatever we do, we should do
the same thing in pg_upgrade. Maybe there's some sort of precedent?
Sigh, you are right... It's consistent hell.
3. [v20240323-2-0002-write_reconstructed_file.patch]: The mentioned
ENOSPACE spiral-of-death-bug symptoms are like that:
[..]
Yeah, that retry logic is wrong. I ended up copying the check from the
"regular" copy branch, which simply bails out if copy_file_range returns
anything but the expected 8192.I wonder if this should deal with partial writes, though. I mean, it's
allowed copy_file_range() copies only some of the bytes - I don't know
how often / in what situations that happens, though ... And if we want
to handle that for copy_file_range(), pwrite() needs the same treatment.
Maybe that helps?
https://github.com/coreutils/coreutils/blob/606f54d157c3d9d558bdbe41da8d108993d86aeb/src/copy.c#L1427
, it's harder than I anticipated (we can ignore the sparse logic
though, I think)
3. .. but onn startup I've got this after trying psql login: invalid
page in block 0 of relation base/5/1259 .
[..]
Can you see if you can still reproduce this with the attached version?
Looks like it's fixed now and it works great (~3min, multiple times)!
BTW: I've tried to also try it over NFSv4 over loopback with XFS as
copy_file_range() does server-side optimization probably, but somehow
it was so slow there that's it is close to being unusable (~9GB out of
104GB reconstructed after 45mins) - maybe it's due to NFS mount opts,
i don't think we should worry too much. I think it's related to
missing the below optimization if that matters. I think it's too early
to warn users about NFS (I've spent on it just 10 mins), but on the
other hand people might complain it's broken...
Apparently there's no merging of adjacent IO/s, so pg_combinebackup
wastes lots of time on issuing instead small syscalls but it could
let's say do single pread/write (or even copy_file_range()). I think
it was not evident in my earlier testing (200GB; 39min vs ~40s) as I
had much smaller modifications in my incremental (think of 99% of
static data).Yes, I've been thinking about exactly this optimization, but I think
we're way past proposing this for PG17. The changes that would require
in reconstruct_from_incremental_file are way too significant. Has to
wait for PG18 ;-)
Sure thing!
I do think there's more on the table, as mentioned by Thomas a couple
days ago - maybe we shouldn't approach clone/copy_file_range merely as
an optimization to save time, it might be entirely reasonable to do this
simply to allow the filesystem to do CoW magic and save space (even if
we need to read the data and recalculate the checksum, which now
disables these copy methods).
Sure ! I think time will still be a priority though, as
pg_combinebackup duration impacts RTO while disk space is relatively
cheap.
One could argue that reconstructing 50TB will be a challenge though.
Now my tests indicate space saving is already happening with 0002
patch - 100GB DB / full backup stats look like that (so we are good I
think when using CoW - not so without using CoW) -- or i misunderstood
something?:
root@jw-test-1:/backups# du -sm /backups/
214612 /backups/
root@jw-test-1:/backups# du -sm *
106831 full
2823 incr.1
165 incr.2
104794 outtest
root@jw-test-1:/backups# df -h . # note this double confirms that just
114GB is used (XFS), great!
Filesystem Size Used Avail Use% Mounted on
/dev/sdb1 500G 114G 387G 23% /backups
root@jw-test-1:/backups# # https://github.com/pwaller/sharedextents
root@jw-test-1:/backups# ./sharedextents-linux-amd64
full/base/5/16427.68 outtest/base/5/16427.68
1056915456 / 1073741824 bytes (98.43%) # extents reuse
Now I was wondering a little bit if the huge XFS extent allocation
won't hurt read performance (probably they were created due many
independent copy_file_range() calls):
root@jw-test-1:/backups# filefrag full/base/5/16427.68
full/base/5/16427.68: 1 extent found
root@jw-test-1:/backups# filefrag outtest/base/5/16427.68
outtest/base/5/16427.68: 3979 extents found
However in first look on seq reads of such CoW file it's still good
(I'm assuming such backup after reconstruction would be copied back to
the proper DB server from this backup server):
root@jw-test-1:/backups# echo 3 > /proc/sys/vm/drop_caches
root@jw-test-1:/backups# time cat outtest/base/5/16427.68 > /dev/null
real 0m4.286s
root@jw-test-1:/backups# echo 3 > /proc/sys/vm/drop_caches
root@jw-test-1:/backups# time cat full/base/5/16427.68 > /dev/null
real 0m4.325s
Now Thomas wrote there "then it might make sense to do that even if
you *also* have to read the data to compute the checksums, I think? "
... sounds like read(), checksum() and still do copy_file_range()
instead of pwrite? PG 18 ? :D
-J.
On Tue, Mar 26, 2024 at 2:09 PM Tomas Vondra
<tomas.vondra@enterprisedb.com> wrote:
The patch I shared a couple minutes ago should fix this, effectively
restoring the original debug behavior. I liked the approach with calling
strategy_implementation a bit more, I wonder if it'd be better to go
back to that for the "accelerated" copy methods, somehow.
Somehow I don't see this patch?
--
Robert Haas
EDB: http://www.enterprisedb.com
On 3/28/24 21:45, Robert Haas wrote:
On Tue, Mar 26, 2024 at 2:09 PM Tomas Vondra
<tomas.vondra@enterprisedb.com> wrote:The patch I shared a couple minutes ago should fix this, effectively
restoring the original debug behavior. I liked the approach with calling
strategy_implementation a bit more, I wonder if it'd be better to go
back to that for the "accelerated" copy methods, somehow.Somehow I don't see this patch?
It's here:
/messages/by-id/90866c27-265a-4adb-89d0-18c8dd22bc19@enterprisedb.com
I did change the subject to reflect that it's no longer about
pg_upgrade, maybe that breaks the threading for you somehow?
regards
--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
On Tue, Mar 26, 2024 at 2:03 PM Tomas Vondra
<tomas.vondra@enterprisedb.com> wrote:
[ new patches ]
Tomas, thanks for pointing me to this email; as you speculated, gmail
breaks threading if the subject line changes.
The documentation still needs work here:
- It refers to --link mode, which is not a thing.
- It should talk about the fact that in some cases block-by-block
copying will still be needed, possibly mentioning that it specifically
happens when the old backup manifest is not available or does not
contain checksums or does not contain checksums of the right type, or
maybe just being a bit vague.
In copy_file.c:
- You added an unnecessary blank line to the beginning of a comment block.
- You could keep the strategy_implementation variable here. I don't
think that's 100% necessary, but now that you've got the code
structured this way, there's no compelling reason to remove it.
- I don't know what the +/* XXX maybe this should do the check
internally, same as the other functions? */ comment is talking about.
- Maybe these functions should have header comments.
--
Robert Haas
EDB: http://www.enterprisedb.com
On 3/29/24 15:23, Robert Haas wrote:
On Tue, Mar 26, 2024 at 2:03 PM Tomas Vondra
<tomas.vondra@enterprisedb.com> wrote:[ new patches ]
Tomas, thanks for pointing me to this email; as you speculated, gmail
breaks threading if the subject line changes.The documentation still needs work here:
- It refers to --link mode, which is not a thing.
- It should talk about the fact that in some cases block-by-block
copying will still be needed, possibly mentioning that it specifically
happens when the old backup manifest is not available or does not
contain checksums or does not contain checksums of the right type, or
maybe just being a bit vague.In copy_file.c:
- You added an unnecessary blank line to the beginning of a comment block.
Thanks, should be all cleaned up now, I think.
- You could keep the strategy_implementation variable here. I don't
think that's 100% necessary, but now that you've got the code
structured this way, there's no compelling reason to remove it.
Yeah, I think you're right. The strategy_implementation seemed a bit
weird to me because we now have 4 functions with different signatures.
Most only take srd/dst, but copy_file_blocks() also takes checksum. And
it seemed better to handle everything the same way, rather than treating
copy_file_blocks as an exception.
But it's not that bad, so 0001 has strategy_implementation again. But
I'll get back to this in a minute.
- I don't know what the +/* XXX maybe this should do the check
internally, same as the other functions? */ comment is talking about.
I think this is stale. The XXX is about how the various functions
detect/report support. In most we have the ifdefs/pg_fatal() inside the
function, but CopyFile() has nothing like that, because the detection
happens earlier. I wasn't sure if maybe we should make all these
functions more alike, but I don't think we should.
- Maybe these functions should have header comments.
Right, added.
I was thinking about the comment [1]/messages/by-id/CA+hUKG+8KDk+pM6vZHWT6XtZzh-sdieUDohcjj0fia6aqK3Oxg@mail.gmail.com from a couple a days ago, where
Thomas suggested that maybe we should try doing the CoW stuff
(clone/copy_file_range) even in cases when we need to read the block,
say to calculate checksum, or even reconstruct from incremental backups.
I wasn't sure how big the benefits of the patches shared so far might
be, so I decided to do some tests. I did a fairly simple thing:
1) initialize a cluster with pgbench scale 5000 (~75GB)
2) create a full backup
3) do a run that updates ~1%, 10% and 20% of the blocks
4) create an incremental backup after each run
5) do pg_combinebackup for for each of the increments, with
block-by-block copy and copy_file_range, measure how long it takes and
how much disk space it consumes
I did this on xfs and btrfs, and it quickly became obvious that there's
very little benefit unless --no-manifest is used. Which makes perfect
sense, because pgbench is uniform updates so all segments need to be
reconstructed from increments (so copy_file.c is mostly irrelevant), and
write_reconstructed_file only uses copy_file_range() without checksums.
I don't know how common --no-manifest is going to be, but I guess people
will want to keep manifests in at least some backup schemes (e.g. to
rebuild full backup instead of having to take a full backup regularly).
So I decided to take a stab at Thomas' idea, i.e. reading the data to
calculate checksums, but then using copy_file_range instead of just
writing the data onto disk. This is in 0003, which relaxes the
conditions in 0002 shared a couple days ago. And this helped a lot.
The attached PDF shows results for xfs/btrfs. Charts on the left are
disk space occupied by the reconstructed backup, measured as difference
between "df" before and after running pg_combinebackup. The duration of
the pg_combinebackup execution is on the right. First row is without
manifest (i.e. --no-manifest), the second row is with manifest.
The 1%, 10% and 20% groups are for the various increments, updating
different fractions of the database.
The database is ~75GB, so that's what we expect a plain copy to have. If
there are some CoW benefits of copy_file_range, allowing the fs to reuse
some of the space or, the disk space should be reduced. And similarly,
there could/should be some improvements in pg_combinebackup duration.
Each bar is a different copy method and patch:
* copy on master/0001/0002/0003 - we don't expect any difference between
these, it should all perform the same and use the "full" space
* copy_file_range on 0001/0002/0003 - 0001 should perform the same as
copy, because it's only about full-segment copies, and we don't any of
those here, 0002/0003 should help, depending on --no-manifest
And indeed, this is what we see. 0002/0003 use only a fraction of disk
space, roughly the same as the updated fraction (which matches the size
of the increment). This is nice.
For duration, the benefits seem to depend on the file system. For btrfs
it actually is faster, as expected. 0002/0003 saves maybe 30-50% of
time, compared to block-by-block copy. On XFS it's not that great, the
copy_file_range is actually slower by up to about 50%. And this is not
about the extra read - this affects the 0002/no-manifest case too, where
the read is not necessary.
I think this is fine. It's a tradeoff, where on some filesystems you can
save time or space, and on other filesystems you can save both. That's a
tradeoff for the users to decide, I think.
I'll see how this works on EXT4/ZFS next ...
But thinking about this a bit more, I realized there's no reason not to
apply the same logic to the copy_file part. I mean, if we need to copy a
file but also calculate a checksum, we can simply do the clone using
clone/copy_file_range, but then also read the file and calculate the
checksum ...
0004 does this, by simply passing the checksum_cxt around, which also
has the nice consequence that all the functions now have the same
signature, which makes the strategy_implementation work for all cases. I
need to do more testing of this, but like how this looks.
Of course, maybe there's not an agreement this is the right way to
approach this, and we should do the regular block-by-block copy?
There's one more change in 0003 - the checks if clone/copy_file_range
are supported by the platform now happen right at the beginning when
parsing the arguments, so that when a user specifies one of those
options, the error happens right away instead of sometime later when we
happen to hit one of those pg_fatal() places.
I think this is the right place to do these checks, as it makes the
write_reconstructed_file much easier to understand (without all the
ifdefs etc.).
But there's an argument whether this should fail with pg_fatal() or just
fallback to the default copy method.
BTW I wonder if it makes sense to only allow one of those methods? At
the moment the user can specify both --clone and --copy-file-range, and
which one "wins" depends on the order in which they are specified. Seems
confusing at best. But maybe it'd make sense to allow both, and e.g. use
clone() to copy whole segments and copy_file_range() for other places?
regards
[1]: /messages/by-id/CA+hUKG+8KDk+pM6vZHWT6XtZzh-sdieUDohcjj0fia6aqK3Oxg@mail.gmail.com
/messages/by-id/CA+hUKG+8KDk+pM6vZHWT6XtZzh-sdieUDohcjj0fia6aqK3Oxg@mail.gmail.com
--
Tomas Vondra
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
Attachments:
incremental-backup-test.pdfapplication/pdf; name=incremental-backup-test.pdfDownload
%PDF-1.4
%����
1 0 obj
<</Title (Untitled document)
/Producer (Skia/PDF m124 Google Docs Renderer)>>
endobj
3 0 obj
<</ca 1
/BM /Normal>>
endobj
5 0 obj
<</CA 1
/ca 1
/LC 0
/LJ 0
/LW 1.33333337
/ML 10
/SA true
/BM /Normal>>
endobj
7 0 obj
<</N 3
/Filter /FlateDecode
/Length 296>> stream
x�}��J�`�kA�A�\��h���X\[�V�4M����������&.���e(��%���o$����{x��/�@$�*�N�s���Q�S�L��eZ}�����K�}^�'7��v���!y�.�'��V��>�����s<����^�(���F�>�����7�V�=���f���WtV�%J��-���S��#LQ�"���'IB� EN��Py�zfISP�Y��H)��������@�&/C�~�{
�e�6�����1]shE�H� ��0W��'�9�]��Y�?����bM��4 �?��K�
endstream
endobj
6 0 obj
<</Type /XObject
/Subtype /Image
/Width 1682
/Height 922
/ColorSpace [/ICCBased 7 0 R]
/BitsPerComponent 8
/Filter /FlateDecode
/Length 54583>> stream
x��� �U��7�F���h��1��_�d��e&s,�Vr�[7�|���7'����T������ DPPQ�!*�$F��8�h�Q"�(( �����x������M7,���(�{���Z{���>��k�� �A������������Z�{��'/��/�\�b���������?��������O/9��������?<~���N;-�j���c������f���f
�n���� Y�����uzm%��J������?�������2 �7"];���Q���z��'�|������;_��O�����#���_|����%];z�����- x#����V�\���I'��a��6_��u������p�)��5j����I�M�����[o��s/���[��v;�� ��h��
w����W��vo <�@�R�����Uu��N�oc���I�5kV��b�6m���7�j��O��h����!�1
Z����k�������+�T�,;�����}�~]�8��flh �����\u�UiK�~�������v��{n�t�~���m�����_��!��=��W�����Wn�����[��
�#]�7���72$m�%K����~���[�.^�/]�����iyF���������;-F,O����W����;v�fN�� ��lh ����Q<���i3�~���7on�����{�������k����&�����������/�<-��9s*\�v���w�����O:�$����A�e��� ���{����;�f�<yr�^�/]���g�L]�M�6�z��i1^x����M=����k�d��G�9}�#|�Z�� `{Z�f�]w�5q����:��SN9���<����?��o\�`A�o����m��i1,=���
4d��K.����n[�zu��6_���3�h�����(��W������r=��s�^v�e#F�8���N=��s�9g����Nfl����7o~�������|�~�����=:���'��q����~:^>r���O>9�Ip������O�k�K�n����<^xa,[l�X���\{���2������������ko����;������qbe�w�}W\qE|�x� '��4^������k�S�Z�*6D�9��
�O�+�m��)�z<�-fq���_y��g�}v,@�|����Bn���������(wfbF�za���vI����;�7n\����.����3g�\������-�l.�!MV���������6V]�@l�����){
�b���o���>������:�S�n_|��������E����W�|l�y���l�'�|2����&+�%�{�a�����}#�t���k�F&�M���7�tS�&��t���bU�kc������x�E�3v��l���l��-wI���h�a���`������=�_?�g�(6eM.E����w?�M��K/����q���/�5k�� b��J�:�:v��_F���o\Z��(��������eH�����l���C�W��������/���+^k eodxD�C=TI���� @;bT���[��1������/��^�����_������>��Z�jU��F��pb���[;]�g�y&�[�nct���6{mQc�x�}��G��k#�x�K���l����P:O0o��5O�8��;��x�����!>B>n�����1�P��coi��V�X���s��=��3��6v��S����k�]�����z��r#��cU4[]�E~�������8���|���V�^=l���Ho�v.1bD����W_��Uz������2eJ�/���������3g�%4hP�vq��g>����v��'�x"����b[�j��s�
�]�z�TL�p�� ]�� �1fO?�t�U��eK�.=���N���V7n���b+�X�H���KrlN�>�{�����Nk�&�6Y�t���]�����{K������?^�J� ��o��W2|��?^~��^xa9��qS�C���rTC�x�s�9����?��_���]9bZ�x��g��~>���cF���r��i�c�.;e)>���������/��||��!�>�l�����k�e���8q�o~���#G�G����c��#�����1R��7n\����������k��g����I�&��c��1b���5���.�(���p�������.����x9����5;�/v�i����Il�����
��K/��<��d�]vY��c�5�:VW�Voi��y#����k;g��y���r�y��]7�e.w�5k�L�"��X9��[�oc�wx��Gjv���������rC���N�o���V���?�
�k��K/�X3��=m������7�;wn�/Y�|y�uM#"��f���v��8i����*=�0
�\��E�.�������86nl����#>K�R��b�r��:G\Dh�;�f������������L���a���c��F4E���=��-jb9��+����V�T��j����zby�)�I�I,|$ODG������`��L�:5o�xa�de���&k��k�[��1����I�v�����5P~����/Bmh �Gw�yg.�c@��SO���]�6�:�\����������A��1���JOE�>���sR�7��k���{����S1N��rXE:70y���n�������KM��u�.�������{�){V�&o�������Y������2�,��{��
�)Zc����m�A\a���5 �X���c� �y�l���8d��x �M�6=�����������Y�^x!�.cu�0�l��Z�={v^K1�o��+��1�\�hQ9�����������fP��?�9?�_����O�2�����n�)����^��������������K��l���1x����cx�]Z-o�;����O�_�����z�/�K�)������<��S���W��b�-������I�{y���5�r�:�U��`����c6|�����>�l�=�����0aB��3f�-�)eG1��l.�[��[��^�|y�Kg�c���K���l���"u����J���[n�G��[�.�L�E|�+��"}7W�Z�������ss/�5��Q�<z���l�����l)���2�&����,/��l���0��M��k��x]�>fW�1?������c������oh ��5k�����1"~m8Y�6���g�<{��7�g^�l���y����zm��v��%����������4iR�T�qzs�h��'�l�x�<�H~�[o����<H�wx��g�_|�9�����]���h�����
4�\Ee��sK����1�
�3�<��iv�
7�w�Ah�h���V�C��c�6<����/���_�>7�
�l�y�u����t���A�>�ho_[6W;���p���'�i�����g{l������Z\��E?���a��\y�Sl����������������������l]:���dO?�t���S�_]����|�[����Z�lh�_~y�C�6n�X������kk�\�����J�4����\�_����S�?�����
��*�R�Z�&{����$^�O���d}���=�"F������������k�� ����h�����f�����3�H��s�9�S�6m�W��0aB�w(;3���}ym�6����V~�|�����M����zV7�xc�%�C������U�V�w�1x���Gv��k-��[\xp��u�f��������G���5<*))�f5#�x����cXz��g7;Y2����3f��y6�<?���O7|�5k��Oz��W���6E�&U��<�������������+�v�$���[?���j���qT�{Xs"^����n���5�}���Ps��i}��fr,��r��W��[����1�HJi���`�)�����k���0�f�_8Z�G�%���%�[���O�h]��-7l?�{�vni�5�������4��sO����#��7r��7t~����;�_���d-"�L���~_�v}���W�2��v�m����;��.�v ����N=��fjO�"��)G�
�4Hb�8}���3g� =��/�����]M7�FyBV�j1N//7��
TYyS�E���z���x�����6m*/���]�s���nh��I���Q|��[�Z�`og�������x�����x��"��c3�
�tbog]��^��i���o����A�\pAo_����]�~Z>;���N��D����n���'�����5��|�i���G������<�"15�r��@����|}���v���=�r�[�.��:�WB��N��_;`�.���4�����4k:�7X�?���>�l~�{�����r����z�Bj�j_�v}��|]������k �TT�����'���<��<����@������k���k������A��k-��y�������(��&s������k�~��Y�������x������1cZLY~��W��Q��a�����O>9=�����������CZ��1�m�7whx9��]�f�'y��Y�.n�k��N���_��}�b]�(�:h����m��k����K���b��4���3������W���]|[�
������2���cP���Hy������e%������fo�w����X��^z��Z��,v�f��
>�`�M����]�d�����x������k �T�r�]?��c�<qy�Bn�� �����������v
�"�����]���|��K.�����/O4m���`��z�z<U�z��+����r�����ge���^\c^x����Y�f������X�Y�����9���+h�St*_[����t��<q�7-���[��B��kq�i�����:}�]��.�����>
�<�3��1��z����
���s=����D��t�;�v������
�� O:��6����M���-��/l��)o������_L���v��]�l`��(�y��5{�������c�����S�N�����;j^�7Y|�[�y�M����]�dW|��V��)���w���� �6U����������bt�<��Sz� }ymww�Z�s��_���8=��+���� P=��r���]���[��d�����z��f�(�~}����on[5���������t��~r����)�����(_�������{�����������������Zw��3�ow)&������u+!����u�������f�
�|tk��^�]o�V�w8��m�W���a��Z@y�:��1Q�<���$���k���]��+�e��v������[�eK[k��k}������Wv��^6���^��������K� ����@5W���f��<q9�������g�yfo�/�����]�{�&��/o��l��q���R�G��;�����K]mE��{�QFc��i�H1<����^h����x�G<��5�����a������O<�����Tk�k{��[#u������Y��k}$L�"��b�]�/r��k���������8��S"FZ_���|s�f7i�<��lp�o\���f���-Tr���Rm��<���ce��$o��]���������f�.i����M����]?f��/�8q���Ak�w�y>�`����� �6�O�l�k�z��<�5�\����!C��v����������{w��e�q��M�����.i���^��$_}���]9z���]�x���n��Y�.>x���e�������jN �<yr~�SN9%�������f^��v-�v-n�����kG���f���R���/��Iz������l����~�������K��0��H���J1��n���s���e+n��Fq����a�?��<Y^��
x1�|��S��@]�|o�������<]�����-���:k��9������<�z����{v�X�b���1AyK�l�����*�� �6�h��\r�x�lyQ�r����������������5�����<���N��[�������f������]C��O�2��;m(�������x��|�l9�-�o>|x����Y��|��oh�������M�r���S�)�v���o1ey�ly�cw]���/�8g��xa���e����}[���.�_~yz$���3�d�EH��$���5w_���k��3dx��<q��o��]yU��?Q�S�w������u����
#�>�������k��� lS�Z[���6T^/�����:1h������mw�Z\��|��\���(�3�Z��s����.��GK&N������(����<e�t�����>x�i���/oAXv�z�Z�Eypy�����7�VO��Q�x^g�+V����N��}��O?���f�(�v-z���J���]m�.�o�M7���>g�yf������������������y
�\$m����[�������T��W��������-����-�E�����������kW�;���;��m�P]���] ���u3�� �6�y��|<X���,�^^��#z{����6L�2%����bh�lpWv�n���o��?�9OY�b��O�����^��7�����_l1��q������K������������gWWW��<��<���Z���_^��+w� &�����d_�j- 9i���~��k��������Rl�u���A�vI|��r���//I��^�=:thz���:���;�\���/��MK���Vn�<��#5oXm���=A�'b���|}[�<o��%�^�OR�|�v��_Z����{��s���v��%���n�����,�� lk�����cy\\Y���1v�����/�|��G���������/oq(Z��������_��������b~��8�xn�5���gZ
�v��A��)�E�.6,�y�l���Qp�37��(���������]������]w��^�3f����.��q�)��������V�El��~��k����"�?�z����I��7��W��:t���7����.�EN���w�y5��?X�~����v1qWWW���v�"������'��\�0��/X�����?���v���k[$��M�;^�n;dW���_���� ���om���/��<���n�06�&r�����g�MN�y����6�z����f�qz�����V��]�_z���o�y��|'�X��������H��Z�|y�bP�����\ �)�1�o��r���������{��W�u�Y=���qT~��]�]�<��<�n�� ���O>��k_|��|xU��j����3V�l��5�A���1������m��8���u�����L�~}�.�E�SJW�X��<���r�h��
_���]��7�tSl��O?�<�����+�}1[�7O/�����|I3s��Mo5m���{�����X{�bbc����6�v�V~�[@��.������?���Qt�����fWK�}/�agO]��gs7��������d���,���H����1�,X0e����G�XI>���{Gwlh ���k��A��a�V�^�p�k��&�����g�5�%�+���&5���k��N���_�>��ppWv�:������G�����/�j=N//L��h�u���?>Ms�i������5��z����/[�����������b�_^��<A��kw�Yg���2�������t��p������������
��y�A����[�����K���]hv-�������ay�������]�������������<;i�����������X����o_�C��M%:{3���>��e;��&b��1cF����I������3'MV��JZ�A�+��-���v�e�5L�H���;/M�;�_�>B,F�w(|�l��+���;��l���������Y^U5����O��9�����v���]�R��b����������� ��2�JM��j���
H
/c��Q 1}��l��a�����������6F��X�y��e�����N����7��]��c�����������z(����N�i/��������4h���o������??]M+���w���8���b����b���7/�������]�_|1s�x��6K���G9��3�4� ����c���g�Zn���oy���������c��ai��|�Vl���|������\�����}8���gR^�0f3-�x�r�}���q��^k�/NX3����V��q����bw*�E,g�N�a~}�������c=��/]����\�pa��tuu�v�my�����Y>�����}['�nbIr�a�97�r���9��������P��/X�����&�b�M��f�]����%��3#���0��k��k����%�C���m�}��EJ��kW��(b�����5k��r�-�W��:���{W��v����]1e>�:��5kV�]9b���/w��%��� �Q��:������w�yy��&��]c��#�b�0l�������n��������N:i��Q��y�r���p,OPs M��{�������/���<^K��Y�z����_������?�� ���O��6K<�����r�)1^>��s�������4��lO<��!�{^r�%1�|�\z��6�e�.>~c�1";�ay&]���q��C
�,.�����'��k��c �aN�[��������Uf��j����k��m�~f��Y��c����KcU��/����Z��h���T{%w���/\�0��b��7.VE��|K��x}���-��-}�XE�V���~����(/&��g��������Xn������Qs��V���$v�����^����,�:
z\�������oP� Vu����}�-��w��]�C��KP�w��s�I�J����r3�~WY�zuy]���u�Y�O��;���*m����5k��&���X�4iR|��-H�_y��y��{c������v�}��E�� ��%��K_�.���uxg�;��}C m���;���^��L�R�P��^�w��|�|���7n,�T5.���tl[���3Xv��~������g��bl�����v��g����5��x�fwP]�n]�"V��X^���yR-<���5�z�F�Z�pa����m����bsL�<��1���J��s�5��C8���j��5#�X��f����b4��A��C��mC��
�iZ�h��U��5���V>�9�i��?���!�
2��'�h�&�6m*�����k����9c��rD�����^��������P��j��e���I����4�q��K��{�V�����G)������esyw���O�������$5't';T��{K��&R���N�R\�������S;B���o��{�lo���--�f����� ��f��������.�q����c�2t��q����9��FP���M�����c�c��/�x��Y��`n�_/�4iR,v,s:0~�AJ����x�����F:�m��
?�p��#F������s�������v�v�[J1�k��6_������&N�8g���'��������.�*^'>�����=�g|����b�7�h_�c��{�=:��O<1����O��oxpQ���-�\�`A,jl�X{��c����HI���?>��XQ�_}�MFb�>a���������[�jUl�K.�$�����>��������o��H��_+!v�xI�O8�������7��}����tZ9�w&���S���U|/n���/�0��ic�;���nk�;�;}��xI���������|��Cv��w�qGL�3�d���pg�yf|�c��;j�Ki�-.��+���G���N{N�G�{����d=�A���_]�$"+� ]tQ�=�[ �"B���s�=y;l�t����^z����o��vN�2�����f��0}��1}�{��>]��-������#����*�[���b��F���]p���?;H���o����t�u�E�2$>fZ���
c����]74 @e����h�a� �B���t� �w�v}�k @����#]; ���]�� ��t��H� �~�k�G�v �;]�>�� �J��3�+_�����������Gy��q�6l�P?��W^y�1����~{���>������+W�Ov��w��_>��v�}��:���������'{��W���}h���z�[�z���_x���6m��� ������0`@GG�1�����_���{��~���^�vm9�����x<&��G>r��G���>��{����K�����7��e�]�S�&tP����{�q��d�V����?O���Gy�G1p�����c���q�V� v����u�]p�M7��{���>���c��a��I�&�#�x�;����Y�z�q�~�_��-\�p��v�e�]�������M��}b�w������j�������<��^x!��}�{_<8f���� v#F�H���<~�������/��9��C��I�&���X�"��O����~���5ox�QG�����O�.]�t��������E���f���x���7o��| `�q�I'utt|�;��y|��i��������SO=�Ne}���j���7�O�92����={v�d�����?����_'N��~�3��_�w��]�������� �N���/�����'?Y���Q��%����^{m�z�G����������������;������l����D��k:$o��A�o��~������/ �L���8��������*?��s������g���9rd����}������x���x�|���������d��-K
�W^y%~���>?�7�~��������~���� ;�Gy$������~��/~��{���>����vK���?��-��O�����?���O�~����[�xq�z��G���'O��2�+]��� �N�����s��w�};
���wy��<�/��x�'?�I��g��Ot�A���3��C9����.X� ~>�����i���Ov�g�S������/ �LV�\���}��������_������g�yf���o{����c�����i��yK���s���%K��8�.������! ��Z�M���?������Qsx���k����_��W��a�������??����������^x����[�fM����>~���������Ouvv�?�vT�� ���l�����~��������y|���oy�[�����O����#F������o��+V�(o9Q���{S0���_�"�r��g?��xj�� �� v&o{��:::n�������k����%K���w�����f���;.����������zk�d�G������/�_'O��~�S���l��
���_<�����j� ��Hw�>|x��O>�d:j��'�H�~����l��%����������#=���Y��M�=��x����J����Koy�[�����S��^�����G��/ �4�����������>���+��s�U��������=���Y�l��G��d�'{��g��s�xp����7o��r�������a��<e:I������G�����w�3���+z;_ �i�_���_�bGG�.��r�QG}�;�9��c���x������VN��o��C��'?��s��GW�ZUN6m����S|p�gz����/���uuu}��H'�q�������5��| `��y�������������?p������#������-[V?���S�9��}��w�=�<��C�<��t��>������w����������h������[�n���zh��>����O}j��I
��� �����������������/����k���\?�M/-_y�,������<������<����/k���=���/l�}��Y����6>���u���v�����g�����/S�����m����������z�{���'��������m/=�{U�� �V[�z���vm�~]�/f�����w��e�����-�u�����U+_7�u�m��l�~]��������r��������m��+��,V�[�������S��M�u���E��>/ �VpT�N~T ��oxm[�����g�V�w�Y�n��]����!�|7�����e������_���{���-�����.|���[��e��M����G �7G���G� �����y$�1�r������������p����k3*��N�������^�����v1# `k9�d'?��K�=�*���*����;�'��r��'��r���e��_9������6��i�� �Vql�6���[��%�6G�l�;�Q%�zI�N�N� ����m�oG9��K�m��;��TB/I��<u� `����m�o9��K�m��;��TB/I��<u� `��S��� ���PBI*JE� �R����c��Cy(�$��T�j���c�1H��<��P��RQ*@��c�1�$�P�C(IE�(�Z�1��zy(�!���T�� P-��zL=�<���JRQ*JE ��zL=��D�Cy%�(�"