From 7b6a6fe1b555647109caec2817f9200ac8fe9db9 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@2ndquadrant.com>
Date: Tue, 19 Mar 2024 15:16:29 +0100
Subject: [PATCH v20240322] pg_combinebackup - allow using
 clone/copy_file_range

---
 doc/src/sgml/ref/pg_combinebackup.sgml      |  34 +++++
 src/bin/pg_combinebackup/copy_file.c        | 157 +++++++++++++++-----
 src/bin/pg_combinebackup/copy_file.h        |  18 ++-
 src/bin/pg_combinebackup/pg_combinebackup.c |  26 +++-
 src/bin/pg_combinebackup/reconstruct.c      |   5 +-
 src/bin/pg_combinebackup/reconstruct.h      |   5 +-
 6 files changed, 202 insertions(+), 43 deletions(-)

diff --git a/doc/src/sgml/ref/pg_combinebackup.sgml b/doc/src/sgml/ref/pg_combinebackup.sgml
index 8a0a600c2b2..60a60e3fae6 100644
--- a/doc/src/sgml/ref/pg_combinebackup.sgml
+++ b/doc/src/sgml/ref/pg_combinebackup.sgml
@@ -185,6 +185,40 @@ PostgreSQL documentation
       </listitem>
      </varlistentry>
 
+     <varlistentry>
+      <term><option>--clone</option></term>
+      <listitem>
+       <para>
+        Use efficient file cloning (also known as <quote>reflinks</quote> on
+        some systems) instead of copying files to the new cluster.  This can
+        result in near-instantaneous copying of the data files, giving the
+        speed advantages of <option>-k</option>/<option>--link</option> while
+        leaving the old cluster untouched.
+       </para>
+
+       <para>
+        File cloning is only supported on some operating systems and file
+        systems.  If it is selected but not supported, the
+        <application>pg_combinebackup</application> run will error.  At present,
+        it is supported on Linux (kernel 4.5 or later) with Btrfs and XFS (on
+        file systems created with reflink support), and on macOS with APFS.
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry>
+      <term><option>--copy-file-range</option></term>
+      <listitem>
+       <para>
+        Use the <function>copy_file_range</function> system call for efficient
+        copying.  On some file systems this gives results similar to
+        <option>--clone</option>, sharing physical disk blocks, while on others
+        it may still copy blocks, but do so via an optimized path.  At present,
+        it is supported on Linux and FreeBSD.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry>
        <term><option>-V</option></term>
        <term><option>--version</option></term>
diff --git a/src/bin/pg_combinebackup/copy_file.c b/src/bin/pg_combinebackup/copy_file.c
index e6d2423278a..cd3b8447308 100644
--- a/src/bin/pg_combinebackup/copy_file.c
+++ b/src/bin/pg_combinebackup/copy_file.c
@@ -14,6 +14,7 @@
 #include <copyfile.h>
 #endif
 #include <fcntl.h>
+#include <limits.h>
 #include <sys/stat.h>
 #include <unistd.h>
 
@@ -24,6 +25,10 @@
 static void copy_file_blocks(const char *src, const char *dst,
 							 pg_checksum_context *checksum_ctx);
 
+static void copy_file_clone(const char *src, const char *dst);
+
+static void copy_file_by_range(const char *src, const char *dst);
+
 #ifdef WIN32
 static void copy_file_copyfile(const char *src, const char *dst);
 #endif
@@ -35,7 +40,8 @@ static void copy_file_copyfile(const char *src, const char *dst);
  */
 void
 copy_file(const char *src, const char *dst,
-		  pg_checksum_context *checksum_ctx, bool dry_run)
+		  pg_checksum_context *checksum_ctx, bool dry_run,
+		  CopyMode copy_mode)
 {
 	/*
 	 * In dry-run mode, we don't actually copy anything, nor do we read any
@@ -55,54 +61,70 @@ copy_file(const char *src, const char *dst,
 	 * If we don't need to compute a checksum, then we can use any special
 	 * operating system primitives that we know about to copy the file; this
 	 * may be quicker than a naive block copy.
+	 *
+	 * On the other hand, if we need to compute a checksum, but the user
+	 * requested a special copy method that does not support this, report
+	 * this and fail.
+	 *
+	 * XXX Why do do this only for WIN32 and not for the other systems? Are
+	 * there some reliability concerns/issues?
+	 *
+	 * XXX Maybe this should simply fall back to the basic copy method, and
+	 * not fail the whole command?
 	 */
 	if (checksum_ctx->type == CHECKSUM_TYPE_NONE)
 	{
-		char	   *strategy_name = NULL;
-		void		(*strategy_implementation) (const char *, const char *) = NULL;
-
 #ifdef WIN32
-		strategy_name = "CopyFile";
-		strategy_implementation = copy_file_copyfile;
+		copy_method = COPY_MODE_COPYFILE;
 #endif
-
-		if (strategy_name != NULL)
-		{
-			if (dry_run)
-				pg_log_debug("would copy \"%s\" to \"%s\" using strategy %s",
-							 src, dst, strategy_name);
-			else
-			{
-				pg_log_debug("copying \"%s\" to \"%s\" using strategy %s",
-							 src, dst, strategy_name);
-				(*strategy_implementation) (src, dst);
-			}
-			return;
-		}
+	}
+	else if (copy_mode != COPY_MODE_COPY)
+	{
+		/* XXX maybe silently fall back to simple copy? */
+		pg_fatal("copy method does not support checksums");
 	}
 
-	/*
-	 * Fall back to the simple approach of reading and writing all the blocks,
-	 * feeding them into the checksum context as we go.
-	 */
+
 	if (dry_run)
 	{
-		if (checksum_ctx->type == CHECKSUM_TYPE_NONE)
-			pg_log_debug("would copy \"%s\" to \"%s\"",
-						 src, dst);
-		else
-			pg_log_debug("would copy \"%s\" to \"%s\" and checksum with %s",
-						 src, dst, pg_checksum_type_name(checksum_ctx->type));
+		switch (copy_mode)
+		{
+			case COPY_MODE_CLONE:
+				pg_log_debug("would copy \"%s\" to \"%s\" (clone)", src, dst);
+				break;
+			case COPY_MODE_COPY:
+				pg_log_debug("would copy \"%s\" to \"%s\"", src, dst);
+				break;
+			case COPY_MODE_COPY_FILE_RANGE:
+				pg_log_debug("would copy \"%s\" to \"%s\" (copy_file_range)",
+							 src, dst);
+#ifdef WIN32
+			case COPY_MODE_COPYFILE:
+				pg_log_debug("would copy \"%s\" to \"%s\" (copyfile)",
+							 src, dst);
+				break;
+#endif
+		}
 	}
 	else
 	{
-		if (checksum_ctx->type == CHECKSUM_TYPE_NONE)
-			pg_log_debug("copying \"%s\" to \"%s\"",
-						 src, dst);
-		else
-			pg_log_debug("copying \"%s\" to \"%s\" and checksumming with %s",
-						 src, dst, pg_checksum_type_name(checksum_ctx->type));
-		copy_file_blocks(src, dst, checksum_ctx);
+		switch (copy_mode)
+		{
+			case COPY_MODE_CLONE:
+				copy_file_clone(src, dst);
+				break;
+			case COPY_MODE_COPY:
+				copy_file_blocks(src, dst, checksum_ctx);
+				break;
+			case COPY_MODE_COPY_FILE_RANGE:
+				copy_file_by_range(src, dst);
+				break;
+#ifdef WIN32
+			case COPY_MODE_COPYFILE:
+				copy_file_copyfile(src, dst);
+				break;
+#endif
+		}
 	}
 }
 
@@ -156,6 +178,67 @@ copy_file_blocks(const char *src, const char *dst,
 	close(dest_fd);
 }
 
+static void
+copy_file_clone(const char *src, const char *dest)
+{
+#if defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE)
+	if (copyfile(src, dest, NULL, COPYFILE_CLONE_FORCE) < 0)
+		pg_fatal("error while cloning file \"%s\" to \"%s\": %m", src, dest);
+#elif defined(__linux__) && defined(FICLONE)
+	{
+		if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0)
+			pg_fatal("could not open file \"%s\": %m", src);
+
+		if ((dest_fd = open(dest, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
+							pg_file_create_mode)) < 0)
+			pg_fatal("could not create file \"%s\": %m", dest);
+
+		if (ioctl(dest_fd, FICLONE, src_fd) < 0)
+		{
+			int			save_errno = errno;
+
+			unlink(dest);
+
+			pg_fatal("error while cloning file \"%s\" to \"%s\": %s",
+					 src, dest);
+		}
+	}
+#else
+	pg_fatal("file cloning not supported on this platform");
+#endif
+}
+
+static void
+copy_file_by_range(const char *src, const char *dest)
+{
+#if defined(HAVE_COPY_FILE_RANGE)
+	int			src_fd;
+	int			dest_fd;
+	ssize_t		nbytes;
+
+	if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0)
+		pg_fatal("could not open file \"%s\": %m", src);
+
+	if ((dest_fd = open(dest, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
+						pg_file_create_mode)) < 0)
+		pg_fatal("could not create file \"%s\": %m", dest);
+
+	do
+	{
+		nbytes = copy_file_range(src_fd, NULL, dest_fd, NULL, SSIZE_MAX, 0);
+		if (nbytes < 0)
+			pg_fatal("error while copying file range from \"%s\" to \"%s\": %m",
+					 src, dest);
+	} while (nbytes > 0);
+
+	close(src_fd);
+	close(dest_fd);
+#else
+	pg_fatal("copy_file_range not supported on this platform");
+#endif
+}
+
+/* XXX maybe this should do the check internally, same as the other functions? */
 #ifdef WIN32
 static void
 copy_file_copyfile(const char *src, const char *dst)
diff --git a/src/bin/pg_combinebackup/copy_file.h b/src/bin/pg_combinebackup/copy_file.h
index 0f6bc09403f..3a1c5eb764f 100644
--- a/src/bin/pg_combinebackup/copy_file.h
+++ b/src/bin/pg_combinebackup/copy_file.h
@@ -11,9 +11,25 @@
 #ifndef COPY_FILE_H
 #define COPY_FILE_H
 
+#include "c.h"
 #include "common/checksum_helper.h"
+#include "common/file_utils.h"
+
+/*
+ * Enumeration to denote copy modes
+ */
+typedef enum CopyMode
+{
+	COPY_MODE_CLONE,
+	COPY_MODE_COPY,
+	COPY_MODE_COPY_FILE_RANGE,
+#ifdef WIN32
+	COPY_MODE_COPYFILE,
+#endif
+} CopyMode;
 
 extern void copy_file(const char *src, const char *dst,
-					  pg_checksum_context *checksum_ctx, bool dry_run);
+					  pg_checksum_context *checksum_ctx, bool dry_run,
+					  CopyMode copy_mode);
 
 #endif							/* COPY_FILE_H */
diff --git a/src/bin/pg_combinebackup/pg_combinebackup.c b/src/bin/pg_combinebackup/pg_combinebackup.c
index 74f8be9eeac..fb5e51811bd 100644
--- a/src/bin/pg_combinebackup/pg_combinebackup.c
+++ b/src/bin/pg_combinebackup/pg_combinebackup.c
@@ -69,6 +69,7 @@ typedef struct cb_options
 	pg_checksum_type manifest_checksums;
 	bool		no_manifest;
 	DataDirSyncMethod sync_method;
+	CopyMode	copy_method;
 } cb_options;
 
 /*
@@ -129,6 +130,8 @@ main(int argc, char *argv[])
 		{"manifest-checksums", required_argument, NULL, 1},
 		{"no-manifest", no_argument, NULL, 2},
 		{"sync-method", required_argument, NULL, 3},
+		{"clone", no_argument, NULL, 4},
+		{"copy-file-range", no_argument, NULL, 5},
 		{NULL, 0, NULL, 0}
 	};
 
@@ -156,6 +159,7 @@ main(int argc, char *argv[])
 	memset(&opt, 0, sizeof(opt));
 	opt.manifest_checksums = CHECKSUM_TYPE_CRC32C;
 	opt.sync_method = DATA_DIR_SYNC_METHOD_FSYNC;
+	opt.copy_method = COPY_MODE_COPY;
 
 	/* process command-line options */
 	while ((c = getopt_long(argc, argv, "dnNPo:T:",
@@ -192,6 +196,12 @@ main(int argc, char *argv[])
 				if (!parse_sync_method(optarg, &opt.sync_method))
 					exit(1);
 				break;
+			case 4:
+				opt.copy_method = COPY_MODE_CLONE;
+				break;
+			case 5:
+				opt.copy_method = COPY_MODE_COPY_FILE_RANGE;
+				break;
 			default:
 				/* getopt_long already emitted a complaint */
 				pg_log_error_hint("Try \"%s --help\" for more information.", progname);
@@ -213,6 +223,14 @@ main(int argc, char *argv[])
 	if (opt.no_manifest)
 		opt.manifest_checksums = CHECKSUM_TYPE_NONE;
 
+	/*
+	 * We cannot provide file copy/clone offload in case when we need to
+	 * calculate checksums
+	 */
+	if (opt.copy_method != COPY_MODE_COPY && opt.manifest_checksums != CHECKSUM_TYPE_NONE)
+		pg_fatal("unable to use accelerated copy when manifest checksums "
+				 "are to be calculated. Use --no-manifest");
+
 	/* Read the server version from the final backup. */
 	version = read_pg_version_file(argv[argc - 1]);
 
@@ -696,6 +714,8 @@ help(const char *progname)
 			 "                            use algorithm for manifest checksums\n"));
 	printf(_("      --no-manifest         suppress generation of backup manifest\n"));
 	printf(_("      --sync-method=METHOD  set method for syncing files to disk\n"));
+	printf(_("      --clone               clone (reflink) instead of copying files\n"));
+	printf(_("      --copy-file-range     copy using copy_file_range() syscall\n"));
 	printf(_("  -?, --help                show this help, then exit\n"));
 
 	printf(_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
@@ -937,7 +957,8 @@ process_directory_recursively(Oid tsoid,
 											  &checksum_length,
 											  &checksum_payload,
 											  opt->debug,
-											  opt->dry_run);
+											  opt->dry_run,
+											  opt->copy_method);
 		}
 		else
 		{
@@ -993,7 +1014,8 @@ process_directory_recursively(Oid tsoid,
 
 			/* Actually copy the file. */
 			snprintf(ofullpath, MAXPGPATH, "%s/%s", ofulldir, de->d_name);
-			copy_file(ifullpath, ofullpath, &checksum_ctx, opt->dry_run);
+			copy_file(ifullpath, ofullpath, &checksum_ctx, opt->dry_run,
+					  opt->copy_method);
 
 			/*
 			 * If copy_file() performed a checksum calculation for us, then
diff --git a/src/bin/pg_combinebackup/reconstruct.c b/src/bin/pg_combinebackup/reconstruct.c
index 41f06bb26b5..f5c7af8a23c 100644
--- a/src/bin/pg_combinebackup/reconstruct.c
+++ b/src/bin/pg_combinebackup/reconstruct.c
@@ -90,7 +90,8 @@ reconstruct_from_incremental_file(char *input_filename,
 								  int *checksum_length,
 								  uint8 **checksum_payload,
 								  bool debug,
-								  bool dry_run)
+								  bool dry_run,
+								  CopyMode copy_method)
 {
 	rfile	  **source;
 	rfile	   *latest_source = NULL;
@@ -319,7 +320,7 @@ reconstruct_from_incremental_file(char *input_filename,
 	 */
 	if (copy_source != NULL)
 		copy_file(copy_source->filename, output_filename,
-				  &checksum_ctx, dry_run);
+				  &checksum_ctx, dry_run, copy_method);
 	else
 	{
 		write_reconstructed_file(input_filename, output_filename,
diff --git a/src/bin/pg_combinebackup/reconstruct.h b/src/bin/pg_combinebackup/reconstruct.h
index 8e33a8a95a0..726f94389f3 100644
--- a/src/bin/pg_combinebackup/reconstruct.h
+++ b/src/bin/pg_combinebackup/reconstruct.h
@@ -13,7 +13,9 @@
 #ifndef RECONSTRUCT_H
 #define RECONSTRUCT_H
 
+#include "c.h"
 #include "common/checksum_helper.h"
+#include "common/file_utils.h"
 #include "load_manifest.h"
 
 extern void reconstruct_from_incremental_file(char *input_filename,
@@ -28,6 +30,7 @@ extern void reconstruct_from_incremental_file(char *input_filename,
 											  int *checksum_length,
 											  uint8 **checksum_payload,
 											  bool debug,
-											  bool dry_run);
+											  bool dry_run,
+											  CopyMode copy_mode);
 
 #endif
-- 
2.44.0

