From 404e301dbdb252c23ea9d451b817cf6e372d0d9a Mon Sep 17 00:00:00 2001
From: Nazir Bilal Yavuz <byavuz81@gmail.com>
Date: Tue, 7 May 2024 14:16:09 +0300
Subject: [PATCH v5] Use CLONE method with GUC on CREATE DATABASE ...
 STRATEGY=FILE_COPY.

createdb_file_copy_method GUC option is introduced. It can be set to
either COPY (default) or CLONE (if system supports it).

If CLONE option is chosen, similar to STRATEGY=FILE_COPY; but attempting
to use efficient file copying system calls.  The kernel has the
opportunity to share block ranges in copy-on-write file systems, or
maybe push down the copy to network file systems and storage devices.

Currently works on Linux, FreeBSD and macOS.  More systems could be
supported.

Author: Thomas Munro <thomas.munro@gmail.com>
Author: Nazir Bilal Yavuz <byavuz81@gmail.com>
Discussion: https://postgr.es/m/CA%2BhUKGLM%2Bt%2BSwBU-cHeMUXJCOgBxSHLGZutV5zCwY4qrCcE02w%40mail.gmail.com
---
 src/include/commands/dbcommands.h             |  9 ++
 src/include/storage/copydir.h                 |  3 +-
 src/backend/commands/dbcommands.c             | 21 ++++-
 src/backend/storage/file/copydir.c            | 83 ++++++++++++++++++-
 .../utils/activity/wait_event_names.txt       |  1 +
 src/backend/utils/misc/guc_tables.c           | 13 +++
 src/backend/utils/misc/postgresql.conf.sample |  5 ++
 doc/src/sgml/config.sgml                      | 17 ++++
 doc/src/sgml/ref/create_database.sgml         | 24 ++++--
 src/tools/pgindent/typedefs.list              |  1 +
 10 files changed, 162 insertions(+), 15 deletions(-)

diff --git a/src/include/commands/dbcommands.h b/src/include/commands/dbcommands.h
index 92e17c71158..2e1d3565edc 100644
--- a/src/include/commands/dbcommands.h
+++ b/src/include/commands/dbcommands.h
@@ -14,6 +14,15 @@
 #ifndef DBCOMMANDS_H
 #define DBCOMMANDS_H
 
+typedef enum CreateDBFileCopyMethod
+{
+	CREATEDB_FILE_COPY_METHOD_COPY,
+	CREATEDB_FILE_COPY_METHOD_CLONE,
+} CreateDBFileCopyMethod;
+
+/* GUC parameters */
+extern PGDLLIMPORT int createdb_file_copy_method;
+
 #include "access/xlogreader.h"
 #include "catalog/objectaddress.h"
 #include "lib/stringinfo.h"
diff --git a/src/include/storage/copydir.h b/src/include/storage/copydir.h
index a25e258f479..9ff28f2eec9 100644
--- a/src/include/storage/copydir.h
+++ b/src/include/storage/copydir.h
@@ -13,7 +13,8 @@
 #ifndef COPYDIR_H
 #define COPYDIR_H
 
-extern void copydir(const char *fromdir, const char *todir, bool recurse);
+extern void copydir(const char *fromdir, const char *todir, bool recurse,
+					bool clone_files);
 extern void copy_file(const char *fromfile, const char *tofile);
 
 #endif							/* COPYDIR_H */
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index be629ea92cf..cf0dff65e69 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -69,6 +69,20 @@
 #include "utils/snapmgr.h"
 #include "utils/syscache.h"
 
+/* GUCs */
+int			createdb_file_copy_method = CREATEDB_FILE_COPY_METHOD_COPY;
+
+/*
+ * GUC support
+ */
+const struct config_enum_entry createdb_file_copy_method_options[] = {
+	{"copy", CREATEDB_FILE_COPY_METHOD_COPY, false},
+#if (defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE)) || defined(HAVE_COPY_FILE_RANGE)
+	{"clone", CREATEDB_FILE_COPY_METHOD_CLONE, false},
+#endif
+	{NULL, 0, false}
+};
+
 /*
  * Create database strategy.
  *
@@ -608,7 +622,8 @@ CreateDatabaseUsingFileCopy(Oid src_dboid, Oid dst_dboid, Oid src_tsid,
 		 *
 		 * We don't need to copy subdirectories
 		 */
-		copydir(srcpath, dstpath, false);
+		copydir(srcpath, dstpath, false,
+				createdb_file_copy_method == CREATEDB_FILE_COPY_METHOD_CLONE);
 
 		/* Record the filesystem change in XLOG */
 		{
@@ -2146,7 +2161,7 @@ movedb(const char *dbname, const char *tblspcname)
 		/*
 		 * Copy files from the old tablespace to the new one
 		 */
-		copydir(src_dbpath, dst_dbpath, false);
+		copydir(src_dbpath, dst_dbpath, false, false);
 
 		/*
 		 * Record the filesystem change in XLOG
@@ -3321,7 +3336,7 @@ dbase_redo(XLogReaderState *record)
 		 *
 		 * We don't need to copy subdirectories
 		 */
-		copydir(src_path, dst_path, false);
+		copydir(src_path, dst_path, false, false);
 
 		pfree(src_path);
 		pfree(dst_path);
diff --git a/src/backend/storage/file/copydir.c b/src/backend/storage/file/copydir.c
index d4fbe542077..6aae58d6683 100644
--- a/src/backend/storage/file/copydir.c
+++ b/src/backend/storage/file/copydir.c
@@ -21,12 +21,18 @@
 #include <fcntl.h>
 #include <unistd.h>
 
+#ifdef HAVE_COPYFILE_H
+#include <copyfile.h>
+#endif
+
 #include "common/file_utils.h"
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "storage/copydir.h"
 #include "storage/fd.h"
 
+static void clone_file(const char *fromfile, const char *tofile);
+
 /*
  * copydir: copy a directory
  *
@@ -34,7 +40,7 @@
  * a directory or a regular file is ignored.
  */
 void
-copydir(const char *fromdir, const char *todir, bool recurse)
+copydir(const char *fromdir, const char *todir, bool recurse, bool clone_files)
 {
 	DIR		   *xldir;
 	struct dirent *xlde;
@@ -68,10 +74,15 @@ copydir(const char *fromdir, const char *todir, bool recurse)
 		{
 			/* recurse to handle subdirectories */
 			if (recurse)
-				copydir(fromfile, tofile, true);
+				copydir(fromfile, tofile, true, clone_files);
 		}
 		else if (xlde_type == PGFILETYPE_REG)
-			copy_file(fromfile, tofile);
+		{
+			if (clone_files)
+				clone_file(fromfile, tofile);
+			else
+				copy_file(fromfile, tofile);
+		}
 	}
 	FreeDir(xldir);
 
@@ -214,3 +225,69 @@ copy_file(const char *fromfile, const char *tofile)
 
 	pfree(buffer);
 }
+
+/*
+ * clone one file
+ */
+static void
+clone_file(const char *fromfile, const char *tofile)
+{
+#if defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE)
+	if (copyfile(fromfile, tofile, NULL, COPYFILE_CLONE_FORCE) < 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not clone file \"%s\" to \"%s\": %m",
+						fromfile, tofile)));
+#elif defined(HAVE_COPY_FILE_RANGE)
+	int			srcfd;
+	int			dstfd;
+	ssize_t		nbytes;
+
+	srcfd = OpenTransientFile(fromfile, O_RDONLY | PG_BINARY);
+	if (srcfd < 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not open file \"%s\": %m", fromfile)));
+
+	dstfd = OpenTransientFile(tofile, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
+	if (dstfd < 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not create file \"%s\": %m", tofile)));
+
+	do
+	{
+		/* If we got a cancel signal during the copy of the file, quit */
+		CHECK_FOR_INTERRUPTS();
+
+		/*
+		 * Don't copy too much at once, so we can check for interrupts from
+		 * time to time if this falls back to a slow copy.
+		 */
+		pgstat_report_wait_start(WAIT_EVENT_COPY_FILE_COPY);
+		nbytes = copy_file_range(srcfd, NULL, dstfd, NULL, 1024 * 1024, 0);
+		if (nbytes < 0 && errno != EINTR)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not clone file \"%s\" to \"%s\": %m",
+							fromfile, tofile)));
+		pgstat_report_wait_end();
+	}
+	while (nbytes != 0);
+
+	if (CloseTransientFile(dstfd) != 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not close file \"%s\": %m", tofile)));
+
+	if (CloseTransientFile(srcfd) != 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not close file \"%s\": %m", fromfile)));
+#else
+	/*
+	 * If there is no CLONE support, clone_file() should not be called.
+	 */
+	pg_unreachable();
+#endif
+}
diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt
index 87cbca28118..91c3d644bec 100644
--- a/src/backend/utils/activity/wait_event_names.txt
+++ b/src/backend/utils/activity/wait_event_names.txt
@@ -201,6 +201,7 @@ CONTROL_FILE_SYNC	"Waiting for the <filename>pg_control</filename> file to reach
 CONTROL_FILE_SYNC_UPDATE	"Waiting for an update to the <filename>pg_control</filename> file to reach durable storage."
 CONTROL_FILE_WRITE	"Waiting for a write to the <filename>pg_control</filename> file."
 CONTROL_FILE_WRITE_UPDATE	"Waiting for a write to update the <filename>pg_control</filename> file."
+COPY_FILE_COPY	"Waiting for a file copy operation."
 COPY_FILE_READ	"Waiting for a read during a file copy operation."
 COPY_FILE_WRITE	"Waiting for a write during a file copy operation."
 DATA_FILE_EXTEND	"Waiting for a relation data file to be extended."
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index ea2b0577bc6..532ceedbdab 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -39,6 +39,7 @@
 #include "catalog/storage.h"
 #include "commands/async.h"
 #include "commands/event_trigger.h"
+#include "commands/dbcommands.h"
 #include "commands/tablespace.h"
 #include "commands/trigger.h"
 #include "commands/user.h"
@@ -491,6 +492,7 @@ extern const struct config_enum_entry archive_mode_options[];
 extern const struct config_enum_entry recovery_target_action_options[];
 extern const struct config_enum_entry wal_sync_method_options[];
 extern const struct config_enum_entry dynamic_shared_memory_options[];
+extern const struct config_enum_entry createdb_file_copy_method_options[];
 
 /*
  * GUC option variables that are exported from this module
@@ -4974,6 +4976,17 @@ struct config_enum ConfigureNamesEnum[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"createdb_file_copy_method", PGC_POSTMASTER, RESOURCES_DISK,
+			gettext_noop("Selects the copy method implementation used for FILE_CLONE strategy in"
+						 " CREATE DATABASE command."),
+			NULL
+		},
+		&createdb_file_copy_method,
+		CREATEDB_FILE_COPY_METHOD_COPY, createdb_file_copy_method_options,
+		NULL, NULL, NULL
+	},
+
 	{
 		{"wal_sync_method", PGC_SIGHUP, WAL_SETTINGS,
 			gettext_noop("Selects the method used for forcing WAL updates to disk."),
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 83d5df8e460..e270f11a4d8 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -178,6 +178,11 @@
 #max_notify_queue_pages = 1048576	# limits the number of SLRU pages allocated
 					# for NOTIFY / LISTEN queue
 
+#createdb_file_copy_method = copy	# the default is the first option
+					# 	copy
+					# 	clone (if your system supports)
+					# (change requires restart)
+
 # - Kernel Resources -
 
 #max_files_per_process = 1000		# min 64
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index e93208b2e6a..ff1ae8743cd 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -2279,6 +2279,23 @@ include_dir 'conf.d'
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-createdb_file_copy_method" xreflabel="createdb_file_copy_method">
+      <term><varname>createdb_file_copy_method</varname> (<type>enum</type>)
+      <indexterm>
+       <primary><varname>createdb_file_copy_method</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Specifies the copy method that <literal>FILE_COPY</literal> strategy in
+        <command>CREATE DATABASE ...</command> command will use. Possible
+        values are <literal>COPY</literal> (default) and
+        <literal>CLONE</literal> (if your system supports). For the more
+        information, see: <xref linkend="create-database-strategy"/>.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-max-notify-queue-pages" xreflabel="max_notify_queue_pages">
       <term><varname>max_notify_queue_pages</varname> (<type>integer</type>)
       <indexterm>
diff --git a/doc/src/sgml/ref/create_database.sgml b/doc/src/sgml/ref/create_database.sgml
index 7653cb902ee..fae7a1536ae 100644
--- a/doc/src/sgml/ref/create_database.sgml
+++ b/doc/src/sgml/ref/create_database.sgml
@@ -131,14 +131,22 @@ CREATE DATABASE <replaceable class="parameter">name</replaceable>
         to the write-ahead log. This is the most efficient strategy in
         cases where the template database is small, and therefore it is the
         default. The older <literal>FILE_COPY</literal> strategy is also
-        available. This strategy writes a small record to the write-ahead log
-        for each tablespace used by the target database. Each such record
-        represents copying an entire directory to a new location at the
-        filesystem level. While this does reduce the write-ahead
-        log volume substantially, especially if the template database is large,
-        it also forces the system to perform a checkpoint both before and
-        after the creation of the new database. In some situations, this may
-        have a noticeable negative impact on overall system performance.
+        available. This strategy has two methods available,
+        <literal>COPY</literal> and <literal>CLONE</literal>. The
+        <literal>COPY</literal> is the default method, which writes a small
+        record to the write-ahead log for each tablespace used by the target
+        database. Each such record represents copying an entire directory to a
+        new location at the filesystem level. While this does reduce the
+        write-ahead log volume substantially, especially if the template
+        database is large, it also forces the system to perform a checkpoint
+        both before and after the creation of the new database. In some
+        situations, this may have a noticeable negative impact on overall
+        system performance. On some platforms and file systems, the
+        <literal>CLONE</literal> method is available.  This works the same
+        way as <literal>COPY</literal> method, except that it uses fast file
+        cloning or copying system calls that might push down the work of
+        copying to the storage, or use copy-on-write techniques.  The effect on
+        disk space usage and execution time is file system-dependent.
        </para>
       </listitem>
      </varlistentry>
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 2311f82d81e..12820267353 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -500,6 +500,7 @@ CoverPos
 CreateAmStmt
 CreateCastStmt
 CreateConversionStmt
+CreateDBFileCopyMethod
 CreateDBRelInfo
 CreateDBStrategy
 CreateDomainStmt
-- 
2.43.0

