From d89cbae1851627be4e146efedc92ba9d0a67ad6a Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Sun, 30 Apr 2023 11:10:08 +1200
Subject: [PATCH 07/11] Use copy_file_range() to implement copy_file().

If copy_file_range() is available, use it to implement copy_file(), so
that the operating system has opportunities for efficient copying,
block cloning and pushdown.  This affects the commands CREATE DATABASE
STRATEGY=FILE_COPY and ALTER TABLE SET TABLESPACE, which perform bulk
file copies.

On older Linux systems, copy_file_range() might fail with EXDEV, so we
look out for that and fall back to the traditional read/write loop.

XXX Should we also let the user opt out?
---
 doc/src/sgml/monitoring.sgml            |  4 ++
 src/backend/storage/file/copydir.c      | 94 +++++++++++++++++++------
 src/backend/utils/activity/wait_event.c |  3 +
 src/include/utils/wait_event.h          |  1 +
 4 files changed, 82 insertions(+), 20 deletions(-)

diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index 99f7f95c39..2161b32b17 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -1317,6 +1317,10 @@ postgres   27093  0.0  0.0  30096  2752 ?        Ss   11:34   0:00 postgres: ser
       <entry>Waiting for a write to update the <filename>pg_control</filename>
        file.</entry>
      </row>
+     <row>
+      <entry><literal>CopyFileRange</literal></entry>
+      <entry>Waiting for range to be copied during a file copy operation.</entry>
+     </row>
      <row>
       <entry><literal>CopyFileRead</literal></entry>
       <entry>Waiting for a read during a file copy operation.</entry>
diff --git a/src/backend/storage/file/copydir.c b/src/backend/storage/file/copydir.c
index 82f77536b4..497d357d8c 100644
--- a/src/backend/storage/file/copydir.c
+++ b/src/backend/storage/file/copydir.c
@@ -126,6 +126,14 @@ copy_file(const char *fromfile, const char *tofile)
 	/* Size of copy buffer (read and write requests) */
 #define COPY_BUF_SIZE (8 * BLCKSZ)
 
+	/*
+	 * Size of ranges when using copy_file_range().  We could in theory just
+	 * use the whole file size, but we want to check for interrupts
+	 * periodically while copying.  We don't want to make it too small though,
+	 * to give the operating system the chance to clone large extents.
+	 */
+#define COPY_FILE_RANGE_CHUNK_SIZE (1024 * 1024)
+
 	/*
 	 * Size of data flush requests.  It seems beneficial on most platforms to
 	 * do this every 1MB or so.  But macOS, at least with early releases of
@@ -138,8 +146,13 @@ copy_file(const char *fromfile, const char *tofile)
 #define FLUSH_DISTANCE (1024 * 1024)
 #endif
 
+#ifdef HAVE_COPY_FILE_RANGE
+	/* Don't allocate the buffer unless we have to fall back to read/write. */
+	buffer = NULL;
+#else
 	/* Use palloc to ensure we get a maxaligned buffer */
 	buffer = palloc(COPY_BUF_SIZE);
+#endif
 
 	/*
 	 * Open the files
@@ -176,27 +189,67 @@ copy_file(const char *fromfile, const char *tofile)
 			flush_offset = offset;
 		}
 
-		pgstat_report_wait_start(WAIT_EVENT_COPY_FILE_READ);
-		nbytes = read(srcfd, buffer, COPY_BUF_SIZE);
-		pgstat_report_wait_end();
-		if (nbytes < 0)
-			ereport(ERROR,
-					(errcode_for_file_access(),
-					 errmsg("could not read file \"%s\": %m", fromfile)));
-		if (nbytes == 0)
-			break;
-		errno = 0;
-		pgstat_report_wait_start(WAIT_EVENT_COPY_FILE_WRITE);
-		if ((int) write(dstfd, buffer, nbytes) != nbytes)
+		nbytes = 0;			/* silence compiler */
+
+#ifdef HAVE_COPY_FILE_RANGE
+		if (buffer == NULL)
+		{
+			pgstat_report_wait_start(WAIT_EVENT_COPY_FILE_RANGE);
+			nbytes = copy_file_range(srcfd, NULL, dstfd, NULL,
+									 COPY_FILE_RANGE_CHUNK_SIZE, 0);
+			pgstat_report_wait_end();
+
+			if (nbytes < 0)
+			{
+				if (errno == EXDEV)
+				{
+					/*
+					 * Linux < 5.3 fails like this for cross-filesystem copies.
+					 * Allocate the buffer to fall back to read/write mode.
+					 */
+					buffer = palloc(COPY_BUF_SIZE);
+				}
+				else
+					ereport(ERROR,
+							(errcode_for_file_access(),
+							 errmsg("could not copy to file \"%s\": %m", tofile)));
+			}
+		}
+#endif
+
+		if (buffer)
 		{
-			/* if write didn't set errno, assume problem is no disk space */
-			if (errno == 0)
-				errno = ENOSPC;
-			ereport(ERROR,
-					(errcode_for_file_access(),
-					 errmsg("could not write to file \"%s\": %m", tofile)));
+			pgstat_report_wait_start(WAIT_EVENT_COPY_FILE_READ);
+			nbytes = read(srcfd, buffer, COPY_BUF_SIZE);
+			pgstat_report_wait_end();
+
+			if (nbytes < 0)
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not read file \"%s\": %m", fromfile)));
+
+			if (nbytes > 0)
+			{
+				errno = 0;
+				pgstat_report_wait_start(WAIT_EVENT_COPY_FILE_WRITE);
+				if ((int) write(dstfd, buffer, nbytes) != nbytes)
+				{
+					/*
+					 * If write didn't set errno, assume problem is no disk
+					 * space.
+					 */
+					if (errno == 0)
+						errno = ENOSPC;
+					ereport(ERROR,
+							(errcode_for_file_access(),
+							 errmsg("could not write to file \"%s\": %m", tofile)));
+				}
+				pgstat_report_wait_end();
+			}
 		}
-		pgstat_report_wait_end();
+
+		if (nbytes == 0)
+			break;
 	}
 
 	if (offset > flush_offset)
@@ -212,5 +265,6 @@ copy_file(const char *fromfile, const char *tofile)
 				(errcode_for_file_access(),
 				 errmsg("could not close file \"%s\": %m", fromfile)));
 
-	pfree(buffer);
+	if (buffer)
+		pfree(buffer);
 }
diff --git a/src/backend/utils/activity/wait_event.c b/src/backend/utils/activity/wait_event.c
index 7940d64639..9c3cd088c0 100644
--- a/src/backend/utils/activity/wait_event.c
+++ b/src/backend/utils/activity/wait_event.c
@@ -567,6 +567,9 @@ pgstat_get_wait_io(WaitEventIO w)
 		case WAIT_EVENT_CONTROL_FILE_WRITE_UPDATE:
 			event_name = "ControlFileWriteUpdate";
 			break;
+		case WAIT_EVENT_COPY_FILE_RANGE:
+			event_name = "CopyFileRange";
+			break;
 		case WAIT_EVENT_COPY_FILE_READ:
 			event_name = "CopyFileRead";
 			break;
diff --git a/src/include/utils/wait_event.h b/src/include/utils/wait_event.h
index 518d3b0a1f..517de1544b 100644
--- a/src/include/utils/wait_event.h
+++ b/src/include/utils/wait_event.h
@@ -172,6 +172,7 @@ typedef enum
 	WAIT_EVENT_CONTROL_FILE_SYNC_UPDATE,
 	WAIT_EVENT_CONTROL_FILE_WRITE,
 	WAIT_EVENT_CONTROL_FILE_WRITE_UPDATE,
+	WAIT_EVENT_COPY_FILE_RANGE,
 	WAIT_EVENT_COPY_FILE_READ,
 	WAIT_EVENT_COPY_FILE_WRITE,
 	WAIT_EVENT_DATA_FILE_EXTEND,
-- 
2.40.1

