From 2f58f6af27c94d782925b7f38ea8845cdfb3e0d2 Mon Sep 17 00:00:00 2001 From: Jakub Wartak Date: Thu, 4 Jan 2024 19:45:30 +0100 Subject: [PATCH v3 2/4] Confine various OS copy-on-write (and other copy acceleration) methods via new internal pg_copyfile_* APIs in libpqcommon. Later refactor pg_upgrade to use those APIs for ioctl(FICLONE). Co-authored-by: Thomas Munro Discussion: https://www.postgresql.org/message-id/flat/CA%2BhUKGJvLLNQtzb%3DZWcTsYF8kv8cR_%3DH17CX-eL8qNixeC4DAw%40mail.gmail.com#ce606227e39df74c6b2abf80b8eab04a --- src/bin/pg_upgrade/file.c | 252 ++++++++----------------- src/bin/pg_upgrade/relfilenumber.c | 81 ++++---- src/common/file_utils.c | 288 +++++++++++++++++++++++------ src/include/common/file_utils.h | 37 +++- 4 files changed, 380 insertions(+), 278 deletions(-) diff --git a/src/bin/pg_upgrade/file.c b/src/bin/pg_upgrade/file.c index d173602882..f91cc548ce 100644 --- a/src/bin/pg_upgrade/file.c +++ b/src/bin/pg_upgrade/file.c @@ -1,31 +1,23 @@ /* - * file.c + * file.c * - * file system operations + * file system operations * - * Copyright (c) 2010-2023, PostgreSQL Global Development Group - * src/bin/pg_upgrade/file.c + * Copyright (c) 2010-2023, PostgreSQL Global Development Group + * src/bin/pg_upgrade/file.c */ #include "postgres_fe.h" -#include -#include -#ifdef HAVE_COPYFILE_H -#include -#endif -#ifdef __linux__ -#include -#include -#endif - #include "access/visibilitymapdefs.h" #include "common/file_perm.h" +#include "common/file_utils.h" #include "pg_upgrade.h" #include "storage/bufpage.h" #include "storage/checksum.h" #include "storage/checksum_impl.h" - +#include +#include /* * cloneFile() @@ -35,127 +27,49 @@ * schemaName/relName are relation's SQL name (used for error messages only). */ void -cloneFile(const char *src, const char *dst, - const char *schemaName, const char *relName) +cloneFile(const char *src, const char *dst, const char *schemaName, + const char *relName) { -#if defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE) - if (copyfile(src, dst, NULL, COPYFILE_CLONE_FORCE) < 0) - pg_fatal("error while cloning relation \"%s.%s\" (\"%s\" to \"%s\"): %s", - schemaName, relName, src, dst, strerror(errno)); -#elif defined(__linux__) && defined(FICLONE) - int src_fd; - int dest_fd; - - if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0) - pg_fatal("error while cloning relation \"%s.%s\": could not open file \"%s\": %s", - schemaName, relName, src, strerror(errno)); - - if ((dest_fd = open(dst, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, - pg_file_create_mode)) < 0) - pg_fatal("error while cloning relation \"%s.%s\": could not create file \"%s\": %s", - schemaName, relName, dst, strerror(errno)); - - if (ioctl(dest_fd, FICLONE, src_fd) < 0) - { - int save_errno = errno; - - unlink(dst); + char action[1024]; - pg_fatal("error while cloning relation \"%s.%s\" (\"%s\" to \"%s\"): %s", - schemaName, relName, src, dst, strerror(save_errno)); - } - - close(src_fd); - close(dest_fd); -#endif + snprintf(action, sizeof(action) - 1, "relation \"%s.%s\"", schemaName, + relName); + pg_copyfile_offload(src, dst, action, PG_COPYFILE_IOCTL_FICLONE); } - /* * copyFile() * - * Copies a relation file from src to dst. - * schemaName/relName are relation's SQL name (used for error messages only). + * Copies a relation file from src to dst. schemaName/relName are relation's + * SQL name (used for error messages only). */ void -copyFile(const char *src, const char *dst, - const char *schemaName, const char *relName) +copyFile(const char *src, const char *dst, const char *schemaName, + const char *relName) { -#ifndef WIN32 - int src_fd; - int dest_fd; - char *buffer; - - if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0) - pg_fatal("error while copying relation \"%s.%s\": could not open file \"%s\": %s", - schemaName, relName, src, strerror(errno)); - - if ((dest_fd = open(dst, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, - pg_file_create_mode)) < 0) - pg_fatal("error while copying relation \"%s.%s\": could not create file \"%s\": %s", - schemaName, relName, dst, strerror(errno)); - - /* copy in fairly large chunks for best efficiency */ -#define COPY_BUF_SIZE (50 * BLCKSZ) - - buffer = (char *) pg_malloc(COPY_BUF_SIZE); - - /* perform data copying i.e read src source, write to destination */ - while (true) - { - ssize_t nbytes = read(src_fd, buffer, COPY_BUF_SIZE); + char action[128]; - if (nbytes < 0) - pg_fatal("error while copying relation \"%s.%s\": could not read file \"%s\": %s", - schemaName, relName, src, strerror(errno)); - - if (nbytes == 0) - break; - - errno = 0; - if (write(dest_fd, buffer, nbytes) != nbytes) - { - /* if write didn't set errno, assume problem is no disk space */ - if (errno == 0) - errno = ENOSPC; - pg_fatal("error while copying relation \"%s.%s\": could not write file \"%s\": %s", - schemaName, relName, dst, strerror(errno)); - } - } - - pg_free(buffer); - close(src_fd); - close(dest_fd); - -#else /* WIN32 */ - - if (CopyFile(src, dst, true) == 0) - { - _dosmaperr(GetLastError()); - pg_fatal("error while copying relation \"%s.%s\" (\"%s\" to \"%s\"): %s", - schemaName, relName, src, dst, strerror(errno)); - } - -#endif /* WIN32 */ + snprintf(action, sizeof(action) - 1, "relation \"%s.%s\"", schemaName, + relName); + pg_copyfile(src, dst, action, NULL); } - /* * linkFile() * - * Hard-links a relation file from src to dst. - * schemaName/relName are relation's SQL name (used for error messages only). + * Hard-links a relation file from src to dst. schemaName/relName are + * relation's SQL name (used for error messages only). */ void -linkFile(const char *src, const char *dst, - const char *schemaName, const char *relName) +linkFile(const char *src, const char *dst, const char *schemaName, + const char *relName) { if (link(src, dst) < 0) - pg_fatal("error while creating link for relation \"%s.%s\" (\"%s\" to \"%s\"): %s", + pg_fatal("error while creating link for relation \"%s.%s\" (\"%s\" to " + "\"%s\"): %s", schemaName, relName, src, dst, strerror(errno)); } - /* * rewriteVisibilityMap() * @@ -163,14 +77,14 @@ linkFile(const char *src, const char *dst, * schemaName/relName are relation's SQL name (used for error messages only). * * In versions of PostgreSQL prior to catversion 201603011, PostgreSQL's - * visibility map included one bit per heap page; it now includes two. - * When upgrading a cluster from before that time to a current PostgreSQL - * version, we could refuse to copy visibility maps from the old cluster - * to the new cluster; the next VACUUM would recreate them, but at the - * price of scanning the entire table. So, instead, we rewrite the old - * visibility maps in the new format. That way, the all-visible bits - * remain set for the pages for which they were set previously. The - * all-frozen bits are never set by this conversion; we leave that to VACUUM. + * visibility map included one bit per heap page; it now includes two. When + * upgrading a cluster from before that time to a current PostgreSQL version, + * we could refuse to copy visibility maps from the old cluster to the new + * cluster; the next VACUUM would recreate them, but at the price of scanning + * the entire table. So, instead, we rewrite the old visibility maps in the + * new format. That way, the all-visible bits remain set for the pages for + * which they were set previously. The all-frozen bits are never set by this + * conversion; we leave that to VACUUM. */ void rewriteVisibilityMap(const char *fromfile, const char *tofile, @@ -190,16 +104,19 @@ rewriteVisibilityMap(const char *fromfile, const char *tofile, rewriteVmBytesPerPage = (BLCKSZ - SizeOfPageHeaderData) / 2; if ((src_fd = open(fromfile, O_RDONLY | PG_BINARY, 0)) < 0) - pg_fatal("error while copying relation \"%s.%s\": could not open file \"%s\": %s", + pg_fatal("error while copying relation \"%s.%s\": could not open file " + "\"%s\": %s", schemaName, relName, fromfile, strerror(errno)); if (fstat(src_fd, &statbuf) != 0) - pg_fatal("error while copying relation \"%s.%s\": could not stat file \"%s\": %s", + pg_fatal("error while copying relation \"%s.%s\": could not stat file " + "\"%s\": %s", schemaName, relName, fromfile, strerror(errno)); if ((dst_fd = open(tofile, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, pg_file_create_mode)) < 0) - pg_fatal("error while copying relation \"%s.%s\": could not create file \"%s\": %s", + pg_fatal("error while copying relation \"%s.%s\": could not create file " + "\"%s\": %s", schemaName, relName, tofile, strerror(errno)); /* Save old file size */ @@ -223,10 +140,12 @@ rewriteVisibilityMap(const char *fromfile, const char *tofile, if ((bytesRead = read(src_fd, buffer.data, BLCKSZ)) != BLCKSZ) { if (bytesRead < 0) - pg_fatal("error while copying relation \"%s.%s\": could not read file \"%s\": %s", + pg_fatal("error while copying relation \"%s.%s\": could not read file " + "\"%s\": %s", schemaName, relName, fromfile, strerror(errno)); else - pg_fatal("error while copying relation \"%s.%s\": partial page found in file \"%s\"", + pg_fatal("error while copying relation \"%s.%s\": partial page found " + "in file \"%s\"", schemaName, relName, fromfile); } @@ -260,25 +179,30 @@ rewriteVisibilityMap(const char *fromfile, const char *tofile, new_cur = new_vmbuf.data + SizeOfPageHeaderData; - /* Process old page bytes one by one, and turn it into new page. */ + /* + * Process old page bytes one by one, and turn it into new page. + */ while (old_cur < old_break) { uint8 byte = *(uint8 *) old_cur; uint16 new_vmbits = 0; int i; - /* Generate new format bits while keeping old information */ + /* + * Generate new format bits while keeping old information + */ for (i = 0; i < BITS_PER_BYTE; i++) { if (byte & (1 << i)) { empty = false; - new_vmbits |= - VISIBILITYMAP_ALL_VISIBLE << (BITS_PER_HEAPBLOCK * i); + new_vmbits |= VISIBILITYMAP_ALL_VISIBLE << (BITS_PER_HEAPBLOCK * i); } } - /* Copy new visibility map bytes to new-format page */ + /* + * Copy new visibility map bytes to new-format page + */ new_cur[0] = (char) (new_vmbits & 0xFF); new_cur[1] = (char) (new_vmbits >> 8); @@ -286,11 +210,15 @@ rewriteVisibilityMap(const char *fromfile, const char *tofile, new_cur += BITS_PER_HEAPBLOCK; } - /* If the last part of the last page is empty, skip writing it */ + /* + * If the last part of the last page is empty, skip writing it + */ if (old_lastpart && empty) break; - /* Set new checksum for visibility map page, if enabled */ + /* + * Set new checksum for visibility map page, if enabled + */ if (new_cluster.controldata.data_checksum_version != 0) ((PageHeader) new_vmbuf.data)->pd_checksum = pg_checksum_page(new_vmbuf.data, new_blkno); @@ -298,10 +226,13 @@ rewriteVisibilityMap(const char *fromfile, const char *tofile, errno = 0; if (write(dst_fd, new_vmbuf.data, BLCKSZ) != BLCKSZ) { - /* if write didn't set errno, assume problem is no disk space */ + /* + * if write didn't set errno, assume problem is no disk space + */ if (errno == 0) errno = ENOSPC; - pg_fatal("error while copying relation \"%s.%s\": could not write file \"%s\": %s", + pg_fatal("error while copying relation \"%s.%s\": could not write file " + "\"%s\": %s", schemaName, relName, tofile, strerror(errno)); } @@ -322,40 +253,15 @@ check_file_clone(void) char existing_file[MAXPGPATH]; char new_link_file[MAXPGPATH]; - snprintf(existing_file, sizeof(existing_file), "%s/PG_VERSION", old_cluster.pgdata); - snprintf(new_link_file, sizeof(new_link_file), "%s/PG_VERSION.clonetest", new_cluster.pgdata); + snprintf(existing_file, sizeof(existing_file), "%s/PG_VERSION", + old_cluster.pgdata); + snprintf(new_link_file, sizeof(new_link_file), "%s/PG_VERSION.clonetest", + new_cluster.pgdata); unlink(new_link_file); /* might fail */ -#if defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE) - if (copyfile(existing_file, new_link_file, NULL, COPYFILE_CLONE_FORCE) < 0) - pg_fatal("could not clone file between old and new data directories: %s", - strerror(errno)); -#elif defined(__linux__) && defined(FICLONE) - { - int src_fd; - int dest_fd; - - if ((src_fd = open(existing_file, O_RDONLY | PG_BINARY, 0)) < 0) - pg_fatal("could not open file \"%s\": %s", - existing_file, strerror(errno)); - - if ((dest_fd = open(new_link_file, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, - pg_file_create_mode)) < 0) - pg_fatal("could not create file \"%s\": %s", - new_link_file, strerror(errno)); - - if (ioctl(dest_fd, FICLONE, src_fd) < 0) - pg_fatal("could not clone file between old and new data directories: %s", - strerror(errno)); - - close(src_fd); - close(dest_fd); - } -#else - pg_fatal("file cloning not supported on this platform"); -#endif - - unlink(new_link_file); + /* will throw error in case it is not supported */ + pg_copyfile_offload_supported(existing_file, new_link_file, NULL, + PG_COPYFILE_IOCTL_FICLONE); } void @@ -364,13 +270,17 @@ check_hard_link(void) char existing_file[MAXPGPATH]; char new_link_file[MAXPGPATH]; - snprintf(existing_file, sizeof(existing_file), "%s/PG_VERSION", old_cluster.pgdata); - snprintf(new_link_file, sizeof(new_link_file), "%s/PG_VERSION.linktest", new_cluster.pgdata); + snprintf(existing_file, sizeof(existing_file), "%s/PG_VERSION", + old_cluster.pgdata); + snprintf(new_link_file, sizeof(new_link_file), "%s/PG_VERSION.linktest", + new_cluster.pgdata); unlink(new_link_file); /* might fail */ if (link(existing_file, new_link_file) < 0) - pg_fatal("could not create hard link between old and new data directories: %s\n" - "In link mode the old and new data directories must be on the same file system.", + pg_fatal( + "could not create hard link between old and new data directories: %s\n" + "In link mode the old and new data directories must be on the same " + "file system.", strerror(errno)); unlink(new_link_file); diff --git a/src/bin/pg_upgrade/relfilenumber.c b/src/bin/pg_upgrade/relfilenumber.c index 34bc9c1504..d61fb77bdf 100644 --- a/src/bin/pg_upgrade/relfilenumber.c +++ b/src/bin/pg_upgrade/relfilenumber.c @@ -15,15 +15,16 @@ #include "catalog/pg_class_d.h" #include "pg_upgrade.h" -static void transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace); -static void transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_frozenbit); - +static void transfer_single_new_db(FileNameMap *maps, int size, + char *old_tablespace); +static void transfer_relfile(FileNameMap *map, const char *type_suffix, + bool vm_must_add_frozenbit); /* * transfer_all_new_tablespaces() * - * Responsible for upgrading all database. invokes routines to generate mappings and then - * physically link the databases. + * Responsible for upgrading all database. invokes routines to generate mappings + * and then physically link the databases. */ void transfer_all_new_tablespaces(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr, @@ -40,6 +41,9 @@ transfer_all_new_tablespaces(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr, case TRANSFER_MODE_LINK: prep_status_progress("Linking user relation files"); break; + case TRANSFER_MODE_COPY_FILE_RANGE: + prep_status_progress("Copying user relation files with copy_file_range"); + break; } /* @@ -61,9 +65,7 @@ transfer_all_new_tablespaces(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr, new_pgdata, old_pgdata); for (tblnum = 0; tblnum < os_info.num_old_tablespaces; tblnum++) - parallel_transfer_all_new_dbs(old_db_arr, - new_db_arr, - old_pgdata, + parallel_transfer_all_new_dbs(old_db_arr, new_db_arr, old_pgdata, new_pgdata, os_info.old_tablespaces[tblnum]); /* reap all children */ @@ -75,23 +77,22 @@ transfer_all_new_tablespaces(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr, check_ok(); } - /* * transfer_all_new_dbs() * - * Responsible for upgrading all database. invokes routines to generate mappings and then - * physically link the databases. + * Responsible for upgrading all database. invokes routines to generate mappings + * and then physically link the databases. */ void transfer_all_new_dbs(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr, - char *old_pgdata, char *new_pgdata, char *old_tablespace) + char *old_pgdata, char *new_pgdata, + char *old_tablespace) { int old_dbnum, new_dbnum; /* Scan the old cluster databases and transfer their files */ - for (old_dbnum = new_dbnum = 0; - old_dbnum < old_db_arr->ndbs; + for (old_dbnum = new_dbnum = 0; old_dbnum < old_db_arr->ndbs; old_dbnum++, new_dbnum++) { DbInfo *old_db = &old_db_arr->dbs[old_dbnum], @@ -115,8 +116,8 @@ transfer_all_new_dbs(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr, pg_fatal("old database \"%s\" not found in the new cluster", old_db->db_name); - mappings = gen_db_file_maps(old_db, new_db, &n_maps, old_pgdata, - new_pgdata); + mappings = + gen_db_file_maps(old_db, new_db, &n_maps, old_pgdata, new_pgdata); if (n_maps) { transfer_single_new_db(mappings, n_maps, old_tablespace); @@ -132,7 +133,8 @@ transfer_all_new_dbs(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr, * create links for mappings stored in "maps" array. */ static void -transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace) +transfer_single_new_db(FileNameMap *maps, int size, + char *old_tablespace) { int mapnum; bool vm_must_add_frozenbit = false; @@ -161,7 +163,6 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace) } } - /* * transfer_relfile() * @@ -170,7 +171,8 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace) * mode. */ static void -transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_frozenbit) +transfer_relfile(FileNameMap *map, const char *type_suffix, + bool vm_must_add_frozenbit) { char old_file[MAXPGPATH]; char new_file[MAXPGPATH]; @@ -190,20 +192,12 @@ transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_fro else snprintf(extent_suffix, sizeof(extent_suffix), ".%d", segno); - snprintf(old_file, sizeof(old_file), "%s%s/%u/%u%s%s", - map->old_tablespace, - map->old_tablespace_suffix, - map->db_oid, - map->relfilenumber, - type_suffix, - extent_suffix); - snprintf(new_file, sizeof(new_file), "%s%s/%u/%u%s%s", - map->new_tablespace, - map->new_tablespace_suffix, - map->db_oid, - map->relfilenumber, - type_suffix, - extent_suffix); + snprintf(old_file, sizeof(old_file), "%s%s/%u/%u%s%s", map->old_tablespace, + map->old_tablespace_suffix, map->db_oid, map->relfilenumber, + type_suffix, extent_suffix); + snprintf(new_file, sizeof(new_file), "%s%s/%u/%u%s%s", map->new_tablespace, + map->new_tablespace_suffix, map->db_oid, map->relfilenumber, + type_suffix, extent_suffix); /* Is it an extent, fsm, or vm file? */ if (type_suffix[0] != '\0' || segno != 0) @@ -215,7 +209,8 @@ transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_fro if (errno == ENOENT) return; else - pg_fatal("error while checking for file existence \"%s.%s\" (\"%s\" to \"%s\"): %s", + pg_fatal("error while checking for file existence \"%s.%s\" (\"%s\" " + "to \"%s\"): %s", map->nspname, map->relname, old_file, new_file, strerror(errno)); } @@ -233,27 +228,29 @@ transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_fro if (vm_must_add_frozenbit && strcmp(type_suffix, "_vm") == 0) { /* Need to rewrite visibility map format */ - pg_log(PG_VERBOSE, "rewriting \"%s\" to \"%s\"", - old_file, new_file); + pg_log(PG_VERBOSE, "rewriting \"%s\" to \"%s\"", old_file, new_file); rewriteVisibilityMap(old_file, new_file, map->nspname, map->relname); } else switch (user_opts.transfer_mode) { case TRANSFER_MODE_CLONE: - pg_log(PG_VERBOSE, "cloning \"%s\" to \"%s\"", - old_file, new_file); + pg_log(PG_VERBOSE, "cloning \"%s\" to \"%s\"", old_file, new_file); cloneFile(old_file, new_file, map->nspname, map->relname); break; case TRANSFER_MODE_COPY: - pg_log(PG_VERBOSE, "copying \"%s\" to \"%s\"", - old_file, new_file); + pg_log(PG_VERBOSE, "copying \"%s\" to \"%s\"", old_file, new_file); copyFile(old_file, new_file, map->nspname, map->relname); break; case TRANSFER_MODE_LINK: - pg_log(PG_VERBOSE, "linking \"%s\" to \"%s\"", - old_file, new_file); + pg_log(PG_VERBOSE, "linking \"%s\" to \"%s\"", old_file, new_file); linkFile(old_file, new_file, map->nspname, map->relname); + break; + case TRANSFER_MODE_COPY_FILE_RANGE: + pg_log(PG_VERBOSE, "copying \"%s\" to \"%s\" with copy_file_range", + old_file, new_file); + copyFileByRange(old_file, new_file, map->nspname, map->relname); + break; } } } diff --git a/src/common/file_utils.c b/src/common/file_utils.c index 5380299f35..25f9a48b21 100644 --- a/src/common/file_utils.c +++ b/src/common/file_utils.c @@ -24,10 +24,19 @@ #include #include +#include "common/file_perm.h" #include "common/file_utils.h" #ifdef FRONTEND +#include "common/checksum_helper.h" #include "common/logging.h" +#ifdef HAVE_COPYFILE_H +#include #endif +#ifdef __linux__ +#include +#include +#endif +#endif /* FRONTEND */ #include "port/pg_iovec.h" #ifdef FRONTEND @@ -42,7 +51,7 @@ /* * pg_xlog has been renamed to pg_wal in version 10. */ -#define MINIMUM_VERSION_FOR_PG_WAL 100000 +#define MINIMUM_VERSION_FOR_PG_WAL 100000 #ifdef PG_FLUSH_DATA_WORKS static int pre_sync_fname(const char *fname, bool isdir); @@ -94,8 +103,7 @@ do_syncfs(const char *path) * serverVersion indicates the version of the server to be sync'd. */ void -sync_pgdata(const char *pg_data, - int serverVersion, +sync_pgdata(const char *pg_data, int serverVersion, DataDirSyncMethod sync_method) { bool xlog_is_symlink; @@ -127,8 +135,7 @@ sync_pgdata(const char *pg_data, case DATA_DIR_SYNC_METHOD_SYNCFS: { #ifndef HAVE_SYNCFS - pg_log_error("this build does not support sync method \"%s\"", - "syncfs"); + pg_log_error("this build does not support sync method \"%s\"", "syncfs"); exit(EXIT_FAILURE); #else DIR *dir; @@ -145,29 +152,27 @@ sync_pgdata(const char *pg_data, /* Sync the top level pgdata directory. */ do_syncfs(pg_data); - /* If any tablespaces are configured, sync each of those. */ + /* + * If any tablespaces are configured, sync each of those. + */ dir = opendir(pg_tblspc); if (dir == NULL) - pg_log_error("could not open directory \"%s\": %m", - pg_tblspc); + pg_log_error("could not open directory \"%s\": %m", pg_tblspc); else { while (errno = 0, (de = readdir(dir)) != NULL) { char subpath[MAXPGPATH * 2]; - if (strcmp(de->d_name, ".") == 0 || - strcmp(de->d_name, "..") == 0) + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) continue; - snprintf(subpath, sizeof(subpath), "%s/%s", - pg_tblspc, de->d_name); + snprintf(subpath, sizeof(subpath), "%s/%s", pg_tblspc, de->d_name); do_syncfs(subpath); } if (errno) - pg_log_error("could not read directory \"%s\": %m", - pg_tblspc); + pg_log_error("could not read directory \"%s\": %m", pg_tblspc); (void) closedir(dir); } @@ -176,8 +181,7 @@ sync_pgdata(const char *pg_data, if (xlog_is_symlink) do_syncfs(pg_wal); #endif /* HAVE_SYNCFS */ - } - break; + } break; case DATA_DIR_SYNC_METHOD_FSYNC: { @@ -206,8 +210,7 @@ sync_pgdata(const char *pg_data, if (xlog_is_symlink) walkdir(pg_wal, fsync_fname, false); walkdir(pg_tblspc, fsync_fname, true); - } - break; + } break; } } @@ -224,8 +227,7 @@ sync_dir_recurse(const char *dir, DataDirSyncMethod sync_method) case DATA_DIR_SYNC_METHOD_SYNCFS: { #ifndef HAVE_SYNCFS - pg_log_error("this build does not support sync method \"%s\"", - "syncfs"); + pg_log_error("this build does not support sync method \"%s\"", "syncfs"); exit(EXIT_FAILURE); #else /* @@ -234,8 +236,7 @@ sync_dir_recurse(const char *dir, DataDirSyncMethod sync_method) */ do_syncfs(dir); #endif /* HAVE_SYNCFS */ - } - break; + } break; case DATA_DIR_SYNC_METHOD_FSYNC: { @@ -248,20 +249,19 @@ sync_dir_recurse(const char *dir, DataDirSyncMethod sync_method) #endif walkdir(dir, fsync_fname, false); - } - break; + } break; } } /* - * walkdir: recursively walk a directory, applying the action to each - * regular file and directory (including the named directory itself). + * walkdir: recursively walk a directory, applying the action to each regular + * file and directory (including the named directory itself). * - * If process_symlinks is true, the action and recursion are also applied - * to regular files and directories that are pointed to by symlinks in the - * given directory; otherwise symlinks are ignored. Symlinks are always - * ignored in subdirectories, ie we intentionally don't pass down the - * process_symlinks flag to recursive calls. + * If process_symlinks is true, the action and recursion are also applied to + * regular files and directories that are pointed to by symlinks in the given + * directory; otherwise symlinks are ignored. Symlinks are always ignored in + * subdirectories, ie we intentionally don't pass down the process_symlinks + * flag to recursive calls. * * Errors are reported but not considered fatal. * @@ -286,8 +286,7 @@ walkdir(const char *path, { char subpath[MAXPGPATH * 2]; - if (strcmp(de->d_name, ".") == 0 || - strcmp(de->d_name, "..") == 0) + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) continue; snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name); @@ -371,8 +370,8 @@ pre_sync_fname(const char *fname, bool isdir) * fsync_fname -- Try to fsync a file or directory * * Ignores errors trying to open unreadable files, or trying to fsync - * directories on systems where that isn't allowed/required. All other errors - * are fatal. + * directories on systems where that isn't allowed/required. All other + * errors are fatal. */ int fsync_fname(const char *fname, bool isdir) @@ -427,8 +426,8 @@ fsync_fname(const char *fname, bool isdir) /* * fsync_parent_path -- fsync the parent path of a file or directory * - * This is aimed at making file operations persistent on disk in case of - * an OS crash or power failure. + * This is aimed at making file operations persistent on disk in case of an + * OS crash or power failure. */ int fsync_parent_path(const char *fname) @@ -453,7 +452,8 @@ fsync_parent_path(const char *fname) } /* - * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability + * durable_rename -- rename(2) wrapper, issuing fsyncs required for + * durability * * Wrapper around rename, similar to the backend version. */ @@ -495,8 +495,8 @@ durable_rename(const char *oldfile, const char *newfile) /* Time to do the real deal... */ if (rename(oldfile, newfile) != 0) { - pg_log_error("could not rename file \"%s\" to \"%s\": %m", - oldfile, newfile); + pg_log_error("could not rename file \"%s\" to \"%s\": %m", oldfile, + newfile); return -1; } @@ -513,6 +513,186 @@ durable_rename(const char *oldfile, const char *newfile) return 0; } +/* Helper function to optionally prepend error string */ +static inline char * +opt_errinfo(const char *addon_errmsg) +{ + char buf[128]; + + if (addon_errmsg == NULL) + return ""; + + strcpy(buf, " "); + return strncat(buf, addon_errmsg, sizeof(buf) - 2); +} + +/* + * Copies a relation file from src to dest. addon_errmsg is an optional + * addon error message (can be NULL or include schema/relName) + */ +void +pg_copyfile(const char *src, const char *dest, const char *addon_errmsg, + pg_checksum_context *ctx) +{ +#ifndef WIN32 + int src_fd; + int dest_fd; + uint8 *buffer; + + /* copy in fairly large chunks for best efficiency */ + const int buffer_size = 50 * BLCKSZ; + + if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0) + pg_fatal("error while copying%s: could not open file \"%s\": %s", + opt_errinfo(addon_errmsg), src, strerror(errno)); + + if ((dest_fd = open(dest, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, + pg_file_create_mode)) < 0) + pg_fatal("error while copying%s: could not create file \"%s\": %s", + opt_errinfo(addon_errmsg), dest, strerror(errno)); + + buffer = pg_malloc(buffer_size); + + /* perform data copying i.e read src source, write to destination */ + while (true) + { + ssize_t nbytes = read(src_fd, buffer, buffer_size); + + if (nbytes < 0) + pg_fatal("error while copying%s: could not read file " + "\"%s\": %s", + opt_errinfo(addon_errmsg), src, strerror(errno)); + + if (nbytes == 0) + break; + + errno = 0; + if (write(dest_fd, buffer, nbytes) != nbytes) + { + /* + * if write didn't set errno, assume problem is no disk space + */ + if (errno == 0) + errno = ENOSPC; + pg_fatal("error while copying%s: could not write file \"%s\": %s", + opt_errinfo(addon_errmsg), dest, strerror(errno)); + } + + if (pg_checksum_update(ctx, buffer, nbytes) < 0) + pg_fatal("could not calculate checksum of file \"%s\"", dest); + } + + pg_free(buffer); + close(src_fd); + close(dest_fd); + +#else /* WIN32 */ + if (CopyFile(src, dest, true) == 0) + { + _dosmaperr(GetLastError()); + pg_fatal("error while copying%s (\"%s\" to \"%s\"): %s", addon_errmsg, + opt_errinfo(addon_errmsg), src, dest, strerror(errno)); + } +#endif /* WIN32 */ +} + +/* + * pg_copyfile_offload() + * + * Clones/reflinks a relation file from src to dest using variety of methods + * + * addon_errmsg can be used to pass additional information in case of errors. + * flags, see PG_COPYFILE_* enum in file_utils.h + */ +void +pg_copyfile_offload(const char *src, const char *dest, + const char *addon_errmsg, CopyFileMethod flags) +{ + +#ifdef WIN32 + /* on WIN32 we ignore flags, we have no other choice */ + if (CopyFile(src, dest, true) == 0) + { + _dosmaperr(GetLastError()); + pg_fatal("error while copying%s (\"%s\" to \"%s\"): %s", addon_errmsg, + opt_errinfo(addon_errmsg), src, dest, strerror(errno)); + } +#elif defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE) + /* on MacOS we ignore flags, we have no other choice */ + if (copyfile(src, dest, NULL, COPYFILE_CLONE_FORCE) < 0) + pg_fatal("error while cloning%s: (\"%s\" to \"%s\"): %s", + opt_errinfo(addon_errmsg), src, dest, strerror(errno)); + +#elif defined(HAVE_COPY_FILE_RANGE) || defined(FICLONE) + int src_fd; + int dest_fd; + ssize_t nbytes; + + if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0) + pg_fatal("error while copying%s: could not open file \"%s\": %s", + opt_errinfo(addon_errmsg), src, strerror(errno)); + + if ((dest_fd = open(dest, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, + pg_file_create_mode)) < 0) + pg_fatal("error while copying%s: could not create file \"%s\": %s", + opt_errinfo(addon_errmsg), dest, strerror(errno)); + + if (flags & PG_COPYFILE_COPY_FILE_RANGE) + { +#ifdef HAVE_COPY_FILE_RANGE + do + { + nbytes = copy_file_range(src_fd, NULL, dest_fd, NULL, SSIZE_MAX, 0); + if (nbytes < 0 && errno != EINTR) + pg_fatal("error while copying%s: could not copy_file_range()" + "from \"%s\" to \"%s\": %s", + opt_errinfo(addon_errmsg), src, dest, strerror(errno)); + } while (nbytes > 0); +#else + pg_fatal("copy file accelaration via copy_file_range() is not supported on " + "this platform"); +#endif + } + else if (flags & PG_COPYFILE_IOCTL_FICLONE) + { +#if defined(__linux__) && defined(FICLONE) + if (ioctl(dest_fd, FICLONE, src_fd) < 0) + { + int save_errno = errno; + + unlink(dest); + + pg_fatal("error while cloning%s: (\"%s\" to \"%s\"): %s", + opt_errinfo(addon_errmsg), src, dest, strerror(save_errno)); + } +#else + pg_fatal("clone file accelaration via ioctl(FICLONE) is not supported on " + "this platform"); +#endif + } + + close(src_fd); + close(dest_fd); + +#else + if (flags & PG_COPYFILE_FALLBACK) + pg_copyfile(src, dest, addon_errmsg); + else + pg_fatal("none of the copy file acceleration methods are supported on this " + "platform"); +#endif +} + +/* FIXME */ +bool +pg_copyfile_offload_supported(const char *src, const char *dst, + const char *addon_errmsg, + CopyFileMethod flags) +{ + pg_copyfile_offload(src, dst, addon_errmsg, flags); + return true; +} + #endif /* FRONTEND */ /* @@ -522,10 +702,8 @@ durable_rename(const char *oldfile, const char *newfile) * it should be a level from elog.h. */ PGFileType -get_dirent_type(const char *path, - const struct dirent *de, - bool look_through_symlinks, - int elevel) +get_dirent_type(const char *path, const struct dirent *de, + bool look_through_symlinks, int elevel) { PGFileType result; @@ -553,7 +731,6 @@ get_dirent_type(const char *path, struct stat fst; int sret; - if (look_through_symlinks) sret = stat(path, &fst); else @@ -563,11 +740,11 @@ get_dirent_type(const char *path, { result = PGFILETYPE_ERROR; #ifdef FRONTEND - pg_log_generic(elevel, PG_LOG_PRIMARY, "could not stat file \"%s\": %m", path); + pg_log_generic(elevel, PG_LOG_PRIMARY, "could not stat file \"%s\": %m", + path); #else - ereport(elevel, - (errcode_for_file_access(), - errmsg("could not stat file \"%s\": %m", path))); + ereport(elevel, (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", path))); #endif } else if (S_ISREG(fst.st_mode)) @@ -586,12 +763,12 @@ get_dirent_type(const char *path, * write. The part of 'source' beginning after 'transferred' bytes is copied * to 'destination', and its length is returned. 'source' and 'destination' * may point to the same array, for in-place adjustment. A return value of - * zero indicates completion (for callers without a cheaper way to know that). + * zero indicates completion (for callers without a cheaper way to know + * that). */ int compute_remaining_iovec(struct iovec *destination, - const struct iovec *source, - int iovcnt, + const struct iovec *source, int iovcnt, size_t transferred) { Assert(iovcnt > 0); @@ -634,7 +811,8 @@ compute_remaining_iovec(struct iovec *destination, * error is returned, it is unspecified how much has been written. */ ssize_t -pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset) +pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, + off_t offset) { struct iovec iov_copy[PG_IOV_MAX]; ssize_t sum = 0; @@ -680,8 +858,8 @@ pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset) * Writes zeros to file worth "size" bytes at "offset" (from the start of the * file), using vectored I/O. * - * Returns the total amount of data written. On failure, a negative value - * is returned with errno set. + * Returns the total amount of data written. On failure, a negative value is + * returned with errno set. */ ssize_t pg_pwrite_zeros(int fd, size_t size, off_t offset) diff --git a/src/include/common/file_utils.h b/src/include/common/file_utils.h index 02a940e310..0747109217 100644 --- a/src/include/common/file_utils.h +++ b/src/include/common/file_utils.h @@ -30,31 +30,48 @@ typedef enum DataDirSyncMethod DATA_DIR_SYNC_METHOD_SYNCFS, } DataDirSyncMethod; +typedef enum CopyFileMethod +{ + PG_COPYFILE_FALLBACK = 0x1, + PG_COPYFILE_IOCTL_FICLONE = 0x2, /* Linux */ + PG_COPYFILE_COPY_FILE_RANGE = 0x4, /* FreeBSD & Linux >= 4.5 */ + PG_COPYFILE_COPYFILE_CLONE_FORCE = 0x8 /* MacOS */ +} CopyFileMethod; +#define PG_COPYFILE_ANY_WITH_FALLBACK (2 << 4) - 1 + struct iovec; /* avoid including port/pg_iovec.h here */ #ifdef FRONTEND +#include "c.h" +#include "common/checksum_helper.h" extern int fsync_fname(const char *fname, bool isdir); extern void sync_pgdata(const char *pg_data, int serverVersion, DataDirSyncMethod sync_method); extern void sync_dir_recurse(const char *dir, DataDirSyncMethod sync_method); extern int durable_rename(const char *oldfile, const char *newfile); extern int fsync_parent_path(const char *fname); + +extern void pg_copyfile(const char *src, const char *dest, + const char *addon_errmsg, pg_checksum_context *ctx); + +extern void pg_copyfile_offload(const char *src, const char *dest, + const char *addon_errmsg, CopyFileMethod flags); + +extern bool pg_copyfile_offload_supported(const char *src, const char *dst, + const char *addon_errmsg, + CopyFileMethod flags); + #endif -extern PGFileType get_dirent_type(const char *path, - const struct dirent *de, - bool look_through_symlinks, - int elevel); +extern PGFileType get_dirent_type(const char *path, const struct dirent *de, + bool look_through_symlinks, int elevel); extern int compute_remaining_iovec(struct iovec *destination, - const struct iovec *source, - int iovcnt, + const struct iovec *source, int iovcnt, size_t transferred); -extern ssize_t pg_pwritev_with_retry(int fd, - const struct iovec *iov, - int iovcnt, - off_t offset); +extern ssize_t pg_pwritev_with_retry(int fd, const struct iovec *iov, + int iovcnt, off_t offset); extern ssize_t pg_pwrite_zeros(int fd, size_t size, off_t offset); -- 2.30.2