diff --git a/configure b/configure index af4f9a3..3da0771 100755 --- a/configure +++ b/configure @@ -19263,7 +19263,8 @@ fi -for ac_func in cbrt dlopen fcvt fdatasync getifaddrs getpeerucred getrlimit memmove poll pstat readlink setproctitle setsid sigprocmask symlink sysconf towlower utime utimes waitpid wcstombs wcstombs_l + +for ac_func in cbrt dlopen fcvt fdatasync sync_file_range getifaddrs getpeerucred getrlimit memmove poll pstat readlink setproctitle setsid sigprocmask symlink sysconf towlower utime utimes waitpid wcstombs wcstombs_l do as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh` { $as_echo "$as_me:$LINENO: checking for $ac_func" >&5 diff --git a/configure.in b/configure.in index 9cad436..2d1608d 100644 --- a/configure.in +++ b/configure.in @@ -1216,7 +1216,7 @@ PGAC_VAR_INT_TIMEZONE AC_FUNC_ACCEPT_ARGTYPES PGAC_FUNC_GETTIMEOFDAY_1ARG -AC_CHECK_FUNCS([cbrt dlopen fcvt fdatasync getifaddrs getpeerucred getrlimit memmove poll pstat readlink setproctitle setsid sigprocmask symlink sysconf towlower utime utimes waitpid wcstombs wcstombs_l]) +AC_CHECK_FUNCS([cbrt dlopen fcvt fdatasync sync_file_range getifaddrs getpeerucred getrlimit memmove poll pstat readlink setproctitle setsid sigprocmask symlink sysconf towlower utime utimes waitpid wcstombs wcstombs_l]) AC_REPLACE_FUNCS(fseeko) case $host_os in diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 8e65962..f6e20b8 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -70,6 +70,11 @@ /* User-settable parameters */ int CheckPointSegments = 3; +#ifdef USE_WRITEBACK +bool checkpoint_writeback = true; +#else +bool checkpoint_writeback = false; +#endif int wal_keep_segments = 0; int XLOGbuffers = -1; int XLogArchiveTimeout = 0; diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 8f68bcc..0402f2c 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -1766,6 +1766,7 @@ CheckPointBuffers(int flags) { TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags); CheckpointStats.ckpt_write_t = GetCurrentTimestamp(); + smgrwriteback(); BufferSync(flags); CheckpointStats.ckpt_sync_t = GetCurrentTimestamp(); TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START(); diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 43bc43a..e0344ce 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -347,6 +347,21 @@ pg_flush_data(int fd, off_t offset, off_t amount) #endif } +/* + * pg_writeback --- advise OS that the data described won't be needed soon + * + * Treat as noop if no OS support is available. + */ +int +pg_writeback(int fd) +{ +#if defined(HAVE_SYNC_FILE_RANGE) + return sync_file_range(fd, 0, 0, SYNC_FILE_RANGE_WRITE); +#else + return 0; +#endif +} + /* * InitFileAccess --- initialize this module during backend startup @@ -1336,6 +1351,23 @@ retry: } int +FileWriteback(File file) +{ + int returnCode; + + Assert(FileIsValid(file)); + + DO_DB(elog(LOG, "FileWriteback: %d (%s)", + file, VfdCache[file].fileName)); + + returnCode = FileAccess(file); + if (returnCode < 0) + return returnCode; + + return pg_writeback(VfdCache[file].fd); +} + +int FileSync(File file) { int returnCode; diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index bfc9f06..d85e9bf 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -931,6 +931,95 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum) } /* + * mdwriteback() -- Initiate writeback of data to stable storage. + */ +void +mdwriteback(void) +{ +#ifdef USE_WRITEBACK + HASH_SEQ_STATUS hstat; + PendingOperationEntry *entry; + int absorb_counter; + + /* + * This is only called during checkpoints, and checkpoints should only + * occur in processes that have created a pendingOpsTable. + */ + if (!pendingOpsTable) + elog(ERROR, "cannot sync without a pendingOpsTable"); + + /* Scan the hashtable for fsync requests. */ + absorb_counter = FSYNCS_PER_ABSORB; + hash_seq_init(&hstat, pendingOpsTable); + while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) + { + SMgrRelation reln; + MdfdVec *seg; + + /* + * If writeback is off then we don't have to bother opening the file at + * all. (We delay checking until this point so that changing this on + * the fly behaves sensibly.) + */ + if (!checkpoint_writeback) + break; + + /* Skip canceled entries. */ + if (entry->canceled) + continue; + + /* Absorb fsync requests so that the queue doesn't overflow. */ + if (--absorb_counter <= 0) + { + AbsorbFsyncRequests(); + absorb_counter = FSYNCS_PER_ABSORB; + } + + /* + * Find or create an smgr hash entry for this relation. See + * mdsync() a full explanation of why we go back through the smgr + * layer here. + */ + reln = smgropen(entry->tag.rnode.node, entry->tag.rnode.backend); + + /* + * It is possible that the relation has been dropped or truncated + * since the fsync request was entered. Since writeback is just a + * performance optimization, there's no harm in just skipping the + * segment if it turns out not to exist any more. + */ + seg = _mdfd_getseg(reln, entry->tag.forknum, + entry->tag.segno * ((BlockNumber) RELSEG_SIZE), + false, EXTENSION_RETURN_NULL); + if (seg == NULL) + continue; + + /* + * Try to write it back. + */ + errno = FileWriteback(seg->mdfd_vfd); + + /* + * Since this is just a hint to the OS to get the file on disk, + * there's no great harm if it fails. Of course, failure here may be + * a sign that the eventual fsync will also fail, but that's mdsync's + * problem, not ours. + */ + if (errno != 0 && !FILE_POSSIBLY_DELETED(errno)) + { + char *path; + + path = _mdfd_segpath(reln, entry->tag.forknum, + entry->tag.segno); + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not write back file \"%s\": %m", path))); + } + } +#endif +} + +/* * mdsync() -- Sync previous writes to stable storage. */ void diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index 5f87543..ccd952a 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -58,6 +58,7 @@ typedef struct f_smgr BlockNumber nblocks); void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum); void (*smgr_pre_ckpt) (void); /* may be NULL */ + void (*smgr_writeback) (void); /* may be NULL */ void (*smgr_sync) (void); /* may be NULL */ void (*smgr_post_ckpt) (void); /* may be NULL */ } f_smgr; @@ -67,7 +68,7 @@ static const f_smgr smgrsw[] = { /* magnetic disk */ {mdinit, NULL, mdclose, mdcreate, mdexists, mdunlink, mdextend, mdprefetch, mdread, mdwrite, mdnblocks, mdtruncate, mdimmedsync, - mdpreckpt, mdsync, mdpostckpt + mdpreckpt, mdwriteback, mdsync, mdpostckpt } }; @@ -533,6 +534,21 @@ smgrpreckpt(void) } /* + * smgrwriteback() -- Initial writeback during checkpoint. + */ +void +smgrwriteback(void) +{ + int i; + + for (i = 0; i < NSmgr; i++) + { + if (smgrsw[i].smgr_writeback) + (*(smgrsw[i].smgr_writeback)) (); + } +} + +/* * smgrsync() -- Sync files to disk during checkpoint. */ void diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 5c910dd..c1eec9c 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -182,6 +182,7 @@ static bool check_phony_autocommit(bool *newval, void **extra, GucSource source) static bool check_debug_assertions(bool *newval, void **extra, GucSource source); static bool check_bonjour(bool *newval, void **extra, GucSource source); static bool check_ssl(bool *newval, void **extra, GucSource source); +static bool check_checkpoint_writeback(bool *newval, void **extra, GucSource source); static bool check_stage_log_stats(bool *newval, void **extra, GucSource source); static bool check_log_stats(bool *newval, void **extra, GucSource source); static bool check_canonical_path(char **newval, void **extra, GucSource source); @@ -816,6 +817,25 @@ static struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, { + {"checkpoint_writeback", +#ifdef USE_WRITEBACK + PGC_SIGHUP, +#else + PGC_INTERNAL, +#endif + WAL_CHECKPOINTS, + gettext_noop("Initiates OS writeback of dirty data at checkpoint start."), + gettext_noop("For RAID arrays, this should be approximately the number of drive spindles in the array.") + }, + &checkpoint_writeback, +#ifdef USE_WRITEBACK + true, +#else + false, +#endif + check_checkpoint_writeback, NULL, NULL + }, + { {"zero_damaged_pages", PGC_SUSET, DEVELOPER_OPTIONS, gettext_noop("Continues processing past damaged page headers."), gettext_noop("Detection of a damaged page header normally causes PostgreSQL to " @@ -8374,6 +8394,19 @@ check_bonjour(bool *newval, void **extra, GucSource source) } static bool +check_checkpoint_writeback(bool *newval, void **extra, GucSource source) +{ +#ifndef USE_WRITEBACK + if (*newval) + { + GUC_check_errmsg("writeback is not supported by this build"); + return false; + } +#endif + return true; +} + +static bool check_ssl(bool *newval, void **extra, GucSource source) { #ifndef USE_SSL diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 315db46..bfaf0bc 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -180,6 +180,7 @@ #checkpoint_timeout = 5min # range 30s-1h #checkpoint_completion_target = 0.5 # checkpoint target duration, 0.0 - 1.0 #checkpoint_warning = 30s # 0 disables +#checkpoint_writeback = true # false if async writeback not supported # - Archiving - diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 93622c4..a3fa124 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -186,6 +186,7 @@ extern bool reachedConsistency; /* these variables are GUC parameters related to XLOG */ extern int CheckPointSegments; +extern bool checkpoint_writeback; extern int wal_keep_segments; extern int XLOGbuffers; extern int XLogArchiveTimeout; diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index db84f49..adee3e4 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -544,6 +544,9 @@ /* Define to 1 if you have the `symlink' function. */ #undef HAVE_SYMLINK +/* Define to 1 if you have the `sync_file_range' function. */ +#undef HAVE_SYNC_FILE_RANGE + /* Define to 1 if you have the `sysconf' function. */ #undef HAVE_SYSCONF diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h index ac45ee6..05cf2f2 100644 --- a/src/include/pg_config_manual.h +++ b/src/include/pg_config_manual.h @@ -129,14 +129,23 @@ /* * USE_PREFETCH code should be compiled only if we have a way to implement - * prefetching. (This is decoupled from USE_POSIX_FADVISE because there - * might in future be support for alternative low-level prefetch APIs.) + * prefetching. (This is decoupled from HAVE_FILE_SYNC_RANGE because there + * might in future be support for alternative low-level writeback APIs.) */ #ifdef USE_POSIX_FADVISE #define USE_PREFETCH #endif /* + * USE_WRITEBACK code should be compiled only if we have a way to implement + * writeback. (This is decoupled from HAVE_SYNC_FILE_RANGE because there + * might in future be support for alternative low-level writeback APIs.) + */ +#ifdef HAVE_SYNC_FILE_RANGE +#define USE_WRITEBACK +#endif + +/* * This is the default directory in which AF_UNIX socket files are * placed. Caution: changing this risks breaking your existing client * applications, which are likely to continue to look in the old diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index 22e7fe8..dbf74c0 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -66,6 +66,7 @@ extern void FileClose(File file); extern int FilePrefetch(File file, off_t offset, int amount); extern int FileRead(File file, char *buffer, int amount); extern int FileWrite(File file, char *buffer, int amount); +extern int FileWriteback(File file); extern int FileSync(File file); extern off_t FileSeek(File file, off_t offset, int whence); extern int FileTruncate(File file, off_t offset); @@ -100,6 +101,7 @@ extern int pg_fsync_no_writethrough(int fd); extern int pg_fsync_writethrough(int fd); extern int pg_fdatasync(int fd); extern int pg_flush_data(int fd, off_t offset, off_t amount); +extern int pg_writeback(int fd); /* Filename components for OpenTemporaryFile */ #define PG_TEMP_FILES_DIR "pgsql_tmp" diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index 46c8402..37c76fb 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -95,6 +95,7 @@ extern void smgrtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks); extern void smgrimmedsync(SMgrRelation reln, ForkNumber forknum); extern void smgrpreckpt(void); +extern void smgrwriteback(void); extern void smgrsync(void); extern void smgrpostckpt(void); @@ -120,6 +121,7 @@ extern void mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks); extern void mdimmedsync(SMgrRelation reln, ForkNumber forknum); extern void mdpreckpt(void); +extern void mdwriteback(void); extern void mdsync(void); extern void mdpostckpt(void);