From b6a0267321314992cd0a355002610c0514ec0cc8 Mon Sep 17 00:00:00 2001 From: Thomas Munro Date: Sat, 29 May 2021 23:38:10 +1200 Subject: [PATCH] Support direct I/O on macOS. Macs don't understand O_DIRECT, but they do know how to disable kernel caching if you make a separate fcntl() call. Extend the file opening wrappers in fd.c to handle this. For now, this affects only WAL data and even then only if you set: wal_level=minimal max_wal_senders=0 Later proposed patches will make greater use of direct I/O, and it'll be useful for testing if people developing on Macs can test that. --- src/backend/access/transam/xlog.c | 74 ++++++++++++++++++--------- src/backend/storage/file/fd.c | 54 +++++++++++++++++++ src/bin/pg_test_fsync/pg_test_fsync.c | 35 +++++++++++-- src/include/access/xlogdefs.h | 15 ------ src/include/storage/fd.h | 2 + 5 files changed, 138 insertions(+), 42 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 441a9124cd..1732ca944b 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -961,6 +961,7 @@ static bool read_tablespace_map(List **tablespaces); static void rm_redo_error_callback(void *arg); static int get_sync_bit(int method); +static bool get_direct_flag(int method); static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata, @@ -3296,7 +3297,8 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock) */ if (*use_existent) { - fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method)); + fd = BasicOpenFileDirect(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method), + get_direct_flag(sync_method)); if (fd < 0) { if (errno != ENOENT) @@ -3454,7 +3456,8 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock) *use_existent = false; /* Now open original target segment (might not be file I just made) */ - fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method)); + fd = BasicOpenFileDirect(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method), + get_direct_flag(sync_method)); if (fd < 0) ereport(ERROR, (errcode_for_file_access(), @@ -3693,7 +3696,8 @@ XLogFileOpen(XLogSegNo segno) XLogFilePath(path, ThisTimeLineID, segno, wal_segment_size); - fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method)); + fd = BasicOpenFileDirect(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method), + get_direct_flag(sync_method)); if (fd < 0) ereport(PANIC, (errcode_for_file_access(), @@ -5455,8 +5459,10 @@ readRecoverySignalFile(void) { int fd; - fd = BasicOpenFilePerm(STANDBY_SIGNAL_FILE, O_RDWR | PG_BINARY | get_sync_bit(sync_method), - S_IRUSR | S_IWUSR); + fd = BasicOpenFilePermDirect(STANDBY_SIGNAL_FILE, + O_RDWR | PG_BINARY | get_sync_bit(sync_method), + S_IRUSR | S_IWUSR, + get_direct_flag(sync_method)); if (fd >= 0) { (void) pg_fsync(fd); @@ -5468,8 +5474,10 @@ readRecoverySignalFile(void) { int fd; - fd = BasicOpenFilePerm(RECOVERY_SIGNAL_FILE, O_RDWR | PG_BINARY | get_sync_bit(sync_method), - S_IRUSR | S_IWUSR); + fd = BasicOpenFilePermDirect(RECOVERY_SIGNAL_FILE, + O_RDWR | PG_BINARY | get_sync_bit(sync_method), + S_IRUSR | S_IWUSR, + get_direct_flag(sync_method)); if (fd >= 0) { (void) pg_fsync(fd); @@ -10505,12 +10513,40 @@ xlog_outdesc(StringInfo buf, XLogReaderState *record) static int get_sync_bit(int method) { - int o_direct_flag = 0; - /* If fsync is disabled, never open in sync mode */ if (!enableFsync) return 0; + switch (method) + { + /* + * enum values for all sync options are defined even if they are + * not supported on the current platform. But if not, they are + * not included in the enum option array, and therefore will never + * be seen here. + */ + case SYNC_METHOD_FSYNC: + case SYNC_METHOD_FSYNC_WRITETHROUGH: + case SYNC_METHOD_FDATASYNC: + return 0; +#ifdef OPEN_SYNC_FLAG + case SYNC_METHOD_OPEN: + return OPEN_SYNC_FLAG; +#endif +#ifdef OPEN_DATASYNC_FLAG + case SYNC_METHOD_OPEN_DSYNC: + return OPEN_DATASYNC_FLAG; +#endif + default: + /* can't happen (unless we are out of sync with option array) */ + elog(ERROR, "unrecognized wal_sync_method: %d", method); + return 0; /* silence warning */ + } +} + +static bool +get_direct_flag(int method) +{ /* * Optimize writes by bypassing kernel cache with O_DIRECT when using * O_SYNC/O_FSYNC and O_DSYNC. But only if archiving and streaming are @@ -10525,33 +10561,25 @@ get_sync_bit(int method) * after it's written. Also, walreceiver performs unaligned writes, which * don't work with O_DIRECT, so it is required for correctness too. */ - if (!XLogIsNeeded() && !AmWalReceiverProcess()) - o_direct_flag = PG_O_DIRECT; + if (XLogIsNeeded() || AmWalReceiverProcess()) + return false; switch (method) { - /* - * enum values for all sync options are defined even if they are - * not supported on the current platform. But if not, they are - * not included in the enum option array, and therefore will never - * be seen here. - */ case SYNC_METHOD_FSYNC: case SYNC_METHOD_FSYNC_WRITETHROUGH: case SYNC_METHOD_FDATASYNC: - return 0; + return false; #ifdef OPEN_SYNC_FLAG case SYNC_METHOD_OPEN: - return OPEN_SYNC_FLAG | o_direct_flag; + return true; #endif #ifdef OPEN_DATASYNC_FLAG case SYNC_METHOD_OPEN_DSYNC: - return OPEN_DATASYNC_FLAG | o_direct_flag; + return true; #endif default: - /* can't happen (unless we are out of sync with option array) */ - elog(ERROR, "unrecognized wal_sync_method: %d", method); - return 0; /* silence warning */ + return false; } } diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index e8cd7ef088..0fff5e3605 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -1035,6 +1035,15 @@ BasicOpenFile(const char *fileName, int fileFlags) return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode); } +/* + * As BasicOpenFile(), but optionally attempt to disable kernel buffering. + */ +int +BasicOpenFileDirect(const char *fileName, int fileFlags, bool direct) +{ + return BasicOpenFilePermDirect(fileName, fileFlags, pg_file_create_mode, direct); +} + /* * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed * @@ -1078,6 +1087,51 @@ tryAgain: return -1; /* failure */ } +/* + * BasicOpenDirectFilePerm --- open a file, optionally without kernel buffering + * + * On systems that have a way to request it, open the file in direct mode if + * requested. + */ +int +BasicOpenFilePermDirect(const char *fileName, int fileFlags, mode_t fileMode, + bool direct) +{ + int fd; + +#ifdef O_DIRECT + if (direct) + { + /* + * Almost every Unix system copied IRIX's O_DIRECT flag, and our wrapper + * in src/port/open.c simulates that for Windows. + */ + fileFlags |= O_DIRECT; + } +#endif + + fd = BasicOpenFilePerm(fileName, fileFlags, fileMode); + if (fd < 0) + return fd; + +#if !defined(O_DIRECT) && defined(F_NOCACHE) + /* macOS requires an extra step. */ + if (direct && fcntl(fd, F_NOCACHE, 1) < 0) + { + int save_errno = errno; + + close(fd); + errno = save_errno; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not disable kernel file caching for file \"%s\": %m", + fileName))); + } +#endif + + return fd; +} + /* * AcquireExternalFD - attempt to reserve an external file descriptor * diff --git a/src/bin/pg_test_fsync/pg_test_fsync.c b/src/bin/pg_test_fsync/pg_test_fsync.c index 78dab5096c..fef31844fa 100644 --- a/src/bin/pg_test_fsync/pg_test_fsync.c +++ b/src/bin/pg_test_fsync/pg_test_fsync.c @@ -217,8 +217,10 @@ handle_args(int argc, char *argv[]) "%u seconds per test\n", secs_per_test), secs_per_test); -#if PG_O_DIRECT != 0 +#if defined(O_DIRECT) printf(_("O_DIRECT supported on this platform for open_datasync and open_sync.\n")); +#elif defined(F_NOCACHE) + printf(_("F_NOCACHE supported on this platform for open_datasync and open_sync.\n")); #else printf(_("Direct I/O is not supported on this platform.\n")); #endif @@ -258,6 +260,31 @@ test_open(void) close(tmpfile); } +static int +open_direct(const char *path, int flags, mode_t mode) +{ + int fd; + +#ifdef O_DIRECT + flags |= O_DIRECT; +#endif + + fd = open(path, flags, mode); + +#if !defined(O_DIRECT) && defined(F_NOCACHE) + if (fd >= 0 && fcntl(fd, F_NOCACHE, 1) < 0) + { + int save_errno = errno; + + close(fd); + errno = save_errno; + return -1; + } +#endif + + return fd; +} + static void test_sync(int writes_per_op) { @@ -279,7 +306,7 @@ test_sync(int writes_per_op) fflush(stdout); #ifdef OPEN_DATASYNC_FLAG - if ((tmpfile = open(filename, O_RDWR | O_DSYNC | PG_O_DIRECT | PG_BINARY, 0)) == -1) + if ((tmpfile = open_direct(filename, O_RDWR | O_DSYNC | PG_BINARY, 0)) == -1) { printf(NA_FORMAT, _("n/a*")); fs_warning = true; @@ -386,7 +413,7 @@ test_sync(int writes_per_op) fflush(stdout); #ifdef OPEN_SYNC_FLAG - if ((tmpfile = open(filename, O_RDWR | OPEN_SYNC_FLAG | PG_O_DIRECT | PG_BINARY, 0)) == -1) + if ((tmpfile = open_direct(filename, O_RDWR | OPEN_SYNC_FLAG | PG_BINARY, 0)) == -1) { printf(NA_FORMAT, _("n/a*")); fs_warning = true; @@ -454,7 +481,7 @@ test_open_sync(const char *msg, int writes_size) fflush(stdout); #ifdef OPEN_SYNC_FLAG - if ((tmpfile = open(filename, O_RDWR | OPEN_SYNC_FLAG | PG_O_DIRECT | PG_BINARY, 0)) == -1) + if ((tmpfile = open_direct(filename, O_RDWR | OPEN_SYNC_FLAG | PG_BINARY, 0)) == -1) printf(NA_FORMAT, _("n/a*")); else { diff --git a/src/include/access/xlogdefs.h b/src/include/access/xlogdefs.h index 0940b64ca6..60348d1850 100644 --- a/src/include/access/xlogdefs.h +++ b/src/include/access/xlogdefs.h @@ -64,21 +64,6 @@ typedef uint32 TimeLineID; */ typedef uint16 RepOriginId; -/* - * Because O_DIRECT bypasses the kernel buffers, and because we never - * read those buffers except during crash recovery or if wal_level != minimal, - * it is a win to use it in all cases where we sync on each write(). We could - * allow O_DIRECT with fsync(), but it is unclear if fsync() could process - * writes not buffered in the kernel. Also, O_DIRECT is never enough to force - * data to the drives, it merely tries to bypass the kernel cache, so we still - * need O_SYNC/O_DSYNC. - */ -#ifdef O_DIRECT -#define PG_O_DIRECT O_DIRECT -#else -#define PG_O_DIRECT 0 -#endif - /* * This chunk of hackery attempts to determine which file sync methods * are available on the current platform, and to choose an appropriate diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index 5b3c280dd7..65dfe99ba0 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -130,7 +130,9 @@ extern int CloseTransientFile(int fd); /* If you've really really gotta have a plain kernel FD, use this */ extern int BasicOpenFile(const char *fileName, int fileFlags); +extern int BasicOpenFileDirect(const char *fileName, int fileFlags, bool direct); extern int BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode); +extern int BasicOpenFilePermDirect(const char *fileName, int fileFlags, mode_t fileMode, bool direct); /* Use these for other cases, and also for long-lived BasicOpenFile FDs */ extern bool AcquireExternalFD(void); -- 2.24.3 (Apple Git-128)