From 819a406f029b04ab6a500f63fe9c154332b65d8e Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Mon, 3 Oct 2022 21:58:22 -0700
Subject: [PATCH 3/3] Add direct I/O settings (developer-only).

Provide a way to ask the kernel to use O_DIRECT (or local equivalent)
for data and WAL files.  This hurts performance currently and is not
intended for end-users yet.  Later proposed work will introduce our own
I/O clustering, read-ahead, etc to replace the kernel features that are
disabled with this option.

This replaces the previous logic that would use O_DIRECT for the WAL in
limited and obscure cases, now that there is an explicit setting.

Discussion: https://postgr.es/m/
Author: Andres Freund <andres@anarazel.de>
Author: Thomas Munro <thomas.munro@gmail.com>
---
 doc/src/sgml/config.sgml                    | 51 ++++++++++++++++++++
 src/backend/access/transam/xlog.c           | 53 +++++++++++++--------
 src/backend/access/transam/xlogprefetcher.c |  2 +-
 src/backend/storage/buffer/bufmgr.c         | 13 +++--
 src/backend/storage/buffer/localbuf.c       |  4 +-
 src/backend/storage/file/fd.c               |  5 ++
 src/backend/storage/smgr/md.c               | 29 +++++++++--
 src/backend/storage/smgr/smgr.c             | 20 ++++++++
 src/backend/utils/misc/guc_tables.c         | 33 +++++++++++++
 src/include/access/xlog.h                   |  2 +
 src/include/storage/fd.h                    |  6 ++-
 src/include/storage/smgr.h                  |  5 ++
 src/include/utils/guc_hooks.h               |  2 +
 13 files changed, 190 insertions(+), 35 deletions(-)

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 559eb898a9..2d860dd900 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -11011,6 +11011,57 @@ dynamic_library_path = 'C:\tools\postgresql;H:\my_project\lib;$libdir'
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-io-data-direct" xreflabel="io_data_direct">
+      <term><varname>io_data_direct</varname> (<type>boolean</type>)
+      <indexterm>
+        <primary><varname>io_data_direct</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Ask the kernel to minimize caching effects for relation data files
+        using <literal>O_DIRECT</literal> (most Unix-like systems),
+        <literal>F_NOCACHE</literal> (macOS) or
+        <literal>FILE_FLAG_NO_BUFFERING</literal> (Windows).  Currently this
+        hurts performance, and is intended for developer testing only.
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry id="guc-io-wal-direct" xreflabel="io_wal_direct">
+      <term><varname>io_wal_direct</varname> (<type>boolean</type>)
+      <indexterm>
+        <primary><varname>io_wal_direct</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Ask the kernel to minimize caching effects while writing WAL files
+        using <literal>O_DIRECT</literal> (most Unix-like systems),
+        <literal>F_NOCACHE</literal> (macOS) or
+        <literal>FILE_FLAG_NO_BUFFERING</literal> (Windows).  Currently this
+        hurts performance, and is intended for developer testing only.
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry id="guc-io-wal-init-direct" xreflabel="io_wal_init_direct">
+      <term><varname>io_wal_init_direct</varname> (<type>boolean</type>)
+      <indexterm>
+        <primary><varname>io_wal_init_direct</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Ask the kernel to minimize caching effects while initializing WAL files
+        using <literal>O_DIRECT</literal> (most Unix-like systems),
+        <literal>F_NOCACHE</literal> (macOS) or
+        <literal>FILE_FLAG_NO_BUFFERING</literal> (Windows).  Currently this
+        hurts performance, and is intended for developer testing only.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-post-auth-delay" xreflabel="post_auth_delay">
       <term><varname>post_auth_delay</varname> (<type>integer</type>)
       <indexterm>
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 8f10effe3a..5663bdf856 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -138,6 +138,8 @@ int			wal_retrieve_retry_interval = 5000;
 int			max_slot_wal_keep_size_mb = -1;
 int			wal_decode_buffer_size = 512 * 1024;
 bool		track_wal_io_timing = false;
+bool		io_wal_direct = false;
+bool		io_wal_init_direct = false;
 
 #ifdef WAL_DEBUG
 bool		XLOG_DEBUG = false;
@@ -2926,6 +2928,7 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
 	XLogSegNo	max_segno;
 	int			fd;
 	int			save_errno;
+	int			open_flags = O_RDWR | O_CREAT | O_EXCL | PG_BINARY;
 
 	Assert(logtli != 0);
 
@@ -2958,8 +2961,11 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
 
 	unlink(tmppath);
 
+	if (io_wal_init_direct)
+		open_flags |= PG_O_DIRECT;
+
 	/* do not use get_sync_bit() here --- want to fsync only at end of fill */
-	fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
+	fd = BasicOpenFile(tmppath, open_flags);
 	if (fd < 0)
 		ereport(ERROR,
 				(errcode_for_file_access(),
@@ -3373,7 +3379,7 @@ XLogFileClose(void)
 	 * use the cache to read the WAL segment.
 	 */
 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
-	if (!XLogIsNeeded())
+	if (!XLogIsNeeded() && !io_wal_direct)
 		(void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
 #endif
 
@@ -4473,6 +4479,21 @@ show_in_hot_standby(void)
 	return RecoveryInProgress() ? "on" : "off";
 }
 
+/*
+ * GUC check for direct I/O support.
+ */
+bool
+check_io_wal_direct(bool *newval, void **extra, GucSource source)
+{
+#if PG_O_DIRECT == 0
+	if (*newval)
+	{
+		GUC_check_errdetail("io_wal_direct and io_wal_init_direct are not supported on this platform.");
+		return false;
+	}
+#endif
+	return true;
+}
 
 /*
  * Read the control file, set respective GUCs.
@@ -8056,35 +8077,27 @@ xlog_redo(XLogReaderState *record)
 }
 
 /*
- * Return the (possible) sync flag used for opening a file, depending on the
- * value of the GUC wal_sync_method.
+ * Return the extra open flags used for opening a file, depending on the
+ * value of the GUCs wal_sync_method, fsync and io_wal_direct.
  */
 static int
 get_sync_bit(int method)
 {
 	int			o_direct_flag = 0;
 
-	/* If fsync is disabled, never open in sync mode */
-	if (!enableFsync)
-		return 0;
-
 	/*
-	 * Optimize writes by bypassing kernel cache with O_DIRECT when using
-	 * O_SYNC and O_DSYNC.  But only if archiving and streaming are disabled,
-	 * otherwise the archive command or walsender process will read the WAL
-	 * soon after writing it, which is guaranteed to cause a physical read if
-	 * we bypassed the kernel cache. We also skip the
-	 * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
-	 * reason.
-	 *
-	 * Never use O_DIRECT in walreceiver process for similar reasons; the WAL
+	 * Use O_DIRECT if requested, except in walreceiver process.  The WAL
 	 * written by walreceiver is normally read by the startup process soon
-	 * after it's written. Also, walreceiver performs unaligned writes, which
+	 * after it's written.  Also, walreceiver performs unaligned writes, which
 	 * don't work with O_DIRECT, so it is required for correctness too.
 	 */
-	if (!XLogIsNeeded() && !AmWalReceiverProcess())
+	if (io_wal_direct && !AmWalReceiverProcess())
 		o_direct_flag = PG_O_DIRECT;
 
+	/* If fsync is disabled, never open in sync mode */
+	if (!enableFsync)
+		return o_direct_flag;
+
 	switch (method)
 	{
 			/*
@@ -8096,7 +8109,7 @@ get_sync_bit(int method)
 		case SYNC_METHOD_FSYNC:
 		case SYNC_METHOD_FSYNC_WRITETHROUGH:
 		case SYNC_METHOD_FDATASYNC:
-			return 0;
+			return o_direct_flag;
 #ifdef O_SYNC
 		case SYNC_METHOD_OPEN:
 			return O_SYNC | o_direct_flag;
diff --git a/src/backend/access/transam/xlogprefetcher.c b/src/backend/access/transam/xlogprefetcher.c
index 0cf03945ee..d840078afc 100644
--- a/src/backend/access/transam/xlogprefetcher.c
+++ b/src/backend/access/transam/xlogprefetcher.c
@@ -785,7 +785,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn)
 				block->prefetch_buffer = InvalidBuffer;
 				return LRQ_NEXT_IO;
 			}
-			else
+			else if (!io_data_direct)
 			{
 				/*
 				 * This shouldn't be possible, because we already determined
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 6b95381481..9918855f37 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -535,7 +535,7 @@ PrefetchSharedBuffer(SMgrRelation smgr_reln,
 		 * Try to initiate an asynchronous read.  This returns false in
 		 * recovery if the relation file doesn't exist.
 		 */
-		if (smgrprefetch(smgr_reln, forkNum, blockNum))
+		if (!io_data_direct && smgrprefetch(smgr_reln, forkNum, blockNum))
 			result.initiated_io = true;
 #endif							/* USE_PREFETCH */
 	}
@@ -582,11 +582,11 @@ PrefetchSharedBuffer(SMgrRelation smgr_reln,
  * the kernel and therefore didn't really initiate I/O, and no way to know when
  * the I/O completes other than using synchronous ReadBuffer().
  *
- * 3.  Otherwise, the buffer wasn't already cached by PostgreSQL, and either
+ * 3.  Otherwise, the buffer wasn't already cached by PostgreSQL, and
  * USE_PREFETCH is not defined (this build doesn't support prefetching due to
- * lack of a kernel facility), or the underlying relation file wasn't found and
- * we are in recovery.  (If the relation file wasn't found and we are not in
- * recovery, an error is raised).
+ * lack of a kernel facility), io_data_direct is enabled, or the underlying
+ * relation file wasn't found and we are in recovery.  (If the relation file
+ * wasn't found and we are not in recovery, an error is raised).
  */
 PrefetchBufferResult
 PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
@@ -4908,6 +4908,9 @@ ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag)
 {
 	PendingWriteback *pending;
 
+	if (io_data_direct)
+		return;
+
 	/*
 	 * Add buffer to the pending writeback array, unless writeback control is
 	 * disabled.
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index f51d3527f6..f9c82a789e 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -87,8 +87,8 @@ PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum,
 	{
 #ifdef USE_PREFETCH
 		/* Not in buffers, so initiate prefetch */
-		smgrprefetch(smgr, forkNum, blockNum);
-		result.initiated_io = true;
+		if (!io_data_direct && smgrprefetch(smgr, forkNum, blockNum))
+			result.initiated_io = true;
 #endif							/* USE_PREFETCH */
 	}
 
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 4151cafec5..aa720952f8 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -2021,6 +2021,11 @@ FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
 	if (nbytes <= 0)
 		return;
 
+#ifdef PG_O_DIRECT
+	if (VfdCache[file].fileFlags & PG_O_DIRECT)
+		return;
+#endif
+
 	returnCode = FileAccess(file);
 	if (returnCode < 0)
 		return;
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 719721a894..20ec37c310 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -142,6 +142,21 @@ static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forknum,
 static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum,
 							  MdfdVec *seg);
 
+static inline int
+_mdfd_open_flags(ForkNumber forkNum)
+{
+	int		flags = O_RDWR | PG_BINARY;
+
+	/*
+	 * XXX: not clear if direct IO ever is interesting for other forks?  The
+	 * FSM fork currently often ends up very fragmented when using direct IO,
+	 * for example.
+	 */
+	if (io_data_direct /* && forkNum == MAIN_FORKNUM */)
+		flags |= PG_O_DIRECT;
+
+	return flags;
+}
 
 /*
  *	mdinit() -- Initialize private state for magnetic disk storage manager.
@@ -205,14 +220,14 @@ mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
 
 	path = relpath(reln->smgr_rlocator, forknum);
 
-	fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
+	fd = PathNameOpenFile(path, _mdfd_open_flags(forknum) | O_CREAT | O_EXCL);
 
 	if (fd < 0)
 	{
 		int			save_errno = errno;
 
 		if (isRedo)
-			fd = PathNameOpenFile(path, O_RDWR | PG_BINARY);
+			fd = PathNameOpenFile(path, _mdfd_open_flags(forknum));
 		if (fd < 0)
 		{
 			/* be sure to report the error reported by create, not open */
@@ -513,7 +528,7 @@ mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior)
 
 	path = relpath(reln->smgr_rlocator, forknum);
 
-	fd = PathNameOpenFile(path, O_RDWR | PG_BINARY);
+	fd = PathNameOpenFile(path, _mdfd_open_flags(forknum));
 
 	if (fd < 0)
 	{
@@ -584,6 +599,8 @@ mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 	off_t		seekpos;
 	MdfdVec    *v;
 
+	Assert(!io_data_direct);
+
 	v = _mdfd_getseg(reln, forknum, blocknum, false,
 					 InRecovery ? EXTENSION_RETURN_NULL : EXTENSION_FAIL);
 	if (v == NULL)
@@ -609,6 +626,8 @@ void
 mdwriteback(SMgrRelation reln, ForkNumber forknum,
 			BlockNumber blocknum, BlockNumber nblocks)
 {
+	Assert(!io_data_direct);
+
 	/*
 	 * Issue flush requests in as few requests as possible; have to split at
 	 * segment boundaries though, since those are actually separate files.
@@ -1186,7 +1205,7 @@ _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno,
 	fullpath = _mdfd_segpath(reln, forknum, segno);
 
 	/* open the file */
-	fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags);
+	fd = PathNameOpenFile(fullpath, _mdfd_open_flags(forknum) | oflags);
 
 	pfree(fullpath);
 
@@ -1395,7 +1414,7 @@ mdsyncfiletag(const FileTag *ftag, char *path)
 		strlcpy(path, p, MAXPGPATH);
 		pfree(p);
 
-		file = PathNameOpenFile(path, O_RDWR | PG_BINARY);
+		file = PathNameOpenFile(path, _mdfd_open_flags(ftag->forknum));
 		if (file < 0)
 			return -1;
 		need_to_close = true;
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index c1a5febcbf..706a52b9f1 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -20,6 +20,7 @@
 #include "access/xlogutils.h"
 #include "lib/ilist.h"
 #include "storage/bufmgr.h"
+#include "storage/fd.h"
 #include "storage/ipc.h"
 #include "storage/md.h"
 #include "storage/smgr.h"
@@ -27,6 +28,9 @@
 #include "utils/inval.h"
 
 
+/* GUCs */
+bool		io_data_direct = false;
+
 /*
  * This struct of function pointers defines the API between smgr.c and
  * any individual storage manager module.  Note that smgr subfunctions are
@@ -735,3 +739,19 @@ ProcessBarrierSmgrRelease(void)
 	smgrreleaseall();
 	return true;
 }
+
+/*
+ * Check if this build allows smgr implementations to enable direct I/O.
+ */
+bool
+check_io_data_direct(bool *newval, void **extra, GucSource source)
+{
+#if PG_O_DIRECT == 0
+	if (*newval)
+	{
+		GUC_check_errdetail("io_data_direct is not supported on this platform.");
+		return false;
+	}
+#endif
+	return true;
+}
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 05ab087934..e324378ad4 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -1925,6 +1925,39 @@ struct config_bool ConfigureNamesBool[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"io_data_direct", PGC_SUSET, DEVELOPER_OPTIONS,
+			gettext_noop("Access data files with direct I/O."),
+			NULL,
+			GUC_NOT_IN_SAMPLE
+		},
+		&io_data_direct,
+		false,
+		check_io_data_direct, NULL, NULL
+	},
+
+	{
+		{"io_wal_direct", PGC_SUSET, DEVELOPER_OPTIONS,
+			gettext_noop("Write WAL files with direct I/O."),
+			NULL,
+			GUC_NOT_IN_SAMPLE
+		},
+		&io_wal_direct,
+		false,
+		check_io_wal_direct, NULL, NULL
+	},
+
+	{
+		{"io_wal_init_direct", PGC_SUSET, DEVELOPER_OPTIONS,
+			gettext_noop("Initialize WAL files with direct I/O."),
+			NULL,
+			GUC_NOT_IN_SAMPLE
+		},
+		&io_wal_init_direct,
+		false,
+		check_io_wal_direct, NULL, NULL
+	},
+
 	/* End-of-list marker */
 	{
 		{NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 1fbd48fbda..6220370036 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -51,6 +51,8 @@ extern PGDLLIMPORT char *wal_consistency_checking_string;
 extern PGDLLIMPORT bool log_checkpoints;
 extern PGDLLIMPORT bool track_wal_io_timing;
 extern PGDLLIMPORT int wal_decode_buffer_size;
+extern PGDLLIMPORT bool io_wal_direct;
+extern PGDLLIMPORT bool io_wal_init_direct;
 
 extern PGDLLIMPORT int CheckPointSegments;
 
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index c0a212487d..283ff21e31 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -44,6 +44,7 @@
 #define FD_H
 
 #include <dirent.h>
+#include <fcntl.h>
 
 typedef enum RecoveryInitSyncMethod
 {
@@ -82,9 +83,10 @@ extern PGDLLIMPORT int max_safe_fds;
  * to the appropriate Windows flag in src/port/open.c.  We simulate it with
  * fcntl(F_NOCACHE) on macOS inside fd.c's open() wrapper.  We use the name
  * PG_O_DIRECT rather than defining O_DIRECT in that case (probably not a good
- * idea on a Unix).
+ * idea on a Unix).  We can only use it if the compiler will correctly align
+ * PGAlignedBlock for us, though.
  */
-#if defined(O_DIRECT)
+#if defined(O_DIRECT) && defined(pg_attribute_aligned)
 #define		PG_O_DIRECT O_DIRECT
 #elif defined(F_NOCACHE)
 #define		PG_O_DIRECT 0x80000000
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index a07715356b..ef75934a16 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -17,6 +17,10 @@
 #include "lib/ilist.h"
 #include "storage/block.h"
 #include "storage/relfilelocator.h"
+#include "utils/guc.h"
+
+/* GUCs */
+extern PGDLLIMPORT bool io_data_direct;
 
 /*
  * smgr.c maintains a table of SMgrRelation objects, which are essentially
@@ -107,5 +111,6 @@ extern void smgrtruncate(SMgrRelation reln, ForkNumber *forknum,
 extern void smgrimmedsync(SMgrRelation reln, ForkNumber forknum);
 extern void AtEOXact_SMgr(void);
 extern bool ProcessBarrierSmgrRelease(void);
+extern bool check_io_data_direct(bool *newval, void **extra, GucSource source);
 
 #endif							/* SMGR_H */
diff --git a/src/include/utils/guc_hooks.h b/src/include/utils/guc_hooks.h
index f1a9a183b4..a9748f6b34 100644
--- a/src/include/utils/guc_hooks.h
+++ b/src/include/utils/guc_hooks.h
@@ -59,6 +59,8 @@ extern bool check_effective_io_concurrency(int *newval, void **extra,
 										   GucSource source);
 extern bool check_huge_page_size(int *newval, void **extra, GucSource source);
 extern const char *show_in_hot_standby(void);
+extern bool check_io_data_direct(bool *newval, void **extra, GucSource source);
+extern bool check_io_wal_direct(bool *newval, void **extra, GucSource source);
 extern bool check_locale_messages(char **newval, void **extra, GucSource source);
 extern void assign_locale_messages(const char *newval, void *extra);
 extern bool check_locale_monetary(char **newval, void **extra, GucSource source);
-- 
2.35.1

