From 3db83b7289b85d1c84c5490e1d43e378b5ed3053 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Tue, 13 Dec 2022 16:54:18 +1300
Subject: [PATCH v4 2/3] Add io_direct setting (developer-only).

Provide a way to ask the kernel to use O_DIRECT (or local equivalent)
for data and WAL files.  This hurts performance currently and is not
intended for end-users yet.  Later proposed work would introduce our own
I/O clustering, read-ahead, etc to replace the kernel features that are
disabled with this option.

The only user-visible change, if the developer-only GUC is not used, is
that this commit also removes the obscure logic that would activate
O_DIRECT for the WAL when wal_sync_method=open_[data]sync and
wal_level=minimal (which also requires max_wal_senders=0).  Those are
non-default and unlikely settings, and this behavior wasn't (correctly)
documented.  In the unlikely event that a user wants that functionality
back, io_direct=wal is a more direct way to say so.

Author: Thomas Munro <thomas.munro@gmail.com>
Author: Andres Freund <andres@anarazel.de>
Author: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com>
Reviewed-by: Justin Pryzby <pryzby@telsasoft.com>
Reviewed-by: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com>
Discussion: https://postgr.es/m/CA%2BhUKGK1X532hYqJ_MzFWt0n1zt8trz980D79WbjwnT-yYLZpg%40mail.gmail.com
---
 doc/src/sgml/config.sgml                      | 34 ++++++++-
 src/backend/access/transam/xlog.c             | 37 ++++-----
 src/backend/access/transam/xlogprefetcher.c   |  2 +-
 src/backend/storage/buffer/bufmgr.c           | 16 ++--
 src/backend/storage/buffer/localbuf.c         |  7 +-
 src/backend/storage/file/fd.c                 | 76 +++++++++++++++++++
 src/backend/storage/smgr/md.c                 | 24 ++++--
 src/backend/storage/smgr/smgr.c               |  1 +
 src/backend/utils/misc/guc_tables.c           | 12 +++
 src/include/storage/fd.h                      |  7 ++
 src/include/storage/smgr.h                    |  1 +
 src/include/utils/guc_hooks.h                 |  2 +
 src/test/modules/test_misc/meson.build        |  1 +
 src/test/modules/test_misc/t/004_io_direct.pl | 48 ++++++++++++
 14 files changed, 233 insertions(+), 35 deletions(-)
 create mode 100644 src/test/modules/test_misc/t/004_io_direct.pl

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 25111d5caf..fc885c43a8 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -3155,7 +3155,6 @@ include_dir 'conf.d'
         </listitem>
        </itemizedlist>
        <para>
-        The <literal>open_</literal>* options also use <literal>O_DIRECT</literal> if available.
         Not all of these choices are available on all platforms.
         The default is the first method in the above list that is supported
         by the platform, except that <literal>fdatasync</literal> is the default on
@@ -11241,6 +11240,39 @@ dynamic_library_path = 'C:\tools\postgresql;H:\my_project\lib;$libdir'
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-io-direct" xreflabel="io_direct">
+      <term><varname>io_direct</varname> (<type>string</type>)
+      <indexterm>
+        <primary><varname>io_direct</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Ask the kernel to minimize caching effects for relation data and WAL
+        files using <literal>O_DIRECT</literal> (most Unix-like systems),
+        <literal>F_NOCACHE</literal> (macOS) or
+        <literal>FILE_FLAG_NO_BUFFERING</literal> (Windows).
+       </para>
+       <para>
+        May be set to an empty string (the default) to disable use of direct
+        I/O, or a comma-separated list of types of files for which direct I/O
+        is enabled.  The valid types of file are <literal>data</literal> for
+        main data files, <literal>wal</literal> for WAL files, and
+        <literal>wal_init</literal> for WAL files when being initially
+        allocated.
+       </para>
+       <para>
+        Some operating systems and file systems do not support direct I/O, so
+        non-default settings may be rejected at startup, or produce I/O errors
+        at runtime.
+       </para>
+       <para>
+        Currently this feature reduces performance, and is intended for
+        developer testing only.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-post-auth-delay" xreflabel="post_auth_delay">
       <term><varname>post_auth_delay</varname> (<type>integer</type>)
       <indexterm>
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 3fea8c4082..7a555d8701 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -2926,6 +2926,7 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
 	XLogSegNo	max_segno;
 	int			fd;
 	int			save_errno;
+	int			open_flags = O_RDWR | O_CREAT | O_EXCL | PG_BINARY;
 
 	Assert(logtli != 0);
 
@@ -2959,8 +2960,11 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
 
 	unlink(tmppath);
 
+	if (io_direct_flags & IO_DIRECT_WAL_INIT)
+		open_flags |= PG_O_DIRECT;
+
 	/* do not use get_sync_bit() here --- want to fsync only at end of fill */
-	fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
+	fd = BasicOpenFile(tmppath, open_flags);
 	if (fd < 0)
 		ereport(ERROR,
 				(errcode_for_file_access(),
@@ -3354,7 +3358,7 @@ XLogFileClose(void)
 	 * use the cache to read the WAL segment.
 	 */
 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
-	if (!XLogIsNeeded())
+	if (!XLogIsNeeded() && (io_direct_flags & IO_DIRECT_WAL) == 0)
 		(void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
 #endif
 
@@ -4445,7 +4449,6 @@ show_in_hot_standby(void)
 	return RecoveryInProgress() ? "on" : "off";
 }
 
-
 /*
  * Read the control file, set respective GUCs.
  *
@@ -8030,35 +8033,27 @@ xlog_redo(XLogReaderState *record)
 }
 
 /*
- * Return the (possible) sync flag used for opening a file, depending on the
- * value of the GUC wal_sync_method.
+ * Return the extra open flags used for opening a file, depending on the
+ * value of the GUCs wal_sync_method, fsync and io_direct.
  */
 static int
 get_sync_bit(int method)
 {
 	int			o_direct_flag = 0;
 
-	/* If fsync is disabled, never open in sync mode */
-	if (!enableFsync)
-		return 0;
-
 	/*
-	 * Optimize writes by bypassing kernel cache with O_DIRECT when using
-	 * O_SYNC and O_DSYNC.  But only if archiving and streaming are disabled,
-	 * otherwise the archive command or walsender process will read the WAL
-	 * soon after writing it, which is guaranteed to cause a physical read if
-	 * we bypassed the kernel cache. We also skip the
-	 * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
-	 * reason.
-	 *
-	 * Never use O_DIRECT in walreceiver process for similar reasons; the WAL
+	 * Use O_DIRECT if requested, except in walreceiver process.  The WAL
 	 * written by walreceiver is normally read by the startup process soon
-	 * after it's written. Also, walreceiver performs unaligned writes, which
+	 * after it's written.  Also, walreceiver performs unaligned writes, which
 	 * don't work with O_DIRECT, so it is required for correctness too.
 	 */
-	if (!XLogIsNeeded() && !AmWalReceiverProcess())
+	if ((io_direct_flags & IO_DIRECT_WAL) && !AmWalReceiverProcess())
 		o_direct_flag = PG_O_DIRECT;
 
+	/* If fsync is disabled, never open in sync mode */
+	if (!enableFsync)
+		return o_direct_flag;
+
 	switch (method)
 	{
 			/*
@@ -8070,7 +8065,7 @@ get_sync_bit(int method)
 		case SYNC_METHOD_FSYNC:
 		case SYNC_METHOD_FSYNC_WRITETHROUGH:
 		case SYNC_METHOD_FDATASYNC:
-			return 0;
+			return o_direct_flag;
 #ifdef O_SYNC
 		case SYNC_METHOD_OPEN:
 			return O_SYNC | o_direct_flag;
diff --git a/src/backend/access/transam/xlogprefetcher.c b/src/backend/access/transam/xlogprefetcher.c
index 046e40d143..7ba18f2a76 100644
--- a/src/backend/access/transam/xlogprefetcher.c
+++ b/src/backend/access/transam/xlogprefetcher.c
@@ -785,7 +785,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn)
 				block->prefetch_buffer = InvalidBuffer;
 				return LRQ_NEXT_IO;
 			}
-			else
+			else if ((io_direct_flags & IO_DIRECT_DATA) == 0)
 			{
 				/*
 				 * This shouldn't be possible, because we already determined
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 033f230b1d..5cb026d1ca 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -541,8 +541,11 @@ PrefetchSharedBuffer(SMgrRelation smgr_reln,
 		 * Try to initiate an asynchronous read.  This returns false in
 		 * recovery if the relation file doesn't exist.
 		 */
-		if (smgrprefetch(smgr_reln, forkNum, blockNum))
+		if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
+			smgrprefetch(smgr_reln, forkNum, blockNum))
+		{
 			result.initiated_io = true;
+		}
 #endif							/* USE_PREFETCH */
 	}
 	else
@@ -588,11 +591,11 @@ PrefetchSharedBuffer(SMgrRelation smgr_reln,
  * the kernel and therefore didn't really initiate I/O, and no way to know when
  * the I/O completes other than using synchronous ReadBuffer().
  *
- * 3.  Otherwise, the buffer wasn't already cached by PostgreSQL, and either
+ * 3.  Otherwise, the buffer wasn't already cached by PostgreSQL, and
  * USE_PREFETCH is not defined (this build doesn't support prefetching due to
- * lack of a kernel facility), or the underlying relation file wasn't found and
- * we are in recovery.  (If the relation file wasn't found and we are not in
- * recovery, an error is raised).
+ * lack of a kernel facility), direct I/O is enabled, or the underlying
+ * relation file wasn't found and we are in recovery.  (If the relation file
+ * wasn't found and we are not in recovery, an error is raised).
  */
 PrefetchBufferResult
 PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
@@ -5451,6 +5454,9 @@ ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag)
 {
 	PendingWriteback *pending;
 
+	if (io_direct_flags & IO_DIRECT_DATA)
+		return;
+
 	/*
 	 * Add buffer to the pending writeback array, unless writeback control is
 	 * disabled.
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index aae02949ce..c6384c9fde 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -92,8 +92,11 @@ PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum,
 	{
 #ifdef USE_PREFETCH
 		/* Not in buffers, so initiate prefetch */
-		smgrprefetch(smgr, forkNum, blockNum);
-		result.initiated_io = true;
+		if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
+			smgrprefetch(smgr, forkNum, blockNum))
+		{
+			result.initiated_io = true;
+		}
 #endif							/* USE_PREFETCH */
 	}
 
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index a280a1e7be..ccc789dc03 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -98,7 +98,9 @@
 #include "storage/fd.h"
 #include "storage/ipc.h"
 #include "utils/guc.h"
+#include "utils/guc_hooks.h"
 #include "utils/resowner_private.h"
+#include "utils/varlena.h"
 
 /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
 #if defined(HAVE_SYNC_FILE_RANGE)
@@ -162,6 +164,9 @@ bool		data_sync_retry = false;
 /* How SyncDataDirectory() should do its job. */
 int			recovery_init_sync_method = RECOVERY_INIT_SYNC_METHOD_FSYNC;
 
+/* Which kinds of files should be opened with PG_O_DIRECT. */
+int			io_direct_flags;
+
 /* Debugging.... */
 
 #ifdef FDDEBUG
@@ -2022,6 +2027,9 @@ FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
 	if (nbytes <= 0)
 		return;
 
+	if (VfdCache[file].fileFlags & PG_O_DIRECT)
+		return;
+
 	returnCode = FileAccess(file);
 	if (returnCode < 0)
 		return;
@@ -3826,3 +3834,71 @@ data_sync_elevel(int elevel)
 {
 	return data_sync_retry ? elevel : PANIC;
 }
+
+bool
+check_io_direct(char **newval, void **extra, GucSource source)
+{
+	int		flags;
+
+#if PG_O_DIRECT == 0
+	if (strcmp(*newval, "") != 0)
+	{
+		GUC_check_errdetail("io_direct is not supported on this platform.");
+		return false;
+	}
+	flags = 0;
+#else
+	List	   *elemlist;
+	ListCell   *l;
+	char	   *rawstring;
+
+	/* Need a modifiable copy of string */
+	rawstring = pstrdup(*newval);
+
+	if (!SplitGUCList(rawstring, ',', &elemlist))
+	{
+		GUC_check_errdetail("invalid list syntax in parameter \"%s\"",
+							"io_direct");
+		pfree(rawstring);
+		list_free(elemlist);
+		return false;
+	}
+
+	flags = 0;
+	foreach (l, elemlist)
+	{
+		char	   *item = (char *) lfirst(l);
+
+		if (pg_strcasecmp(item, "data") == 0)
+			flags |= IO_DIRECT_DATA;
+		else if (pg_strcasecmp(item, "wal") == 0)
+			flags |= IO_DIRECT_WAL;
+		else if (pg_strcasecmp(item, "wal_init") == 0)
+			flags |= IO_DIRECT_WAL_INIT;
+		else
+		{
+			GUC_check_errdetail("invalid option \"%s\"", item);
+			pfree(rawstring);
+			list_free(elemlist);
+			return false;
+		}
+	}
+
+	pfree(rawstring);
+	list_free(elemlist);
+#endif
+
+	/* Save the flags in *extra, for use by assign_io_direct */
+	*extra = guc_malloc(ERROR, sizeof(int));
+	*((int *) *extra) = flags;
+
+	return true;
+}
+
+extern void
+assign_io_direct(const char *newval, void *extra)
+{
+	int	   *flags = (int *) extra;
+
+	io_direct_flags = *flags;
+}
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index efa9773a4d..5647abeffd 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -142,6 +142,16 @@ static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forknum,
 static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum,
 							  MdfdVec *seg);
 
+static inline int
+_mdfd_open_flags(ForkNumber forkNum)
+{
+	int		flags = O_RDWR | PG_BINARY;
+
+	if (io_direct_flags & IO_DIRECT_DATA)
+		flags |= PG_O_DIRECT;
+
+	return flags;
+}
 
 /*
  *	mdinit() -- Initialize private state for magnetic disk storage manager.
@@ -205,14 +215,14 @@ mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
 
 	path = relpath(reln->smgr_rlocator, forknum);
 
-	fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
+	fd = PathNameOpenFile(path, _mdfd_open_flags(forknum) | O_CREAT | O_EXCL);
 
 	if (fd < 0)
 	{
 		int			save_errno = errno;
 
 		if (isRedo)
-			fd = PathNameOpenFile(path, O_RDWR | PG_BINARY);
+			fd = PathNameOpenFile(path, _mdfd_open_flags(forknum));
 		if (fd < 0)
 		{
 			/* be sure to report the error reported by create, not open */
@@ -635,7 +645,7 @@ mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior)
 
 	path = relpath(reln->smgr_rlocator, forknum);
 
-	fd = PathNameOpenFile(path, O_RDWR | PG_BINARY);
+	fd = PathNameOpenFile(path, _mdfd_open_flags(forknum));
 
 	if (fd < 0)
 	{
@@ -706,6 +716,8 @@ mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 	off_t		seekpos;
 	MdfdVec    *v;
 
+	Assert((io_direct_flags & IO_DIRECT_DATA) == 0);
+
 	v = _mdfd_getseg(reln, forknum, blocknum, false,
 					 InRecovery ? EXTENSION_RETURN_NULL : EXTENSION_FAIL);
 	if (v == NULL)
@@ -731,6 +743,8 @@ void
 mdwriteback(SMgrRelation reln, ForkNumber forknum,
 			BlockNumber blocknum, BlockNumber nblocks)
 {
+	Assert((io_direct_flags & IO_DIRECT_DATA) == 0);
+
 	/*
 	 * Issue flush requests in as few requests as possible; have to split at
 	 * segment boundaries though, since those are actually separate files.
@@ -1330,7 +1344,7 @@ _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno,
 	fullpath = _mdfd_segpath(reln, forknum, segno);
 
 	/* open the file */
-	fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags);
+	fd = PathNameOpenFile(fullpath, _mdfd_open_flags(forknum) | oflags);
 
 	pfree(fullpath);
 
@@ -1540,7 +1554,7 @@ mdsyncfiletag(const FileTag *ftag, char *path)
 		strlcpy(path, p, MAXPGPATH);
 		pfree(p);
 
-		file = PathNameOpenFile(path, O_RDWR | PG_BINARY);
+		file = PathNameOpenFile(path, _mdfd_open_flags(ftag->forknum));
 		if (file < 0)
 			return -1;
 		need_to_close = true;
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index c37c246b77..70d0d570b1 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -20,6 +20,7 @@
 #include "access/xlogutils.h"
 #include "lib/ilist.h"
 #include "storage/bufmgr.h"
+#include "storage/fd.h"
 #include "storage/ipc.h"
 #include "storage/md.h"
 #include "storage/smgr.h"
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index e8e8245e91..d3ed527e3b 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -568,6 +568,7 @@ static char *locale_ctype;
 static char *server_encoding_string;
 static char *server_version_string;
 static int	server_version_num;
+static char *io_direct_string;
 
 #ifdef HAVE_SYSLOG
 #define	DEFAULT_SYSLOG_FACILITY LOG_LOCAL0
@@ -4565,6 +4566,17 @@ struct config_string ConfigureNamesString[] =
 		check_backtrace_functions, assign_backtrace_functions, NULL
 	},
 
+	{
+		{"io_direct", PGC_POSTMASTER, DEVELOPER_OPTIONS,
+			gettext_noop("Use direct I/O for file access."),
+			NULL,
+			GUC_LIST_INPUT | GUC_NOT_IN_SAMPLE
+		},
+		&io_direct_string,
+		"",
+		check_io_direct, assign_io_direct, NULL
+	},
+
 	/* End-of-list marker */
 	{
 		{NULL, 0, 0, NULL, NULL}, NULL, NULL, NULL, NULL, NULL
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index faac4914fe..6791a406fc 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -44,6 +44,7 @@
 #define FD_H
 
 #include <dirent.h>
+#include <fcntl.h>
 
 typedef enum RecoveryInitSyncMethod
 {
@@ -54,10 +55,16 @@ typedef enum RecoveryInitSyncMethod
 typedef int File;
 
 
+#define IO_DIRECT_DATA			0x01
+#define IO_DIRECT_WAL			0x02
+#define IO_DIRECT_WAL_INIT		0x04
+
+
 /* GUC parameter */
 extern PGDLLIMPORT int max_files_per_process;
 extern PGDLLIMPORT bool data_sync_retry;
 extern PGDLLIMPORT int recovery_init_sync_method;
+extern PGDLLIMPORT int io_direct_flags;
 
 /*
  * This is private to fd.c, but exported for save/restore_backend_variables()
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index a9a179aaba..17fba6f91a 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -17,6 +17,7 @@
 #include "lib/ilist.h"
 #include "storage/block.h"
 #include "storage/relfilelocator.h"
+#include "utils/guc.h"
 
 /*
  * smgr.c maintains a table of SMgrRelation objects, which are essentially
diff --git a/src/include/utils/guc_hooks.h b/src/include/utils/guc_hooks.h
index f722fb250a..a82a85c940 100644
--- a/src/include/utils/guc_hooks.h
+++ b/src/include/utils/guc_hooks.h
@@ -156,5 +156,7 @@ extern bool check_wal_consistency_checking(char **newval, void **extra,
 										   GucSource source);
 extern void assign_wal_consistency_checking(const char *newval, void *extra);
 extern void assign_xlog_sync_method(int new_sync_method, void *extra);
+extern bool check_io_direct(char **newval, void **extra, GucSource source);
+extern void assign_io_direct(const char *newval, void *extra);
 
 #endif							/* GUC_HOOKS_H */
diff --git a/src/test/modules/test_misc/meson.build b/src/test/modules/test_misc/meson.build
index 21bde427b4..911084ac0f 100644
--- a/src/test/modules/test_misc/meson.build
+++ b/src/test/modules/test_misc/meson.build
@@ -9,6 +9,7 @@ tests += {
       't/001_constraint_validation.pl',
       't/002_tablespace.pl',
       't/003_check_guc.pl',
+      't/004_io_direct.pl',
     ],
   },
 }
diff --git a/src/test/modules/test_misc/t/004_io_direct.pl b/src/test/modules/test_misc/t/004_io_direct.pl
new file mode 100644
index 0000000000..78646e945e
--- /dev/null
+++ b/src/test/modules/test_misc/t/004_io_direct.pl
@@ -0,0 +1,48 @@
+# Very simple exercise of direct I/O GUC.
+
+use strict;
+use warnings;
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+# Systems that we know to have direct I/O support, and whose typical local
+# filesystems support it or at least won't fail with an error.  (illumos should
+# probably be in this list, but perl reports it as solaris.  Solaris should not
+# be in the list because we don't support its way of turning on direct I/O, and
+# even if we did, its version of ZFS rejects it, and OpenBSD just doesn't have
+# it.)
+if (!grep { $^O eq $_ } qw(aix darwin dragonfly freebsd linux MSWin32 netbsd))
+{
+	plan skip_all => "no direct I/O support";
+}
+
+my $node = PostgreSQL::Test::Cluster->new('main');
+$node->init;
+$node->append_conf('postgresql.conf', qq{
+io_direct = 'data,wal,wal_init'
+shared_buffers = '256kB' # tiny to force I/O
+});
+$node->start;
+
+# Do some work that is bound to generate shared and local writes and reads as a
+# simple exercise.
+$node->safe_psql('postgres', 'create table t1 as select 1 as i from generate_series(1, 10000)');
+$node->safe_psql('postgres', 'create table t2count (i int)');
+$node->safe_psql('postgres', qq{
+begin;
+create temporary table t2 as select 1 as i from generate_series(1, 10000);
+update t2 set i = i;
+insert into t2count select count(*) from t2;
+commit;
+});
+$node->safe_psql('postgres', 'update t1 set i = i');
+is('10000', $node->safe_psql('postgres', 'select count(*) from t1'), "read back from shared");
+is('10000', $node->safe_psql('postgres', 'select * from t2count'), "read back from local");
+$node->stop('immediate');
+
+$node->start;
+is('10000', $node->safe_psql('postgres', 'select count(*) from t1'), "read back from shared after crash recovery");
+$node->stop;
+
+done_testing();
-- 
2.39.2

