From ca0fd0769f08e8cdf65fc04baf12dcbf095b79a0 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Sun, 27 Sep 2020 17:22:10 +1300
Subject: [PATCH] Use syncfs() for SyncDataDirectory() on Linux.

Opening every file can be slow, as the first step before crash recovery
can begin.  For the same effect, call syncfs() on every possibly
different filesystem.  This avoids faulting in inodes for potentially
very many inodes.
---
 configure                     |  2 +-
 configure.ac                  |  1 +
 src/backend/storage/file/fd.c | 59 +++++++++++++++++++++++++++++++++++
 src/include/pg_config.h.in    |  3 ++
 4 files changed, 64 insertions(+), 1 deletion(-)

diff --git a/configure b/configure
index 19a3cd09a0..a3e5d6459d 100755
--- a/configure
+++ b/configure
@@ -15155,7 +15155,7 @@ fi
 LIBS_including_readline="$LIBS"
 LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'`
 
-for ac_func in backtrace_symbols clock_gettime copyfile fdatasync getifaddrs getpeerucred getrlimit kqueue mbstowcs_l memset_s poll posix_fallocate ppoll pstat pthread_is_threaded_np readlink setproctitle setproctitle_fast setsid shm_open strchrnul strsignal symlink sync_file_range uselocale wcstombs_l
+for ac_func in backtrace_symbols clock_gettime copyfile fdatasync getifaddrs getpeerucred getrlimit kqueue mbstowcs_l memset_s poll posix_fallocate ppoll pstat pthread_is_threaded_np readlink setproctitle setproctitle_fast setsid shm_open strchrnul strsignal symlink syncfs sync_file_range uselocale wcstombs_l
 do :
   as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
 ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var"
diff --git a/configure.ac b/configure.ac
index 6b9d0487a8..bad4159a94 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1670,6 +1670,7 @@ AC_CHECK_FUNCS(m4_normalize([
 	strchrnul
 	strsignal
 	symlink
+	syncfs
 	sync_file_range
 	uselocale
 	wcstombs_l
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index bd72a87ee3..6da498b08a 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -72,9 +72,11 @@
 
 #include "postgres.h"
 
+#include <dirent.h>
 #include <sys/file.h>
 #include <sys/param.h>
 #include <sys/stat.h>
+#include <sys/types.h>
 #ifndef WIN32
 #include <sys/mman.h>
 #endif
@@ -330,10 +332,12 @@ static void walkdir(const char *path,
 					void (*action) (const char *fname, bool isdir, int elevel),
 					bool process_symlinks,
 					int elevel);
+#ifndef HAVE_SYNCFS
 #ifdef PG_FLUSH_DATA_WORKS
 static void pre_sync_fname(const char *fname, bool isdir, int elevel);
 #endif
 static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
+#endif
 static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
 
 static int	fsync_parent_path(const char *fname, int elevel);
@@ -3231,6 +3235,23 @@ looks_like_temp_rel_name(const char *name)
 	return true;
 }
 
+#ifdef HAVE_SYNCFS
+static void
+do_syncfs(const char *path)
+{
+	int		fd;
+
+	fd = open(path, O_RDONLY);
+	if (fd < 0)
+	{
+		elog(LOG, "could not open %s: %m", path);
+		return;
+	}
+	if (syncfs(fd) < 0)
+		elog(LOG, "syncfs failed for %s: %m", path);
+	close(fd);
+}
+#endif
 
 /*
  * Issue fsync recursively on PGDATA and all its contents.
@@ -3257,6 +3278,10 @@ void
 SyncDataDirectory(void)
 {
 	bool		xlog_is_symlink;
+#ifdef HAVE_SYNCFS
+	DIR		   *dir;
+	struct dirent *de;
+#endif
 
 	/* We can skip this whole thing if fsync is disabled. */
 	if (!enableFsync)
@@ -3285,6 +3310,36 @@ SyncDataDirectory(void)
 		xlog_is_symlink = true;
 #endif
 
+#ifdef HAVE_SYNCFS
+
+	/*
+	 * On Linux, we don't have to open every single file one by one.  We can
+	 * use syncfs() to sync whole filesystems.  We only expect filesystem
+	 * boundaries to exist where we tolerate symlinks, namely pg_wal and the
+	 * tablespaces, so we call syncfs() for each of those directories.
+	 */
+
+	/* Sync the top level pgdata directory. */
+	do_syncfs(".");
+	/* If any tablespaces are configured, sync each of those. */
+	dir = AllocateDir("pg_tblspc");
+	while ((de = ReadDir(dir, "pg_tblspc")))
+	{
+		char		path[MAXPGPATH];
+
+		if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+			continue;
+
+		snprintf(path, MAXPGPATH, "pg_tblspc/%s", de->d_name);
+		do_syncfs(path);
+	}
+	FreeDir(dir);
+	/* If pg_wal is a symlink, process that too. */
+	if (xlog_is_symlink)
+		do_syncfs("pg_wal");
+
+#else
+
 	/*
 	 * If possible, hint to the kernel that we're soon going to fsync the data
 	 * directory and its contents.  Errors in this step are even less
@@ -3310,6 +3365,8 @@ SyncDataDirectory(void)
 	if (xlog_is_symlink)
 		walkdir("pg_wal", datadir_fsync_fname, false, LOG);
 	walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
+
+#endif		/* !HAVE_SYNCFS */
 }
 
 /*
@@ -3382,6 +3439,7 @@ walkdir(const char *path,
 }
 
 
+#ifndef HAVE_SYNCFS
 /*
  * Hint to the OS that it should get ready to fsync() this file.
  *
@@ -3434,6 +3492,7 @@ datadir_fsync_fname(const char *fname, bool isdir, int elevel)
 	 */
 	fsync_fname_ext(fname, isdir, true, elevel);
 }
+#endif
 
 static void
 unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index fb270df678..68a5c4bb1b 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -575,6 +575,9 @@
 /* Define to 1 if you have the `symlink' function. */
 #undef HAVE_SYMLINK
 
+/* Define to 1 if you have the `syncfs' function. */
+#undef HAVE_SYNCFS
+
 /* Define to 1 if you have the `sync_file_range' function. */
 #undef HAVE_SYNC_FILE_RANGE
 
-- 
2.20.1

