From 4eb14fd2b6eaf7ea61932e2d9684a8b4dfc1e097 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Tue, 13 Sep 2016 13:00:51 +0900
Subject: [PATCH 2/4] Issue fsync more carefully in pg_receivexlog and
 pg_basebackup [-X] stream.

Several places weren't careful about fsyncing in the way. See 1d4a0ab1
and 606e0f98 for details about required fsyns.

This adds a couple of routines in fe_utils which have an equivalent in the
backend:
- durable_rename
- fsync_parent_path
---
 src/bin/pg_basebackup/pg_basebackup.c |  27 +++++++++
 src/bin/pg_basebackup/receivelog.c    |  56 ++++++++++-------
 src/common/file_utils.c               | 110 ++++++++++++++++++++++++++++++++--
 src/include/common/file_utils.h       |   7 ++-
 4 files changed, 171 insertions(+), 29 deletions(-)

diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c
index 42f3b27..2266e34 100644
--- a/src/bin/pg_basebackup/pg_basebackup.c
+++ b/src/bin/pg_basebackup/pg_basebackup.c
@@ -25,6 +25,7 @@
 #include <zlib.h>
 #endif
 
+#include "common/file_utils.h"
 #include "common/string.h"
 #include "fe_utils/string_utils.h"
 #include "getopt_long.h"
@@ -1194,6 +1195,10 @@ ReceiveTarFile(PGconn *conn, PGresult *res, int rownum)
 
 	if (copybuf != NULL)
 		PQfreemem(copybuf);
+
+	/* sync the resulting tar file, errors are not considered fatal */
+	if (strcmp(basedir, "-") != 0)
+		(void) fsync_fname(filename, false, progname);
 }
 
 
@@ -1470,6 +1475,11 @@ ReceiveAndUnpackTarFile(PGconn *conn, PGresult *res, int rownum)
 
 	if (basetablespace && writerecoveryconf)
 		WriteRecoveryConf();
+
+	/*
+	 * No data is synced here, everything is done for all tablespaces at the
+	 * end.
+	 */
 }
 
 /*
@@ -1948,6 +1958,23 @@ BaseBackup(void)
 	PQclear(res);
 	PQfinish(conn);
 
+	/*
+	 * Make data persistent on disk once backup is completed. For tar
+	 * format once syncing the parent directory is fine, each tar file
+	 * created per tablespace has been already synced. In plain format,
+	 * all the data of the base directory is synced, taking into account
+	 * all the tablespaces. Errors are not considered fatal.
+	 */
+	if (format == 't')
+	{
+		if (strcmp(basedir, "-") != 0)
+			(void) fsync_fname(basedir, true, progname);
+	}
+	else
+	{
+		(void) fsync_pgdata(basedir, progname);
+	}
+
 	if (verbose)
 		fprintf(stderr, "%s: base backup completed\n", progname);
 }
diff --git a/src/bin/pg_basebackup/receivelog.c b/src/bin/pg_basebackup/receivelog.c
index 062730b..0a740ce 100644
--- a/src/bin/pg_basebackup/receivelog.c
+++ b/src/bin/pg_basebackup/receivelog.c
@@ -23,6 +23,7 @@
 
 #include "libpq-fe.h"
 #include "access/xlog_internal.h"
+#include "common/file_utils.h"
 
 
 /* fd and filename for currently open WAL file */
@@ -68,17 +69,13 @@ mark_file_as_archived(const char *basedir, const char *fname)
 		return false;
 	}
 
-	if (fsync(fd) != 0)
-	{
-		fprintf(stderr, _("%s: could not fsync file \"%s\": %s\n"),
-				progname, tmppath, strerror(errno));
-
-		close(fd);
+	close(fd);
 
+	if (fsync_fname(tmppath, false, progname) != 0)
 		return false;
-	}
 
-	close(fd);
+	if (fsync_parent_path(tmppath, progname) != 0)
+		return false;
 
 	return true;
 }
@@ -116,6 +113,10 @@ open_walfile(StreamCtl *stream, XLogRecPtr startpoint)
 	/*
 	 * Verify that the file is either empty (just created), or a complete
 	 * XLogSegSize segment. Anything in between indicates a corrupt file.
+	 *
+	 * XXX: This means that we might not restart if a crash occurs before the
+	 * fsync below. We probably should create the file in a temporary path
+	 * like the backend does...
 	 */
 	if (fstat(f, &statbuf) != 0)
 	{
@@ -129,6 +130,16 @@ open_walfile(StreamCtl *stream, XLogRecPtr startpoint)
 	{
 		/* File is open and ready to use */
 		walfile = f;
+
+		/*
+		 * fsync, in case of a previous crash between padding and fsyncing the
+		 * file.
+		 */
+		if (fsync_fname(fn, false, progname) != 0)
+			return false;
+		if (fsync_parent_path(fn, progname) != 0)
+			return false;
+
 		return true;
 	}
 	if (statbuf.st_size != 0)
@@ -157,6 +168,17 @@ open_walfile(StreamCtl *stream, XLogRecPtr startpoint)
 	}
 	free(zerobuf);
 
+	/*
+	 * fsync WAL file and containing directory, to ensure the file is
+	 * persistently created and zeroed. That's particularly important when
+	 * using synchronous mode, where the file is modified and fsynced
+	 * in-place, without a directory fsync.
+	 */
+	if (fsync_fname(fn, false, progname) != 0)
+		return false;
+	if (fsync_parent_path(fn, progname) != 0)
+		return false;
+
 	if (lseek(f, SEEK_SET, 0) != 0)
 	{
 		fprintf(stderr,
@@ -217,10 +239,9 @@ close_walfile(StreamCtl *stream, XLogRecPtr pos)
 
 		snprintf(oldfn, sizeof(oldfn), "%s/%s%s", stream->basedir, current_walfile_name, stream->partial_suffix);
 		snprintf(newfn, sizeof(newfn), "%s/%s", stream->basedir, current_walfile_name);
-		if (rename(oldfn, newfn) != 0)
+		if (durable_rename(oldfn, newfn, progname) != 0)
 		{
-			fprintf(stderr, _("%s: could not rename file \"%s\": %s\n"),
-					progname, current_walfile_name, strerror(errno));
+			/* durable_rename produced a log entry */
 			return false;
 		}
 	}
@@ -338,14 +359,6 @@ writeTimeLineHistoryFile(StreamCtl *stream, char *filename, char *content)
 		return false;
 	}
 
-	if (fsync(fd) != 0)
-	{
-		close(fd);
-		fprintf(stderr, _("%s: could not fsync file \"%s\": %s\n"),
-				progname, tmppath, strerror(errno));
-		return false;
-	}
-
 	if (close(fd) != 0)
 	{
 		fprintf(stderr, _("%s: could not close file \"%s\": %s\n"),
@@ -356,10 +369,9 @@ writeTimeLineHistoryFile(StreamCtl *stream, char *filename, char *content)
 	/*
 	 * Now move the completed history file into place with its final name.
 	 */
-	if (rename(tmppath, path) < 0)
+	if (durable_rename(tmppath, path, progname) < 0)
 	{
-		fprintf(stderr, _("%s: could not rename file \"%s\" to \"%s\": %s\n"),
-				progname, tmppath, path, strerror(errno));
+		/* durable_rename produced a log entry */
 		return false;
 	}
 
diff --git a/src/common/file_utils.c b/src/common/file_utils.c
index b6f62f7..4e622cb 100644
--- a/src/common/file_utils.c
+++ b/src/common/file_utils.c
@@ -34,7 +34,7 @@ static void pre_sync_fname(const char *fname, bool isdir,
 						   const char *progname);
 #endif
 static void walkdir(const char *path,
-	void (*action) (const char *fname, bool isdir, const char *progname),
+	int (*action) (const char *fname, bool isdir, const char *progname),
 	bool process_symlinks, const char *progname);
 
 /*
@@ -120,7 +120,7 @@ fsync_pgdata(const char *pg_data, const char *progname)
  */
 static void
 walkdir(const char *path,
-		void (*action) (const char *fname, bool isdir, const char *progname),
+		int (*action) (const char *fname, bool isdir, const char *progname),
 		bool process_symlinks, const char *progname)
 {
 	DIR		   *dir;
@@ -228,7 +228,7 @@ pre_sync_fname(const char *fname, bool isdir, const char *progname)
  * directories on systems where that isn't allowed/required.  Reports
  * other errors non-fatally.
  */
-void
+int
 fsync_fname(const char *fname, bool isdir, const char *progname)
 {
 	int			fd;
@@ -256,10 +256,10 @@ fsync_fname(const char *fname, bool isdir, const char *progname)
 	if (fd < 0)
 	{
 		if (errno == EACCES || (isdir && errno == EISDIR))
-			return;
+			return 0;
 		fprintf(stderr, _("%s: could not open file \"%s\": %s\n"),
 				progname, fname, strerror(errno));
-		return;
+		return -1;
 	}
 
 	returncode = fsync(fd);
@@ -269,8 +269,108 @@ fsync_fname(const char *fname, bool isdir, const char *progname)
 	 * those errors. Anything else needs to be reported.
 	 */
 	if (returncode != 0 && !(isdir && errno == EBADF))
+	{
 		fprintf(stderr, _("%s: could not fsync file \"%s\": %s\n"),
 				progname, fname, strerror(errno));
+		return -1;
+	}
 
 	(void) close(fd);
+	return 0;
+}
+
+/*
+ * fsync_parent_path -- fsync the parent path of a file or directory
+ *
+ * This is aimed at making file operations persistent on disk in case of
+ * an OS crash or power failure.
+ */
+int
+fsync_parent_path(const char *fname, const char *progname)
+{
+	char		parentpath[MAXPGPATH];
+
+	strlcpy(parentpath, fname, MAXPGPATH);
+	get_parent_directory(parentpath);
+
+	/*
+	 * get_parent_directory() returns an empty string if the input argument is
+	 * just a file name (see comments in path.c), so handle that as being the
+	 * current directory.
+	 */
+	if (strlen(parentpath) == 0)
+		strlcpy(parentpath, ".", MAXPGPATH);
+
+	if (fsync_fname(parentpath, true, progname) != 0)
+		return -1;
+
+	return 0;
+}
+
+/*
+ * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
+ *
+ * Wrapper around rename, similar to the backend version.
+ */
+int
+durable_rename(const char *oldfile, const char *newfile, const char *progname)
+{
+	int		fd;
+
+	/*
+	 * First fsync the old and target path (if it exists), to ensure that they
+	 * are properly persistent on disk. Syncing the target file is not
+	 * strictly necessary, but it makes it easier to reason about crashes;
+	 * because it's then guaranteed that either source or target file exists
+	 * after a crash.
+	 */
+	if (fsync_fname(oldfile, false, progname) != 0)
+		return -1;
+
+	fd = open(newfile, PG_BINARY | O_RDWR, 0);
+	if (fd < 0)
+	{
+		if (errno != ENOENT)
+		{
+			fprintf(stderr, _("%s: could not open file \"%s\": %s\n"),
+					progname, newfile, strerror(errno));
+			return -1;
+		}
+	}
+	else
+	{
+		if (fsync(fd) != 0)
+		{
+			int save_errno;
+
+			/* close file upon error */
+			save_errno = errno;
+			close(fd);
+			errno = save_errno;
+
+			fprintf(stderr, _("%s: could not fsync file \"%s\": %s\n"),
+					progname, newfile, strerror(errno));
+			return -1;
+		}
+	}
+
+	/* Time to do the real deal... */
+	if (rename(oldfile, newfile) != 0)
+	{
+		fprintf(stderr, _("%s: could not rename file \"%s\" to \"%s\": %s\n"),
+				progname, oldfile, newfile, strerror(errno));
+		return -1;
+	}
+
+	/*
+	 * To guarantee renaming the file is persistent, fsync the file with its
+	 * new name, and its containing directory.
+	 */
+	if (fsync_fname(newfile, false, progname) != 0)
+		return -1;
+
+	if (fsync_parent_path(newfile, progname) != 0)
+		return -1;
+
+	return 0;
 }
diff --git a/src/include/common/file_utils.h b/src/include/common/file_utils.h
index d3794df..1cb263d 100644
--- a/src/include/common/file_utils.h
+++ b/src/include/common/file_utils.h
@@ -15,8 +15,11 @@
 #ifndef FILE_UTILS_H
 #define FILE_UTILS_H
 
-extern void fsync_fname(const char *fname, bool isdir,
-						const char *progname);
+extern int fsync_fname(const char *fname, bool isdir,
+					   const char *progname);
 extern void fsync_pgdata(const char *pg_data, const char *progname);
+extern int durable_rename(const char *oldfile, const char *newfile,
+						  const char *progname);
+extern int fsync_parent_path(const char *fname, const char *progname);
 
 #endif   /* FILE_UTILS_H */
-- 
2.10.0

