From 62c107aa2e3c52192baf6105d6f13cd8601b7913 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Tue, 31 Jan 2023 20:01:49 +1300
Subject: [PATCH v3 1/3] Lock pg_control while reading or writing.

Front-end programs that read pg_control and user-facing functions run
in the backend read pg_control without acquiring ControlFileLock.  If
you're unlucky enough to read() while the backend is in write(), on at
least ext4 or NTFS, you might see partial data.  Use a POSIX advisory
file lock or a Windows mandatory file lock, to avoid this problem.
Since Windows mandatory locks might cause existing external backup tools
to fail in ReadFile(), lock one byte past the end so that only tools
that opt into this scheme are affected by it.

Reviewed-by: Anton A. Melnikov <aamelnikov@inbox.ru>
Discussion: https://postgr.es/m/20221123014224.xisi44byq3cf5psi%40awork3.anarazel.de
---
 src/backend/backup/basebackup.c |   5 ++
 src/common/controldata_utils.c  | 137 ++++++++++++++++++++++++++++++++
 2 files changed, 142 insertions(+)

diff --git a/src/backend/backup/basebackup.c b/src/backend/backup/basebackup.c
index 3fb9451643..7e567300bc 100644
--- a/src/backend/backup/basebackup.c
+++ b/src/backend/backup/basebackup.c
@@ -39,6 +39,7 @@
 #include "storage/dsm_impl.h"
 #include "storage/fd.h"
 #include "storage/ipc.h"
+#include "storage/lwlock.h"
 #include "storage/reinit.h"
 #include "utils/builtins.h"
 #include "utils/guc.h"
@@ -345,8 +346,12 @@ perform_base_backup(basebackup_options *opt, bbsink *sink)
 							(errcode_for_file_access(),
 							 errmsg("could not stat file \"%s\": %m",
 									XLOG_CONTROL_FILE)));
+
+				/* Lock to avoid torn reads, if there is a concurrent write. */
+				LWLockAcquire(ControlFileLock, LW_SHARED);
 				sendFile(sink, XLOG_CONTROL_FILE, XLOG_CONTROL_FILE, &statbuf,
 						 false, InvalidOid, &manifest, NULL);
+				LWLockRelease(ControlFileLock);
 			}
 			else
 			{
diff --git a/src/common/controldata_utils.c b/src/common/controldata_utils.c
index 9723587466..e1b7b936ba 100644
--- a/src/common/controldata_utils.c
+++ b/src/common/controldata_utils.c
@@ -39,6 +39,102 @@
 #include "storage/fd.h"
 #endif
 
+/*
+ * Lock the control file until closed.  This is used for the benefit of
+ * frontend/external programs that want to take an atomic copy of the control
+ * file, when though it might be written to concurrently by the server.
+ * On some systems including Windows NTFS and Linux ext4, that might otherwise
+ * cause a torn read.
+ *
+ * (The server also reads and writes the control file directly sometimes, not
+ * through these routines; that's OK, because it uses ControlFileLock, or no
+ * locking when it expects no concurrent writes.)
+ */
+static int
+lock_controlfile(int fd, bool exclusive)
+{
+#ifdef WIN32
+	OVERLAPPED	overlapped = {.Offset = PG_CONTROL_FILE_SIZE};
+	HANDLE		handle;
+
+	handle = (HANDLE) _get_osfhandle(fd);
+	if (handle == INVALID_HANDLE_VALUE)
+	{
+		errno = EBADF;
+		return -1;
+	}
+
+	/*
+	 * On Windows, we lock the first byte past the end of the control file (see
+	 * overlapped.Offset).  This means that we shouldn't cause errors in
+	 * external tools that just want to read the file, but we can block other
+	 * processes using this routine or that know about this protocol.  This
+	 * provides an approximation of Unix's "advisory" locking.
+	 */
+	if (!LockFileEx(handle,
+					exclusive ? LOCKFILE_EXCLUSIVE_LOCK : 0,
+					0,
+					1,
+					0,
+					&overlapped))
+	{
+		_dosmaperr(GetLastError());
+		return -1;
+	}
+
+	return 0;
+#else
+	struct flock lock;
+	int			rc;
+
+	memset(&lock, 0, sizeof(lock));
+	lock.l_type = exclusive ? F_WRLCK : F_RDLCK;
+	lock.l_start = 0;
+	lock.l_whence = SEEK_SET;
+	lock.l_len = 0;
+	lock.l_pid = -1;
+
+	do
+	{
+		rc = fcntl(fd, F_SETLKW, &lock);
+	}
+	while (rc < 0 && errno == EINTR);
+
+	return rc;
+#endif
+}
+
+#ifdef WIN32
+/*
+ * Release lock acquire with lock_controlfile().  On POSIX systems, we don't
+ * bother making an extra system call to release the lock, since it'll be
+ * released on close anyway.  On Windows, explicit release is recommended by
+ * the documentation to make sure it is done synchronously.
+ */
+static int
+unlock_controlfile(int fd)
+{
+	OVERLAPPED	overlapped = {.Offset = PG_CONTROL_FILE_SIZE};
+	HANDLE		handle;
+
+	handle = (HANDLE) _get_osfhandle(fd);
+	if (handle == INVALID_HANDLE_VALUE)
+	{
+		errno = EBADF;
+		return -1;
+	}
+
+	/* Unlock first byte. */
+	if (!UnlockFileEx(handle, 0, 1, 0, &overlapped))
+	{
+		_dosmaperr(GetLastError());
+		return -1;
+	}
+
+	return 0;
+}
+#endif
+
 /*
  * get_controlfile()
  *
@@ -74,6 +170,20 @@ get_controlfile(const char *DataDir, bool *crc_ok_p)
 				 ControlFilePath);
 #endif
 
+	/* Make sure we can read the file atomically. */
+	if (lock_controlfile(fd, false) < 0)
+	{
+#ifndef FRONTEND
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not lock file \"%s\" for reading: %m",
+						ControlFilePath)));
+#else
+		pg_fatal("could not lock file \"%s\" for reading: %m",
+				 ControlFilePath);
+#endif
+	}
+
 	r = read(fd, ControlFile, sizeof(ControlFileData));
 	if (r != sizeof(ControlFileData))
 	{
@@ -97,6 +207,10 @@ get_controlfile(const char *DataDir, bool *crc_ok_p)
 #endif
 	}
 
+#ifdef WIN32
+	unlock_controlfile(fd);
+#endif
+
 #ifndef FRONTEND
 	if (CloseTransientFile(fd) != 0)
 		ereport(ERROR,
@@ -186,6 +300,25 @@ update_controlfile(const char *DataDir,
 		pg_fatal("could not open file \"%s\": %m", ControlFilePath);
 #endif
 
+	/*
+	 * Make sure that any concurrent reader (including frontend programs) can
+	 * read the file atomically.  Note that this refers to atomicity of
+	 * concurrent reads and writes.  For our assumption of atomicity under
+	 * power failure, see PG_CONTROL_MAX_SAFE_SIZE.
+	 */
+	if (lock_controlfile(fd, true) < 0)
+	{
+#ifndef FRONTEND
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not lock file \"%s\" for writing: %m",
+						ControlFilePath)));
+#else
+		pg_fatal("could not lock file \"%s\" for writing: %m",
+				 ControlFilePath);
+#endif
+	}
+
 	errno = 0;
 #ifndef FRONTEND
 	pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_WRITE_UPDATE);
@@ -225,6 +358,10 @@ update_controlfile(const char *DataDir,
 #endif
 	}
 
+#ifdef WIN32
+	unlock_controlfile(fd);
+#endif
+
 	if (close(fd) != 0)
 	{
 #ifndef FRONTEND
-- 
2.39.2

