From 816760e52a1233b29674869205ce9283aab28a73 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Tue, 31 Jan 2023 20:01:49 +1300
Subject: [PATCH 1/3] Lock pg_control while reading or writing.

Front-end programs that read pg_control and user-facing functions run
in the backend read pg_control without acquiring ControlFileLock.  If
you're unlucky enough to read() while the backend is in write(), on at
least ext4 or NTFS, you might see partial data.  Use a POSIX advisory
file lock or a Windows mandatory file lock, to avoid this problem.

Since the semantics of POSIX and Windows file locks are different
and error-prone, don't try to provide a general purpose reusable
function for those operations; keep them close to the control file code.

Reviewed-by: Anton A. Melnikov <aamelnikov@inbox.ru>
Discussion: https://postgr.es/m/20221123014224.xisi44byq3cf5psi%40awork3.anarazel.de
---
 src/backend/access/transam/xlog.c      |  21 +++++
 src/common/controldata_utils.c         | 126 +++++++++++++++++++++++++
 src/include/common/controldata_utils.h |   4 +
 3 files changed, 151 insertions(+)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index f9f0f6db8d..9b9c3294e8 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -3980,6 +3980,19 @@ ReadControlFile(void)
 				 errmsg("could not open file \"%s\": %m",
 						XLOG_CONTROL_FILE)));
 
+	/*
+	 * Since file locks are released asynchronously after a program crashes on
+	 * Windows, there is a small chance that the write lock of a previously
+	 * running postmaster hasn't been released yet.  We need to wait for that
+	 * here; if we didn't, our read might fail, because Windows file locks are
+	 * mandatory (that is, not advisory).
+	 */
+	if (lock_controlfile(fd, false) < 0)
+		ereport(PANIC,
+				(errcode_for_file_access(),
+				 errmsg("could not lock file \"%s\": %m",
+						XLOG_CONTROL_FILE)));
+
 	pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_READ);
 	r = read(fd, ControlFile, sizeof(ControlFileData));
 	if (r != sizeof(ControlFileData))
@@ -3997,6 +4010,14 @@ ReadControlFile(void)
 	}
 	pgstat_report_wait_end();
 
+#ifdef WIN32
+	if (unlock_controlfile(fd) < 0)
+		ereport(PANIC,
+				(errcode_for_file_access(),
+				 errmsg("could not unlock file \"%s\": %m",
+						XLOG_CONTROL_FILE)));
+#endif
+
 	close(fd);
 
 	/*
diff --git a/src/common/controldata_utils.c b/src/common/controldata_utils.c
index 9723587466..45b43ae546 100644
--- a/src/common/controldata_utils.c
+++ b/src/common/controldata_utils.c
@@ -39,6 +39,91 @@
 #include "storage/fd.h"
 #endif
 
+/*
+ * Lock the control file until closed.  This should only be used on file where
+ * only one descriptor is open in a given process (due to weird semantics of
+ * POSIX locks).  On Windows, see unlock_controlfile(), but otherwise the lock
+ * is held until the file is closed.
+ */
+int
+lock_controlfile(int fd, bool exclusive)
+{
+#ifdef WIN32
+	OVERLAPPED	overlapped = {0};
+	HANDLE		handle;
+
+	handle = (HANDLE) _get_osfhandle(fd);
+	if (handle == INVALID_HANDLE_VALUE)
+	{
+		errno = EBADF;
+		return -1;
+	}
+
+	/* Lock first byte. */
+	if (!LockFileEx(handle,
+					exclusive ? LOCKFILE_EXCLUSIVE_LOCK : 0,
+					0,
+					1,
+					0,
+					&overlapped))
+	{
+		_dosmaperr(GetLastError());
+		return -1;
+	}
+
+	return 0;
+#else
+	struct flock lock;
+	int			rc;
+
+	memset(&lock, 0, sizeof(lock));
+	lock.l_type = exclusive ? F_WRLCK : F_RDLCK;
+	lock.l_start = 0;
+	lock.l_whence = SEEK_SET;
+	lock.l_len = 0;
+	lock.l_pid = -1;
+
+	do
+	{
+		rc = fcntl(fd, F_SETLKW, &lock);
+	}
+	while (rc < 0 && errno == EINTR);
+
+	return rc;
+#endif
+}
+
+#ifdef WIN32
+/*
+ * Release lock acquire with lock_controlfile().  On POSIX systems, we don't
+ * bother making an extra system call to release the lock, since it'll be
+ * released on close anyway.  On Windows, explicit release is recommended by
+ * the documentation to make sure it is done synchronously.
+ */
+int
+unlock_controlfile(int fd)
+{
+	OVERLAPPED	overlapped = {0};
+	HANDLE		handle;
+
+	handle = (HANDLE) _get_osfhandle(fd);
+	if (handle == INVALID_HANDLE_VALUE)
+	{
+		errno = EBADF;
+		return -1;
+	}
+
+	/* Unlock first byte. */
+	if (!UnlockFileEx(handle, 0, 1, 0, &overlapped))
+	{
+		_dosmaperr(GetLastError());
+		return -1;
+	}
+
+	return 0;
+}
+#endif
+
 /*
  * get_controlfile()
  *
@@ -74,6 +159,20 @@ get_controlfile(const char *DataDir, bool *crc_ok_p)
 				 ControlFilePath);
 #endif
 
+	/* Make sure we can read the file atomically. */
+	if (lock_controlfile(fd, false) < 0)
+	{
+#ifndef FRONTEND
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not lock file \"%s\" for reading: %m",
+						ControlFilePath)));
+#else
+		pg_fatal("could not lock file \"%s\" for reading: %m",
+				 ControlFilePath);
+#endif
+	}
+
 	r = read(fd, ControlFile, sizeof(ControlFileData));
 	if (r != sizeof(ControlFileData))
 	{
@@ -97,6 +196,10 @@ get_controlfile(const char *DataDir, bool *crc_ok_p)
 #endif
 	}
 
+#ifdef WIN32
+	unlock_controlfile(fd);
+#endif
+
 #ifndef FRONTEND
 	if (CloseTransientFile(fd) != 0)
 		ereport(ERROR,
@@ -186,6 +289,25 @@ update_controlfile(const char *DataDir,
 		pg_fatal("could not open file \"%s\": %m", ControlFilePath);
 #endif
 
+	/*
+	 * Make sure that any concurrent reader (including frontend programs) can
+	 * read the file atomically.  Note that this refers to atomicity of
+	 * concurrent reads and writes.  For our assumption of atomicity under
+	 * power failure, see PG_CONTROL_MAX_SAFE_SIZE.
+	 */
+	if (lock_controlfile(fd, true) < 0)
+	{
+#ifndef FRONTEND
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not lock file \"%s\" for writing: %m",
+						ControlFilePath)));
+#else
+		pg_fatal("could not lock file \"%s\" for writing: %m",
+				 ControlFilePath);
+#endif
+	}
+
 	errno = 0;
 #ifndef FRONTEND
 	pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_WRITE_UPDATE);
@@ -225,6 +347,10 @@ update_controlfile(const char *DataDir,
 #endif
 	}
 
+#ifdef WIN32
+	unlock_controlfile(fd);
+#endif
+
 	if (close(fd) != 0)
 	{
 #ifndef FRONTEND
diff --git a/src/include/common/controldata_utils.h b/src/include/common/controldata_utils.h
index 49e7c52d31..06825cad85 100644
--- a/src/include/common/controldata_utils.h
+++ b/src/include/common/controldata_utils.h
@@ -15,5 +15,9 @@
 extern ControlFileData *get_controlfile(const char *DataDir, bool *crc_ok_p);
 extern void update_controlfile(const char *DataDir,
 							   ControlFileData *ControlFile, bool do_sync);
+extern int lock_controlfile(int fd, bool exclusive);
+#ifdef WIN32
+extern int unlock_controlfile(int fd);
+#endif
 
 #endif							/* COMMON_CONTROLDATA_UTILS_H */
-- 
2.35.1

