From 4240fd71ced2fca770b802cf08b03cab7a2ee89f Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Mon, 28 Jun 2021 12:03:02 +1200
Subject: [PATCH] Remove control file dependency on 512 byte sectors.

Instead of making requirements about sector-level atomicity on power
loss, use a double-write strategy.  We write out two copies of the
control file data with a synchronization barrier in between.  Only one
of them can be torn by power loss, on an overwrite file system.

XXX  WIP
XXX  pg_filenode.map has the same problem

Discussion: https://postgr.es/m/17064-bb0d7904ef72add3%40postgresql.org
---
 src/backend/access/transam/xlog.c             | 118 +++++++++++++++---
 src/common/controldata_utils.c                |  81 ++++++------
 src/include/catalog/pg_control.h              |   8 --
 .../recovery/t/026_corrupted_control_file.pl  |  94 ++++++++++++++
 4 files changed, 236 insertions(+), 65 deletions(-)
 create mode 100644 src/test/recovery/t/026_corrupted_control_file.pl

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 1b3a3d9bea..3442d5d95a 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -4660,10 +4660,8 @@ WriteControlFile(void)
 	 * Ensure that the size of the pg_control data structure is sane.  See the
 	 * comments for these symbols in pg_control.h.
 	 */
-	StaticAssertStmt(sizeof(ControlFileData) <= PG_CONTROL_MAX_SAFE_SIZE,
-					 "pg_control is too large for atomic disk writes");
-	StaticAssertStmt(sizeof(ControlFileData) <= PG_CONTROL_FILE_SIZE,
-					 "sizeof(ControlFileData) exceeds PG_CONTROL_FILE_SIZE");
+	StaticAssertStmt(sizeof(ControlFileData) * 2 <= PG_CONTROL_FILE_SIZE,
+					 "sizeof(ControlFileData) * 2 exceeds PG_CONTROL_FILE_SIZE");
 
 	/*
 	 * Initialize version and compatibility-check fields
@@ -4704,6 +4702,14 @@ WriteControlFile(void)
 	memset(buffer, 0, PG_CONTROL_FILE_SIZE);
 	memcpy(buffer, ControlFile, sizeof(ControlFileData));
 
+	/*
+	 * Make second copy of the control file data.  No need to synchronize the
+	 * file in between copies (as UpdateControlFile() does), because this path
+	 * is used during bootstrapping only.
+	 */
+	memcpy(buffer + sizeof(ControlFileData),
+		   ControlFile, sizeof(ControlFileData));
+
 	fd = BasicOpenFile(XLOG_CONTROL_FILE,
 					   O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
 	if (fd < 0)
@@ -4744,6 +4750,8 @@ WriteControlFile(void)
 static void
 ReadControlFile(void)
 {
+	ControlFileData	control_file_array[2];
+	bool		crc_good[2];
 	pg_crc32c	crc;
 	int			fd;
 	static char wal_segsz_str[20];
@@ -4761,8 +4769,8 @@ ReadControlFile(void)
 						XLOG_CONTROL_FILE)));
 
 	pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_READ);
-	r = read(fd, ControlFile, sizeof(ControlFileData));
-	if (r != sizeof(ControlFileData))
+	r = read(fd, control_file_array, sizeof(control_file_array));
+	if (r != sizeof(control_file_array))
 	{
 		if (r < 0)
 			ereport(PANIC,
@@ -4773,12 +4781,98 @@ ReadControlFile(void)
 			ereport(PANIC,
 					(errcode(ERRCODE_DATA_CORRUPTED),
 					 errmsg("could not read file \"%s\": read %d of %zu",
-							XLOG_CONTROL_FILE, r, sizeof(ControlFileData))));
+							XLOG_CONTROL_FILE, r, sizeof(control_file_array))));
 	}
 	pgstat_report_wait_end();
 
 	close(fd);
 
+	/*
+	 * We read two copies.  Usually they are identical, but they might not be
+	 * if there was a crash while UpdateControlFile() was writing them out.
+	 * They were written with a synchronization barrier between them, so we can
+	 * assume that at least one of them was written out fully, without having
+	 * to make any assumptions about things like sectors and atomicity on
+	 * overwrite filesystems.
+	 */
+	for (int i = 0; i < 2; ++i)
+	{
+		INIT_CRC32C(crc);
+		COMP_CRC32C(crc, &control_file_array[i],
+					offsetof(ControlFileData, crc));
+		FIN_CRC32C(crc);
+		crc_good[i] = EQ_CRC32C(crc, control_file_array[i].crc);
+	}
+
+	if (crc_good[0])
+	{
+		/* The first copy is good, so that's the one we'll use to start up. */
+		*ControlFile = control_file_array[0];
+
+		/*
+		 * There is a hazard if the second copy is corrupted, though:
+		 * UpdateControlFile() writes first then second.  If we leave the
+		 * second copy corrupted now, another power loss event could corrupt
+		 * the first copy, leaving no good copies on the disk.  So, if the
+		 * second copy is corrupted, fix it now.
+		 */
+		if (!crc_good[1])
+		{
+			elog(WARNING, "incorrect checksum for second copy of control data, correcting");
+			fd = BasicOpenFile(XLOG_CONTROL_FILE, O_RDWR | PG_BINARY);
+			if (fd < 0)
+				ereport(PANIC,
+					(errcode_for_file_access(),
+					 errmsg("could not open file \"%s\": %m",
+							XLOG_CONTROL_FILE)));
+			r = pg_pwrite(fd, ControlFile, sizeof(*ControlFile),
+						  sizeof(*ControlFile));
+			if (r != sizeof(*ControlFile))
+			{
+				if (r < 0)
+					ereport(PANIC,
+							(errcode_for_file_access(),
+							 errmsg("could not write file \"%s\": %m",
+									XLOG_CONTROL_FILE)));
+				else
+					ereport(PANIC,
+							(errcode(ERRCODE_DATA_CORRUPTED),
+							 errmsg("could not write file \"%s\": wrote %d of %zu",
+									XLOG_CONTROL_FILE, r, sizeof(*ControlFile))));
+			}
+			if (pg_fsync(openLogFile) != 0)
+				ereport(PANIC,
+						(errcode_for_file_access(),
+						 errmsg("could not fsync file \"%s\": %m",
+								XLOG_CONTROL_FILE)));
+			close(fd);
+		}
+	}
+	else if (crc_good[1])
+	{
+		/* The second copy is good, so that's the one we'll use to start up. */
+		*ControlFile = control_file_array[1];
+		elog(WARNING, "incorrect checksum for first copy of control data, using second copy");
+
+		/*
+		 * We don't have to worry about correcting the corrupted first copy.
+		 * The next call to UpdateControlFile() will overwrite the first copy
+		 * first, so there won't be a risk of both copies being corrupted on
+		 * overwrite filesystems.
+		 */
+	}
+	else
+	{
+		/*
+		 * Both copies failed CRC checks, so we can't start up.  We'll use the
+		 * first one for the checks on file format version that we perform
+		 * next, because only its version is expected to be at the same offset
+		 * in all versions.
+		 */
+		*ControlFile = control_file_array[0];
+		elog(WARNING, "incorrect checksum for both copies of control data");
+	}
+
 	/*
 	 * Check for expected pg_control format version.  If this is wrong, the
 	 * CRC check will likely fail because we'll be checking the wrong number
@@ -4803,14 +4897,8 @@ ReadControlFile(void)
 						   ControlFile->pg_control_version, PG_CONTROL_VERSION),
 				 errhint("It looks like you need to initdb.")));
 
-	/* Now check the CRC. */
-	INIT_CRC32C(crc);
-	COMP_CRC32C(crc,
-				(char *) ControlFile,
-				offsetof(ControlFileData, crc));
-	FIN_CRC32C(crc);
-
-	if (!EQ_CRC32C(crc, ControlFile->crc))
+	/* Now report CRC failure. */
+	if (!crc_good[0] && !crc_good[1])
 		ereport(FATAL,
 				(errmsg("incorrect checksum in control file")));
 
diff --git a/src/common/controldata_utils.c b/src/common/controldata_utils.c
index 7d4af7881e..844f1639fa 100644
--- a/src/common/controldata_utils.c
+++ b/src/common/controldata_utils.c
@@ -44,6 +44,10 @@
  * Get controlfile values.  The result is returned as a palloc'd copy of the
  * control file data.
  *
+ * XXX Unlike the backend's ReadControlFile(), this function does not currently
+ * consider the second copy of ControlFileData and will simply report CRC check
+ * failure if the first copy is corrupted.
+ *
  * crc_ok_p can be used by the caller to see whether the CRC of the control
  * file data is correct.
  */
@@ -157,16 +161,13 @@ update_controlfile(const char *DataDir,
 				   ControlFileData *ControlFile, bool do_sync)
 {
 	int			fd;
-	char		buffer[PG_CONTROL_FILE_SIZE];
 	char		ControlFilePath[MAXPGPATH];
 
 	/*
 	 * Apply the same static assertions as in backend's WriteControlFile().
 	 */
-	StaticAssertStmt(sizeof(ControlFileData) <= PG_CONTROL_MAX_SAFE_SIZE,
-					 "pg_control is too large for atomic disk writes");
-	StaticAssertStmt(sizeof(ControlFileData) <= PG_CONTROL_FILE_SIZE,
-					 "sizeof(ControlFileData) exceeds PG_CONTROL_FILE_SIZE");
+	StaticAssertStmt(sizeof(ControlFileData) * 2 <= PG_CONTROL_FILE_SIZE,
+					 "sizeof(ControlFileData) * 2 exceeds PG_CONTROL_FILE_SIZE");
 
 	/* Recalculate CRC of control file */
 	INIT_CRC32C(ControlFile->crc);
@@ -175,14 +176,6 @@ update_controlfile(const char *DataDir,
 				offsetof(ControlFileData, crc));
 	FIN_CRC32C(ControlFile->crc);
 
-	/*
-	 * Write out PG_CONTROL_FILE_SIZE bytes into pg_control by zero-padding
-	 * the excess over sizeof(ControlFileData), to avoid premature EOF related
-	 * errors when reading it.
-	 */
-	memset(buffer, 0, PG_CONTROL_FILE_SIZE);
-	memcpy(buffer, ControlFile, sizeof(ControlFileData));
-
 	snprintf(ControlFilePath, sizeof(ControlFilePath), "%s/%s", DataDir, XLOG_CONTROL_FILE);
 
 #ifndef FRONTEND
@@ -205,47 +198,51 @@ update_controlfile(const char *DataDir,
 	}
 #endif
 
-	errno = 0;
-#ifndef FRONTEND
-	pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_WRITE_UPDATE);
-#endif
-	if (write(fd, buffer, PG_CONTROL_FILE_SIZE) != PG_CONTROL_FILE_SIZE)
+	/* Write out two copies with a sychronization barrier in between. */
+	for (int i = 0; i < 2; ++i)
 	{
-		/* if write didn't set errno, assume problem is no disk space */
-		if (errno == 0)
-			errno = ENOSPC;
-
-#ifndef FRONTEND
-		ereport(PANIC,
-				(errcode_for_file_access(),
-				 errmsg("could not write file \"%s\": %m",
-						ControlFilePath)));
-#else
-		pg_log_fatal("could not write file \"%s\": %m", ControlFilePath);
-		exit(EXIT_FAILURE);
-#endif
-	}
+		errno = 0;
 #ifndef FRONTEND
-	pgstat_report_wait_end();
+		pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_WRITE_UPDATE);
 #endif
+		if (write(fd, ControlFile, sizeof(*ControlFile)) != sizeof(*ControlFile))
+		{
+			/* if write didn't set errno, assume problem is no disk space */
+			if (errno == 0)
+				errno = ENOSPC;
 
-	if (do_sync)
-	{
 #ifndef FRONTEND
-		pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_SYNC_UPDATE);
-		if (pg_fsync(fd) != 0)
 			ereport(PANIC,
 					(errcode_for_file_access(),
-					 errmsg("could not fsync file \"%s\": %m",
+					 errmsg("could not write file \"%s\": %m",
 							ControlFilePath)));
-		pgstat_report_wait_end();
 #else
-		if (fsync(fd) != 0)
-		{
-			pg_log_fatal("could not fsync file \"%s\": %m", ControlFilePath);
+			pg_log_fatal("could not write file \"%s\": %m", ControlFilePath);
 			exit(EXIT_FAILURE);
+#endif
 		}
+#ifndef FRONTEND
+		pgstat_report_wait_end();
 #endif
+
+		if (do_sync)
+		{
+#ifndef FRONTEND
+			pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_SYNC_UPDATE);
+			if (pg_fsync(fd) != 0)
+				ereport(PANIC,
+						(errcode_for_file_access(),
+						 errmsg("could not fsync file \"%s\": %m",
+								ControlFilePath)));
+			pgstat_report_wait_end();
+#else
+			if (fsync(fd) != 0)
+			{
+				pg_log_fatal("could not fsync file \"%s\": %m", ControlFilePath);
+				exit(EXIT_FAILURE);
+			}
+#endif
+		}
 	}
 
 	if (close(fd) != 0)
diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h
index e3f48158ce..24d98441d1 100644
--- a/src/include/catalog/pg_control.h
+++ b/src/include/catalog/pg_control.h
@@ -230,14 +230,6 @@ typedef struct ControlFileData
 	pg_crc32c	crc;
 } ControlFileData;
 
-/*
- * Maximum safe value of sizeof(ControlFileData).  For reliability's sake,
- * it's critical that pg_control updates be atomic writes.  That generally
- * means the active data can't be more than one disk sector, which is 512
- * bytes on common hardware.  Be very careful about raising this limit.
- */
-#define PG_CONTROL_MAX_SAFE_SIZE	512
-
 /*
  * Physical size of the pg_control file.  Note that this is considerably
  * bigger than the actually used size (ie, sizeof(ControlFileData)).
diff --git a/src/test/recovery/t/026_corrupted_control_file.pl b/src/test/recovery/t/026_corrupted_control_file.pl
new file mode 100644
index 0000000000..03a42c79d0
--- /dev/null
+++ b/src/test/recovery/t/026_corrupted_control_file.pl
@@ -0,0 +1,94 @@
+
+# Copyright (c) 2021, PostgreSQL Global Development Group
+
+use strict;
+use warnings;
+use PostgresNode;
+use TestLib;
+use Test::More;
+use Config;
+
+plan tests => 8;
+
+# find $pat in logfile of $node after $off-th byte
+# XXX: function copied from t/019_repslot_limit.pl
+sub find_in_log
+{
+    my ($node, $pat, $off) = @_;
+
+    $off = 0 unless defined $off;
+    my $log = TestLib::slurp_file($node->logfile);
+    return 0 if (length($log) <= $off);
+
+    $log = substr($log, $off);
+
+    return $log =~ m/$pat/;
+}
+
+my $psql_timeout = IPC::Run::timer(60);
+
+my $node = get_new_node('test');
+my $control_file = $node->data_dir . "/global/pg_control";
+$node->init();
+
+# We start up without warnings if there is no corruption.
+my $logstart = -s $node->logfile;
+$node->start();
+$node->stop();
+ok(!find_in_log($node, "incorrect checksum"),
+   "both copies ok #1");
+
+# Corrupt the first copy of the control data... we can still start up, using
+# the second copy.
+open(my $file, '+<', $control_file);
+print $file "................";
+close($file);
+$logstart = -s $node->logfile;
+$node->start();
+$node->stop();
+ok(find_in_log($node, "incorrect checksum for first copy of control data", $logstart),
+   "first copy bad");
+
+# There should be no corruption now.
+$logstart = -s $node->logfile;
+$node->start();
+$node->stop();
+ok(!find_in_log($node, "incorrect checksum", $logstart),
+   "both copies ok #2");
+
+# Corrupt the second copy of the control data.  We don't want to hard-code its
+# location in this test, so we'll just grab a chunk of bytes from the front of
+# the control file, and then search for the second instance of that string and
+# corrupt it.
+my $control_file_contents = slurp_file($control_file);
+my $header = substr $control_file_contents, 0, 64;
+my $index = rindex $control_file_contents, $header;
+ok($index > 0, "second copy exists");
+open($file, '+<', $control_file);
+seek($file, $index, 0);
+print $file "................";
+close($file);
+$logstart = -s $node->logfile;
+$node->start();
+$node->stop();
+ok(find_in_log($node, "incorrect checksum for second copy of control data, correcting", $logstart),
+   "second copy bad");
+
+# There should be no corruption now.
+$logstart = -s $node->logfile;
+$node->start();
+$node->stop();
+ok(!find_in_log($node, "incorrect checksum", $logstart),
+   "both copies ok #3");
+
+# Corrupt both copies.  Now we can't start up!
+my $control_file_size = -s $control_file;
+open($file, '+<', $control_file);
+print $file ("." x $control_file_size);
+close($file);
+$logstart = -s $node->logfile;
+$node->start(fail_ok => 1);
+ok(find_in_log($node, "incorrect checksum for both copies of control data", $logstart),
+   "both copies bad 1/2");
+ok(find_in_log($node, "FATAL:  database files are incompatible with server", $logstart),
+   "both copies bad 2/2");
-- 
2.30.2