Add checks in pg_rewind to abort if backup_label file is present

Started by Krishnakumar Rabout 2 years ago2 messages
#1Krishnakumar R
kksrcv001@gmail.com
1 attachment(s)

Hi,

Please find the attached patch for $subject and associated test. Please review.

--
Thanks and Regards,
Krishnakumar (KK).
[Microsoft]

Attachments:

v1-0001-Add-checks-in-pg_rewind-to-abort-if-backup_label-.patchtext/x-patch; charset=US-ASCII; name=v1-0001-Add-checks-in-pg_rewind-to-abort-if-backup_label-.patchDownload
From 80ad7293b57a2b346b0775b8f6e0e06198617154 Mon Sep 17 00:00:00 2001
From: "Krishnakumar R (KK)" <kksrcv001@gmail.com>
Date: Tue, 5 Dec 2023 02:36:32 -0800
Subject: [PATCH v1] Add checks in pg_rewind to abort if backup_label file is
 found in either source or target cluster. Append rewind tap tests to verify
 the check.

---
 src/bin/pg_rewind/file_ops.c             | 17 +++++++++
 src/bin/pg_rewind/file_ops.h             |  2 +-
 src/bin/pg_rewind/libpq_source.c         | 22 ++++++++++++
 src/bin/pg_rewind/local_source.c         | 14 ++++++++
 src/bin/pg_rewind/pg_rewind.c            |  6 +++-
 src/bin/pg_rewind/rewind_source.h        |  5 +++
 src/bin/pg_rewind/t/001_basic.pl         | 46 ++++++++++++++++++++++--
 src/test/perl/PostgreSQL/Test/Cluster.pm | 31 ++++++++++++++++
 8 files changed, 138 insertions(+), 5 deletions(-)

diff --git a/src/bin/pg_rewind/file_ops.c b/src/bin/pg_rewind/file_ops.c
index dd22685932..d4568e7d09 100644
--- a/src/bin/pg_rewind/file_ops.c
+++ b/src/bin/pg_rewind/file_ops.c
@@ -299,6 +299,23 @@ sync_target_dir(void)
 	sync_pgdata(datadir_target, PG_VERSION_NUM, sync_method);
 }
 
+/*
+ * Check if a file is present using the stat function. Return 'true'
+ * if present and 'false' if not.
+ */
+bool
+is_file_present(const char *datadir, const char *path)
+{
+	struct stat s;
+	char		fullpath[MAXPGPATH];
+
+	snprintf(fullpath, sizeof(fullpath), "%s/%s", datadir, path);
+
+	if (stat(fullpath, &s) < 0)
+		return false;
+
+	return true;
+}
 
 /*
  * Read a file into memory. The file to be read is <datadir>/<path>.
diff --git a/src/bin/pg_rewind/file_ops.h b/src/bin/pg_rewind/file_ops.h
index 427cf8e0b5..c063175fac 100644
--- a/src/bin/pg_rewind/file_ops.h
+++ b/src/bin/pg_rewind/file_ops.h
@@ -20,7 +20,7 @@ extern void truncate_target_file(const char *path, off_t newsize);
 extern void create_target(file_entry_t *entry);
 extern void remove_target(file_entry_t *entry);
 extern void sync_target_dir(void);
-
+extern bool is_file_present(const char *datadir, const char *path);
 extern char *slurpFile(const char *datadir, const char *path, size_t *filesize);
 
 typedef void (*process_file_callback_t) (const char *path, file_type_t type, size_t size, const char *link_target);
diff --git a/src/bin/pg_rewind/libpq_source.c b/src/bin/pg_rewind/libpq_source.c
index 417c74cfef..66ed6e156d 100644
--- a/src/bin/pg_rewind/libpq_source.c
+++ b/src/bin/pg_rewind/libpq_source.c
@@ -61,6 +61,7 @@ static void appendArrayEscapedString(StringInfo buf, const char *str);
 static void process_queued_fetch_requests(libpq_source *src);
 
 /* public interface functions */
+static bool libpq_is_file_present(rewind_source *source, const char *path);
 static void libpq_traverse_files(rewind_source *source,
 								 process_file_callback_t callback);
 static void libpq_queue_fetch_file(rewind_source *source, const char *path, size_t len);
@@ -87,6 +88,7 @@ init_libpq_source(PGconn *conn)
 
 	src = pg_malloc0(sizeof(libpq_source));
 
+	src->common.is_file_present = libpq_is_file_present;
 	src->common.traverse_files = libpq_traverse_files;
 	src->common.fetch_file = libpq_fetch_file;
 	src->common.queue_fetch_file = libpq_queue_fetch_file;
@@ -627,6 +629,26 @@ appendArrayEscapedString(StringInfo buf, const char *str)
 	appendStringInfoCharMacro(buf, '\"');
 }
 
+/*
+ * Check if a file is present using the connection to the
+ * database.
+ */
+static bool
+libpq_is_file_present(rewind_source *source, const char *path)
+{
+	PGconn	   *conn = ((libpq_source *) source)->conn;
+	PGresult   *res;
+	const char *paramValues[1];
+
+	paramValues[0] = path;
+	res = PQexecParams(conn, "SELECT pg_stat_file($1)",
+					   1, NULL, paramValues, NULL, NULL, 1);
+	if (PQresultStatus(res) != PGRES_TUPLES_OK)
+		return false;
+
+	return true;
+}
+
 /*
  * Fetch a single file as a malloc'd buffer.
  */
diff --git a/src/bin/pg_rewind/local_source.c b/src/bin/pg_rewind/local_source.c
index 9bd43cba74..0d6bf403b4 100644
--- a/src/bin/pg_rewind/local_source.c
+++ b/src/bin/pg_rewind/local_source.c
@@ -25,6 +25,7 @@ typedef struct
 	const char *datadir;		/* path to the source data directory */
 } local_source;
 
+static bool local_is_file_present(rewind_source *source, const char *path);
 static void local_traverse_files(rewind_source *source,
 								 process_file_callback_t callback);
 static char *local_fetch_file(rewind_source *source, const char *path,
@@ -43,6 +44,7 @@ init_local_source(const char *datadir)
 
 	src = pg_malloc0(sizeof(local_source));
 
+	src->common.is_file_present = local_is_file_present;
 	src->common.traverse_files = local_traverse_files;
 	src->common.fetch_file = local_fetch_file;
 	src->common.queue_fetch_file = local_queue_fetch_file;
@@ -62,6 +64,18 @@ local_traverse_files(rewind_source *source, process_file_callback_t callback)
 	traverse_datadir(((local_source *) source)->datadir, callback);
 }
 
+/*
+ * Check if a file is present. It calls the file_ops is_file_present
+ * to check.
+ */
+static bool
+local_is_file_present(rewind_source *source, const char *path)
+{
+	const char *datadir = ((local_source *) source)->datadir;
+
+	return is_file_present(datadir, path);
+}
+
 static char *
 local_fetch_file(rewind_source *source, const char *path, size_t *filesize)
 {
diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c
index f55ef4f2f8..8dbc0eeafb 100644
--- a/src/bin/pg_rewind/pg_rewind.c
+++ b/src/bin/pg_rewind/pg_rewind.c
@@ -729,7 +729,11 @@ perform_rewind(filemap_t *filemap, rewind_source *source,
 static void
 sanityChecks(void)
 {
-	/* TODO Check that there's no backup_label in either cluster */
+	if (source->is_file_present(source, "backup_label"))
+		pg_fatal("The backup_label file is present in source cluster");
+
+	if (is_file_present(datadir_target, "backup_label"))
+		pg_fatal("The backup_label file is present in target  cluster");
 
 	/* Check system_identifier match */
 	if (ControlFile_target.system_identifier != ControlFile_source.system_identifier)
diff --git a/src/bin/pg_rewind/rewind_source.h b/src/bin/pg_rewind/rewind_source.h
index 98af3b58ee..39f9fe33cc 100644
--- a/src/bin/pg_rewind/rewind_source.h
+++ b/src/bin/pg_rewind/rewind_source.h
@@ -22,6 +22,11 @@
 
 typedef struct rewind_source
 {
+	/*
+	 * Check if a file is present. Return true if present, false otherwise.
+	 */
+	bool		(*is_file_present) (struct rewind_source *, const char *path);
+
 	/*
 	 * Traverse all files in the source data directory, and call 'callback' on
 	 * each file.
diff --git a/src/bin/pg_rewind/t/001_basic.pl b/src/bin/pg_rewind/t/001_basic.pl
index c7b48255a7..a79a5154d0 100644
--- a/src/bin/pg_rewind/t/001_basic.pl
+++ b/src/bin/pg_rewind/t/001_basic.pl
@@ -91,14 +91,54 @@ sub run_test
 	standby_psql(
 		"INSERT INTO space_tbl VALUES ('in standby, after promotion')");
 
+	my $primary_pgdata = $node_primary->data_dir;
+	my $standby_pgdata = $node_standby->data_dir;
+
+	# Create mock backup_label in both standby and primary
+	# and ensure pg rewind errors
+	$node_primary->create_mock_backup_label;
+	$node_standby->create_mock_backup_label;	
+	command_fails(
+		[
+			'pg_rewind', '--debug',
+			'--source-pgdata', $standby_pgdata,
+			'--target-pgdata', $primary_pgdata,
+			'--no-sync', '--dry-run'
+		],
+		'pg_rewind with unexpected backup_label on primary & standby');
+	
+	# Remove the backup file in primary and check if pg_rewind 
+	# errors because of the backup label in standby.
+	$node_primary->rm_mock_backup_label; 
+	command_fails(
+		[
+			'pg_rewind', '--debug',
+			'--source-pgdata', $standby_pgdata,
+			'--target-pgdata', $primary_pgdata,
+			'--no-sync', '--dry-run'
+		],
+		'pg_rewind with unexpected backup_label on standby');
+	$node_standby->rm_mock_backup_label; 
+
+	# Add back the primary backup_file and confirm the pg_rewind
+	# does not start.
+	$node_primary->create_mock_backup_label;
+	command_fails(
+		[
+			'pg_rewind', '--debug',
+			'--source-pgdata', $standby_pgdata,
+			'--target-pgdata', $primary_pgdata,
+			'--no-sync', '--dry-run'
+		],
+		'pg_rewind with unexpected backup_label on primary');
+	$node_primary->rm_mock_backup_label; 
+	
 	# Before running pg_rewind, do a couple of extra tests with several
 	# option combinations.  As the code paths taken by those tests
 	# do not change for the "local" and "remote" modes, just run them
 	# in "local" mode for simplicity's sake.
 	if ($test_mode eq 'local')
 	{
-		my $primary_pgdata = $node_primary->data_dir;
-		my $standby_pgdata = $node_standby->data_dir;
 
 		# First check that pg_rewind fails if the target cluster is
 		# not stopped as it fails to start up for the forced recovery
@@ -141,7 +181,7 @@ sub run_test
 		# with --dry-run mode.  If anything gets generated in the data
 		# folder, the follow-up run of pg_rewind will most likely fail,
 		# so keep this test as the last one of this subset.
-		$node_standby->stop;
+		$node_standby->stop;		
 		command_ok(
 			[
 				'pg_rewind', '--debug',
diff --git a/src/test/perl/PostgreSQL/Test/Cluster.pm b/src/test/perl/PostgreSQL/Test/Cluster.pm
index a020377761..72bcfa9895 100644
--- a/src/test/perl/PostgreSQL/Test/Cluster.pm
+++ b/src/test/perl/PostgreSQL/Test/Cluster.pm
@@ -3090,6 +3090,37 @@ sub pg_recvlogical_upto
 
 =pod
 
+=item $node->create_mock_backup_label(self)
+
+Create a mock backup_label file to simulate a real life scenario
+where pg_rewind is attempted on a cluster with backup_label.
+
+=cut
+
+sub create_mock_backup_label
+{
+	my ($self) = @_;
+	my $label_file = $self->data_dir . "/backup_label";
+	system("touch $label_file")
+}
+
+=pod
+
+=item $node->create_mock_backup_label(self)
+
+Remove the mock backup_label file from the cluster.
+
+=cut
+
+sub rm_mock_backup_label
+{
+	my ($self) = @_;
+	my $label_file = $self->data_dir . "/backup_label";
+	unlink($label_file)
+}
+
+=pod
+
 =item $node->corrupt_page_checksum(self, file, page_offset)
 
 Intentionally corrupt the checksum field of one page in a file.
-- 
2.40.1

#2Heikki Linnakangas
hlinnaka@iki.fi
In reply to: Krishnakumar R (#1)
Re: Add checks in pg_rewind to abort if backup_label file is present

On 05/12/2023 19:36, Krishnakumar R wrote:

Hi,

Please find the attached patch for $subject and associated test. Please review.

Thanks for picking up this long-standing TODO!

+/*
+ * Check if a file is present using the connection to the
+ * database.
+ */
+static bool
+libpq_is_file_present(rewind_source *source, const char *path)
+{
+	PGconn	   *conn = ((libpq_source *) source)->conn;
+	PGresult   *res;
+	const char *paramValues[1];
+
+	paramValues[0] = path;
+	res = PQexecParams(conn, "SELECT pg_stat_file($1)",
+					   1, NULL, paramValues, NULL, NULL, 1);
+	if (PQresultStatus(res) != PGRES_TUPLES_OK)
+		return false;
+
+	return true;
+}

The backup_label file cannot be present when the server is running. No
need to check for that when connected to a live server.

--- a/src/bin/pg_rewind/pg_rewind.c
+++ b/src/bin/pg_rewind/pg_rewind.c
@@ -729,7 +729,11 @@ perform_rewind(filemap_t *filemap, rewind_source *source,
static void
sanityChecks(void)
{
-	/* TODO Check that there's no backup_label in either cluster */
+	if (source->is_file_present(source, "backup_label"))
+		pg_fatal("The backup_label file is present in source cluster");
+
+	if (is_file_present(datadir_target, "backup_label"))
+		pg_fatal("The backup_label file is present in target  cluster");

/* Check system_identifier match */
if (ControlFile_target.system_identifier != ControlFile_source.system_identifier)

The error message isn't very user friendly. It's pretty dangerous
actually: I think a lot of users would just delete the backup_label file
when they see that message. Because then the file is no longer present
and problem solved, right?

The point of checking for backup_label is that if it's present, the
cluster wasn't really shut down cleanly. The correct fix is to start it,
let WAL recovery finish, and shut it down again. The error message
should make that clear. Perhaps make it similar to the existing "target
server must be shut down cleanly" message.

I think today if you try to run pg_rewind on a cluster that was restored
from a backup, so that backup_label is present, you get the "target
server must be shut down cleanly" message. But we don't have any tests
for it. We do have a test for when the server is still running, but not
for the restored-from-backup case. Would be nice to add one.

Perhaps move the backup_label check later in sanityChecks(), after
checking the state in the control file. That way, you still normally hit
the "target server must be shut down cleanly" case, and the backup_label
check would be just an additional "can't happen" sanity check.

In createBackupLabel() we have this:

/* TODO: move old file out of the way, if any. */
open_target_file("backup_label", true); /* BACKUP_LABEL_FILE */
write_target_range(buf, 0, len);
close_target_file();

That TODO comment needs to go away now. And we probably should complain
if the file already exists. With your patch, we already checked earlier
that it doesn't exists, so if it exists when we reach that code,
something's gone wrong.

--
Heikki Linnakangas
Neon (https://neon.tech)