From aa57f9f6d21176aab6b3e520b4eb0473b8b4a609 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Fri, 9 Aug 2024 16:50:58 +0300
Subject: [PATCH 3/4] Don't read past current TLI during archive recovery

If you set the recovery target timeline, the recovery would not
correctly follow the timeline history to reach that timeline, if a
local WAL segment had more WAL records on one of the older
timelines. Fix that by not reading WAL past the switchpoint in the
timeline history.

This case came up with pg_rewind: if you call pg_rewind on a server
that is a direct ancestor of the source, it exits with "no rewind
required" and does nothing. However, if the target had already
streamed WAL past the point of divergence, and just hadn't applied it
yet, when you start it up it will follow the original timeline, not
the one you tried to rewind to. This commit alone doesn't fix that,
because the server still won't know that it's supposed to follow the
other timeline. But this at least fixes the similar case when you have
set the recovery target timeline explicitly.

Discussion: https://www.postgresql.org/message-id/18575-1b9b3b60e2a480de%40postgresql.org
---
 src/backend/access/transam/xlogrecovery.c     | 81 +++++++++++++--
 .../recovery/t/044_change_recovery_target.pl  | 98 +++++++++++++++++++
 2 files changed, 171 insertions(+), 8 deletions(-)
 create mode 100644 src/test/recovery/t/044_change_recovery_target.pl

diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c
index ad817fbca6..d3d1774d26 100644
--- a/src/backend/access/transam/xlogrecovery.c
+++ b/src/backend/access/transam/xlogrecovery.c
@@ -117,12 +117,20 @@ bool		wal_receiver_create_temp_slot = false;
  * scanning data that was copied from an ancestor timeline when the current
  * file was created.)  During a sequential scan we do not allow this value
  * to decrease.
+ *
+ * curFileUpto: the end of the curFileTLI timeline in the history of the
+ * timeline we're recovering to (ie. recoveryTargetTLI).  We must not read the
+ * currently open file beyond this position, because it contains WAL that is
+ * not part of the history of the timeline we're recovering to.  Upon reaching
+ * that point, we need to open the file from the next timeline in the history
+ * instead.
  */
 RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal = RECOVERY_TARGET_TIMELINE_LATEST;
 TimeLineID	recoveryTargetTLIRequested = 0;
 TimeLineID	recoveryTargetTLI = 0;
 static List *expectedTLEs;
 static TimeLineID curFileTLI;
+static XLogRecPtr curFileUpto;
 
 /*
  * When ArchiveRecoveryRequested is set, archive recovery was requested,
@@ -3315,6 +3323,7 @@ XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
 	int			emode = private->emode;
 	uint32		targetPageOff;
 	XLogSegNo	targetSegNo PG_USED_FOR_ASSERTS_ONLY;
+	XLogRecPtr	readUpto;
 	int			r;
 
 	XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
@@ -3346,9 +3355,36 @@ XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
 		readSource = XLOG_FROM_ANY;
 	}
 
+retry:
+	/*
+	 * If we have reached the end of the current timeline in the recovery
+	 * target timeline's history, need to switch to the file from the next
+	 * timeline.
+	 *
+	 * Note: The logic and purpose of this is similar to
+	 * XLogReadDetermineTimeline(), but we don't use the end-of-segment to
+	 * determine the next TLI like XLogReadDetermineTimeline() does.  We don't
+	 * need it because XLogFileReadAnyTLI() will try to read from the WAL
+	 * segment with the latest possible TLI.  This is more flexible than
+	 * XLogReadDetermineTimeline(), which makes it more likely that we'll make
+	 * some progress towards the recovery target, even if all the WAL is not
+	 * (yet) available.
+	 */
+	if (curFileUpto != InvalidXLogRecPtr &&
+		curFileUpto <= targetPagePtr + reqLen)
+	{
+		if (readFile >= 0)
+		{
+			close(readFile);
+			readFile = -1;
+		}
+		curFileTLI = tliOfPointInHistory(targetPagePtr + reqLen, expectedTLEs);
+		curFileUpto = tliSwitchPoint(curFileTLI, expectedTLEs, NULL);
+		readSource = XLOG_FROM_ANY;
+	}
+
 	XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size);
 
-retry:
 	/* See if we need to retrieve more data */
 	if (readFile < 0 ||
 		(readSource == XLOG_FROM_STREAM &&
@@ -3382,28 +3418,45 @@ retry:
 		}
 	}
 
+	/*
+	 * If the recovery target TLI changed during WaitForWALToBecomeAvailable,
+	 * the record we're looking for might not be on this file anymore.
+	 */
+	 if (curFileUpto != InvalidXLogRecPtr &&
+		 curFileUpto <= targetPagePtr + reqLen)
+		 goto retry;
+
 	/*
 	 * At this point, we have the right segment open and if we're streaming we
 	 * know the requested record is in it.
 	 */
 	Assert(readFile != -1);
 
+	/*
+	 * If we're recovering through old timelines, make sure we don't read
+	 * beyond the point where we're supposed to switch to next timeline in the
+	 * recovery target's history.
+	 */
+	readUpto = curFileUpto;
+
 	/*
 	 * If the current segment is being streamed from the primary, calculate
 	 * how much of the current page we have received already. We know the
 	 * requested record has been received, but this is for the benefit of
 	 * future calls, to allow quick exit at the top of this function.
 	 */
-	if (readSource == XLOG_FROM_STREAM)
+	if (readSource == XLOG_FROM_STREAM &&
+		(readUpto == InvalidXLogRecPtr || flushedUpto < readUpto))
 	{
-		if (((targetPagePtr) / XLOG_BLCKSZ) != (flushedUpto / XLOG_BLCKSZ))
-			readLen = XLOG_BLCKSZ;
-		else
-			readLen = XLogSegmentOffset(flushedUpto, wal_segment_size) -
-				targetPageOff;
+		readUpto = flushedUpto;
 	}
-	else
+
+	if (readUpto == InvalidXLogRecPtr ||
+		((targetPagePtr) / XLOG_BLCKSZ) != (readUpto / XLOG_BLCKSZ))
 		readLen = XLOG_BLCKSZ;
+	else
+		readLen = XLogSegmentOffset(readUpto, wal_segment_size) -
+			targetPageOff;
 
 	/* Read the requested page */
 	readOff = targetPageOff;
@@ -3773,7 +3826,10 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
 				}
 				/* Reset curFileTLI if random fetch. */
 				if (randAccess)
+				{
 					curFileTLI = 0;
+					curFileUpto = InvalidXLogRecPtr;
+				}
 
 				/*
 				 * Try to restore the file from archive, or read an existing
@@ -3837,6 +3893,9 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
 						XLogRecPtr	ptr;
 						TimeLineID	tli;
 
+						if (!expectedTLEs)
+							expectedTLEs = readTimeLineHistory(recoveryTargetTLI);
+
 						if (fetching_ckpt)
 						{
 							ptr = RedoStartLSN;
@@ -3858,6 +3917,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
 									 tli, curFileTLI);
 						}
 						curFileTLI = tli;
+						curFileUpto = tliSwitchPoint(tli, expectedTLEs, NULL);
 						SetInstallXLogFileSegmentActive();
 						RequestXLogStreaming(tli, ptr, PrimaryConnInfo,
 											 PrimarySlotName,
@@ -4184,6 +4244,8 @@ rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN)
 	 * between the old target and new target in pg_wal.
 	 */
 	restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
+	if (curFileTLI != 0)
+		curFileUpto = tliSwitchPoint(curFileTLI, expectedTLEs, NULL);
 
 	ereport(LOG,
 			(errmsg("new target timeline is %u",
@@ -4254,6 +4316,9 @@ XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
 	{
 		/* Success! */
 		curFileTLI = tli;
+		if (!expectedTLEs)
+			expectedTLEs = readTimeLineHistory(recoveryTargetTLI);
+		curFileUpto = tliSwitchPoint(curFileTLI, expectedTLEs, NULL);
 
 		/* Report recovery progress in PS display */
 		snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
diff --git a/src/test/recovery/t/044_change_recovery_target.pl b/src/test/recovery/t/044_change_recovery_target.pl
new file mode 100644
index 0000000000..99aa126d6b
--- /dev/null
+++ b/src/test/recovery/t/044_change_recovery_target.pl
@@ -0,0 +1,98 @@
+
+# Copyright (c) 2021-2024, PostgreSQL Global Development Group
+
+# Test that recovery server follows the right path through the WAL
+# files towards the recovery target timeline. Consider the following
+# timeline history:
+#
+#    ---S---------------------->P   TLI 1
+#                     \
+#                      -------->    TLI 2
+#
+# Standby has replayed the WALL on TLI up to point S, and is currently
+# stopped. However, it already has all the WAL from TLI 1 locally in
+# pg_wal, because it was restored from th archive or streamed from the
+# primary on TLI 1 earlier. If you now set recovery target timeline to
+# TLI 2, the standby must not replay the WAL it already has on TLI 1,
+# beyond the point where they diverged.
+#
+# This test sets up that scenario, by pausing the replay on the
+# standby.
+
+use strict;
+use warnings FATAL => 'all';
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+use File::Copy;
+
+my $primary1 = PostgreSQL::Test::Cluster->new('primary1');
+$primary1->init(allows_streaming => 1);
+$primary1->start;
+$primary1->backup("bkp");
+
+my $standby = PostgreSQL::Test::Cluster->new('standby');
+$standby->init_from_backup($primary1, "bkp", has_streaming => 1);
+$standby->start;
+
+# Pause the standby
+$standby->safe_psql('postgres', "SELECT pg_wal_replay_pause()");
+
+# Make some changes in the primary. They will be streamed to the
+# standby, but not yet applied.
+$primary1->safe_psql('postgres', 'CREATE TABLE test_tab1 (t TEXT)');
+$primary1->safe_psql('postgres', 'CREATE TABLE test_tab2 (t TEXT)');
+$primary1->safe_psql('postgres',
+	"INSERT INTO test_tab1 VALUES ('both timelines')");
+$primary1->safe_psql('postgres',
+	"INSERT INTO test_tab2 VALUES ('both timelines')");
+
+# Create timeline 2 at this point
+my $primary2 = PostgreSQL::Test::Cluster->new('primary2');
+$primary2->init_from_backup($primary1, "bkp", has_streaming => 1);
+$primary2->start;
+$primary1->wait_for_replay_catchup($primary2);
+$primary2->promote;
+
+# Make some changes on both timelines
+$primary1->safe_psql('postgres',
+	"INSERT INTO test_tab1 VALUES ('TLI 1 only')");
+
+$primary2->safe_psql('postgres',
+	"INSERT INTO test_tab2 VALUES ('TLI 2 only')");
+
+# Stop the standby, and re-point it to TLI 2 (primary2).
+$standby->stop;
+my $primary2_connstr = $primary2->connstr;
+$standby->append_conf('postgresql.conf',
+	"primary_conninfo = '$primary2_connstr'");
+$standby->append_conf('postgresql.conf',
+	"recovery_target_timeline = 2");
+
+# Copy the history file. Otherwise, the standby will just error out
+# with "FATAL: recovery target timeline 2 does not exist"
+#
+# XXX: Perhaps it should try to connect to the primary and fetch the
+# history file from there instead of erroring out. But it doesn't do
+# that today.
+copy($primary2->data_dir . '/pg_wal/00000002.history',
+	$standby->data_dir . '/pg_wal/00000002.history')
+  or BAIL_OUT("could not copy 00000002.history");
+
+# Check that it recovers correctly to TLI 2 when started up.
+$standby->start;
+$primary2->wait_for_replay_catchup($standby);
+
+my $result =
+  $standby->safe_psql('postgres', "SELECT * FROM test_tab1");
+is( $result, qq(both timelines),
+		"changes on TLI 1 are *not* on standby");
+
+$result =
+  $standby->safe_psql('postgres', "SELECT * FROM test_tab2");
+is( $result, qq(both timelines
+TLI 2 only),
+		"all changes on TLI 2 are on the standby");
+
+done_testing();
-- 
2.39.2