From 52eba6b6ef8d8fda078d3acd27b6ce7406078df8 Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Date: Mon, 10 May 2021 16:57:24 +0900
Subject: [PATCH] Choose correct timeline when received historic timelines

When a standby starts streaming before determining expectedTLEs, it
determines the expected timeline list based on the timeline of the WAL
segment just streamed from the primary. If we fetched the last
checkpoint in the older timeline, this behavior prevents recovery from
proceeding. When primary streamed over a WAL file in a historical
timeline, use recoveryTargetTLI, which must be a history of the
primary.  There's one scenario where this makes a difference: suppose
two standbys are connected to a primary both by archive and
streaming. In the case where one of the standby promotes, then another
reconnects to the promoted standby before archiving the first segment
of the new timeline, and the content of pg_wal of the new standby is
removed before reconnection, the standby fetches the history file for
the new timeline but the first segment for the timeline is available
only via streaming.  In this case, the new standby thought that the
primary always sends newer timeline than the current recovery target
but that is not the case of this scenario.
---
 src/backend/access/transam/xlog.c          | 45 ++++++++++---
 src/test/recovery/t/004_timeline_switch.pl | 74 +++++++++++++++++++++-
 2 files changed, 108 insertions(+), 11 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index c1d4415a43..1ca55b7ec0 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -12656,22 +12656,47 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
 						 * info is set correctly and XLogReceiptTime isn't
 						 * changed.
 						 */
-						if (readFile < 0)
-						{
-							if (!expectedTLEs)
-								expectedTLEs = readTimeLineHistory(receiveTLI);
-							readFile = XLogFileRead(readSegNo, PANIC,
-													receiveTLI,
-													XLOG_FROM_STREAM, false);
-							Assert(readFile >= 0);
-						}
-						else
+
+						if (readFile >= 0)
 						{
 							/* just make sure source info is correct... */
 							readSource = XLOG_FROM_STREAM;
 							XLogReceiptSource = XLOG_FROM_STREAM;
 							return true;
 						}
+
+						readFile = XLogFileRead(readSegNo, PANIC,
+												receiveTLI,
+												XLOG_FROM_STREAM, false);
+						Assert(readFile >= 0);
+
+						if (expectedTLEs)
+							break;
+
+						/* initialize expectedTLEs */
+						expectedTLEs = readTimeLineHistory(receiveTLI);
+
+						if (!tliInHistory(recoveryTargetTLI, expectedTLEs))
+						{
+							/*
+							 * Received timeline is not the future of the
+							 * recovery target timeline. The upstream must have
+							 * instead sent a historic timeline of ours. This
+							 * can happen when this standby's pg_wal has been
+							 * cleaned up before restart but archive offered
+							 * the history file recovery target. Pick up the
+							 * history of the recovery target timeline.
+							 */
+							expectedTLEs =
+								readTimeLineHistory(recoveryTargetTLI);
+							if (!tliInHistory(receiveTLI, expectedTLEs))
+								ereport(FATAL,
+										(errcode(ERRCODE_DATA_CORRUPTED),
+										 errmsg ("the primary server has sent incompatible timeline %d with recovery target timeline %d",
+												 receiveTLI,
+												 recoveryTargetTLI)));
+						}
+
 						break;
 					}
 
diff --git a/src/test/recovery/t/004_timeline_switch.pl b/src/test/recovery/t/004_timeline_switch.pl
index c101980e9e..bca7ad165e 100644
--- a/src/test/recovery/t/004_timeline_switch.pl
+++ b/src/test/recovery/t/004_timeline_switch.pl
@@ -7,7 +7,7 @@ use warnings;
 use File::Path qw(rmtree);
 use PostgresNode;
 use TestLib;
-use Test::More tests => 3;
+use Test::More tests => 4;
 
 $ENV{PGDATABASE} = 'postgres';
 
@@ -109,3 +109,75 @@ $node_primary_2->wait_for_catchup($node_standby_3, 'replay',
 my $result_2 =
   $node_standby_3->safe_psql('postgres', "SELECT count(*) FROM tab_int");
 is($result_2, qq(1), 'check content of standby 3');
+
+#####
+# Check if standby can follow primary when the segment for the new
+# time line is not avaiable for the standby in archive.
+
+# setup two standby nodes, one of them archives WAL files
+$node_primary_2->psql('postgres', 'CREATE TABLE t (a int)');
+my $node_standby_4 = get_new_node('standby_3_1');
+$node_standby_4->init_from_backup($node_primary_2, $backup_name,
+	has_streaming => 1,	allows_streaming => 1, has_archiving => 1);
+my $archdir = $node_standby_4->archive_dir;
+
+# keep 3 segments
+my $keepsize = $node_primary_2->safe_psql('postgres',
+										  q[
+select setting from pg_settings where name = 'wal_segment_size';
+										  ]) * 3;
+$node_standby_4->append_conf('postgresql.conf', qq[
+wal_keep_size = $keepsize
+archive_mode = always
+archive_command = 'cp %p $archdir/%f'
+]);
+
+$node_standby_4->start;
+
+my $node_standby_5 = get_new_node('node_standby_5');
+$node_standby_5->init_from_backup($node_primary_2, $backup_name,
+	has_streaming => 1);
+$node_standby_5->start;
+
+$node_primary_2->psql('postgres', qq[
+					  SELECT pg_switch_wal();
+					  INSERT INTO t VALUES (0);
+					  CHECKPOINT;
+]);
+$node_primary_2->wait_for_catchup($node_standby_4, 'write',
+								$node_primary_2->lsn('insert'));
+$node_primary_2->wait_for_catchup($node_standby_5, 'write',
+								$node_primary_2->lsn('insert'));
+
+# disconnect the second standby then connect to the promoted first standby
+$node_standby_5->stop;
+$node_standby_4->psql('postgres', 'SELECT pg_promote()');
+$node_standby_5->enable_streaming($node_standby_4);
+$node_standby_5->append_conf('postgresql.conf',
+							 "restore_command = 'cp $archdir/%f %p'");
+
+# flush pg_wal on the second standby (new standby)
+my $pgwaldir = $node_standby_5->data_dir. "/pg_wal";
+opendir my $dh, $pgwaldir or die "failed to open $pgwaldir of node_standby_5";
+while (my $f = readdir($dh))
+{
+	unlink("$pgwaldir/$f") if (-f "$pgwaldir/$f");
+}
+closedir($dh);
+
+# restart the second standby, not just reloading to trigger archive recovery
+$node_standby_5->start;
+$node_standby_4->psql('postgres', qq[
+						INSERT INTO t VALUES(0);
+						SELECT pg_switch_wal();
+						INSERT INTO t VALUES(0); -- avoid segment border
+]);
+
+$node_standby_4->wait_for_catchup($node_standby_5, 'write',
+									$node_standby_4->lsn('insert'));
+
+# check if the change is correctly replicated
+my $result_3 =
+  $node_standby_4->safe_psql('postgres',
+							   'SELECT pg_current_wal_insert_lsn() = write_lsn FROM pg_stat_replication');
+is($result_3, 't', 'check receiving historic timeline from primary');
-- 
2.27.0

