From 6cca54db773c07098bce4259c0eac95c67b5ab9e Mon Sep 17 00:00:00 2001 From: Robert Haas Date: Tue, 23 Nov 2021 11:08:24 -0500 Subject: [PATCH v2 2/2] Fix corner-case failure to detect improper timeline switch. rescanLatestTimeLine() contains a guard against switching to a timeline that forked off from the current one prior to the current recovery point, but that guard does not work if the timeline switch occurs before the first WAL recod (which must be the checkpoint record) is read. Without this patch, an improper timeline switch is therefore possible in such cases. This happens because rescanLatestTimeLine() relies on the global variable EndRecPtr to understand the current position of WAL replay. However, EndRecPtr at this point in the code contains the endpoint of the last-replayed record, not the startpoint or endpoint of the record being replayed now. Thus, before any records have been replayed, it's zero, which causes the sanity check to always pass. To fix, pass down the correct timeline explicitly. The EndRecPtr value we want is the one from the xlogreader, which will be the starting position of the record we're about to try to read, rather than the global variable, which is the ending position of the last record we successfully read. They're usually the same, but not in the corner case described here. Patch by me, reviewed by Amul Sul. Discussion: http://postgr.es/m/CA+Tgmoao96EuNeSPd+hspRKcsCddu=b1h-QNRuKfY8VmfNQdfg@mail.gmail.com --- src/backend/access/transam/xlog.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 1616448368..c4b822d98e 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -924,7 +924,8 @@ static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, XLogRecPtr targetRecPtr, char *readBuf); static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, bool fetching_ckpt, XLogRecPtr tliRecPtr, - TimeLineID replayTLI); + TimeLineID replayTLI, + XLogRecPtr replayLSN); static void XLogShutdownWalRcv(void); static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr); static void XLogFileClose(void); @@ -946,7 +947,8 @@ static bool PerformRecoveryXLogAction(void); static XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int whichChkpt, bool report, TimeLineID replayTLI); -static bool rescanLatestTimeLine(TimeLineID replayTLI); +static bool rescanLatestTimeLine(TimeLineID replayTLI, + XLogRecPtr replayLSN); static void InitControlFile(uint64 sysidentifier); static void WriteControlFile(void); static void ReadControlFile(void); @@ -4620,7 +4622,7 @@ ReadRecord(XLogReaderState *xlogreader, int emode, * one and returns 'true'. */ static bool -rescanLatestTimeLine(TimeLineID replayTLI) +rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN) { List *newExpectedTLEs; bool found; @@ -4671,13 +4673,13 @@ rescanLatestTimeLine(TimeLineID replayTLI) * next timeline was forked off from it *after* the current recovery * location. */ - if (currentTle->end < EndRecPtr) + if (currentTle->end < replayLSN) { ereport(LOG, (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X", newtarget, replayTLI, - LSN_FORMAT_ARGS(EndRecPtr)))); + LSN_FORMAT_ARGS(replayLSN)))); return false; } @@ -12471,7 +12473,8 @@ retry: private->randAccess, private->fetching_ckpt, targetRecPtr, - private->replayTLI)) + private->replayTLI, + xlogreader->EndRecPtr)) { if (readFile >= 0) close(readFile); @@ -12624,6 +12627,10 @@ next_record_is_invalid: * 'tliRecPtr' is the position of the WAL record we're interested in. It is * used to decide which timeline to stream the requested WAL from. * + * 'replayLSN' is the current replay LSN, so that if we scan for new + * timelines, we can reject a switch to a timeline that branched off before + * this point. + * * If the record is not immediately available, the function returns false * if we're not in standby mode. In standby mode, waits for it to become * available. @@ -12636,7 +12643,7 @@ next_record_is_invalid: static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, bool fetching_ckpt, XLogRecPtr tliRecPtr, - TimeLineID replayTLI) + TimeLineID replayTLI, XLogRecPtr replayLSN) { static TimestampTz last_fail_time = 0; TimestampTz now; @@ -12759,7 +12766,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, */ if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST) { - if (rescanLatestTimeLine(replayTLI)) + if (rescanLatestTimeLine(replayTLI, replayLSN)) { currentSource = XLOG_FROM_ARCHIVE; break; @@ -12886,7 +12893,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, */ if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST) - rescanLatestTimeLine(replayTLI); + rescanLatestTimeLine(replayTLI, replayLSN); startWalReceiver = true; } -- 2.24.3 (Apple Git-128)