From d78ae8fcac7195d2cc967bcde754d1ce1fa39321 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Wed, 27 Apr 2022 11:39:10 +1200
Subject: [PATCH] Fix pg_walinspect race against flush LSN.

Instability in the TAP test for pg_walinspect revealed that
pg_get_wal_records_info_till_end_of_wal(x) would try to decode all the
records beginning in the range [x, flush LSN), even though that might
include a partial record at the end of the range.  In that case, an
error would be reported by read_local_xlog_page_no_wait when it tried to
read past the flush LSN.  That caused a test failure only on a BF
animal that had been restarted recently, but could be expected to happen
in the wild quite easily depending on the alignment of various
parameters.

Adjust pg_walinspect the tolerate errors without messages, as a way to
discover the end of the WAL region to decode.

Discussion: https://postgr.es/m/Ymd/e5eeZMNAkrXo%40paquier.xyz
Discussion: https://postgr.es/m/111657.1650910309@sss.pgh.pa.us
---
 contrib/pg_walinspect/pg_walinspect.c | 51 +++++++++++----------------
 1 file changed, 21 insertions(+), 30 deletions(-)

diff --git a/contrib/pg_walinspect/pg_walinspect.c b/contrib/pg_walinspect/pg_walinspect.c
index bf38863ff1..10507c0c0c 100644
--- a/contrib/pg_walinspect/pg_walinspect.c
+++ b/contrib/pg_walinspect/pg_walinspect.c
@@ -133,6 +133,7 @@ InitXLogReaderState(XLogRecPtr lsn, XLogRecPtr *first_record)
  * We guard against ordinary errors trying to read WAL that hasn't been
  * written yet by limiting end_lsn to the flushed WAL, but that can also
  * encounter errors if the flush pointer falls in the middle of a record.
+ * In that case we'll return NULL.
  */
 static XLogRecord *
 ReadNextXLogRecord(XLogReaderState *xlogreader, XLogRecPtr first_record)
@@ -149,11 +150,11 @@ ReadNextXLogRecord(XLogReaderState *xlogreader, XLogRecPtr first_record)
 					(errcode_for_file_access(),
 					 errmsg("could not read WAL at %X/%X: %s",
 							LSN_FORMAT_ARGS(first_record), errormsg)));
-		else
-			ereport(ERROR,
-					(errcode_for_file_access(),
-					 errmsg("could not read WAL at %X/%X",
-							LSN_FORMAT_ARGS(first_record))));
+
+		/*
+		 * With no error message, assume that read_local_xlog_page_no_wait
+		 * tried to read past the flush LSN and reported an error.
+		 */
 	}
 
 	return record;
@@ -246,7 +247,12 @@ pg_get_wal_record_info(PG_FUNCTION_ARGS)
 
 	xlogreader = InitXLogReaderState(lsn, &first_record);
 
-	(void) ReadNextXLogRecord(xlogreader, first_record);
+	if (!ReadNextXLogRecord(xlogreader, first_record))
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("could not read WAL at %X/%X",
+						LSN_FORMAT_ARGS(first_record))));
+
 
 	MemSet(values, 0, sizeof(values));
 	MemSet(nulls, 0, sizeof(nulls));
@@ -327,22 +333,14 @@ GetWALRecordsInfo(FunctionCallInfo fcinfo, XLogRecPtr start_lsn,
 	MemSet(values, 0, sizeof(values));
 	MemSet(nulls, 0, sizeof(nulls));
 
-	for (;;)
+	while (ReadNextXLogRecord(xlogreader, first_record) &&
+		   xlogreader->EndRecPtr <= end_lsn)
 	{
-		(void) ReadNextXLogRecord(xlogreader, first_record);
+		GetWALRecordInfo(xlogreader, xlogreader->currRecPtr, values, nulls,
+						 PG_GET_WAL_RECORDS_INFO_COLS);
 
-		if (xlogreader->EndRecPtr <= end_lsn)
-		{
-			GetWALRecordInfo(xlogreader, xlogreader->currRecPtr, values, nulls,
-							 PG_GET_WAL_RECORDS_INFO_COLS);
-
-			tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
-								 values, nulls);
-		}
-
-		/* if we read up to end_lsn, we're done */
-		if (xlogreader->EndRecPtr >= end_lsn)
-			break;
+		tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
+							 values, nulls);
 
 		CHECK_FOR_INTERRUPTS();
 	}
@@ -555,17 +553,10 @@ GetWalStats(FunctionCallInfo fcinfo, XLogRecPtr start_lsn,
 
 	MemSet(&stats, 0, sizeof(stats));
 
-	for (;;)
+	while (ReadNextXLogRecord(xlogreader, first_record) &&
+		   xlogreader->EndRecPtr <= end_lsn)
 	{
-		(void) ReadNextXLogRecord(xlogreader, first_record);
-
-		if (xlogreader->EndRecPtr <= end_lsn)
-			XLogRecStoreStats(&stats, xlogreader);
-
-		/* if we read up to end_lsn, we're done */
-		if (xlogreader->EndRecPtr >= end_lsn)
-			break;
-
+		XLogRecStoreStats(&stats, xlogreader);
 		CHECK_FOR_INTERRUPTS();
 	}
 
-- 
2.35.1

