Propagate XLogFindNextRecord error to callers

Started by Anthonin Bonnefoyabout 1 month ago1 messages

anthonin.bonnefoy@datadoghq.com

about 1 month ago

1 attachment(s)

Hi,

Currently, XLogFindNextRecord errormsg is ignored and callers will only
output a generic 'could not find a valid record' message without details.
Additionally, invalid page header won't go through XLogReadRecord, leaving
the error in state->errormsg_buf.

This patch propagates XLogFindNextRecord's error message to the callers and
displays it. In case of an invalid page header, the errormsg is filled with
errormsg_buf content.

With this patch, pg_waldump will now have the following output when reading
a file with an invalid header:
pg_waldump: error: could not find a valid record after D80/5C000000:
invalid magic number D116 in WAL segment 0000001400000D8000000017, LSN
D80/5C000000, offset 0

Regards,
Anthonin Bonnefoy

Attachments:

v1-0001-Propage-errormsg-to-XLogFindNextRecord-caller.patchapplication/octet-stream; name=v1-0001-Propage-errormsg-to-XLogFindNextRecord-caller.patchDownload

From f807ad4e9aaf5e68601f65482cfcd33eeaea5e0c Mon Sep 17 00:00:00 2001
From: Anthonin Bonnefoy <anthonin.bonnefoy@datadoghq.com>
Date: Fri, 12 Dec 2025 09:56:00 +0100
Subject: Propage errormsg to XLogFindNextRecord caller

Currently, XLogFindNextRecord errormsg is ignored and callers will only
output a generic 'could not find a valid record' message without
details.
Additionally, invalid page header won't go through XLogReadRecord,
leaving the error in state->errormsg_buf.

This patch propagates XLogFindNextRecord's error message to the callers
and displays it. In case of an invalid page header, the errormsg is
filled with errormsg_buf content.
---
 contrib/pg_walinspect/pg_walinspect.c   | 16 ++++++++++++----
 src/backend/access/transam/xlogreader.c | 16 +++++++++++++---
 src/backend/postmaster/walsummarizer.c  | 17 ++++++++++++-----
 src/bin/pg_waldump/pg_waldump.c         | 12 +++++++++---
 src/include/access/xlogreader.h         |  2 +-
 5 files changed, 47 insertions(+), 16 deletions(-)

diff --git a/contrib/pg_walinspect/pg_walinspect.c b/contrib/pg_walinspect/pg_walinspect.c
index 6945bac1306..da997e4fd86 100644
--- a/contrib/pg_walinspect/pg_walinspect.c
+++ b/contrib/pg_walinspect/pg_walinspect.c
@@ -97,6 +97,7 @@ InitXLogReaderState(XLogRecPtr lsn)
 	XLogReaderState *xlogreader;
 	ReadLocalXLogPageNoWaitPrivate *private_data;
 	XLogRecPtr	first_valid_record;
+	char	   *errormsg;
 
 	/*
 	 * Reading WAL below the first page of the first segments isn't allowed.
@@ -124,12 +125,19 @@ InitXLogReaderState(XLogRecPtr lsn)
 				 errdetail("Failed while allocating a WAL reading processor.")));
 
 	/* first find a valid recptr to start from */
-	first_valid_record = XLogFindNextRecord(xlogreader, lsn);
+	first_valid_record = XLogFindNextRecord(xlogreader, lsn, &errormsg);
 
 	if (!XLogRecPtrIsValid(first_valid_record))
-		ereport(ERROR,
-				errmsg("could not find a valid record after %X/%08X",
-					   LSN_FORMAT_ARGS(lsn)));
+	{
+		if (errormsg)
+			ereport(ERROR,
+					errmsg("could not find a valid record after %X/%08X: %s",
+						   LSN_FORMAT_ARGS(lsn), errormsg));
+		else
+			ereport(ERROR,
+					errmsg("could not find a valid record after %X/%08X",
+						   LSN_FORMAT_ARGS(lsn)));
+	}
 
 	return xlogreader;
 }
diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c
index 5e5001b2101..269c0ff7f47 100644
--- a/src/backend/access/transam/xlogreader.c
+++ b/src/backend/access/transam/xlogreader.c
@@ -1391,12 +1391,11 @@ XLogReaderResetError(XLogReaderState *state)
  * XLogReadRecord() will read the next valid record.
  */
 XLogRecPtr
-XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr)
+XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr, char **errormsg)
 {
 	XLogRecPtr	tmpRecPtr;
 	XLogRecPtr	found = InvalidXLogRecPtr;
 	XLogPageHeader header;
-	char	   *errormsg;
 
 	Assert(XLogRecPtrIsValid(RecPtr));
 
@@ -1481,7 +1480,7 @@ XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr)
 	 * or we just jumped over the remaining data of a continuation.
 	 */
 	XLogBeginRead(state, tmpRecPtr);
-	while (XLogReadRecord(state, &errormsg) != NULL)
+	while (XLogReadRecord(state, errormsg) != NULL)
 	{
 		/* past the record we've found, break out */
 		if (RecPtr <= state->ReadRecPtr)
@@ -1496,6 +1495,17 @@ XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr)
 err:
 	XLogReaderInvalReadState(state);
 
+	/*
+	 * we may have reported errors due to invalid WAL header, propagate the
+	 * error message to the caller.
+	 */
+	if (state->errormsg_deferred)
+	{
+		if (state->errormsg_buf[0] != '\0')
+			*errormsg = state->errormsg_buf;
+		state->errormsg_deferred = false;
+	}
+
 	return InvalidXLogRecPtr;
 }
 
diff --git a/src/backend/postmaster/walsummarizer.c b/src/backend/postmaster/walsummarizer.c
index e7e4d652f97..3e7114b6942 100644
--- a/src/backend/postmaster/walsummarizer.c
+++ b/src/backend/postmaster/walsummarizer.c
@@ -918,6 +918,7 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact,
 	WalSummaryIO io;
 	BlockRefTable *brtab = CreateEmptyBlockRefTable();
 	bool		fast_forward = true;
+	char	   *errormsg;
 
 	/* Initialize private data for xlogreader. */
 	private_data = palloc0_object(SummarizerReadLocalXLogPrivate);
@@ -969,7 +970,7 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact,
 	}
 	else
 	{
-		summary_start_lsn = XLogFindNextRecord(xlogreader, start_lsn);
+		summary_start_lsn = XLogFindNextRecord(xlogreader, start_lsn, &errormsg);
 		if (!XLogRecPtrIsValid(summary_start_lsn))
 		{
 			/*
@@ -998,9 +999,16 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact,
 				switch_lsn = xlogreader->EndRecPtr;
 			}
 			else
-				ereport(ERROR,
-						errmsg("could not find a valid record after %X/%08X",
-							   LSN_FORMAT_ARGS(start_lsn)));
+			{
+				if (errormsg)
+					ereport(ERROR,
+							errmsg("could not find a valid record after %X/%08X: %s",
+								   LSN_FORMAT_ARGS(start_lsn), errormsg));
+				else
+					ereport(ERROR,
+							errmsg("could not find a valid record after %X/%08X",
+								   LSN_FORMAT_ARGS(start_lsn)));
+			}
 		}
 
 		/* We shouldn't go backward. */
@@ -1013,7 +1021,6 @@ SummarizeWAL(TimeLineID tli, XLogRecPtr start_lsn, bool exact,
 	while (1)
 	{
 		int			block_id;
-		char	   *errormsg;
 		XLogRecord *record;
 		uint8		rmid;
 
diff --git a/src/bin/pg_waldump/pg_waldump.c b/src/bin/pg_waldump/pg_waldump.c
index 1c1ccf59f65..d73f44e6394 100644
--- a/src/bin/pg_waldump/pg_waldump.c
+++ b/src/bin/pg_waldump/pg_waldump.c
@@ -1211,11 +1211,17 @@ main(int argc, char **argv)
 		pg_fatal("out of memory while allocating a WAL reading processor");
 
 	/* first find a valid recptr to start from */
-	first_record = XLogFindNextRecord(xlogreader_state, private.startptr);
+	first_record = XLogFindNextRecord(xlogreader_state, private.startptr, &errormsg);
 
 	if (!XLogRecPtrIsValid(first_record))
-		pg_fatal("could not find a valid record after %X/%08X",
-				 LSN_FORMAT_ARGS(private.startptr));
+	{
+		if (errormsg)
+			pg_fatal("could not find a valid record after %X/%X: %s",
+					 LSN_FORMAT_ARGS(private.startptr), errormsg);
+		else
+			pg_fatal("could not find a valid record after %X/%X",
+					 LSN_FORMAT_ARGS(private.startptr));
+	}
 
 	/*
 	 * Display a message that we're skipping data if `from` wasn't a pointer
diff --git a/src/include/access/xlogreader.h b/src/include/access/xlogreader.h
index dfabbbd57d4..4f68fe0e4ea 100644
--- a/src/include/access/xlogreader.h
+++ b/src/include/access/xlogreader.h
@@ -342,7 +342,7 @@ extern void XLogReaderSetDecodeBuffer(XLogReaderState *state,
 
 /* Position the XLogReader to given record */
 extern void XLogBeginRead(XLogReaderState *state, XLogRecPtr RecPtr);
-extern XLogRecPtr XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr);
+extern XLogRecPtr XLogFindNextRecord(XLogReaderState *state, XLogRecPtr RecPtr, char **errormsg);
 
 /* Return values from XLogPageReadCB. */
 typedef enum XLogPageReadResult
-- 
2.51.0