Remove useless arguments in ReadCheckpointRecord().
Hi,
I'd like to propose to remove "whichChkpt" and "report" arguments in ReadCheckpointRecord(). "report" is obviously useless because it's always true, i.e., there are two callers of the function and they always specify true as "report".
"whichChkpt" indicates where the specified checkpoint location came from, pg_control or backup_label. This information is used to log different messages as follows.
switch (whichChkpt)
{
case 1:
ereport(LOG,
(errmsg("invalid primary checkpoint link in control file")));
break;
default:
ereport(LOG,
(errmsg("invalid checkpoint link in backup_label file")));
break;
}
return NULL;
...
switch (whichChkpt)
{
case 1:
ereport(LOG,
(errmsg("invalid primary checkpoint record")));
break;
default:
ereport(LOG,
(errmsg("invalid checkpoint record")));
break;
}
return NULL;
...
But the callers of ReadCheckpointRecord() already output different log messages depending on where the invalid checkpoint record came from. So even if ReadCheckpointRecord() doesn't use "whichChkpt", i.e., use the same log message in both pg_control and backup_label cases, users can still identify where the invalid checkpoint record came from, by reading the log message.
Also when whichChkpt = 0, "primary checkpoint" is used in the log message and sounds confusing because, as far as I recall correctly, we removed the concept of primary and secondary checkpoints before.
Therefore I think that it's better to remove "whichChkpt" argument in ReadCheckpointRecord().
Patch attached. Thought?
Regards,
--
Fujii Masao
Advanced Computing Technology Center
Research and Development Headquarters
NTT DATA CORPORATION
Attachments:
0001-Remove-useless-arguments-in-ReadCheckpointRecord.patchtext/plain; charset=UTF-8; name=0001-Remove-useless-arguments-in-ReadCheckpointRecord.patchDownload
From a33e557a18cf5c45bbcf47f1cf92e26998f1cd67 Mon Sep 17 00:00:00 2001
From: Fujii Masao <fujii@postgresql.org>
Date: Wed, 20 Jul 2022 23:06:44 +0900
Subject: [PATCH] Remove useless arguments in ReadCheckpointRecord().
---
src/backend/access/transam/xlogrecovery.c | 84 ++++-------------------
1 file changed, 15 insertions(+), 69 deletions(-)
diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c
index 5d6f1b5e46..e383c2123a 100644
--- a/src/backend/access/transam/xlogrecovery.c
+++ b/src/backend/access/transam/xlogrecovery.c
@@ -422,8 +422,8 @@ static XLogPageReadResult WaitForWALToBecomeAvailable(XLogRecPtr RecPtr,
XLogRecPtr replayLSN,
bool nonblocking);
static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
-static XLogRecord *ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher, XLogRecPtr RecPtr,
- int whichChkpt, bool report, TimeLineID replayTLI);
+static XLogRecord *ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher,
+ XLogRecPtr RecPtr, TimeLineID replayTLI);
static bool rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN);
static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
XLogSource source, bool notfoundOk);
@@ -605,7 +605,7 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
* When a backup_label file is present, we want to roll forward from
* the checkpoint it identifies, rather than using pg_control.
*/
- record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc, 0, true,
+ record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc,
CheckPointTLI);
if (record != NULL)
{
@@ -744,7 +744,7 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
CheckPointTLI = ControlFile->checkPointCopy.ThisTimeLineID;
RedoStartLSN = ControlFile->checkPointCopy.redo;
RedoStartTLI = ControlFile->checkPointCopy.ThisTimeLineID;
- record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc, 1, true,
+ record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc,
CheckPointTLI);
if (record != NULL)
{
@@ -3843,13 +3843,10 @@ emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
/*
* Subroutine to try to fetch and validate a prior checkpoint record.
- *
- * whichChkpt identifies the checkpoint (merely for reporting purposes).
- * 1 for "primary", 0 for "other" (backup_label)
*/
static XLogRecord *
ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher, XLogRecPtr RecPtr,
- int whichChkpt, bool report, TimeLineID replayTLI)
+ TimeLineID replayTLI)
{
XLogRecord *record;
uint8 info;
@@ -3858,20 +3855,8 @@ ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher, XLogRecPtr RecPtr,
if (!XRecOffIsValid(RecPtr))
{
- if (!report)
- return NULL;
-
- switch (whichChkpt)
- {
- case 1:
- ereport(LOG,
- (errmsg("invalid primary checkpoint link in control file")));
- break;
- default:
- ereport(LOG,
- (errmsg("invalid checkpoint link in backup_label file")));
- break;
- }
+ ereport(LOG,
+ (errmsg("invalid checkpoint location")));
return NULL;
}
@@ -3880,67 +3865,28 @@ ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher, XLogRecPtr RecPtr,
if (record == NULL)
{
- if (!report)
- return NULL;
-
- switch (whichChkpt)
- {
- case 1:
- ereport(LOG,
- (errmsg("invalid primary checkpoint record")));
- break;
- default:
- ereport(LOG,
- (errmsg("invalid checkpoint record")));
- break;
- }
+ ereport(LOG,
+ (errmsg("invalid checkpoint record")));
return NULL;
}
if (record->xl_rmid != RM_XLOG_ID)
{
- switch (whichChkpt)
- {
- case 1:
- ereport(LOG,
- (errmsg("invalid resource manager ID in primary checkpoint record")));
- break;
- default:
- ereport(LOG,
- (errmsg("invalid resource manager ID in checkpoint record")));
- break;
- }
+ ereport(LOG,
+ (errmsg("invalid resource manager ID in checkpoint record")));
return NULL;
}
info = record->xl_info & ~XLR_INFO_MASK;
if (info != XLOG_CHECKPOINT_SHUTDOWN &&
info != XLOG_CHECKPOINT_ONLINE)
{
- switch (whichChkpt)
- {
- case 1:
- ereport(LOG,
- (errmsg("invalid xl_info in primary checkpoint record")));
- break;
- default:
- ereport(LOG,
- (errmsg("invalid xl_info in checkpoint record")));
- break;
- }
+ ereport(LOG,
+ (errmsg("invalid xl_info in checkpoint record")));
return NULL;
}
if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint))
{
- switch (whichChkpt)
- {
- case 1:
- ereport(LOG,
- (errmsg("invalid length of primary checkpoint record")));
- break;
- default:
- ereport(LOG,
- (errmsg("invalid length of checkpoint record")));
- break;
- }
+ ereport(LOG,
+ (errmsg("invalid length of checkpoint record")));
return NULL;
}
return record;
--
2.36.0
On Wed, Jul 20, 2022 at 8:21 PM Fujii Masao <masao.fujii@oss.nttdata.com> wrote:
Hi,
I'd like to propose to remove "whichChkpt" and "report" arguments in ReadCheckpointRecord(). "report" is obviously useless because it's always true, i.e., there are two callers of the function and they always specify true as "report".
Yes, the report parameter is obvious to delete. The commit 1d919de5eb
removed the only call with the report parameter as false.
"whichChkpt" indicates where the specified checkpoint location came from, pg_control or backup_label. This information is used to log different messages as follows.
switch (whichChkpt)
{
case 1:
ereport(LOG,
(errmsg("invalid primary checkpoint link in control file")));
break;
default:
ereport(LOG,
(errmsg("invalid checkpoint link in backup_label file")));
break;
}
return NULL;
...
switch (whichChkpt)
{
case 1:
ereport(LOG,
(errmsg("invalid primary checkpoint record")));
break;
default:
ereport(LOG,
(errmsg("invalid checkpoint record")));
break;
}
return NULL;
...But the callers of ReadCheckpointRecord() already output different log messages depending on where the invalid checkpoint record came from. So even if ReadCheckpointRecord() doesn't use "whichChkpt", i.e., use the same log message in both pg_control and backup_label cases, users can still identify where the invalid checkpoint record came from, by reading the log message.
Also when whichChkpt = 0, "primary checkpoint" is used in the log message and sounds confusing because, as far as I recall correctly, we removed the concept of primary and secondary checkpoints before.
Yes, using "primary checkpoint" confuses, as we usually refer primary
in the context of replication and HA.
Therefore I think that it's better to remove "whichChkpt" argument in ReadCheckpointRecord().
Patch attached. Thought?
How about we transform the following messages into something like below?
(errmsg("could not locate a valid checkpoint record"))); after
ReadCheckpointRecord() for control file cases to "could not locate
valid checkpoint record in control file"
(errmsg("could not locate required checkpoint record"), after
ReadCheckpointRecord() for backup_label case to "could not locate
valid checkpoint record in backup_label file"
The above messages give more meaningful and unique info to the users.
May be unrelated, IIRC, for the errors like ereport(PANIC,
(errmsg("could not locate a valid checkpoint record"))); we wanted to
add a hint asking users to consider running pg_resetwal to fix the
issue. The error for ReadCheckpointRecord() in backup_label file
cases, already gives a hint errhint("If you are restoring from a
backup, touch \"%s/recovery.signal\" ...... Adding the hint of running
pg_resetwal (of course with a caution that it can cause inconsistency
in the data and use it as a last resort as described in the docs)
helps users and support engineers a lot to mitigate the server down
cases quickly.
Regards,
Bharath Rupireddy.
On 2022/07/21 0:29, Bharath Rupireddy wrote:
How about we transform the following messages into something like below?
(errmsg("could not locate a valid checkpoint record"))); after
ReadCheckpointRecord() for control file cases to "could not locate
valid checkpoint record in control file"
(errmsg("could not locate required checkpoint record"), after
ReadCheckpointRecord() for backup_label case to "could not locate
valid checkpoint record in backup_label file"The above messages give more meaningful and unique info to the users.
Agreed. Attached is the updated version of the patch.
Thanks for the review!
May be unrelated, IIRC, for the errors like ereport(PANIC,
(errmsg("could not locate a valid checkpoint record"))); we wanted to
add a hint asking users to consider running pg_resetwal to fix the
issue. The error for ReadCheckpointRecord() in backup_label file
cases, already gives a hint errhint("If you are restoring from a
backup, touch \"%s/recovery.signal\" ...... Adding the hint of running
pg_resetwal (of course with a caution that it can cause inconsistency
in the data and use it as a last resort as described in the docs)
helps users and support engineers a lot to mitigate the server down
cases quickly.
+1 to add some hint messages. But I'm not sure if it's good to hint the use of pg_resetwal because, as you're saying, it should be used as a last resort and has some big risks like data loss, corruption, etc. That is, I'm concerned about that some users might quickly run pg_resetwal because hint message says that, without reading the docs nor understanding those risks.
Regards,
--
Fujii Masao
Advanced Computing Technology Center
Research and Development Headquarters
NTT DATA CORPORATION
Attachments:
v2-0001-Remove-useless-arguments-in-ReadCheckpointRecord.patchtext/plain; charset=UTF-8; name=v2-0001-Remove-useless-arguments-in-ReadCheckpointRecord.patchDownload
From a5e45acb753e9d93074a25a2fc409c693c46630c Mon Sep 17 00:00:00 2001
From: Fujii Masao <fujii@postgresql.org>
Date: Wed, 20 Jul 2022 23:06:44 +0900
Subject: [PATCH] Remove useless arguments in ReadCheckpointRecord().
---
src/backend/access/transam/xlogrecovery.c | 88 +++++------------------
1 file changed, 17 insertions(+), 71 deletions(-)
diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c
index 5d6f1b5e46..010f0cd7b2 100644
--- a/src/backend/access/transam/xlogrecovery.c
+++ b/src/backend/access/transam/xlogrecovery.c
@@ -422,8 +422,8 @@ static XLogPageReadResult WaitForWALToBecomeAvailable(XLogRecPtr RecPtr,
XLogRecPtr replayLSN,
bool nonblocking);
static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
-static XLogRecord *ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher, XLogRecPtr RecPtr,
- int whichChkpt, bool report, TimeLineID replayTLI);
+static XLogRecord *ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher,
+ XLogRecPtr RecPtr, TimeLineID replayTLI);
static bool rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN);
static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
XLogSource source, bool notfoundOk);
@@ -605,7 +605,7 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
* When a backup_label file is present, we want to roll forward from
* the checkpoint it identifies, rather than using pg_control.
*/
- record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc, 0, true,
+ record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc,
CheckPointTLI);
if (record != NULL)
{
@@ -638,7 +638,7 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
else
{
ereport(FATAL,
- (errmsg("could not locate required checkpoint record"),
+ (errmsg("could not locate a valid checkpoint record in backup_label file"),
errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n"
"If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
"Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
@@ -744,7 +744,7 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
CheckPointTLI = ControlFile->checkPointCopy.ThisTimeLineID;
RedoStartLSN = ControlFile->checkPointCopy.redo;
RedoStartTLI = ControlFile->checkPointCopy.ThisTimeLineID;
- record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc, 1, true,
+ record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc,
CheckPointTLI);
if (record != NULL)
{
@@ -761,7 +761,7 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
* simplify processing around checkpoints.
*/
ereport(PANIC,
- (errmsg("could not locate a valid checkpoint record")));
+ (errmsg("could not locate a valid checkpoint record in control file")));
}
memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
@@ -3843,13 +3843,10 @@ emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
/*
* Subroutine to try to fetch and validate a prior checkpoint record.
- *
- * whichChkpt identifies the checkpoint (merely for reporting purposes).
- * 1 for "primary", 0 for "other" (backup_label)
*/
static XLogRecord *
ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher, XLogRecPtr RecPtr,
- int whichChkpt, bool report, TimeLineID replayTLI)
+ TimeLineID replayTLI)
{
XLogRecord *record;
uint8 info;
@@ -3858,20 +3855,8 @@ ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher, XLogRecPtr RecPtr,
if (!XRecOffIsValid(RecPtr))
{
- if (!report)
- return NULL;
-
- switch (whichChkpt)
- {
- case 1:
- ereport(LOG,
- (errmsg("invalid primary checkpoint link in control file")));
- break;
- default:
- ereport(LOG,
- (errmsg("invalid checkpoint link in backup_label file")));
- break;
- }
+ ereport(LOG,
+ (errmsg("invalid checkpoint location")));
return NULL;
}
@@ -3880,67 +3865,28 @@ ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher, XLogRecPtr RecPtr,
if (record == NULL)
{
- if (!report)
- return NULL;
-
- switch (whichChkpt)
- {
- case 1:
- ereport(LOG,
- (errmsg("invalid primary checkpoint record")));
- break;
- default:
- ereport(LOG,
- (errmsg("invalid checkpoint record")));
- break;
- }
+ ereport(LOG,
+ (errmsg("invalid checkpoint record")));
return NULL;
}
if (record->xl_rmid != RM_XLOG_ID)
{
- switch (whichChkpt)
- {
- case 1:
- ereport(LOG,
- (errmsg("invalid resource manager ID in primary checkpoint record")));
- break;
- default:
- ereport(LOG,
- (errmsg("invalid resource manager ID in checkpoint record")));
- break;
- }
+ ereport(LOG,
+ (errmsg("invalid resource manager ID in checkpoint record")));
return NULL;
}
info = record->xl_info & ~XLR_INFO_MASK;
if (info != XLOG_CHECKPOINT_SHUTDOWN &&
info != XLOG_CHECKPOINT_ONLINE)
{
- switch (whichChkpt)
- {
- case 1:
- ereport(LOG,
- (errmsg("invalid xl_info in primary checkpoint record")));
- break;
- default:
- ereport(LOG,
- (errmsg("invalid xl_info in checkpoint record")));
- break;
- }
+ ereport(LOG,
+ (errmsg("invalid xl_info in checkpoint record")));
return NULL;
}
if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint))
{
- switch (whichChkpt)
- {
- case 1:
- ereport(LOG,
- (errmsg("invalid length of primary checkpoint record")));
- break;
- default:
- ereport(LOG,
- (errmsg("invalid length of checkpoint record")));
- break;
- }
+ ereport(LOG,
+ (errmsg("invalid length of checkpoint record")));
return NULL;
}
return record;
--
2.36.0
I agree to removing the two parameters. And agree to let
ReadCheckpointRecord not conscious of the location source.
At Thu, 21 Jul 2022 11:45:23 +0900, Fujii Masao <masao.fujii@oss.nttdata.com> wrote in
Agreed. Attached is the updated version of the patch.
Thanks for the review!
- (errmsg("could not locate required checkpoint record"),
+ (errmsg("could not locate a valid checkpoint record in backup_label file"),
"in backup_label" there looks *to me* need some verb.. By the way,
this looks like a good chance to remove the (now) extra parens around
errmsg() and friends.
For example:
- (errmsg("could not locate a valid checkpoint record in backup_label file"),
+ errmsg("could not locate checkpoint record spcified in backup_label file"),
- (errmsg("could not locate a valid checkpoint record in control file")));
+ errmsg("could not locate checkpoint record recorded in control file")));
+ (errmsg("invalid checkpoint record")));
Is it useful to show the specified LSN there?
+ (errmsg("invalid resource manager ID in checkpoint record")));
We have a similar message "invalid resource manager ID %u at
%X/%X". Since the caller explains that it is a checkpoint record, we
can share the message here.
+ (errmsg("invalid xl_info in checkpoint record")));
(It is not an issue of this patch, though) I don't think this is
appropriate for user-facing message. Counldn't we say "unexpected
record type: %x" or something like?
+ (errmsg("invalid length of checkpoint record")));
We have "record with invalid length at %X/%X" or "invalid record
length at %X/%X: wanted %u, got %u". Counld we reuse any of them?
May be unrelated, IIRC, for the errors like ereport(PANIC,
(errmsg("could not locate a valid checkpoint record"))); we wanted to
add a hint asking users to consider running pg_resetwal to fix the
issue. The error for ReadCheckpointRecord() in backup_label file
cases, already gives a hint errhint("If you are restoring from a
backup, touch \"%s/recovery.signal\" ...... Adding the hint of running
pg_resetwal (of course with a caution that it can cause inconsistency
in the data and use it as a last resort as described in the docs)
helps users and support engineers a lot to mitigate the server down
cases quickly.+1 to add some hint messages. But I'm not sure if it's good to hint
the use of pg_resetwal because, as you're saying, it should be used as
a last resort and has some big risks like data loss, corruption,
etc. That is, I'm concerned about that some users might quickly run
pg_resetwal because hint message says that, without reading the docs
nor understanding those risks.
I don't think we want to recommend pg_resetwal to those who don't
reach it by themselves by other means. I know of a few instances of
people who had the tool (unrecoverably) break their own clusters.
regards.
--
Kyotaro Horiguchi
NTT Open Source Software Center
On 2022/07/21 14:54, Kyotaro Horiguchi wrote:
I agree to removing the two parameters. And agree to let
ReadCheckpointRecord not conscious of the location source.
Thanks for the review!
At Thu, 21 Jul 2022 11:45:23 +0900, Fujii Masao <masao.fujii@oss.nttdata.com> wrote in
Agreed. Attached is the updated version of the patch.
Thanks for the review!- (errmsg("could not locate required checkpoint record"), + (errmsg("could not locate a valid checkpoint record in backup_label file"),"in backup_label" there looks *to me* need some verb..
Sorry, I failed to understand your point. Could you clarify your point?
By the way,
this looks like a good chance to remove the (now) extra parens around
errmsg() and friends.For example:
- (errmsg("could not locate a valid checkpoint record in backup_label file"), + errmsg("could not locate checkpoint record spcified in backup_label file"),- (errmsg("could not locate a valid checkpoint record in control file"))); + errmsg("could not locate checkpoint record recorded in control file")));
Because it's recommended not to put parenthesis just before errmsg(), you mean? I'm ok to remove such parenthesis, but I'd like understand why before doing that. I was thinking that either having or not having parenthesis in front of errmsg() is ok, so there are many calls to errmsg() with parenthesis, in xlogrecovery.c.
+ (errmsg("invalid checkpoint record")));
Is it useful to show the specified LSN there?
Yes, LSN info would be helpful also for debugging.
I separated the patch into two; one is to remove useless arguments in ReadCheckpointRecord(), another is to improve log messages. I added LSN info in log messages in the second patch.
+ (errmsg("invalid resource manager ID in checkpoint record")));
We have a similar message "invalid resource manager ID %u at
%X/%X". Since the caller explains that it is a checkpoint record, we
can share the message here.
+1
+ (errmsg("invalid xl_info in checkpoint record")));
(It is not an issue of this patch, though) I don't think this is
appropriate for user-facing message. Counldn't we say "unexpected
record type: %x" or something like?
The proposed log message doesn't look like an improvement. But I have no better one. So I left the message as it is, in the patch, for now.
+ (errmsg("invalid length of checkpoint record")));
We have "record with invalid length at %X/%X" or "invalid record
length at %X/%X: wanted %u, got %u". Counld we reuse any of them?
+1
Regards,
--
Fujii Masao
Advanced Computing Technology Center
Research and Development Headquarters
NTT DATA CORPORATION
Attachments:
v3-0001-Remove-useless-arguments-in-ReadCheckpointRecord.patchtext/plain; charset=UTF-8; name=v3-0001-Remove-useless-arguments-in-ReadCheckpointRecord.patchDownload
From 1a7f5b66fed158306ed802cb08b328c2f8d47f20 Mon Sep 17 00:00:00 2001
From: Fujii Masao <fujii@postgresql.org>
Date: Wed, 20 Jul 2022 23:06:44 +0900
Subject: [PATCH 1/2] Remove useless arguments in ReadCheckpointRecord().
---
src/backend/access/transam/xlogrecovery.c | 84 ++++-------------------
1 file changed, 15 insertions(+), 69 deletions(-)
diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c
index 5d6f1b5e46..e383c2123a 100644
--- a/src/backend/access/transam/xlogrecovery.c
+++ b/src/backend/access/transam/xlogrecovery.c
@@ -422,8 +422,8 @@ static XLogPageReadResult WaitForWALToBecomeAvailable(XLogRecPtr RecPtr,
XLogRecPtr replayLSN,
bool nonblocking);
static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
-static XLogRecord *ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher, XLogRecPtr RecPtr,
- int whichChkpt, bool report, TimeLineID replayTLI);
+static XLogRecord *ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher,
+ XLogRecPtr RecPtr, TimeLineID replayTLI);
static bool rescanLatestTimeLine(TimeLineID replayTLI, XLogRecPtr replayLSN);
static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
XLogSource source, bool notfoundOk);
@@ -605,7 +605,7 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
* When a backup_label file is present, we want to roll forward from
* the checkpoint it identifies, rather than using pg_control.
*/
- record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc, 0, true,
+ record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc,
CheckPointTLI);
if (record != NULL)
{
@@ -744,7 +744,7 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
CheckPointTLI = ControlFile->checkPointCopy.ThisTimeLineID;
RedoStartLSN = ControlFile->checkPointCopy.redo;
RedoStartTLI = ControlFile->checkPointCopy.ThisTimeLineID;
- record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc, 1, true,
+ record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc,
CheckPointTLI);
if (record != NULL)
{
@@ -3843,13 +3843,10 @@ emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
/*
* Subroutine to try to fetch and validate a prior checkpoint record.
- *
- * whichChkpt identifies the checkpoint (merely for reporting purposes).
- * 1 for "primary", 0 for "other" (backup_label)
*/
static XLogRecord *
ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher, XLogRecPtr RecPtr,
- int whichChkpt, bool report, TimeLineID replayTLI)
+ TimeLineID replayTLI)
{
XLogRecord *record;
uint8 info;
@@ -3858,20 +3855,8 @@ ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher, XLogRecPtr RecPtr,
if (!XRecOffIsValid(RecPtr))
{
- if (!report)
- return NULL;
-
- switch (whichChkpt)
- {
- case 1:
- ereport(LOG,
- (errmsg("invalid primary checkpoint link in control file")));
- break;
- default:
- ereport(LOG,
- (errmsg("invalid checkpoint link in backup_label file")));
- break;
- }
+ ereport(LOG,
+ (errmsg("invalid checkpoint location")));
return NULL;
}
@@ -3880,67 +3865,28 @@ ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher, XLogRecPtr RecPtr,
if (record == NULL)
{
- if (!report)
- return NULL;
-
- switch (whichChkpt)
- {
- case 1:
- ereport(LOG,
- (errmsg("invalid primary checkpoint record")));
- break;
- default:
- ereport(LOG,
- (errmsg("invalid checkpoint record")));
- break;
- }
+ ereport(LOG,
+ (errmsg("invalid checkpoint record")));
return NULL;
}
if (record->xl_rmid != RM_XLOG_ID)
{
- switch (whichChkpt)
- {
- case 1:
- ereport(LOG,
- (errmsg("invalid resource manager ID in primary checkpoint record")));
- break;
- default:
- ereport(LOG,
- (errmsg("invalid resource manager ID in checkpoint record")));
- break;
- }
+ ereport(LOG,
+ (errmsg("invalid resource manager ID in checkpoint record")));
return NULL;
}
info = record->xl_info & ~XLR_INFO_MASK;
if (info != XLOG_CHECKPOINT_SHUTDOWN &&
info != XLOG_CHECKPOINT_ONLINE)
{
- switch (whichChkpt)
- {
- case 1:
- ereport(LOG,
- (errmsg("invalid xl_info in primary checkpoint record")));
- break;
- default:
- ereport(LOG,
- (errmsg("invalid xl_info in checkpoint record")));
- break;
- }
+ ereport(LOG,
+ (errmsg("invalid xl_info in checkpoint record")));
return NULL;
}
if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint))
{
- switch (whichChkpt)
- {
- case 1:
- ereport(LOG,
- (errmsg("invalid length of primary checkpoint record")));
- break;
- default:
- ereport(LOG,
- (errmsg("invalid length of checkpoint record")));
- break;
- }
+ ereport(LOG,
+ (errmsg("invalid length of checkpoint record")));
return NULL;
}
return record;
--
2.36.0
v3-0002-Improve-log-messages-when-invalid-checkpoint-record-.patchtext/plain; charset=UTF-8; name=v3-0002-Improve-log-messages-when-invalid-checkpoint-record-.patchDownload
From 4358d1585121ad5685c08f19516b49d4e3045640 Mon Sep 17 00:00:00 2001
From: Fujii Masao <fujii@postgresql.org>
Date: Fri, 22 Jul 2022 10:05:33 +0900
Subject: [PATCH 2/2] Improve log messages when invalid checkpoint record was
found.
---
src/backend/access/transam/xlogrecovery.c | 22 ++++++++++++++--------
1 file changed, 14 insertions(+), 8 deletions(-)
diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c
index e383c2123a..07bb1f03b8 100644
--- a/src/backend/access/transam/xlogrecovery.c
+++ b/src/backend/access/transam/xlogrecovery.c
@@ -638,7 +638,7 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
else
{
ereport(FATAL,
- (errmsg("could not locate required checkpoint record"),
+ (errmsg("could not locate a valid checkpoint record in backup_label file"),
errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n"
"If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
"Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
@@ -761,7 +761,7 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
* simplify processing around checkpoints.
*/
ereport(PANIC,
- (errmsg("could not locate a valid checkpoint record")));
+ (errmsg("could not locate a valid checkpoint record in control file")));
}
memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
@@ -3856,7 +3856,8 @@ ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher, XLogRecPtr RecPtr,
if (!XRecOffIsValid(RecPtr))
{
ereport(LOG,
- (errmsg("invalid checkpoint location")));
+ (errmsg("invalid checkpoint location: %X/%X",
+ LSN_FORMAT_ARGS(RecPtr))));
return NULL;
}
@@ -3866,13 +3867,15 @@ ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher, XLogRecPtr RecPtr,
if (record == NULL)
{
ereport(LOG,
- (errmsg("invalid checkpoint record")));
+ (errmsg("invalid checkpoint record at %X/%X",
+ LSN_FORMAT_ARGS(RecPtr))));
return NULL;
}
if (record->xl_rmid != RM_XLOG_ID)
{
ereport(LOG,
- (errmsg("invalid resource manager ID in checkpoint record")));
+ (errmsg("invalid resource manager ID %u in checkpoint record at %X/%X",
+ record->xl_rmid, LSN_FORMAT_ARGS(RecPtr))));
return NULL;
}
info = record->xl_info & ~XLR_INFO_MASK;
@@ -3880,13 +3883,16 @@ ReadCheckpointRecord(XLogPrefetcher *xlogprefetcher, XLogRecPtr RecPtr,
info != XLOG_CHECKPOINT_ONLINE)
{
ereport(LOG,
- (errmsg("invalid xl_info in checkpoint record")));
+ (errmsg("invalid xl_info in checkpoint record at %X/%X",
+ LSN_FORMAT_ARGS(RecPtr))));
return NULL;
}
- if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint))
+ if (record->xl_tot_len != SizeOfXLogRecord +
+ SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint))
{
ereport(LOG,
- (errmsg("invalid length of checkpoint record")));
+ (errmsg("checkpoint record with invalid length at %X/%X",
+ LSN_FORMAT_ARGS(RecPtr))));
return NULL;
}
return record;
--
2.36.0
Fujii Masao <masao.fujii@oss.nttdata.com> writes:
On 2022/07/21 14:54, Kyotaro Horiguchi wrote:
At Thu, 21 Jul 2022 11:45:23 +0900, Fujii Masao <masao.fujii@oss.nttdata.com> wrote in
- (errmsg("could not locate required checkpoint record"), + (errmsg("could not locate a valid checkpoint record in backup_label file"),
"in backup_label" there looks *to me* need some verb..
Sorry, I failed to understand your point. Could you clarify your point?
FWIW, the proposed change looks like perfectly good English to me.
"locate" is the verb. It's been way too many years since high
school grammar for me to remember the exact term for auxiliary
clauses like "in backup_label file", but that doesn't need its
own verb. Possibly Kyotaro-san is feeling that it should be
like "... checkpoint record in the backup_label file". That'd
be more formal, but in the telegraphic style that we prefer for
primary error messages, omitting the "the" is fine.
regards, tom lane
At Thu, 21 Jul 2022 23:10:04 -0400, Tom Lane <tgl@sss.pgh.pa.us> wrote in
Fujii Masao <masao.fujii@oss.nttdata.com> writes:
On 2022/07/21 14:54, Kyotaro Horiguchi wrote:
At Thu, 21 Jul 2022 11:45:23 +0900, Fujii Masao <masao.fujii@oss.nttdata.com> wrote in
- (errmsg("could not locate required checkpoint record"), + (errmsg("could not locate a valid checkpoint record in backup_label file"),"in backup_label" there looks *to me* need some verb..
Sorry, I failed to understand your point. Could you clarify your point?
FWIW, the proposed change looks like perfectly good English to me.
"locate" is the verb. It's been way too many years since high
school grammar for me to remember the exact term for auxiliary
clauses like "in backup_label file", but that doesn't need its
own verb. Possibly Kyotaro-san is feeling that it should be
like "... checkpoint record in the backup_label file". That'd
be more formal, but in the telegraphic style that we prefer for
primary error messages, omitting the "the" is fine.
Maybe a little different. I thought that a checkpoint record cannot
be located in backup_label file. In other words what is in
backup_label file is a pointer to the record and the record is in a
WAL file.
I'm fine with the proposed sentsnse if the it makes the correct
sense. (And sorry for the noise)
By the way, I learned that the style is called "telegraphic style".
regards.
--
Kyotaro Horiguchi
NTT Open Source Software Center
At Fri, 22 Jul 2022 11:50:14 +0900, Fujii Masao <masao.fujii@oss.nttdata.com> wrote in
Sorry, I failed to understand your point. Could you clarify your
point?
Wrote as a reply to Tom's message.
By the way, this looks like a good chance to remove the (now) extra parens around errmsg() and friends. For example: - (errmsg("could not locate a valid checkpoint record in backup_label - file"), + errmsg("could not locate checkpoint record spcified in backup_label file"), - (errmsg("could not locate a valid checkpoint record in control file"))); + errmsg("could not locate checkpoint record recorded in control file")));Because it's recommended not to put parenthesis just before errmsg(),
you mean? I'm ok to remove such parenthesis, but I'd like understand
why before doing that. I was thinking that either having or not having
parenthesis in front of errmsg() is ok, so there are many calls to
errmsg() with parenthesis, in xlogrecovery.c.
I believed that it is recommended to move to the style not having the
outmost parens. That style has been introduced by e3a87b4991.
* The extra parentheses around the auxiliary function calls are now
optional. Aside from being a bit less ugly, this removes a common
gotcha for new contributors, because in some cases the compiler errors
you got from forgetting them were unintelligible.
...
While new code can be written either way, code intended to be
back-patched will need to use extra parens for awhile yet. It seems
worth back-patching this change into v12, so as to reduce the window
where we have to be careful about that by one year. Hence, this patch
is careful to preserve ABI compatibility; a followup HEAD-only patch
will make some additional simplifications.
So I thought that if we modify an error message, its ererpot can be
rewritten.
+ (errmsg("invalid checkpoint record")));
Is it useful to show the specified LSN there?Yes, LSN info would be helpful also for debugging.
I separated the patch into two; one is to remove useless arguments in
ReadCheckpointRecord(), another is to improve log messages. I added
LSN info in log messages in the second patch.
Thanks!
+ (errmsg("invalid xl_info in checkpoint record")));
(It is not an issue of this patch, though) I don't think this is
appropriate for user-facing message. Counldn't we say "unexpected
record type: %x" or something like?The proposed log message doesn't look like an improvement. But I have
no better one. So I left the message as it is, in the patch, for now.
Understood.
regards
--
Kyotaro Horiguchi
NTT Open Source Software Center
On 2022/07/22 17:31, Kyotaro Horiguchi wrote:
I believed that it is recommended to move to the style not having the
outmost parens. That style has been introduced by e3a87b4991.
I read the commit log, but I'm not sure yet if it's really recommended to remove extra parens even from the existing calls to errmsg(). Removing extra parens can interfere with back-patching of the changes around those errmsg(), can't it?
Anyway, at first I pushed the 0001 patch that removes useless arguments in ReadCheckpointRecord().
Regards,
--
Fujii Masao
Advanced Computing Technology Center
Research and Development Headquarters
NTT DATA CORPORATION
Fujii Masao <masao.fujii@oss.nttdata.com> writes:
On 2022/07/22 17:31, Kyotaro Horiguchi wrote:
I believed that it is recommended to move to the style not having the
outmost parens. That style has been introduced by e3a87b4991.
I read the commit log, but I'm not sure yet if it's really recommended to remove extra parens even from the existing calls to errmsg(). Removing extra parens can interfere with back-patching of the changes around those errmsg(), can't it?
Right, so I wouldn't be in a hurry to change existing calls. If you're
editing an ereport call for some other reason, it's fine to remove the
excess parens in it, because you're creating a backpatch hazard there
anyway. But otherwise, I think such changes are make-work in themselves
and risk creating more make-work for somebody else later.
regards, tom lane
On Thu, Jul 21, 2022 at 11:24 AM Kyotaro Horiguchi
<horikyota.ntt@gmail.com> wrote:
May be unrelated, IIRC, for the errors like ereport(PANIC,
(errmsg("could not locate a valid checkpoint record"))); we wanted to
add a hint asking users to consider running pg_resetwal to fix the
issue. The error for ReadCheckpointRecord() in backup_label file
cases, already gives a hint errhint("If you are restoring from a
backup, touch \"%s/recovery.signal\" ...... Adding the hint of running
pg_resetwal (of course with a caution that it can cause inconsistency
in the data and use it as a last resort as described in the docs)
helps users and support engineers a lot to mitigate the server down
cases quickly.+1 to add some hint messages. But I'm not sure if it's good to hint
the use of pg_resetwal because, as you're saying, it should be used as
a last resort and has some big risks like data loss, corruption,
etc. That is, I'm concerned about that some users might quickly run
pg_resetwal because hint message says that, without reading the docs
nor understanding those risks.I don't think we want to recommend pg_resetwal to those who don't
reach it by themselves by other means. I know of a few instances of
people who had the tool (unrecoverably) break their own clusters.
Agree. We might want to take this topic separately as it needs more
careful study of common issues such as PANICs and then adding hints
with proven ways to repair the server and bring it back online.
Regards,
Bharath Rupireddy.
At Sun, 24 Jul 2022 22:40:16 -0400, Tom Lane <tgl@sss.pgh.pa.us> wrote in
Fujii Masao <masao.fujii@oss.nttdata.com> writes:
On 2022/07/22 17:31, Kyotaro Horiguchi wrote:
I believed that it is recommended to move to the style not having the
outmost parens. That style has been introduced by e3a87b4991.I read the commit log, but I'm not sure yet if it's really recommended to remove extra parens even from the existing calls to errmsg(). Removing extra parens can interfere with back-patching of the changes around those errmsg(), can't it?
Right, so I wouldn't be in a hurry to change existing calls. If you're
editing an ereport call for some other reason, it's fine to remove the
excess parens in it, because you're creating a backpatch hazard there
anyway. But otherwise, I think such changes are make-work in themselves
and risk creating more make-work for somebody else later.
So, I meant to propose to remove extra parens for errmsg()'s where the
message string is edited. Is it fine in that criteria?
regards.
--
Kyotaro Horiguchi
NTT Open Source Software Center
On 2022/07/26 9:42, Kyotaro Horiguchi wrote:
At Sun, 24 Jul 2022 22:40:16 -0400, Tom Lane <tgl@sss.pgh.pa.us> wrote in
Fujii Masao <masao.fujii@oss.nttdata.com> writes:
On 2022/07/22 17:31, Kyotaro Horiguchi wrote:
I believed that it is recommended to move to the style not having the
outmost parens. That style has been introduced by e3a87b4991.I read the commit log, but I'm not sure yet if it's really recommended to remove extra parens even from the existing calls to errmsg(). Removing extra parens can interfere with back-patching of the changes around those errmsg(), can't it?
Right, so I wouldn't be in a hurry to change existing calls. If you're
editing an ereport call for some other reason, it's fine to remove the
excess parens in it, because you're creating a backpatch hazard there
anyway. But otherwise, I think such changes are make-work in themselves
and risk creating more make-work for somebody else later.So, I meant to propose to remove extra parens for errmsg()'s where the
message string is edited. Is it fine in that criteria?
Even in that case, removing parens may interfere with the back-patch. For example, please imagine the case where wasShutdown is changed to be set to true in the following code and this changed is back-patched to v15. If we modify only the log message in the following errmsg() and leave the parens around that, git cherry-pick of the change of wasShutdown to v15 would be completed successfully. But if we remove the parens, git cherry-pick would fail.
ereport(FATAL,
(errmsg("could not locate required checkpoint record"),
errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n"
"If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
"Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
DataDir, DataDir, DataDir)));
wasShutdown = false; /* keep compiler quiet */
Regards,
--
Fujii Masao
Advanced Computing Technology Center
Research and Development Headquarters
NTT DATA CORPORATION