Print physical file path when checksum check fails

Started by Hubert Zhangalmost 6 years ago12 messages

Hubert Zhang

hzhang@pivotal.io

almost 6 years ago

1 attachment(s)

Hi hacker,

Currently we only print block number and relation path when checksum check
fails. See example below:

ERROR: invalid page in block 333571 of relation base/65959/656195

DBA complains that she needs additional work to calculate which physical
file is broken, since one physical file can only contain `RELSEG_SIZE`
number of blocks. For large tables, we need to use many physical files with
additional suffix, e.g. 656195.1, 656195.2 ...

Is that a good idea to also print the physical file path in error message?
Like below:

ERROR: invalid page in block 333571 of relation base/65959/656195, file
path base/65959/656195.2

Patch is attached.
--
Thanks

Hubert Zhang

Attachments:

0001-Print-physical-file-path-when-checksum-check-fails.patchapplication/octet-stream; name=0001-Print-physical-file-path-when-checksum-check-fails.patchDownload

From 785ebf3b029b7b79a90eb15ae2ddb9e984389fd7 Mon Sep 17 00:00:00 2001
From: Hubert Zhang <hzhang@pivotal.io>
Date: Mon, 10 Feb 2020 07:54:51 +0000
Subject: [PATCH] Print physical file path when checksum check fails

---
 src/backend/storage/buffer/bufmgr.c | 11 +++++++----
 src/common/relpath.c                | 27 +++++++++++++++++++++++++++
 src/include/common/relpath.h        | 21 +++++++++++++++++++++
 3 files changed, 55 insertions(+), 4 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index aba3960481..84308bd7a4 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -912,17 +912,20 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 				{
 					ereport(WARNING,
 							(errcode(ERRCODE_DATA_CORRUPTED),
-							 errmsg("invalid page in block %u of relation %s; zeroing out page",
+							 errmsg("invalid page in block %u of relation %s, "
+									"file path %s; zeroing out page",
 									blockNum,
-									relpath(smgr->smgr_rnode, forkNum))));
+									relpath(smgr->smgr_rnode, forkNum),
+									relfilepath(smgr->smgr_rnode, forkNum, blockNum))));
 					MemSet((char *) bufBlock, 0, BLCKSZ);
 				}
 				else
 					ereport(ERROR,
 							(errcode(ERRCODE_DATA_CORRUPTED),
-							 errmsg("invalid page in block %u of relation %s",
+							 errmsg("invalid page in block %u of relation %s, file path %s",
 									blockNum,
-									relpath(smgr->smgr_rnode, forkNum))));
+									relpath(smgr->smgr_rnode, forkNum),
+									relfilepath(smgr->smgr_rnode, forkNum, blockNum))));
 			}
 		}
 	}
diff --git a/src/common/relpath.c b/src/common/relpath.c
index ad733d1363..8b39c4ac4f 100644
--- a/src/common/relpath.c
+++ b/src/common/relpath.c
@@ -208,3 +208,30 @@ GetRelationPath(Oid dbNode, Oid spcNode, Oid relNode,
 	}
 	return path;
 }
+
+/*
+ * GetRelationFilePath - construct path to a relation's physical file
+ * given its block number.
+ */
+	char *
+GetRelationFilePath(Oid dbNode, Oid spcNode, Oid relNode,
+					int backendId, ForkNumber forkNumber, BlockNumber blkno)
+{
+	char	   *path;
+	char	   *fullpath;
+	BlockNumber	segno;
+
+	path = GetRelationPath(dbNode, spcNode, relNode, backendId, forkNumber);
+
+	segno = blkno / ((BlockNumber) RELSEG_SIZE);
+
+	if (segno > 0)
+	{
+		fullpath = psprintf("%s.%u", path, segno);
+		pfree(path);
+	}
+	else
+		fullpath = path;
+
+	return fullpath;
+}
diff --git a/src/include/common/relpath.h b/src/include/common/relpath.h
index 869cabcc0d..7e036f4086 100644
--- a/src/include/common/relpath.h
+++ b/src/include/common/relpath.h
@@ -13,6 +13,8 @@
 #ifndef RELPATH_H
 #define RELPATH_H
 
+#include "storage/block.h"
+
 /*
  *	'pgrminclude ignore' needed here because CppAsString2() does not throw
  *	an error if the symbol is not defined.
@@ -69,6 +71,9 @@ extern char *GetDatabasePath(Oid dbNode, Oid spcNode);
 extern char *GetRelationPath(Oid dbNode, Oid spcNode, Oid relNode,
 							 int backendId, ForkNumber forkNumber);
 
+extern char *GetRelationFilePath(Oid dbNode, Oid spcNode, Oid relNode,
+								 int backendId, ForkNumber forkNumber, BlockNumber blkno);
+
 /*
  * Wrapper macros for GetRelationPath.  Beware of multiple
  * evaluation of the RelFileNode or RelFileNodeBackend argument!
@@ -87,4 +92,20 @@ extern char *GetRelationPath(Oid dbNode, Oid spcNode, Oid relNode,
 #define relpath(rnode, forknum) \
 	relpathbackend((rnode).node, (rnode).backend, forknum)
 
+/*
+ * File path of a relation given the block number.
+ * Note that a file can contain at most RELSEG_SIZE number
+ * of blocks. We supply relfilepath macro to display the
+ * physical file name for a relation given the block number.
+ */
+
+/* First argument is a RelFileNode */
+#define relfilepathbackend(rnode, backend, forknum, blkno) \
+	GetRelationFilePath((rnode).dbNode, (rnode).spcNode, (rnode).relNode, \
+						backend, forknum, blkno)
+
+/* First argument is a RelFileNodeBackend */
+#define relfilepath(rnode, forknum, blkno) \
+	relfilepathbackend((rnode).node, (rnode).backend, forknum, blkno)
+
 #endif							/* RELPATH_H */
-- 
2.16.6

Andres Freund

andres@anarazel.de

almost 6 years ago

In reply to: Hubert Zhang (#1)

Re: Print physical file path when checksum check fails

HHi,

On 2020-02-10 16:04:21 +0800, Hubert Zhang wrote:

Currently we only print block number and relation path when checksum check
fails. See example below:

ERROR: invalid page in block 333571 of relation base/65959/656195

DBA complains that she needs additional work to calculate which physical
file is broken, since one physical file can only contain `RELSEG_SIZE`
number of blocks. For large tables, we need to use many physical files with
additional suffix, e.g. 656195.1, 656195.2 ...

Is that a good idea to also print the physical file path in error message?
Like below:

ERROR: invalid page in block 333571 of relation base/65959/656195, file
path base/65959/656195.2

I think that'd be a nice improvement. But:

I don't think the way you did it is right architecturally. The
segmenting is really something that lives within md.c, and we shouldn't
further expose it outside of that. And e.g. the undo patchset uses files
with different segmentation - but still goes through bufmgr.c.

I wonder if this partially signals that the checksum verification piece
is architecturally done in the wrong place currently? It's imo not good
that every place doing an smgrread() needs to separately verify
checksums. OTOH, it doesn't really belong inside smgr.c.

This layering issue was also encountered in
https://git.postgresql.org/gitweb/?p=postgresql.git;a=commit;h=3eb77eba5a51780d5cf52cd66a9844cd4d26feb0
so perhaps we should work to reuse the FileTag it introduces to
represent segments, without hardcoding the specific segment size?

Regards,

Andres

@@ -912,17 +912,20 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
{
ereport(WARNING,
(errcode(ERRCODE_DATA_CORRUPTED),
-							 errmsg("invalid page in block %u of relation %s; zeroing out page",
+							 errmsg("invalid page in block %u of relation %s, "
+									"file path %s; zeroing out page",
blockNum,
-									relpath(smgr->smgr_rnode, forkNum))));
+									relpath(smgr->smgr_rnode, forkNum),
+									relfilepath(smgr->smgr_rnode, forkNum, blockNum))));
MemSet((char *) bufBlock, 0, BLCKSZ);
}
else
ereport(ERROR,
(errcode(ERRCODE_DATA_CORRUPTED),
-							 errmsg("invalid page in block %u of relation %s",
+							 errmsg("invalid page in block %u of relation %s, file path %s",
blockNum,
-									relpath(smgr->smgr_rnode, forkNum))));
+									relpath(smgr->smgr_rnode, forkNum),
+									relfilepath(smgr->smgr_rnode, forkNum, blockNum))));
}
}
}
diff --git a/src/common/relpath.c b/src/common/relpath.c
index ad733d1363..8b39c4ac4f 100644
--- a/src/common/relpath.c
+++ b/src/common/relpath.c
@@ -208,3 +208,30 @@ GetRelationPath(Oid dbNode, Oid spcNode, Oid relNode,
}
return path;
}
+
+/*
+ * GetRelationFilePath - construct path to a relation's physical file
+ * given its block number.
+ */
+	char *
+GetRelationFilePath(Oid dbNode, Oid spcNode, Oid relNode,
+					int backendId, ForkNumber forkNumber, BlockNumber blkno)
+{
+	char	   *path;
+	char	   *fullpath;
+	BlockNumber	segno;
+
+	path = GetRelationPath(dbNode, spcNode, relNode, backendId, forkNumber);
+
+	segno = blkno / ((BlockNumber) RELSEG_SIZE);
+
+	if (segno > 0)
+	{
+		fullpath = psprintf("%s.%u", path, segno);
+		pfree(path);
+	}
+	else
+		fullpath = path;
+
+	return fullpath;
+}

Greetings,

Andres Freund

Hubert Zhang

hzhang@pivotal.io

almost 6 years ago

In reply to: Andres Freund (#2)

Re: Print physical file path when checksum check fails

Thanks Andres,

On Tue, Feb 11, 2020 at 5:30 AM Andres Freund <andres@anarazel.de> wrote:

HHi,

On 2020-02-10 16:04:21 +0800, Hubert Zhang wrote:

Currently we only print block number and relation path when checksum

check

fails. See example below:

ERROR: invalid page in block 333571 of relation base/65959/656195

DBA complains that she needs additional work to calculate which physical
file is broken, since one physical file can only contain `RELSEG_SIZE`
number of blocks. For large tables, we need to use many physical files

with

additional suffix, e.g. 656195.1, 656195.2 ...

Is that a good idea to also print the physical file path in error

message?

Like below:

ERROR: invalid page in block 333571 of relation base/65959/656195, file
path base/65959/656195.2

I think that'd be a nice improvement. But:

I don't think the way you did it is right architecturally. The
segmenting is really something that lives within md.c, and we shouldn't
further expose it outside of that. And e.g. the undo patchset uses files
with different segmentation - but still goes through bufmgr.c.

I wonder if this partially signals that the checksum verification piece
is architecturally done in the wrong place currently? It's imo not good
that every place doing an smgrread() needs to separately verify
checksums. OTOH, it doesn't really belong inside smgr.c.

This layering issue was also encountered in

https://git.postgresql.org/gitweb/?p=postgresql.git;a=commit;h=3eb77eba5a51780d5cf52cd66a9844cd4d26feb0
so perhaps we should work to reuse the FileTag it introduces to
represent segments, without hardcoding the specific segment size?

I checked the FileTag commit. It calls `register_xxx_segment` inside md.c
to store the sync request into a hashtable and used by checkpointer later.

Checksum verify is simpler. We could move the `PageIsVerified` into md.c
(mdread). But we can not elog error inside md.c because read buffer mode
RBM_ZERO_ON_ERROR is at bugmgr.c level.

One idea is to change save the error message(or the FileTag) at (e.g. a
static string in bufmgr.c) by calling `register_checksum_failure` inside
mdread in md.c.

As for your concern about the need to do checksum verify after every
smgrread, we now move the checksum verify logic into md.c, but we still
need to check the checksum verify result after smgrread and reset buffer to
zero if mode is RBM_ZERO_ON_ERROR.

If this idea is OK, I will submit the new PR.

Thanks

Hubert Zhang

hzhang@pivotal.io

almost 6 years ago

In reply to: Hubert Zhang (#3)

1 attachment(s)

Re: Print physical file path when checksum check fails

On Wed, Feb 12, 2020 at 5:22 PM Hubert Zhang <hzhang@pivotal.io> wrote:

Thanks Andres,

On Tue, Feb 11, 2020 at 5:30 AM Andres Freund <andres@anarazel.de> wrote:

HHi,

On 2020-02-10 16:04:21 +0800, Hubert Zhang wrote:

Currently we only print block number and relation path when checksum

check

fails. See example below:

ERROR: invalid page in block 333571 of relation base/65959/656195

DBA complains that she needs additional work to calculate which physical
file is broken, since one physical file can only contain `RELSEG_SIZE`
number of blocks. For large tables, we need to use many physical files

with

additional suffix, e.g. 656195.1, 656195.2 ...

Is that a good idea to also print the physical file path in error

message?

Like below:

ERROR: invalid page in block 333571 of relation base/65959/656195, file
path base/65959/656195.2

I think that'd be a nice improvement. But:

I don't think the way you did it is right architecturally. The
segmenting is really something that lives within md.c, and we shouldn't
further expose it outside of that. And e.g. the undo patchset uses files
with different segmentation - but still goes through bufmgr.c.

I wonder if this partially signals that the checksum verification piece
is architecturally done in the wrong place currently? It's imo not good
that every place doing an smgrread() needs to separately verify
checksums. OTOH, it doesn't really belong inside smgr.c.

This layering issue was also encountered in

https://git.postgresql.org/gitweb/?p=postgresql.git;a=commit;h=3eb77eba5a51780d5cf52cd66a9844cd4d26feb0
so perhaps we should work to reuse the FileTag it introduces to
represent segments, without hardcoding the specific segment size?

I checked the FileTag commit. It calls `register_xxx_segment` inside md.c
to store the sync request into a hashtable and used by checkpointer later.

Checksum verify is simpler. We could move the `PageIsVerified` into md.c
(mdread). But we can not elog error inside md.c because read buffer mode
RBM_ZERO_ON_ERROR is at bugmgr.c level.

One idea is to change save the error message(or the FileTag) at (e.g. a
static string in bufmgr.c) by calling `register_checksum_failure` inside
mdread in md.c.

As for your concern about the need to do checksum verify after every
smgrread, we now move the checksum verify logic into md.c, but we still
need to check the checksum verify result after smgrread and reset buffer to
zero if mode is RBM_ZERO_ON_ERROR.

If this idea is OK, I will submit the new PR.

Based on Andres's comments, here is the new patch for moving checksum
verify logic into mdread() instead of call PageIsVerified in every
smgrread(). Also using FileTag to print the physical file name when
checksum verify failed, which handle segmenting inside md.c as well.

--
Thanks

Hubert Zhang

Attachments:

0002-Print-physical-file-path-when-verify-checksum-failed.patchapplication/octet-stream; name=0002-Print-physical-file-path-when-verify-checksum-failed.patchDownload

From 6930c51691e9f612c8167d567543f735fc193eb7 Mon Sep 17 00:00:00 2001
From: Hubert Zhang <hzhang@pivotal.io>
Date: Mon, 17 Feb 2020 09:16:44 +0000
Subject: [PATCH] Print physical file path when verify checksum failed

Move the verify checksum logic into md.c to avoid every smgrread()
needs to separately verify checksums.
This also enables printing physical file path when verify checksum
failed, since file segmenting is also handled in md.c.
---
 src/backend/catalog/storage.c       | 14 +++++--------
 src/backend/storage/buffer/bufmgr.c | 25 ++++++----------------
 src/backend/storage/page/bufpage.c  | 41 +++++++++++++++++++++++++++++++++++--
 src/backend/storage/smgr/md.c       | 17 +++++++++++++++
 src/common/relpath.c                | 24 ++++++++++++++++++++++
 src/include/common/relpath.h        | 21 +++++++++++++++++++
 src/include/storage/bufpage.h       |  7 ++++---
 src/include/storage/sync.h          |  1 +
 8 files changed, 117 insertions(+), 33 deletions(-)

diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index fddfbf1d8c..f845e696ce 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -367,17 +367,13 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
 		/* If we got a cancel signal during the copy of the data, quit */
 		CHECK_FOR_INTERRUPTS();
 
+		/*
+		 * Disable zero damaged page in checksum veirfy
+		 */
+		SetZeroDamagedPageInChecksum(false);
+		
 		smgrread(src, forkNum, blkno, buf.data);
 
-		if (!PageIsVerified(page, blkno))
-			ereport(ERROR,
-					(errcode(ERRCODE_DATA_CORRUPTED),
-					 errmsg("invalid page in block %u of relation %s",
-							blkno,
-							relpathbackend(src->smgr_rnode.node,
-										   src->smgr_rnode.backend,
-										   forkNum))));
-
 		/*
 		 * WAL-log the copied page. Unfortunately we don't know what kind of a
 		 * page this is, so we have to log the full page including any unused
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 5880054245..6d84a9950a 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -895,6 +895,12 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 			if (track_io_timing)
 				INSTR_TIME_SET_CURRENT(io_start);
 
+			/*
+			 * Set whether zero damaged page in checksum veirfy
+			 */
+			if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages)
+				SetZeroDamagedPageInChecksum(true);
+
 			smgrread(smgr, forkNum, blockNum, (char *) bufBlock);
 
 			if (track_io_timing)
@@ -905,25 +911,6 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 				INSTR_TIME_ADD(pgBufferUsage.blk_read_time, io_time);
 			}
 
-			/* check for garbage data */
-			if (!PageIsVerified((Page) bufBlock, blockNum))
-			{
-				if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages)
-				{
-					ereport(WARNING,
-							(errcode(ERRCODE_DATA_CORRUPTED),
-							 errmsg("invalid page in block %u of relation %s; zeroing out page",
-									blockNum,
-									relpath(smgr->smgr_rnode, forkNum))));
-					MemSet((char *) bufBlock, 0, BLCKSZ);
-				}
-				else
-					ereport(ERROR,
-							(errcode(ERRCODE_DATA_CORRUPTED),
-							 errmsg("invalid page in block %u of relation %s",
-									blockNum,
-									relpath(smgr->smgr_rnode, forkNum))));
-			}
 		}
 	}
 
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
index 4ea6ea7a3d..e1abfea82a 100644
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@@ -26,6 +26,8 @@
 /* GUC variable */
 bool		ignore_checksum_failure = false;
 
+/* whether to zero damaged page when checksum verify failed */
+static bool zero_damaged_page_in_checksum = false;
 
 /* ----------------------------------------------------------------
  *						Page support functions
@@ -61,13 +63,17 @@ PageInit(Page page, Size pageSize, Size specialSize)
 
 
 /*
- * PageIsVerified
+ * VerifyPage
  *		Check that the page header and checksum (if any) appear valid.
  *
  * This is called when a page has just been read in from disk.  The idea is
  * to cheaply detect trashed pages before we go nuts following bogus line
  * pointers, testing invalid transaction identifiers, etc.
  *
+ * FileTag contains the segment file information which the page belongs to.
+ * Note that for large heap table, it may across many physical files. Better
+ * to print the exact file path when checksum fails.
+ *
  * It turns out to be necessary to allow zeroed pages here too.  Even though
  * this routine is *not* called when deliberately adding a page to a relation,
  * there are scenarios in which a zeroed page might be found in a table.
@@ -79,7 +85,7 @@ PageInit(Page page, Size pageSize, Size specialSize)
  * will clean up such a page and make it usable.
  */
 bool
-PageIsVerified(Page page, BlockNumber blkno)
+VerifyPage(const FileTag *ftag, Page page, BlockNumber blkno)
 {
 	PageHeader	p = (PageHeader) page;
 	size_t	   *pagebytes;
@@ -151,6 +157,28 @@ PageIsVerified(Page page, BlockNumber blkno)
 			return true;
 	}
 
+	/*
+	 * Throw out ERROR or WARNING based on whether zero_damaged_pages_in_checksum is set
+	 *
+	 * Note that using GUC zero_damaged_pages is not suffient here, since we should also
+	 * zero damaged page when ReadBufferMode is RBM_ZERO_ON_ERROR.
+	 */
+	if (zero_damaged_page_in_checksum)
+	{
+		ereport(WARNING,
+				(errcode(ERRCODE_DATA_CORRUPTED),
+				 errmsg("invalid page in block %u of relation file %s; zeroing out page",
+						blkno,
+						relfilepathbackend(ftag->rnode, ftag->backend, ftag->forknum, ftag->segno))));
+		MemSet((char *) page, 0, BLCKSZ);
+	}
+	else
+		ereport(ERROR,
+				(errcode(ERRCODE_DATA_CORRUPTED),
+				 errmsg("invalid page in block %u of relation file %s",
+						blkno,
+						relfilepathbackend(ftag->rnode, ftag->backend, ftag->forknum, ftag->segno))));
+
 	return false;
 }
 
@@ -1196,3 +1224,12 @@ PageSetChecksumInplace(Page page, BlockNumber blkno)
 
 	((PageHeader) page)->pd_checksum = pg_checksum_page((char *) page, blkno);
 }
+
+/*
+ * Zero damage page when vefirying checksum failed
+ */
+void
+SetZeroDamagedPageInChecksum(bool isZeroDamagedPage)
+{
+	zero_damaged_page_in_checksum = isZeroDamagedPage;
+}
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index c5b771c531..2a4b8e5b42 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -94,6 +94,17 @@ static MemoryContext MdCxt;		/* context for all MdfdVec objects */
 	memset(&(a), 0, sizeof(FileTag)), \
 	(a).handler = SYNC_HANDLER_MD, \
 	(a).rnode = (xx_rnode), \
+	(a).backend = InvalidBackendId, \
+	(a).forknum = (xx_forknum), \
+	(a).segno = (xx_segno) \
+)
+
+#define INIT_MD_FILETAG_WITH_BACKEND(a,xx_rnode,xx_backend,xx_forknum,xx_segno) \
+( \
+	memset(&(a), 0, sizeof(FileTag)), \
+	(a).handler = SYNC_HANDLER_MD, \
+	(a).rnode = (xx_rnode), \
+	(a).backend = (xx_backend), \
 	(a).forknum = (xx_forknum), \
 	(a).segno = (xx_segno) \
 )
@@ -604,6 +615,7 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	off_t		seekpos;
 	int			nbytes;
 	MdfdVec    *v;
+	FileTag		tag;
 
 	TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,
 										reln->smgr_rnode.node.spcNode,
@@ -653,6 +665,11 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 							blocknum, FilePathName(v->mdfd_vfd),
 							nbytes, BLCKSZ)));
 	}
+
+	/* verify the read page content satisfy page checksum */
+	INIT_MD_FILETAG_WITH_BACKEND(tag, reln->smgr_rnode.node, reln->smgr_rnode.backend, forknum, v->mdfd_segno);
+
+	VerifyPage(&tag, buffer, blocknum);
 }
 
 /*
diff --git a/src/common/relpath.c b/src/common/relpath.c
index ad733d1363..eae1d25fe4 100644
--- a/src/common/relpath.c
+++ b/src/common/relpath.c
@@ -208,3 +208,27 @@ GetRelationPath(Oid dbNode, Oid spcNode, Oid relNode,
 	}
 	return path;
 }
+
+/*
+ * GetRelationFilePath - construct path to a relation's physical file
+ * given its block number.
+ */
+char *
+GetRelationFilePath(Oid dbNode, Oid spcNode, Oid relNode,
+					int backendId, ForkNumber forkNumber, uint32 segno)
+{
+	char       *path;
+	char       *fullpath;
+
+	path = GetRelationPath(dbNode, spcNode, relNode, backendId, forkNumber);
+
+	if (segno > 0)
+	{
+		fullpath = psprintf("%s.%u", path, segno);
+		pfree(path);
+	}
+	else
+		fullpath = path;
+
+	return fullpath;
+}
diff --git a/src/include/common/relpath.h b/src/include/common/relpath.h
index 869cabcc0d..5db63c5197 100644
--- a/src/include/common/relpath.h
+++ b/src/include/common/relpath.h
@@ -13,6 +13,8 @@
 #ifndef RELPATH_H
 #define RELPATH_H
 
+#include "storage/block.h"
+
 /*
  *	'pgrminclude ignore' needed here because CppAsString2() does not throw
  *	an error if the symbol is not defined.
@@ -68,6 +70,8 @@ extern char *GetDatabasePath(Oid dbNode, Oid spcNode);
 
 extern char *GetRelationPath(Oid dbNode, Oid spcNode, Oid relNode,
 							 int backendId, ForkNumber forkNumber);
+extern char *GetRelationFilePath(Oid dbNode, Oid spcNode, Oid relNode,
+								 int backendId, ForkNumber forkNumber, BlockNumber blkno);
 
 /*
  * Wrapper macros for GetRelationPath.  Beware of multiple
@@ -87,4 +91,21 @@ extern char *GetRelationPath(Oid dbNode, Oid spcNode, Oid relNode,
 #define relpath(rnode, forknum) \
 	relpathbackend((rnode).node, (rnode).backend, forknum)
 
+
+/*
+ * File path of a relation given the block number.
+ * Note that a file can contain at most RELSEG_SIZE number
+ * of blocks. We supply relfilepath macro to display the
+ * physical file name for a relation given the block number.
+ */
+
+/* First argument is a RelFileNode */
+#define relfilepathbackend(rnode, backend, forknum, segno) \
+	GetRelationFilePath((rnode).dbNode, (rnode).spcNode, (rnode).relNode, \
+						backend, forknum, segno)
+
+/* First argument is a RelFileNodeBackend */
+#define relfilepath(rnode, forknum, segno) \
+	relfilepathbackend((rnode).node, (rnode).backend, forknum, segno)
+
 #endif							/* RELPATH_H */
diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h
index 3f88683a05..b88e15b76e 100644
--- a/src/include/storage/bufpage.h
+++ b/src/include/storage/bufpage.h
@@ -18,6 +18,7 @@
 #include "storage/block.h"
 #include "storage/item.h"
 #include "storage/off.h"
+#include "storage/sync.h"		/* For FileTag */
 
 /*
  * A postgres disk page is an abstraction layered on top of a postgres
@@ -419,7 +420,7 @@ do { \
 						((is_heap) ? PAI_IS_HEAP : 0))
 
 /*
- * Check that BLCKSZ is a multiple of sizeof(size_t).  In PageIsVerified(),
+ * Check that BLCKSZ is a multiple of sizeof(size_t).  In VerifyPage(),
  * it is much faster to check if a page is full of zeroes using the native
  * word size.  Note that this assertion is kept within a header to make
  * sure that StaticAssertDecl() works across various combinations of
@@ -429,7 +430,7 @@ StaticAssertDecl(BLCKSZ == ((BLCKSZ / sizeof(size_t)) * sizeof(size_t)),
 				 "BLCKSZ has to be a multiple of sizeof(size_t)");
 
 extern void PageInit(Page page, Size pageSize, Size specialSize);
-extern bool PageIsVerified(Page page, BlockNumber blkno);
+extern bool VerifyPage(const FileTag *ftag, Page page, BlockNumber blkno);
 extern OffsetNumber PageAddItemExtended(Page page, Item item, Size size,
 										OffsetNumber offsetNumber, int flags);
 extern Page PageGetTempPage(Page page);
@@ -448,5 +449,5 @@ extern bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum,
 									Item newtup, Size newsize);
 extern char *PageSetChecksumCopy(Page page, BlockNumber blkno);
 extern void PageSetChecksumInplace(Page page, BlockNumber blkno);
-
+extern void SetZeroDamagedPageInChecksum(bool isZeroDamagedPage);
 #endif							/* BUFPAGE_H */
diff --git a/src/include/storage/sync.h b/src/include/storage/sync.h
index e16ab8e711..2c34579eec 100644
--- a/src/include/storage/sync.h
+++ b/src/include/storage/sync.h
@@ -47,6 +47,7 @@ typedef struct FileTag
 	int16		handler;		/* SyncRequestHandler value, saving space */
 	int16		forknum;		/* ForkNumber, saving space */
 	RelFileNode rnode;
+	BackendId	backend;
 	uint32		segno;
 } FileTag;
 
-- 
2.16.6

Kyotaro Horiguchi

horikyota.ntt@gmail.com

almost 6 years ago

In reply to: Hubert Zhang (#4)

Re: Print physical file path when checksum check fails

Hello. Thank you for the new patch.

At Tue, 18 Feb 2020 09:27:39 +0800, Hubert Zhang <hzhang@pivotal.io> wrote in

On Wed, Feb 12, 2020 at 5:22 PM Hubert Zhang <hzhang@pivotal.io> wrote:

Thanks Andres,

On Tue, Feb 11, 2020 at 5:30 AM Andres Freund <andres@anarazel.de> wrote:

HHi,

On 2020-02-10 16:04:21 +0800, Hubert Zhang wrote:

Currently we only print block number and relation path when checksum

check

fails. See example below:

ERROR: invalid page in block 333571 of relation base/65959/656195

DBA complains that she needs additional work to calculate which physical
file is broken, since one physical file can only contain `RELSEG_SIZE`
number of blocks. For large tables, we need to use many physical files

with

additional suffix, e.g. 656195.1, 656195.2 ...

Is that a good idea to also print the physical file path in error

message?

Like below:

ERROR: invalid page in block 333571 of relation base/65959/656195, file
path base/65959/656195.2

I think that'd be a nice improvement. But:

I don't think the way you did it is right architecturally. The
segmenting is really something that lives within md.c, and we shouldn't
further expose it outside of that. And e.g. the undo patchset uses files
with different segmentation - but still goes through bufmgr.c.

I wonder if this partially signals that the checksum verification piece
is architecturally done in the wrong place currently? It's imo not good
that every place doing an smgrread() needs to separately verify
checksums. OTOH, it doesn't really belong inside smgr.c.

This layering issue was also encountered in

https://git.postgresql.org/gitweb/?p=postgresql.git;a=commit;h=3eb77eba5a51780d5cf52cd66a9844cd4d26feb0
so perhaps we should work to reuse the FileTag it introduces to
represent segments, without hardcoding the specific segment size?

I checked the FileTag commit. It calls `register_xxx_segment` inside md.c
to store the sync request into a hashtable and used by checkpointer later.

Checksum verify is simpler. We could move the `PageIsVerified` into md.c
(mdread). But we can not elog error inside md.c because read buffer mode
RBM_ZERO_ON_ERROR is at bugmgr.c level.

One idea is to change save the error message(or the FileTag) at (e.g. a
static string in bufmgr.c) by calling `register_checksum_failure` inside
mdread in md.c.

As for your concern about the need to do checksum verify after every
smgrread, we now move the checksum verify logic into md.c, but we still
need to check the checksum verify result after smgrread and reset buffer to
zero if mode is RBM_ZERO_ON_ERROR.

If this idea is OK, I will submit the new PR.

Based on Andres's comments, here is the new patch for moving checksum
verify logic into mdread() instead of call PageIsVerified in every
smgrread(). Also using FileTag to print the physical file name when
checksum verify failed, which handle segmenting inside md.c as well.

The patch doesn't address the first comment from Andres. It still
expose the notion of segment to the upper layer, but bufmgr (bufmgr.c
and bufpage.c) or Realation (relpath.c) layer shouldn't know of
segment. So the two layers should ask smgr without using segment
number for the real file name for the block.

For example, I think the following structure works. (Without moving
checksum verification.)

======
md.c:
char *mdfname(SmgrRelation reln, Forknumber forkNum, BlockNumber blocknum);
smgr.c:
char *smgrfname(SMgrRelation reln, ForkNumber forkNum, BlockNumber Blocknum);

bufmgr.c:
ReadBuffer_common()
{
..
/* check for garbage data */
if (!PageIsVerified((Page) bufBlock, blockNum))
if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages)
ereport(WARNING,
errmsg("invalid page in block %u in file %s; zeroing out page",
blockNum,
smgrfname(smgr, forkNum, blockNum))));
MemSet((char *) bufBlock, 0, BLCKSZ);
====

However, the block number in the error messages looks odd as it is
actually not the block number in the segment. We could convert
BlockNum into relative block number or offset in the file but it would
be overkill. So something like this works?

"invalid page in block %u in relation %u, file \"%s\"",
blockNum, smgr->smgr_rnode.node.relNode, smgrfname()

If we also verify checksum in md layer, callback is overkill since the
immediate caller consumes the event immediately. We can signal the
error by somehow returning a file tag.

======
md.c:
FileTag *mdread(...) /* or void mdread(..., FileTag*)? */
{
if (!VerfyPage())
return ftag;
....
return NULL;
}
char *mdfname(FileTag *ftag);
smgr.c:
FileTag *smgrread(...);
char *smgrfname(FileTag *ftag);

bufmgr.c:
ReadBuffer_common()
{
FileTag *errtag;
..
errftag = smgrread();

if (errtag))
/* page verification failed */
if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages)
ereport(WARNING,
errmsg("invalid page in block %u in file %s; zeroing out page",
blockNum, smgrfname(errftag)));
MemSet((char *) bufBlock, 0, BLCKSZ);
====

But it is uneasy that smgrread just returning a filetag signals
checksum error..

regards.

--
Kyotaro Horiguchi
NTT Open Source Software Center

Michael Paquier

michael@paquier.xyz

almost 6 years ago

In reply to: Kyotaro Horiguchi (#5)

Re: Print physical file path when checksum check fails

On Wed, Feb 19, 2020 at 01:07:36PM +0900, Kyotaro Horiguchi wrote:

If we also verify checksum in md layer, callback is overkill since the
immediate caller consumes the event immediately. We can signal the
error by somehow returning a file tag.

FWIW, I am wondering if there is any need for a change here and
complicate more the code. If you know the block number, the page size
and the segment file size you can immediately guess where is the
damaged block. The first information is already part of the error
message, and the two other ones are constants defined at
compile-time.
--
Michael

Kyotaro Horiguchi

horikyota.ntt@gmail.com

almost 6 years ago

In reply to: Michael Paquier (#6)

Re: Print physical file path when checksum check fails

At Wed, 19 Feb 2020 13:28:04 +0900, Michael Paquier <michael@paquier.xyz> wrote in

On Wed, Feb 19, 2020 at 01:07:36PM +0900, Kyotaro Horiguchi wrote:

If we also verify checksum in md layer, callback is overkill since the
immediate caller consumes the event immediately. We can signal the
error by somehow returning a file tag.

FWIW, I am wondering if there is any need for a change here and
complicate more the code. If you know the block number, the page size
and the segment file size you can immediately guess where is the
damaged block. The first information is already part of the error

I have had support requests related to broken block several times, and
(I think) most of *them* had hard time to locate the broken block or
even broken file. I don't think it is useles at all, but I'm not sure
it is worth the additional complexity.

damaged block. The first information is already part of the error
message, and the two other ones are constants defined at
compile-time.

May you have misread the snippet?

What Hubert proposed is:

"invalid page in block %u of relation file %s; zeroing out page",
blkno, <filename>

The second format in my messages just before is:
"invalid page in block %u in relation %u, file \"%s\"",
blockNum, smgr->smgr_rnode.node.relNode, smgrfname()

All of them are not compile-time constant at all.

regards.

--
Kyotaro Horiguchi
NTT Open Source Software Center

Michael Paquier

michael@paquier.xyz

almost 6 years ago

In reply to: Kyotaro Horiguchi (#7)

Re: Print physical file path when checksum check fails

On Wed, Feb 19, 2020 at 03:00:54PM +0900, Kyotaro Horiguchi wrote:

I have had support requests related to broken block several times, and
(I think) most of *them* had hard time to locate the broken block or
even broken file. I don't think it is useles at all, but I'm not sure
it is worth the additional complexity.

I handle stuff like that from time to time, and any reports usually
go down to people knowledgeable about PostgreSQL enough to know the
difference. My point is that in order to know where a broken block is
physically located on disk, you need to know four things:
- The block number.
- The physical location of the relation.
- The size of the block.
- The length of a file segment.
The first two items are printed in the error message, and you can
guess easily the actual location (file, offset) with the two others.

I am not necessarily against improving the error message here, but
FWIW I think that we need to consider seriously if the code
complications and the maintenance cost involved are really worth
saving from one simple calculation. Particularly, quickly reading
through the patch, I am rather unhappy about the shape of the second
patch which pushes down the segment number knowledge into relpath.c,
and creates more complication around the handling of
zero_damaged_pages and zero'ed pages.
--
Michael

Andres Freund

andres@anarazel.de

almost 6 years ago

In reply to: Michael Paquier (#8)

Re: Print physical file path when checksum check fails

Hi,

On 2020-02-19 16:48:45 +0900, Michael Paquier wrote:

On Wed, Feb 19, 2020 at 03:00:54PM +0900, Kyotaro Horiguchi wrote:

I have had support requests related to broken block several times, and
(I think) most of *them* had hard time to locate the broken block or
even broken file. I don't think it is useles at all, but I'm not sure
it is worth the additional complexity.

I handle stuff like that from time to time, and any reports usually
go down to people knowledgeable about PostgreSQL enough to know the
difference. My point is that in order to know where a broken block is
physically located on disk, you need to know four things:
- The block number.
- The physical location of the relation.
- The size of the block.
- The length of a file segment.
The first two items are printed in the error message, and you can
guess easily the actual location (file, offset) with the two others.

I am not necessarily against improving the error message here, but
FWIW I think that we need to consider seriously if the code
complications and the maintenance cost involved are really worth
saving from one simple calculation.

I don't think it's that simple for most.

And if we e.g. ever get the undo stuff merged, it'd get more
complicated, because they segment entirely differently. Similar, if we
ever manage to move SLRUs into the buffer pool and checksummed, it'd
again work differently.

Nor is it architecturally appealing to handle checksums in multiple
places above the smgr layer: For one, that requires multiple places to
compute verify them. But also, as the way checksums are computed depends
on the page format etc, it'll likely change for things like undo/slru -
which then again will require additional smarts if done above the smgr
layer.

Particularly, quickly reading through the patch, I am rather unhappy
about the shape of the second patch which pushes down the segment
number knowledge into relpath.c, and creates more complication around
the handling of zero_damaged_pages and zero'ed pages. -- Michael

I do not like the SetZeroDamagedPageInChecksum stuff at all however.

Greetings,

Andres Freund

#10

Hubert Zhang

hzhang@pivotal.io

almost 6 years ago

In reply to: Andres Freund (#9)

Re: Print physical file path when checksum check fails

Thanks,

On Thu, Feb 20, 2020 at 11:36 AM Andres Freund <andres@anarazel.de> wrote:

Hi,

On 2020-02-19 16:48:45 +0900, Michael Paquier wrote:

On Wed, Feb 19, 2020 at 03:00:54PM +0900, Kyotaro Horiguchi wrote:

I have had support requests related to broken block several times, and
(I think) most of *them* had hard time to locate the broken block or
even broken file. I don't think it is useles at all, but I'm not sure
it is worth the additional complexity.

I handle stuff like that from time to time, and any reports usually
go down to people knowledgeable about PostgreSQL enough to know the
difference. My point is that in order to know where a broken block is
physically located on disk, you need to know four things:
- The block number.
- The physical location of the relation.
- The size of the block.
- The length of a file segment.
The first two items are printed in the error message, and you can
guess easily the actual location (file, offset) with the two others.

I am not necessarily against improving the error message here, but
FWIW I think that we need to consider seriously if the code
complications and the maintenance cost involved are really worth
saving from one simple calculation.

I don't think it's that simple for most.

And if we e.g. ever get the undo stuff merged, it'd get more
complicated, because they segment entirely differently. Similar, if we
ever manage to move SLRUs into the buffer pool and checksummed, it'd
again work differently.

Nor is it architecturally appealing to handle checksums in multiple
places above the smgr layer: For one, that requires multiple places to
compute verify them. But also, as the way checksums are computed depends
on the page format etc, it'll likely change for things like undo/slru -
which then again will require additional smarts if done above the smgr
layer.

So considering undo staff, it's better to move checksum logic into md.c
I will keep it in the new patch.

On 2020-02-19 16:48:45 +0900, Michael Paquier wrote:

Particularly, quickly reading through the patch, I am rather unhappy

about the shape of the second patch which pushes down the segment
number knowledge into relpath.c, and creates more complication around
the handling of zero_damaged_pages and zero'ed pages. -- Michael

I do not like the SetZeroDamagedPageInChecksum stuff at all however.

I'm +1 on the first concern, and I will delete the new added function
`GetRelationFilePath`
in relpath.c and append the segno directly in error message inside function
`VerifyPage`

As for SetZeroDamagedPageInChecksum, the reason why I introduced it is that
when we are doing
smgrread() and one of the damaged page failed to pass the checksum check,
we could not directly error
out, since the caller of smgrread() may tolerate this error and just zero
all the damaged page plus a warning message.
Also, we could not just use GUC zero_damaged_pages to do this branch, since
we also have ReadBufferMode(i.e. RBM_ZERO_ON_ERROR) to control it.

To get rid of SetZeroDamagedPageInChecksum, one idea is to pass
zero_damaged_page flag into smgrread(), something like below:
==

extern void smgrread(SMgrRelation reln, ForkNumber forknum,

BlockNumber blocknum, char *buffer, int flag);

===

Any comments?

--
Thanks

Hubert Zhang

#11

Hubert Zhang

hzhang@pivotal.io

almost 6 years ago

In reply to: Kyotaro Horiguchi (#7)

Re: Print physical file path when checksum check fails

Thanks Kyotaro,

On Wed, Feb 19, 2020 at 2:02 PM Kyotaro Horiguchi <horikyota.ntt@gmail.com>
wrote:

At Wed, 19 Feb 2020 13:28:04 +0900, Michael Paquier <michael@paquier.xyz>
wrote in

On Wed, Feb 19, 2020 at 01:07:36PM +0900, Kyotaro Horiguchi wrote:

If we also verify checksum in md layer, callback is overkill since the
immediate caller consumes the event immediately. We can signal the
error by somehow returning a file tag.

FWIW, I am wondering if there is any need for a change here and
complicate more the code. If you know the block number, the page size
and the segment file size you can immediately guess where is the
damaged block. The first information is already part of the error

I have had support requests related to broken block several times, and
(I think) most of *them* had hard time to locate the broken block or
even broken file. I don't think it is useles at all, but I'm not sure
it is worth the additional complexity.

damaged block. The first information is already part of the error
message, and the two other ones are constants defined at
compile-time.

May you have misread the snippet?

What Hubert proposed is:

"invalid page in block %u of relation file %s; zeroing out page",
blkno, <filename>

The second format in my messages just before is:
"invalid page in block %u in relation %u, file \"%s\"",
blockNum, smgr->smgr_rnode.node.relNode, smgrfname()

All of them are not compile-time constant at all.

I like your error message, the block number is relation level not file
level.
I 'll change the error message to
"invalid page in block %u of relation %u, file %s"

--
Thanks

Hubert Zhang

#12

Hubert Zhang

hzhang@pivotal.io

almost 6 years ago

In reply to: Hubert Zhang (#10)

1 attachment(s)

Re: Print physical file path when checksum check fails

I have updated the patch based on the previous comments. Sorry for the late
patch.

I removed `SetZeroDamagedPageInChecksum` and add `zeroDamagePage` flag in
smgrread to determine whether we should zero damage page when an error
happens. It depends on the caller.

`GetRelationFilePath` is removed as well. We print segno on the fly.

On Thu, Feb 20, 2020 at 2:33 PM Hubert Zhang <hzhang@pivotal.io> wrote:

Thanks,

On Thu, Feb 20, 2020 at 11:36 AM Andres Freund <andres@anarazel.de> wrote:

Hi,

On 2020-02-19 16:48:45 +0900, Michael Paquier wrote:

On Wed, Feb 19, 2020 at 03:00:54PM +0900, Kyotaro Horiguchi wrote:

I have had support requests related to broken block several times, and
(I think) most of *them* had hard time to locate the broken block or
even broken file. I don't think it is useles at all, but I'm not sure
it is worth the additional complexity.

I handle stuff like that from time to time, and any reports usually
go down to people knowledgeable about PostgreSQL enough to know the
difference. My point is that in order to know where a broken block is
physically located on disk, you need to know four things:
- The block number.
- The physical location of the relation.
- The size of the block.
- The length of a file segment.
The first two items are printed in the error message, and you can
guess easily the actual location (file, offset) with the two others.

I am not necessarily against improving the error message here, but
FWIW I think that we need to consider seriously if the code
complications and the maintenance cost involved are really worth
saving from one simple calculation.

I don't think it's that simple for most.

And if we e.g. ever get the undo stuff merged, it'd get more
complicated, because they segment entirely differently. Similar, if we
ever manage to move SLRUs into the buffer pool and checksummed, it'd
again work differently.

Nor is it architecturally appealing to handle checksums in multiple
places above the smgr layer: For one, that requires multiple places to
compute verify them. But also, as the way checksums are computed depends
on the page format etc, it'll likely change for things like undo/slru -
which then again will require additional smarts if done above the smgr
layer.

So considering undo staff, it's better to move checksum logic into md.c
I will keep it in the new patch.

On 2020-02-19 16:48:45 +0900, Michael Paquier wrote:

Particularly, quickly reading through the patch, I am rather unhappy

about the shape of the second patch which pushes down the segment
number knowledge into relpath.c, and creates more complication around
the handling of zero_damaged_pages and zero'ed pages. -- Michael

I do not like the SetZeroDamagedPageInChecksum stuff at all however.

I'm +1 on the first concern, and I will delete the new added function
`GetRelationFilePath`
in relpath.c and append the segno directly in error message inside
function `VerifyPage`

As for SetZeroDamagedPageInChecksum, the reason why I introduced it is
that when we are doing
smgrread() and one of the damaged page failed to pass the checksum check,
we could not directly error
out, since the caller of smgrread() may tolerate this error and just zero
all the damaged page plus a warning message.
Also, we could not just use GUC zero_damaged_pages to do this branch,
since we also have ReadBufferMode(i.e. RBM_ZERO_ON_ERROR) to control it.

To get rid of SetZeroDamagedPageInChecksum, one idea is to pass
zero_damaged_page flag into smgrread(), something like below:
==

extern void smgrread(SMgrRelation reln, ForkNumber forknum,

BlockNumber blocknum, char *buffer, int flag);

===

Any comments?

--
Thanks

Hubert Zhang

--
Thanks

Hubert Zhang

Attachments:

0003-Print-physical-file-path-when-verify-checksum-failed.patchapplication/octet-stream; name=0003-Print-physical-file-path-when-verify-checksum-failed.patchDownload

From 0e27920a7a7eabecdc2eb8d578e45964f5a77b51 Mon Sep 17 00:00:00 2001
From: Hubert Zhang <hzhang@pivotal.io>
Date: Mon, 17 Feb 2020 09:16:44 +0000
Subject: [PATCH 1/1] Print physical file path when verify checksum failed

Move the verify checksum logic into md.c to avoid every smgrread()
needs to separately verify checksums.
This also enables printing physical file path(including segno) when verify checksum
failed. It should not handle outside md.c, since undo and redo may have different
file segmentation.
---
 contrib/pg_prewarm/pg_prewarm.c     |  2 +-
 src/backend/catalog/storage.c       | 11 +----------
 src/backend/storage/buffer/bufmgr.c | 28 ++++++++--------------------
 src/backend/storage/page/bufpage.c  | 33 ++++++++++++++++++++++++++++++---
 src/backend/storage/smgr/md.c       | 23 ++++++++++++++++++++---
 src/backend/storage/smgr/smgr.c     |  6 +++---
 src/include/storage/bufpage.h       |  6 +++---
 src/include/storage/md.h            |  2 +-
 src/include/storage/smgr.h          |  2 +-
 src/include/storage/sync.h          |  1 +
 10 files changed, 69 insertions(+), 45 deletions(-)

diff --git a/contrib/pg_prewarm/pg_prewarm.c b/contrib/pg_prewarm/pg_prewarm.c
index 33e2d28b27..93059ef581 100644
--- a/contrib/pg_prewarm/pg_prewarm.c
+++ b/contrib/pg_prewarm/pg_prewarm.c
@@ -178,7 +178,7 @@ pg_prewarm(PG_FUNCTION_ARGS)
 		for (block = first_block; block <= last_block; ++block)
 		{
 			CHECK_FOR_INTERRUPTS();
-			smgrread(rel->rd_smgr, forkNumber, block, blockbuffer.data);
+			smgrread(rel->rd_smgr, forkNumber, block, blockbuffer.data, false);
 			++blocks_done;
 		}
 	}
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index fddfbf1d8c..8cb482cf1e 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -367,16 +367,7 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
 		/* If we got a cancel signal during the copy of the data, quit */
 		CHECK_FOR_INTERRUPTS();
 
-		smgrread(src, forkNum, blkno, buf.data);
-
-		if (!PageIsVerified(page, blkno))
-			ereport(ERROR,
-					(errcode(ERRCODE_DATA_CORRUPTED),
-					 errmsg("invalid page in block %u of relation %s",
-							blkno,
-							relpathbackend(src->smgr_rnode.node,
-										   src->smgr_rnode.backend,
-										   forkNum))));
+		smgrread(src, forkNum, blkno, buf.data, false);
 
 		/*
 		 * WAL-log the copied page. Unfortunately we don't know what kind of a
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 5880054245..12bbf9899c 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -891,11 +891,18 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 		{
 			instr_time	io_start,
 						io_time;
+			bool	zeroDamagePage = false;
 
 			if (track_io_timing)
 				INSTR_TIME_SET_CURRENT(io_start);
 
-			smgrread(smgr, forkNum, blockNum, (char *) bufBlock);
+			/*
+			 * Set whether zero damaged page in checksum veirfy
+			 */
+			if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages)
+				zeroDamagePage = true;
+
+			smgrread(smgr, forkNum, blockNum, (char *) bufBlock, zeroDamagePage);
 
 			if (track_io_timing)
 			{
@@ -905,25 +912,6 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 				INSTR_TIME_ADD(pgBufferUsage.blk_read_time, io_time);
 			}
 
-			/* check for garbage data */
-			if (!PageIsVerified((Page) bufBlock, blockNum))
-			{
-				if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages)
-				{
-					ereport(WARNING,
-							(errcode(ERRCODE_DATA_CORRUPTED),
-							 errmsg("invalid page in block %u of relation %s; zeroing out page",
-									blockNum,
-									relpath(smgr->smgr_rnode, forkNum))));
-					MemSet((char *) bufBlock, 0, BLCKSZ);
-				}
-				else
-					ereport(ERROR,
-							(errcode(ERRCODE_DATA_CORRUPTED),
-							 errmsg("invalid page in block %u of relation %s",
-									blockNum,
-									relpath(smgr->smgr_rnode, forkNum))));
-			}
 		}
 	}
 
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
index 4ea6ea7a3d..0b1d4fb39c 100644
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@@ -26,7 +26,6 @@
 /* GUC variable */
 bool		ignore_checksum_failure = false;
 
-
 /* ----------------------------------------------------------------
  *						Page support functions
  * ----------------------------------------------------------------
@@ -61,13 +60,17 @@ PageInit(Page page, Size pageSize, Size specialSize)
 
 
 /*
- * PageIsVerified
+ * VerifyPage
  *		Check that the page header and checksum (if any) appear valid.
  *
  * This is called when a page has just been read in from disk.  The idea is
  * to cheaply detect trashed pages before we go nuts following bogus line
  * pointers, testing invalid transaction identifiers, etc.
  *
+ * FileTag contains the segment file information which the page belongs to.
+ * Note that for large heap table, it may across many physical files. Better
+ * to print the exact file path when checksum fails.
+ *
  * It turns out to be necessary to allow zeroed pages here too.  Even though
  * this routine is *not* called when deliberately adding a page to a relation,
  * there are scenarios in which a zeroed page might be found in a table.
@@ -79,7 +82,7 @@ PageInit(Page page, Size pageSize, Size specialSize)
  * will clean up such a page and make it usable.
  */
 bool
-PageIsVerified(Page page, BlockNumber blkno)
+VerifyPage(const FileTag *ftag, Page page, BlockNumber blkno, bool zeroDamagePage)
 {
 	PageHeader	p = (PageHeader) page;
 	size_t	   *pagebytes;
@@ -151,6 +154,30 @@ PageIsVerified(Page page, BlockNumber blkno)
 			return true;
 	}
 
+	/*
+	 * Throw out ERROR or WARNING based on whether zero_damaged_pages_in_checksum is set
+	 *
+	 * Note that using GUC zero_damaged_pages is not suffient here, since we should also
+	 * zero damaged page when ReadBufferMode is RBM_ZERO_ON_ERROR.
+	 */
+	if (zeroDamagePage)
+	{
+		ereport(WARNING,
+				(errcode(ERRCODE_DATA_CORRUPTED),
+				 errmsg("invalid page in block %u of relation file %s.%u; zeroing out page",
+						blkno,
+						relpathbackend(ftag->rnode, ftag->backend, ftag->forknum),
+						ftag->segno)));
+		MemSet((char *) page, 0, BLCKSZ);
+	}
+	else
+		ereport(ERROR,
+				(errcode(ERRCODE_DATA_CORRUPTED),
+				 errmsg("invalid page in block %u of relation file %s.%u",
+						blkno,
+						relpathbackend(ftag->rnode, ftag->backend, ftag->forknum),
+						ftag->segno)));
+
 	return false;
 }
 
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index c5b771c531..242b9f846c 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -94,6 +94,17 @@ static MemoryContext MdCxt;		/* context for all MdfdVec objects */
 	memset(&(a), 0, sizeof(FileTag)), \
 	(a).handler = SYNC_HANDLER_MD, \
 	(a).rnode = (xx_rnode), \
+	(a).backend = InvalidBackendId, \
+	(a).forknum = (xx_forknum), \
+	(a).segno = (xx_segno) \
+)
+
+#define INIT_MD_FILETAG_WITH_BACKEND(a,xx_rnode,xx_backend,xx_forknum,xx_segno) \
+( \
+	memset(&(a), 0, sizeof(FileTag)), \
+	(a).handler = SYNC_HANDLER_MD, \
+	(a).rnode = (xx_rnode), \
+	(a).backend = (xx_backend), \
 	(a).forknum = (xx_forknum), \
 	(a).segno = (xx_segno) \
 )
@@ -599,11 +610,12 @@ mdwriteback(SMgrRelation reln, ForkNumber forknum,
  */
 void
 mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-	   char *buffer)
+	   char *buffer, bool zeroDamagePage)
 {
 	off_t		seekpos;
 	int			nbytes;
 	MdfdVec    *v;
+	FileTag		tag;
 
 	TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,
 										reln->smgr_rnode.node.spcNode,
@@ -639,12 +651,12 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		/*
 		 * Short read: we are at or past EOF, or we read a partial block at
 		 * EOF.  Normally this is an error; upper levels should never try to
-		 * read a nonexistent block.  However, if zero_damaged_pages is ON or
+		 * read a nonexistent block.  However, if zeroDamagePage is ON or
 		 * we are InRecovery, we should instead return zeroes without
 		 * complaining.  This allows, for example, the case of trying to
 		 * update a block that was later truncated away.
 		 */
-		if (zero_damaged_pages || InRecovery)
+		if (zeroDamagePage || InRecovery)
 			MemSet(buffer, 0, BLCKSZ);
 		else
 			ereport(ERROR,
@@ -653,6 +665,11 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 							blocknum, FilePathName(v->mdfd_vfd),
 							nbytes, BLCKSZ)));
 	}
+
+	/* verify the read page content satisfy page checksum */
+	INIT_MD_FILETAG_WITH_BACKEND(tag, reln->smgr_rnode.node, reln->smgr_rnode.backend, forknum, v->mdfd_segno);
+
+	VerifyPage(&tag, buffer, blocknum, zeroDamagePage);
 }
 
 /*
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 360b5bf5bf..a7048d44c4 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -52,7 +52,7 @@ typedef struct f_smgr
 	void		(*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
 								  BlockNumber blocknum);
 	void		(*smgr_read) (SMgrRelation reln, ForkNumber forknum,
-							  BlockNumber blocknum, char *buffer);
+							  BlockNumber blocknum, char *buffer, bool zeroDamagePage);
 	void		(*smgr_write) (SMgrRelation reln, ForkNumber forknum,
 							   BlockNumber blocknum, char *buffer, bool skipFsync);
 	void		(*smgr_writeback) (SMgrRelation reln, ForkNumber forknum,
@@ -506,9 +506,9 @@ smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
  */
 void
 smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-		 char *buffer)
+		 char *buffer, bool zeroDamagePage)
 {
-	smgrsw[reln->smgr_which].smgr_read(reln, forknum, blocknum, buffer);
+	smgrsw[reln->smgr_which].smgr_read(reln, forknum, blocknum, buffer, zeroDamagePage);
 }
 
 /*
diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h
index 3f88683a05..0037819b1c 100644
--- a/src/include/storage/bufpage.h
+++ b/src/include/storage/bufpage.h
@@ -18,6 +18,7 @@
 #include "storage/block.h"
 #include "storage/item.h"
 #include "storage/off.h"
+#include "storage/sync.h"		/* For FileTag */
 
 /*
  * A postgres disk page is an abstraction layered on top of a postgres
@@ -419,7 +420,7 @@ do { \
 						((is_heap) ? PAI_IS_HEAP : 0))
 
 /*
- * Check that BLCKSZ is a multiple of sizeof(size_t).  In PageIsVerified(),
+ * Check that BLCKSZ is a multiple of sizeof(size_t).  In VerifyPage(),
  * it is much faster to check if a page is full of zeroes using the native
  * word size.  Note that this assertion is kept within a header to make
  * sure that StaticAssertDecl() works across various combinations of
@@ -429,7 +430,7 @@ StaticAssertDecl(BLCKSZ == ((BLCKSZ / sizeof(size_t)) * sizeof(size_t)),
 				 "BLCKSZ has to be a multiple of sizeof(size_t)");
 
 extern void PageInit(Page page, Size pageSize, Size specialSize);
-extern bool PageIsVerified(Page page, BlockNumber blkno);
+extern bool VerifyPage(const FileTag *ftag, Page page, BlockNumber blkno, bool zeroDamagePage);
 extern OffsetNumber PageAddItemExtended(Page page, Item item, Size size,
 										OffsetNumber offsetNumber, int flags);
 extern Page PageGetTempPage(Page page);
@@ -448,5 +449,4 @@ extern bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum,
 									Item newtup, Size newsize);
 extern char *PageSetChecksumCopy(Page page, BlockNumber blkno);
 extern void PageSetChecksumInplace(Page page, BlockNumber blkno);
-
 #endif							/* BUFPAGE_H */
diff --git a/src/include/storage/md.h b/src/include/storage/md.h
index ec7630ce3b..7537c4f490 100644
--- a/src/include/storage/md.h
+++ b/src/include/storage/md.h
@@ -31,7 +31,7 @@ extern void mdextend(SMgrRelation reln, ForkNumber forknum,
 extern void mdprefetch(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber blocknum);
 extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-				   char *buffer);
+				   char *buffer, bool zeroDamagePage);
 extern void mdwrite(SMgrRelation reln, ForkNumber forknum,
 					BlockNumber blocknum, char *buffer, bool skipFsync);
 extern void mdwriteback(SMgrRelation reln, ForkNumber forknum,
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index 243822137c..e8eda56f14 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -95,7 +95,7 @@ extern void smgrextend(SMgrRelation reln, ForkNumber forknum,
 extern void smgrprefetch(SMgrRelation reln, ForkNumber forknum,
 						 BlockNumber blocknum);
 extern void smgrread(SMgrRelation reln, ForkNumber forknum,
-					 BlockNumber blocknum, char *buffer);
+					 BlockNumber blocknum, char *buffer, bool zeroDamagePage);
 extern void smgrwrite(SMgrRelation reln, ForkNumber forknum,
 					  BlockNumber blocknum, char *buffer, bool skipFsync);
 extern void smgrwriteback(SMgrRelation reln, ForkNumber forknum,
diff --git a/src/include/storage/sync.h b/src/include/storage/sync.h
index e16ab8e711..2c34579eec 100644
--- a/src/include/storage/sync.h
+++ b/src/include/storage/sync.h
@@ -47,6 +47,7 @@ typedef struct FileTag
 	int16		handler;		/* SyncRequestHandler value, saving space */
 	int16		forknum;		/* ForkNumber, saving space */
 	RelFileNode rnode;
+	BackendId	backend;
 	uint32		segno;
 } FileTag;
 
-- 
2.16.6