From fc850bada4a93523a69eb9b824432209d6eee344 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Fri, 30 Oct 2020 10:19:14 +0900
Subject: [PATCH v8 2/2] Fix page verification in base backups

---
 src/backend/replication/basebackup.c         | 177 ++++++++++---------
 src/bin/pg_basebackup/t/010_pg_basebackup.pl |  20 ++-
 2 files changed, 111 insertions(+), 86 deletions(-)

diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c
index 50ae1f16d0..9a9acbe818 100644
--- a/src/backend/replication/basebackup.c
+++ b/src/backend/replication/basebackup.c
@@ -1566,21 +1566,24 @@ sendFile(const char *readfilename, const char *tarfilename,
 {
 	FILE	   *fp;
 	BlockNumber blkno = 0;
-	bool		block_retry = false;
+	int			block_attempts = 0;
 	char		buf[TAR_SEND_SIZE];
-	uint16		checksum;
 	int			checksum_failures = 0;
 	off_t		cnt;
 	int			i;
 	pgoff_t		len = 0;
 	char	   *page;
 	size_t		pad;
-	PageHeader	phdr;
 	int			segmentno = 0;
 	char	   *segmentpath;
 	bool		verify_checksum = false;
 	pg_checksum_context checksum_ctx;
 
+	/* Maximum number of checksum failures reported for this file */
+#define CHECKSUM_REPORT_THRESHOLD		5
+	/* maximum number of retries done for a single page */
+#define PAGE_RETRY_THRESHOLD			5
+
 	pg_checksum_init(&checksum_ctx, manifest->checksum_type);
 
 	fp = AllocateFile(readfilename, "rb");
@@ -1639,9 +1642,7 @@ sendFile(const char *readfilename, const char *tarfilename,
 		if (verify_checksum && (cnt % BLCKSZ != 0))
 		{
 			ereport(WARNING,
-					(errmsg("could not verify checksum in file \"%s\", block "
-							"%d: read buffer size %d and page size %d "
-							"differ",
+					(errmsg("could not verify checksum in file \"%s\", block %u: read buffer size %d and page size %d differ",
 							readfilename, blkno, (int) cnt, BLCKSZ)));
 			verify_checksum = false;
 		}
@@ -1652,96 +1653,102 @@ sendFile(const char *readfilename, const char *tarfilename,
 			{
 				page = buf + BLCKSZ * i;
 
+				elog(DEBUG1, "checking block %u of file %s, attempt %d",
+					 blkno, readfilename, block_attempts);
+
 				/*
-				 * Only check pages which have not been modified since the
-				 * start of the base backup. Otherwise, they might have been
-				 * written only halfway and the checksum would not be valid.
-				 * However, replaying WAL would reinstate the correct page in
-				 * this case. We also skip completely new pages, since they
-				 * don't have a checksum yet.
+				 * Check on-disk pages with the same set of verification
+				 * conditions used before loading pages into shared buffers.
+				 * Note that PageIsVerifiedExtended() considers pages filled
+				 * only with zeros as valid, since they don't have a checksum
+				 * yet.  Failures are not reported to pgstat yet, as these are
+				 * reported in a single batch once a file is completed.  It
+				 * could be possible, that a page is written halfway while
+				 * doing this check, causing a false positive.  If that
+				 * happens, a page is retried multiple times, with an error
+				 * reported if the second attempt also fails.
 				 */
-				if (!PageIsNew(page) && PageGetLSN(page) < startptr)
+				if (PageIsVerifiedExtended(page, blkno, 0))
 				{
-					checksum = pg_checksum_page((char *) page, blkno + segmentno * RELSEG_SIZE);
-					phdr = (PageHeader) page;
-					if (phdr->pd_checksum != checksum)
+					/* success, move on to the next block */
+					blkno++;
+					elog(DEBUG1, "check success for block %u of file %s, attempt %d",
+						 blkno, readfilename, block_attempts);
+					block_attempts = 0;
+					continue;
+				}
+
+				/* The verification of a page has failed, retry again */
+				if (block_attempts < PAGE_RETRY_THRESHOLD)
+				{
+					/* Reread the failed block */
+					if (fseek(fp, -(cnt - BLCKSZ * i), SEEK_CUR) == -1)
+					{
+						ereport(ERROR,
+								(errcode_for_file_access(),
+								 errmsg("could not fseek in file \"%s\": %m",
+										readfilename)));
+					}
+
+					if (fread(buf + BLCKSZ * i, 1, BLCKSZ, fp) != BLCKSZ)
 					{
 						/*
-						 * Retry the block on the first failure.  It's
-						 * possible that we read the first 4K page of the
-						 * block just before postgres updated the entire block
-						 * so it ends up looking torn to us.  We only need to
-						 * retry once because the LSN should be updated to
-						 * something we can ignore on the next pass.  If the
-						 * error happens again then it is a true validation
-						 * failure.
+						 * If we hit end-of-file, a concurrent truncation must
+						 * have occurred, so break out of this loop just as if
+						 * the initial fread() returned 0. We'll drop through
+						 * to the same code that handles that case. (We must
+						 * fix up cnt first, though.)
 						 */
-						if (block_retry == false)
+						if (feof(fp))
 						{
-							/* Reread the failed block */
-							if (fseek(fp, -(cnt - BLCKSZ * i), SEEK_CUR) == -1)
-							{
-								ereport(ERROR,
-										(errcode_for_file_access(),
-										 errmsg("could not fseek in file \"%s\": %m",
-												readfilename)));
-							}
-
-							if (fread(buf + BLCKSZ * i, 1, BLCKSZ, fp) != BLCKSZ)
-							{
-								/*
-								 * If we hit end-of-file, a concurrent
-								 * truncation must have occurred, so break out
-								 * of this loop just as if the initial fread()
-								 * returned 0. We'll drop through to the same
-								 * code that handles that case. (We must fix
-								 * up cnt first, though.)
-								 */
-								if (feof(fp))
-								{
-									cnt = BLCKSZ * i;
-									break;
-								}
-
-								ereport(ERROR,
-										(errcode_for_file_access(),
-										 errmsg("could not reread block %d of file \"%s\": %m",
-												blkno, readfilename)));
-							}
-
-							if (fseek(fp, cnt - BLCKSZ * i - BLCKSZ, SEEK_CUR) == -1)
-							{
-								ereport(ERROR,
-										(errcode_for_file_access(),
-										 errmsg("could not fseek in file \"%s\": %m",
-												readfilename)));
-							}
-
-							/* Set flag so we know a retry was attempted */
-							block_retry = true;
-
-							/* Reset loop to validate the block again */
-							i--;
-							continue;
+							cnt = BLCKSZ * i;
+							break;
 						}
 
-						checksum_failures++;
-
-						if (checksum_failures <= 5)
-							ereport(WARNING,
-									(errmsg("checksum verification failed in "
-											"file \"%s\", block %d: calculated "
-											"%X but expected %X",
-											readfilename, blkno, checksum,
-											phdr->pd_checksum)));
-						if (checksum_failures == 5)
-							ereport(WARNING,
-									(errmsg("further checksum verification "
-											"failures in file \"%s\" will not "
-											"be reported", readfilename)));
+						ereport(ERROR,
+								(errcode_for_file_access(),
+								 errmsg("could not reread block %d of file \"%s\": %m",
+										blkno, readfilename)));
 					}
+
+
+					if (fseek(fp, cnt - BLCKSZ * i - BLCKSZ, SEEK_CUR) == -1)
+					{
+						ereport(ERROR,
+								(errcode_for_file_access(),
+								 errmsg("could not fseek in file \"%s\": %m",
+										readfilename)));
+					}
+
+					/* Set the counter so we know a retry was attempted */
+					block_attempts++;
+
+					elog(DEBUG1, "check failure for block %u of file %s, attempt %d",
+						 blkno, readfilename, block_attempts);
+
+					/*
+					 * Wait for 100 ms to give some room to any parallel page
+					 * write that may have caused this retry to finish.
+					 */
+					pg_usleep(100000L);
+
+					/* Reset loop to validate the block again */
+					i--;
+					continue;
 				}
-				block_retry = false;
+
+				checksum_failures++;
+				if (checksum_failures <= CHECKSUM_REPORT_THRESHOLD)
+					ereport(WARNING,
+							(errmsg("checksum verification failed in block %u of file \"%s\"",
+									blkno, readfilename)));
+				if (checksum_failures == CHECKSUM_REPORT_THRESHOLD)
+					ereport(WARNING,
+							(errmsg("further checksum verification failures in file \"%s\" will not be reported",
+									readfilename)));
+
+				/* move to next block */
+				block_attempts = 0;
 				blkno++;
 			}
 		}
diff --git a/src/bin/pg_basebackup/t/010_pg_basebackup.pl b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
index 208df557b8..01cc2dceb2 100644
--- a/src/bin/pg_basebackup/t/010_pg_basebackup.pl
+++ b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
@@ -6,7 +6,7 @@ use File::Basename qw(basename dirname);
 use File::Path qw(rmtree);
 use PostgresNode;
 use TestLib;
-use Test::More tests => 109;
+use Test::More tests => 112;
 
 program_help_ok('pg_basebackup');
 program_version_ok('pg_basebackup');
@@ -514,6 +514,24 @@ $node->command_checks_all(
 	'pg_basebackup reports checksum mismatch');
 rmtree("$tempdir/backup_corrupt");
 
+# Create a new type of corruption and zero out one page header
+# completely.
+open $file, '+<', "$pgdata/$file_corrupt1";
+seek($file, $pageheader_size, 0);
+system_or_bail 'pg_ctl', '-D', $pgdata, 'stop';
+my $zero_data = '\0' x $pageheader_size;
+syswrite($file, $zero_data);
+close $file;
+system_or_bail 'pg_ctl', '-D', $pgdata, 'start';
+
+$node->command_checks_all(
+	[ 'pg_basebackup', '-D', "$tempdir/backup_corrupt1a" ],
+	1,
+	[qr{^$}],
+	[qr/^WARNING.*checksum verification failed/s],
+	"pg_basebackup reports checksum mismatch for zeroed page headers");
+rmtree("$tempdir/backup_corrupt1a");
+
 # induce further corruption in 5 more blocks
 system_or_bail 'pg_ctl', '-D', $pgdata, 'stop';
 open $file, '+<', "$pgdata/$file_corrupt1";
-- 
2.29.1

