From 87baf73f56f688f4532ef78f6684934a47be3ba2 Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <horiguchi.kyotaro@lab.ntt.co.jp>
Date: Thu, 25 Oct 2018 16:49:46 +0900
Subject: [PATCH 2/2] Common'ize file type checker for checksums

pg_verify_checksums.c and basebackup.c has the same notion of 'files
that have checksums'. This patch moves the core logic so that
src/common and the both files share the logic.
---
 src/backend/replication/basebackup.c              |  43 +++--
 src/bin/pg_verify_checksums/Makefile              |   3 +-
 src/bin/pg_verify_checksums/pg_verify_checksums.c | 220 +---------------------
 src/common/Makefile                               |   3 +-
 src/common/file_checksums.c                       | 197 +++++++++++++++++++
 src/include/common/file_checksums.h               |  42 +++++
 6 files changed, 273 insertions(+), 235 deletions(-)
 create mode 100644 src/common/file_checksums.c
 create mode 100644 src/include/common/file_checksums.h

diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c
index b20f6c379c..4ebc969f3d 100644
--- a/src/backend/replication/basebackup.c
+++ b/src/backend/replication/basebackup.c
@@ -19,6 +19,7 @@
 #include "access/xlog_internal.h"	/* for pg_start/stop_backup */
 #include "catalog/pg_type.h"
 #include "common/file_perm.h"
+#include "common/file_checksums.h"
 #include "lib/stringinfo.h"
 #include "libpq/libpq.h"
 #include "libpq/pqformat.h"
@@ -187,18 +188,6 @@ static const char *excludeFiles[] =
 	NULL
 };
 
-/*
- * List of files excluded from checksum validation.
- */
-static const char *const noChecksumFiles[] = {
-	"pg_control",
-	"pg_filenode.map",
-	"pg_internal.init",
-	"PG_VERSION",
-	NULL,
-};
-
-
 /*
  * Called when ERROR or FATAL happens in perform_base_backup() after
  * we have started the backup - make sure we end it!
@@ -1321,22 +1310,36 @@ sendDir(const char *path, int basepathlen, bool sizeonly, List *tablespaces,
 static bool
 is_checksummed_file(const char *fullpath, const char *filename)
 {
-	const char *const *f;
+	checksum_scan_context ctx;
 
 	/* Check that the file is in a tablespace */
 	if (strncmp(fullpath, "./global/", 9) == 0 ||
 		strncmp(fullpath, "./base/", 7) == 0 ||
 		strncmp(fullpath, "/", 1) == 0)
 	{
-		/* Compare file against noChecksumFiles skiplist */
-		for (f = noChecksumFiles; *f; f++)
-			if (strcmp(*f, filename) == 0)
-				return false;
+		/* check if the file has checksums */
+		switch (checksum_find_file_type(fullpath, NULL, &ctx))
+		{
+		case HEAP_TO_SCAN:
+			return true;
+		case STAT_FAILED:
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("failed to stat \"%s\": %m",
+							fullpath)));
 
-		return true;
+		case ENTRY_TO_IGNORE:
+		case FILE_TO_SKIP:
+		case DIR_TO_SKIP:
+		case DIR_TO_SCAN:
+			break;
+		case FILE_UNKNOWN:
+			elog(DEBUG1, "checksum verification was skipped for unknown file: %s", fullpath);
+			break;
+		}
 	}
-	else
-		return false;
+
+	return false;
 }
 
 /*****
diff --git a/src/bin/pg_verify_checksums/Makefile b/src/bin/pg_verify_checksums/Makefile
index cfe4ab1b8b..3d0a9baf24 100644
--- a/src/bin/pg_verify_checksums/Makefile
+++ b/src/bin/pg_verify_checksums/Makefile
@@ -15,7 +15,8 @@ subdir = src/bin/pg_verify_checksums
 top_builddir = ../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS= pg_verify_checksums.o $(WIN32RES)
+OBJS= pg_verify_checksums.o $(top_builddir)/src/common/file_checksums.o \
+	$(WIN32RES)
 
 all: pg_verify_checksums
 
diff --git a/src/bin/pg_verify_checksums/pg_verify_checksums.c b/src/bin/pg_verify_checksums/pg_verify_checksums.c
index 4b527913c1..dc2143ea65 100644
--- a/src/bin/pg_verify_checksums/pg_verify_checksums.c
+++ b/src/bin/pg_verify_checksums/pg_verify_checksums.c
@@ -15,7 +15,7 @@
 
 #include "catalog/pg_control.h"
 #include "common/controldata_utils.h"
-#include "common/relpath.h"
+#include "common/file_checksums.h"
 #include "getopt_long.h"
 #include "pg_getopt.h"
 #include "storage/bufpage.h"
@@ -36,46 +36,6 @@ static bool verbose = false;
 
 static const char *progname;
 
-/* struct for checksum verification paremter*/
-typedef struct
-{
-	union
-	{
-		struct
-		{
-			BlockNumber	segmentno;
-		} heap_param;
-	} params;
-} checksum_scan_context;
-
-/* enum for return value of find_file_type */
-typedef enum
-{
-	ENTRY_TO_IGNORE,
-	DIR_TO_SCAN,
-	HEAP_TO_SCAN,
-	FILE_TO_SKIP,
-	DIR_TO_SKIP,
-	FILE_UNKNOWN
-} checksum_file_types;
-
-/* black (explisit exclusion) list for checksum verification */
-static const char *const checksum_known_to_skip[] = {
-	"pg_control",
- 	"pg_internal.init",
-	"pg_filenode.map",
-	"PG_VERSION",
-	"config_exec_params",
-	"config_exec_params.new",
-	"pgsql_tmp",					/* this is a directory */
-	NULL
-};
-
-static checksum_file_types find_file_type(const char *fn,
-										  const char *relfilenode,
-										  checksum_scan_context *ctx);
-
-
 static void
 usage(void)
 {
@@ -93,71 +53,6 @@ usage(void)
 	printf(_("Report bugs to <pgsql-bugs@postgresql.org>.\n"));
 }
 
-/*
- * isRelFileName
- *
- * Check if the given file name is authorized for checksum verification.
- */
-static bool
-isRelFileName(const char *fn)
-{
-	int			pos;
-
-	/*----------
-	 * Only files including data checksums are authorized for verification.
-	 * This is guessed based on the file name by reverse-engineering
-	 * GetRelationPath() so make sure to update both code paths if any
-	 * updates are done.  The following file name formats are allowed:
-	 * <digits>
-	 * <digits>.<segment>
-	 * <digits>_<forkname>
-	 * <digits>_<forkname>.<segment>
-	 *
-	 * Note that temporary files, beginning with 't', are also skipped.
-	 *
-	 *----------
-	 */
-
-	/* A non-empty string of digits should follow */
-	for (pos = 0; isdigit((unsigned char) fn[pos]); ++pos)
-		;
-	/* leave if no digits */
-	if (pos == 0)
-		return false;
-	/* good to go if only digits */
-	if (fn[pos] == '\0')
-		return true;
-
-	/* Authorized fork files can be scanned */
-	if (fn[pos] == '_')
-	{
-		int			forkchar = forkname_chars(&fn[pos + 1], NULL);
-
-		if (forkchar <= 0)
-			return false;
-
-		pos += forkchar + 1;
-	}
-
-	/* Check for an optional segment number */
-	if (fn[pos] == '.')
-	{
-		int			segchar;
-
-		for (segchar = 1; isdigit((unsigned char) fn[pos + segchar]); ++segchar)
-			;
-
-		if (segchar <= 1)
-			return false;
-		pos += segchar;
-	}
-
-	/* Now this should be the end */
-	if (fn[pos] != '\0')
-		return false;
-	return true;
-}
-
 static void
 scan_heap_file(const char *fn, checksum_scan_context *ctx)
 {
@@ -234,7 +129,7 @@ scan_directory(const char *basedir, const char *subdir)
 		checksum_scan_context ctx;
 
 		snprintf(fn, sizeof(fn), "%s/%s", path, de->d_name);
-		switch (find_file_type(fn, only_relfilenode, &ctx))
+		switch (checksum_find_file_type(fn, only_relfilenode, &ctx))
 		{
 		case ENTRY_TO_IGNORE:
 			continue;		/* ignore completely silently */
@@ -262,118 +157,16 @@ scan_directory(const char *basedir, const char *subdir)
 		case DIR_TO_SCAN:
 			scan_directory(path, de->d_name);
 			break;
+		case STAT_FAILED:
+			fprintf(stderr, _("%s: could not stat file \"%s\": %s\n"),
+					progname, fn, strerror(errno));
+			exit(1);
 		}
 	}
 
 	closedir(dir);
 }
 
-/*
- * find_file_type: identify what to do on a file
- *
- * fn is a file path in full path or relative down from the current directory.
- * relfilenode is filter string of file. Only specified files of node number or
- * databaseid/filenodenum will be verified checksum.
- * ctx is the parameter needed for following checksum scan.
- */
-static checksum_file_types
-find_file_type(const char *fn, const char *relfilenode,
-			   checksum_scan_context *ctx)
-{
-	struct stat st;
-	char		fnonly[MAXPGPATH];
-	const char *fname;
-	char	   *forkpath;
-	char	   *segmentpath;
-	const char *const *p;
-	bool		is_subdir = false;
-
-	/* find file name the full path */
-	fname = strrchr(fn, '/');
-	if (fname)
-		fname++;
-	else
-		fname = fn;
-
-	if (strcmp(fname, ".") == 0 ||
-		strcmp(fname, "..") == 0)
-		return ENTRY_TO_IGNORE;
-
-	if (lstat(fn, &st) < 0)
-	{
-		fprintf(stderr, _("%s: could not stat file \"%s\": %s\n"),
-				progname, fn, strerror(errno));
-		exit(1);
-	}
-
-#ifndef WIN32
-	if (S_ISDIR(st.st_mode) || S_ISLNK(st.st_mode))
-#else
-	if (S_ISDIR(st.st_mode) || pgwin32_is_junction(fn))
-#endif
-		is_subdir = true;
-
-	/* exluded by blacklist */
-
-	for (p = checksum_known_to_skip ; *p ; p++)
-	{
-		if (strcmp(*p, fname) != 0)
-			continue;
-
-		if (!is_subdir)
-			return FILE_TO_SKIP;
-		else
-			return DIR_TO_SKIP;
-	}
-
-	if (is_subdir)
-		return DIR_TO_SCAN;
-
-	/* we now know only of relfiles */
-	if (isRelFileName(fname))
-	{
-		/* copy the path so that we can scribble on it */
-		strlcpy(fnonly, fn, sizeof(fnonly));
-		ctx->params.heap_param.segmentno = 0;
-		segmentpath = strchr(fnonly, '.');
-
-		/* make sure that the dot is in the last segment in the path  */
-		if (segmentpath != NULL && strchr(segmentpath, '/') == NULL)
-		{
-			*segmentpath++ = '\0';
-			ctx->params.heap_param.segmentno = atoi(segmentpath);
-
-			/* something's wrong, treat it as unknown file  */
-			if (ctx->params.heap_param.segmentno == 0)
-				return FILE_UNKNOWN;
-		}
-	
-		if (only_relfilenode)
-		{
-			char *p;
-
-			/* find file suffix if any */
-			forkpath = strrchr(fnonly, '_');
-
-			/* the underscore must be in the last segment in the path */
-			if (forkpath != NULL && strchr(forkpath, '/') == NULL)
-				*forkpath++ = '\0';
-
-			/* make a tail match with only_relfilenode */
-			p = fnonly + strlen(fnonly) - strlen(relfilenode);
-			if (fnonly > p ||					 /* cannot match*/
-				(fnonly < p && *(p-1) != '/') || /* avoid false match */
-				strcmp(relfilenode, p) != 0)
-				/* Relfilenode not to be included */
-				return FILE_TO_SKIP;
-		}
-
-		return HEAP_TO_SCAN;
-	}
-
-	return FILE_UNKNOWN;
-}
-
 int
 main(int argc, char *argv[])
 {
@@ -397,6 +190,7 @@ main(int argc, char *argv[])
 		if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0)
 		{
 			usage();
+
 			exit(0);
 		}
 		if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)
diff --git a/src/common/Makefile b/src/common/Makefile
index ec8139f014..54b7c9f440 100644
--- a/src/common/Makefile
+++ b/src/common/Makefile
@@ -44,7 +44,8 @@ override CPPFLAGS += -DVAL_LIBS="\"$(LIBS)\""
 override CPPFLAGS := -DFRONTEND $(CPPFLAGS)
 LIBS += $(PTHREAD_LIBS)
 
-OBJS_COMMON = base64.o config_info.o controldata_utils.o exec.o file_perm.o \
+OBJS_COMMON = base64.o config_info.o controldata_utils.o exec.o \
+	file_checksums.o file_perm.o \
 	ip.o keywords.o link-canary.o md5.o pg_lzcompress.o \
 	pgfnames.o psprintf.o relpath.o \
 	rmtree.o saslprep.o scram-common.o string.o unicode_norm.o \
diff --git a/src/common/file_checksums.c b/src/common/file_checksums.c
new file mode 100644
index 0000000000..f83bb52c1d
--- /dev/null
+++ b/src/common/file_checksums.c
@@ -0,0 +1,197 @@
+/*-------------------------------------------------------------------------
+ * file_checksums.c
+ *		checksumming files
+ *
+ * This implements Unicode normalization, per the documentation at
+ * http://www.unicode.org/reports/tr15/.
+ *
+ * Portions Copyright (c) 2018, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/common/file_checksums.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <sys/stat.h>
+
+#include "c.h"
+#include "common/file_checksums.h"
+#include "common/relpath.h"
+
+/* black (explisit exclusion) list for checksum verification */
+static const char *const checksum_known_to_skip[] = {
+	"pg_control",
+ 	"pg_internal.init",
+	"pg_filenode.map",
+	"PG_VERSION",
+	"config_exec_params",
+	"config_exec_params.new",
+	"pgsql_tmp",		/* directory */
+	NULL
+};
+
+/*
+ * isRelFileName
+ *
+ * Check if the given file name is authorized for checksum verification.
+ */
+static bool
+isRelFileName(const char *fn)
+{
+	int			pos;
+
+	/*----------
+	 * Only files including data checksums are authorized for verification.
+	 * This is guessed based on the file name by reverse-engineering
+	 * GetRelationPath() so make sure to update both code paths if any
+	 * updates are done.  The following file name formats are allowed:
+	 * <digits>
+	 * <digits>.<segment>
+	 * <digits>_<forkname>
+	 * <digits>_<forkname>.<segment>
+	 *
+	 * Note that temporary files, beginning with 't', are also skipped.
+	 *
+	 *----------
+	 */
+
+	/* A non-empty string of digits should follow */
+	for (pos = 0; isdigit((unsigned char) fn[pos]); ++pos)
+		;
+	/* leave if no digits */
+	if (pos == 0)
+		return false;
+	/* good to go if only digits */
+	if (fn[pos] == '\0')
+		return true;
+
+	/* Authorized fork files can be scanned */
+	if (fn[pos] == '_')
+	{
+		int			forkchar = forkname_chars(&fn[pos + 1], NULL);
+
+		if (forkchar <= 0)
+			return false;
+
+		pos += forkchar + 1;
+	}
+
+	/* Check for an optional segment number */
+	if (fn[pos] == '.')
+	{
+		int			segchar;
+
+		for (segchar = 1; isdigit((unsigned char) fn[pos + segchar]); ++segchar)
+			;
+
+		if (segchar <= 1)
+			return false;
+		pos += segchar;
+	}
+
+	/* Now this should be the end */
+	if (fn[pos] != '\0')
+		return false;
+	return true;
+}
+
+/*
+ * checksum_find_file_type: identify a file from the viewpoint of checksum
+ *
+ * fn is file name with full path to check
+ * relfilenode is relfilenode in string to exclude files other than that.
+ * ctx is the context to scan checksum, which contains parameters for scanners.
+ */
+checksum_file_types
+checksum_find_file_type(const char *fn,
+						const char *relfilenode, checksum_scan_context *ctx)
+{
+	struct stat st;
+	char		fnonly[MAXPGPATH];
+	char	   *fname;
+	char	   *forkpath;
+	char	   *segmentpath;
+	const char *const *p;
+	bool		is_subdir = false;
+
+	fname = strrchr(fn, '/');
+
+	if (fname == NULL)
+		return ENTRY_TO_IGNORE;
+
+	fname++;
+
+	if (strcmp(fname, ".") == 0 ||
+		strcmp(fname, "..") == 0)
+		return ENTRY_TO_IGNORE;
+
+	if (lstat(fn, &st) < 0)
+		return STAT_FAILED;
+
+#ifndef WIN32
+	if (S_ISDIR(st.st_mode) || S_ISLNK(st.st_mode))
+#else
+	if (S_ISDIR(st.st_mode) || pgwin32_is_junction(fn))
+#endif
+		is_subdir = true;
+
+	/* exluded by blacklist */
+
+	for (p = checksum_known_to_skip ; *p ; p++)
+	{
+		if (strcmp(*p, fname) != 0)
+			continue;
+
+		if (is_subdir)
+			return DIR_TO_SKIP;
+
+		return FILE_TO_SKIP;
+	}
+
+	if (is_subdir)
+		return DIR_TO_SCAN;
+
+	/* we now know only of relfiles */
+	if (isRelFileName(fname))
+	{
+		/* copy the path so that we can scribble on it */
+		strlcpy(fnonly, fn, sizeof(fnonly));
+		ctx->params.heap_param.segmentno = 0;
+		segmentpath = strchr(fnonly, '.');
+
+		/* make sure that the dot is in the last segment in the path  */
+		if (segmentpath != NULL && strchr(segmentpath, '/') == NULL)
+		{
+			*segmentpath++ = '\0';
+			ctx->params.heap_param.segmentno = atoi(segmentpath);
+
+			/* something's wrong, treat it as unknown file  */
+			if (ctx->params.heap_param.segmentno == 0)
+				return FILE_UNKNOWN;
+		}
+	
+		if (relfilenode)
+		{
+			char *p;
+
+			/* find file suffix if any */
+			forkpath = strrchr(fnonly, '_');
+
+			/* the underscore must be in the last segment in the path */
+			if (forkpath != NULL && strchr(forkpath, '/') == NULL)
+				*forkpath++ = '\0';
+
+			/* make a tail match with only_relfilenode */
+			p = fnonly + strlen(fnonly) - strlen(relfilenode);
+			if (fnonly > p ||					 /* cannot match*/
+				(fnonly < p && *(p-1) != '/') || /* avoid false match */
+				strcmp(relfilenode, p) != 0)
+				/* Relfilenode not to be included */
+				return FILE_TO_SKIP;
+		}
+
+		return HEAP_TO_SCAN;
+	}
+
+	return FILE_UNKNOWN;
+}
diff --git a/src/include/common/file_checksums.h b/src/include/common/file_checksums.h
new file mode 100644
index 0000000000..3ead25c97f
--- /dev/null
+++ b/src/include/common/file_checksums.h
@@ -0,0 +1,42 @@
+/*
+ *	file_checksums.h
+ *		checksumming files
+ *
+ *	Copyright (c) 2018, PostgreSQL Global Development Group
+ *
+ *	src/include/common/file_checksums.h
+ */
+#ifndef FILE_CHECKSUMS_H
+#define FILE_CHECKSUMS_H
+
+#include "storage/block.h"
+
+/* struct for checksum verification paremter*/
+typedef struct
+{
+	union
+	{
+		struct
+		{
+			BlockNumber	segmentno;
+		} heap_param;
+	} params;
+} checksum_scan_context;
+
+/* enum for return value of find_file_type */
+typedef enum
+{
+	ENTRY_TO_IGNORE,
+	DIR_TO_SCAN,
+	HEAP_TO_SCAN,
+	FILE_TO_SKIP,
+	DIR_TO_SKIP,
+	FILE_UNKNOWN,
+	STAT_FAILED
+} checksum_file_types;
+
+checksum_file_types checksum_find_file_type(const char *fn,
+											const char *relfilenode,
+											checksum_scan_context *ctx);
+
+#endif							/* FILE_CHECKSUMS_H */
-- 
2.16.3

