From 309649dfa2eea9d696e43271057da7e643d771b4 Mon Sep 17 00:00:00 2001
From: Jeremie Grauer <jgrauer@cosium.com>
Date: Mon, 7 Nov 2022 23:50:43 +0100
Subject: [PATCH] adds the option --no-ensure-full-page-writes to pg_rewind

---
 doc/src/sgml/ref/pg_rewind.sgml  | 27 +++++++++++++++++++-
 src/bin/pg_rewind/libpq_source.c | 22 ++++++++++------
 src/bin/pg_rewind/pg_rewind.c    | 43 +++++++++++++++++++-------------
 src/bin/pg_rewind/pg_rewind.h    |  1 +
 4 files changed, 67 insertions(+), 26 deletions(-)

diff --git a/doc/src/sgml/ref/pg_rewind.sgml b/doc/src/sgml/ref/pg_rewind.sgml
index 69d6924b3a..64ee600c2b 100644
--- a/doc/src/sgml/ref/pg_rewind.sgml
+++ b/doc/src/sgml/ref/pg_rewind.sgml
@@ -99,7 +99,13 @@ PostgreSQL documentation
    in <filename>postgresql.conf</filename> or data checksums enabled when
    the cluster was initialized with <application>initdb</application>.  Neither of these
    are currently on by default.  <xref linkend="guc-full-page-writes"/>
-   must also be set to <literal>on</literal>, but is enabled by default.
+   should also be set to <literal>on</literal> (which is the default) when the
+   underlying file system can write data in an increment of less than the page size.
+   Some file systems (like ZFS when configured with a recordsize as a multiple of
+   the page size) don't need this parameter since they will never write less than a
+   complete page to disk. If you are running such a system, you can add the option
+   --no-enforce-full-page-writes so that <application>pg_rewind</application> will run
+   even when <xref linkend="guc-full-page-writes"/> is set to <literal>off</literal>.
   </para>
 
   <warning>
@@ -283,6 +289,25 @@ PostgreSQL documentation
       </listitem>
      </varlistentry>
 
+     <varlistentry>
+      <term><option>--no-ensure-full-page-writes</option></term>
+      <listitem>
+       <para>
+        <application>pg_rewind</application> by default requires that <xref linkend="guc-full-page-writes"/>
+        is set to on. This ensures that <application>pg_rewind</application> do
+        not run into torn pages while running. This option is necessary most of
+        the time since very few filesystems enforce the flush of full page to disk.
+        One such system is ZFS where the page is always flushed to disk in its
+        entirety. Other storage system may also have the same warranty. This option
+        is here to allow pg_rewind to run on such systems without enforcing that
+        <xref linkend="guc-full-page-writes"/> is on. It's important to know what
+        you are doing before setting this setting because in case your system does
+        not really guarantee that the page is persisted to disk in its entirety, you
+        may run into data corruption.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry>
       <term><option>-V</option></term>
       <term><option>--version</option></term>
diff --git a/src/bin/pg_rewind/libpq_source.c b/src/bin/pg_rewind/libpq_source.c
index 011c9cce6e..ad7f745c06 100644
--- a/src/bin/pg_rewind/libpq_source.c
+++ b/src/bin/pg_rewind/libpq_source.c
@@ -132,15 +132,23 @@ init_libpq_conn(PGconn *conn)
 	PQclear(res);
 
 	/*
-	 * Also check that full_page_writes is enabled.  We can get torn pages if
-	 * a page is modified while we read it with pg_read_binary_file(), and we
-	 * rely on full page images to fix them.
+	 * If ensure_full_page_writes is true (the default), check that
+	 * full_page_writes is enabled. The goal here is to not run into a torn
+	 * page. Torn page can happend if a page is modified while we read it
+	 * with pg_read_binary_file(). Some file systems are immune to torn page,
+	 * this is why there is an option to disable this check. Outside of those
+	 * specific systems, we rely on full_page_writes to avoid it.
 	 */
-	str = run_simple_query(conn, "SHOW full_page_writes");
-	if (strcmp(str, "on") != 0)
-		pg_fatal("full_page_writes must be enabled in the source server");
+	if (ensure_full_page_writes)
+	{
+		str = run_simple_query(conn, "SHOW full_page_writes");
+		if (strcmp(str, "on") != 0)
+			pg_fatal("full_page_writes is currently disabled on the "
+				"source server, use the option --no-ensure-full"
+				"-page-write to bypass this check if you kown "
+				"what you are doing");
+	}
 	pg_free(str);
-
 	/* Prepare a statement we'll use to fetch files */
 	res = PQprepare(conn, "fetch_chunks_stmt",
 					"SELECT path, begin,\n"
diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c
index 3cd77c09b1..4307d984fa 100644
--- a/src/bin/pg_rewind/pg_rewind.c
+++ b/src/bin/pg_rewind/pg_rewind.c
@@ -68,6 +68,7 @@ bool		showprogress = false;
 bool		dry_run = false;
 bool		do_sync = true;
 bool		restore_wal = false;
+bool		ensure_full_page_writes = true;
 
 /* Target history */
 TimeLineHistoryEntry *targetHistory;
@@ -86,23 +87,24 @@ usage(const char *progname)
 	printf(_("%s resynchronizes a PostgreSQL cluster with another copy of the cluster.\n\n"), progname);
 	printf(_("Usage:\n  %s [OPTION]...\n\n"), progname);
 	printf(_("Options:\n"));
-	printf(_("  -c, --restore-target-wal       use restore_command in target configuration to\n"
-			 "                                 retrieve WAL files from archives\n"));
-	printf(_("  -D, --target-pgdata=DIRECTORY  existing data directory to modify\n"));
-	printf(_("      --source-pgdata=DIRECTORY  source data directory to synchronize with\n"));
-	printf(_("      --source-server=CONNSTR    source server to synchronize with\n"));
-	printf(_("  -n, --dry-run                  stop before modifying anything\n"));
-	printf(_("  -N, --no-sync                  do not wait for changes to be written\n"
-			 "                                 safely to disk\n"));
-	printf(_("  -P, --progress                 write progress messages\n"));
-	printf(_("  -R, --write-recovery-conf      write configuration for replication\n"
-			 "                                 (requires --source-server)\n"));
-	printf(_("      --config-file=FILENAME     use specified main server configuration\n"
-			 "                                 file when running target cluster\n"));
-	printf(_("      --debug                    write a lot of debug messages\n"));
-	printf(_("      --no-ensure-shutdown       do not automatically fix unclean shutdown\n"));
-	printf(_("  -V, --version                  output version information, then exit\n"));
-	printf(_("  -?, --help                     show this help, then exit\n"));
+	printf(_("  -c, --restore-target-wal               use restore_command in target configuration to\n"
+			 "                                         retrieve WAL files from archives\n"));
+	printf(_("  -D, --target-pgdata=DIRECTORY          existing data directory to modify\n"));
+	printf(_("      --source-pgdata=DIRECTORY          source data directory to synchronize with\n"));
+	printf(_("      --source-server=CONNSTR            source server to synchronize with\n"));
+	printf(_("  -n, --dry-run                          stop before modifying anything\n"));
+	printf(_("  -N, --no-sync                          do not wait for changes to be written\n"
+			 "                                         safely to disk\n"));
+	printf(_("  -P, --progress                         write progress messages\n"));
+	printf(_("  -R, --write-recovery-conf              write configuration for replication\n"
+			 "                                         (requires --source-server)\n"));
+	printf(_("      --config-file=FILENAME             use specified main server configuration\n"
+			 "                                         file when running target cluster\n"));
+	printf(_("      --debug                            write a lot of debug messages\n"));
+	printf(_("      --no-ensure-shutdown               do not automatically fix unclean shutdown\n"));
+	printf(_("      --no-ensure-full-page-writes       do not make sure that full_page_writes is on\n"));
+	printf(_("  -V, --version                          output version information, then exit\n"));
+	printf(_("  -?, --help                             show this help, then exit\n"));
 	printf(_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
 	printf(_("%s home page: <%s>\n"), PACKAGE_NAME, PACKAGE_URL);
 }
@@ -118,7 +120,8 @@ main(int argc, char **argv)
 		{"source-pgdata", required_argument, NULL, 1},
 		{"source-server", required_argument, NULL, 2},
 		{"no-ensure-shutdown", no_argument, NULL, 4},
-		{"config-file", required_argument, NULL, 5},
+		{"no-ensure-full-page-writes", no_argument, NULL, 5},
+		{"config-file", required_argument, NULL, 6},
 		{"version", no_argument, NULL, 'V'},
 		{"restore-target-wal", no_argument, NULL, 'c'},
 		{"dry-run", no_argument, NULL, 'n'},
@@ -207,6 +210,10 @@ main(int argc, char **argv)
 				break;
 
 			case 5:
+				ensure_full_page_writes = false;
+				break;
+
+			case 6:
 				config_file = pg_strdup(optarg);
 				break;
 
diff --git a/src/bin/pg_rewind/pg_rewind.h b/src/bin/pg_rewind/pg_rewind.h
index 80c276285a..f63426256a 100644
--- a/src/bin/pg_rewind/pg_rewind.h
+++ b/src/bin/pg_rewind/pg_rewind.h
@@ -23,6 +23,7 @@ extern char *datadir_target;
 extern bool showprogress;
 extern bool dry_run;
 extern bool do_sync;
+extern bool ensure_full_page_writes;
 extern int	WalSegSz;
 
 /* Target history */
-- 
2.38.1

