From ac914bf0e227b3cb62918954a7d7567a3f09f3b7 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Fri, 4 Mar 2022 14:14:21 -0500
Subject: [PATCH v6 2/4] Avoid immediate fsync for unbuffered IO

Data written to WAL-logged relations is durable once the WAL entries are
on permanent storage; however, the XLOG Redo pointer cannot be moved
past the associated WAL until the page data is safely on permanent
storage. If a crash were to occur before the data is fsync'd, the WAL
wouldn't be replayed during recovery, and the data would be lost.

This is not a problem with pages written in shared buffers because the
checkpointer will block until FlushBuffer() is complete for all buffers
that were dirtied before it began. Therefore it will not move the Redo
pointer past their associated WAL entries until it has fsync'd the data.

A backend writing data outside of shared buffers must ensure that the
data has reached permanent storage itself or that the Redo pointer has
not moved while it was writing the data.

In the common case, the backend should not have to do this fsync itself
and can instead request the checkpointer do it.

To ensure this is safe, the backend can save the XLOG Redo pointer
location before doing the write or extend. Then it can add an fsync
request for the page to the checkpointer's pending-ops table using the
existing mechanism. After doing the write or extend, if the Redo pointer
has moved (meaning a checkpoint has started since it saved it last),
then the backend can simply fsync the page itself. Otherwise, the
checkpointer takes care of fsync'ing the page the next time it processes
the pending-ops table.

This commit adds the optimization option to the directmgr API but does
not add any users, so there is no behavior change.
---
 contrib/bloom/blinsert.c               |  2 +-
 src/backend/access/gist/gistbuild.c    |  2 +-
 src/backend/access/heap/rewriteheap.c  |  3 +--
 src/backend/access/nbtree/nbtree.c     |  2 +-
 src/backend/access/nbtree/nbtsort.c    |  2 +-
 src/backend/access/spgist/spginsert.c  |  2 +-
 src/backend/access/transam/xlog.c      | 13 +++++++++++++
 src/backend/catalog/storage.c          |  2 +-
 src/backend/storage/direct/directmgr.c | 14 +++++++++++++-
 src/include/access/xlog.h              |  1 +
 src/include/storage/directmgr.h        | 11 ++++++++++-
 11 files changed, 44 insertions(+), 10 deletions(-)

diff --git a/contrib/bloom/blinsert.c b/contrib/bloom/blinsert.c
index 7954a17e2d..fdde2bd88a 100644
--- a/contrib/bloom/blinsert.c
+++ b/contrib/bloom/blinsert.c
@@ -173,7 +173,7 @@ blbuildempty(Relation index)
 	 * internally. However, were this to be replaced with unbuffered_extend(),
 	 * do_wal must be true to ensure the data is logged and fsync'd.
 	 */
-	unbuffered_prep(&wstate, true);
+	unbuffered_prep(&wstate, true, false);
 
 	/* Construct metapage. */
 	metapage = (Page) palloc(BLCKSZ);
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
index fc09938f80..8de19199a6 100644
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -412,7 +412,7 @@ gist_indexsortbuild(GISTBuildState *state)
 	state->pages_written = 0;
 	state->ready_num_pages = 0;
 
-	unbuffered_prep(&state->ub_wstate, RelationNeedsWAL(state->indexrel));
+	unbuffered_prep(&state->ub_wstate, RelationNeedsWAL(state->indexrel), false);
 
 	/*
 	 * Write an empty page as a placeholder for the root page. It will be
diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index c8fa8bb27c..70b7a0f269 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -267,8 +267,7 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm
 	state->rs_cxt = rw_cxt;
 
 	unbuffered_prep(&state->rs_unbuffered_wstate,
-			RelationNeedsWAL(state->rs_new_rel));
-
+			RelationNeedsWAL(state->rs_new_rel), false);
 
 	/* Initialize hash tables used to track update chains */
 	hash_ctl.keysize = sizeof(TidHashKey);
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index c7bf971917..6b78acefbe 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -161,7 +161,7 @@ btbuildempty(Relation index)
 	 * internally. However, were this to be replaced with unbuffered_extend(),
 	 * do_wal must be true to ensure the data is logged and fsync'd.
 	 */
-	unbuffered_prep(&wstate, true);
+	unbuffered_prep(&wstate, true, false);
 
 	/* Construct metapage. */
 	metapage = (Page) palloc(BLCKSZ);
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index e280253127..d6d0d4b361 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -1189,7 +1189,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 	int64		tuples_done = 0;
 	bool		deduplicate;
 
-	unbuffered_prep(&wstate->ub_wstate, wstate->btws_use_wal);
+	unbuffered_prep(&wstate->ub_wstate, wstate->btws_use_wal, false);
 
 	deduplicate = wstate->inskey->allequalimage && !btspool->isunique &&
 		BTGetDeduplicateItems(wstate->index);
diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c
index 318dbee823..e30f3623f5 100644
--- a/src/backend/access/spgist/spginsert.c
+++ b/src/backend/access/spgist/spginsert.c
@@ -165,7 +165,7 @@ spgbuildempty(Relation index)
 	 * internally. However, were this to be replaced with unbuffered_extend(),
 	 * do_wal must be true to ensure the data is logged and fsync'd.
 	 */
-	unbuffered_prep(&wstate, true);
+	unbuffered_prep(&wstate, true, false);
 
 	/* Construct metapage. */
 	page = (Page) palloc(BLCKSZ);
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 0d2bd7a357..db7b33ec67 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -5971,6 +5971,19 @@ GetLastImportantRecPtr(void)
 	return res;
 }
 
+bool RedoRecPtrChanged(XLogRecPtr comparator_ptr)
+{
+	XLogRecPtr ptr;
+	SpinLockAcquire(&XLogCtl->info_lck);
+	ptr = XLogCtl->RedoRecPtr;
+	SpinLockRelease(&XLogCtl->info_lck);
+
+	if (RedoRecPtr < ptr)
+		RedoRecPtr = ptr;
+
+	return RedoRecPtr != comparator_ptr;
+}
+
 /*
  * Get the time and LSN of the last xlog segment switch
  */
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index 0b211895c1..2fd5a31ffd 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -443,7 +443,7 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
 	use_wal = XLogIsNeeded() &&
 		(relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork);
 
-	unbuffered_prep(&wstate, use_wal);
+	unbuffered_prep(&wstate, use_wal, false);
 
 	nblocks = smgrnblocks(src, forkNum);
 
diff --git a/src/backend/storage/direct/directmgr.c b/src/backend/storage/direct/directmgr.c
index 42c37daa7a..4c3a5a2f74 100644
--- a/src/backend/storage/direct/directmgr.c
+++ b/src/backend/storage/direct/directmgr.c
@@ -15,14 +15,23 @@
 #include "postgres.h"
 
 
+#include "access/xlog.h"
 #include "access/xlogdefs.h"
 #include "access/xloginsert.h"
 #include "storage/directmgr.h"
 
 void
-unbuffered_prep(UnBufferedWriteState *wstate, bool do_wal)
+unbuffered_prep(UnBufferedWriteState *wstate, bool do_wal, bool
+		do_optimization)
 {
+	/*
+	 * There is no valid fsync optimization if no WAL is being written anyway
+	 */
+	Assert(!do_optimization || (do_optimization && do_wal));
+
 	wstate->do_wal = do_wal;
+	wstate->do_optimization = do_optimization;
+	wstate->redo = do_optimization ? GetRedoRecPtr() : InvalidXLogRecPtr;
 }
 
 void
@@ -94,5 +103,8 @@ unbuffered_finish(UnBufferedWriteState *wstate, SMgrRelation smgrrel,
 	if (!wstate->do_wal)
 		return;
 
+	if (wstate->do_optimization && !RedoRecPtrChanged(wstate->redo))
+		return;
+
 	smgrimmedsync(smgrrel, forknum);
 }
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 4b45ac64db..71fe99a28d 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -241,6 +241,7 @@ extern XLogRecPtr GetInsertRecPtr(void);
 extern XLogRecPtr GetFlushRecPtr(TimeLineID *insertTLI);
 extern TimeLineID GetWALInsertionTimeLine(void);
 extern XLogRecPtr GetLastImportantRecPtr(void);
+extern bool RedoRecPtrChanged(XLogRecPtr comparator_ptr);
 
 extern void SetWalWriterSleeping(bool sleeping);
 
diff --git a/src/include/storage/directmgr.h b/src/include/storage/directmgr.h
index db5e3b1cac..e5454a3296 100644
--- a/src/include/storage/directmgr.h
+++ b/src/include/storage/directmgr.h
@@ -14,6 +14,7 @@
 #ifndef DIRECTMGR_H
 #define DIRECTMGR_H
 
+#include "access/xlogdefs.h"
 #include "common/relpath.h"
 #include "storage/block.h"
 #include "storage/bufpage.h"
@@ -28,14 +29,22 @@ typedef struct UnBufferedWriteState
 	 * must fsync the pages they have written themselves. This is necessary
 	 * only if the relation is WAL-logged or in special cases such as the init
 	 * fork of an unlogged index.
+	 *
+	 * These callers can optionally use the following optimization: attempt to
+	 * use the sync request queue and fall back to fsync'ing the pages
+	 * themselves if the Redo pointer moves between the start and finish of
+	 * their write. In order to do this, they must set do_optimization to true
+	 * so that the redo pointer is saved before the write begins.
 	 */
 	bool do_wal;
+	bool do_optimization;
+	XLogRecPtr redo;
 } UnBufferedWriteState;
 /*
  * prototypes for functions in directmgr.c
  */
 extern void
-unbuffered_prep(UnBufferedWriteState *wstate, bool do_wal);
+unbuffered_prep(UnBufferedWriteState *wstate, bool do_wal, bool do_optimization);
 extern void
 unbuffered_extend(UnBufferedWriteState *wstate, SMgrRelation smgrrel,
 		ForkNumber forknum, BlockNumber blocknum, Page page, bool empty);
-- 
2.30.2

