From 72c528a913ed4cae1cd11789439bfe1208dd379a Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Tue, 8 Feb 2022 19:01:27 -0500
Subject: [PATCH v5 2/4] Avoid immediate fsync for unbuffered IO

Data written to WAL-logged table forks is durable once the WAL entries
are on permanent storage; however, the XLOG Redo pointer cannot be moved
past the associated WAL until the page data is safely on permanent
storage. If a crash were to occur before the data is fsync'd, the WAL
wouldn't be replayed during recovery, and the data would be lost.

This is not a problem with pages written in shared buffers because the
checkpointer will block until FlushBuffer() is complete for all buffers
that were dirtied before it began. Therefore it will not move the Redo
pointer past their associated WAL entries until it has fsync'd the data.

A backend writing data outside of shared buffers must ensure that the
data has reached permanent storage itself or that the Redo pointer has
not moved while it was writing the data.

In the common case, the backend should not have to do this fsync itself
and can instead request the checkpointer do it.

To ensure this is safe, the backend can save the XLOG Redo pointer
location before doing the write or extend. Then it can add an fsync
request for the page to the checkpointer's pending-ops table using the
existing mechanism. After doing the write or extend, if the Redo pointer
has moved (meaning a checkpoint has started since it saved it last),
then the backend can simply fsync the page itself. Otherwise, the
checkpointer takes care of fsync'ing the page the next time it processes
the pending-ops table.

This commit adds the optimization option to the directmgr API but does
not add any users, so there is no behavior change.
---
 src/backend/access/gist/gistbuild.c       |  4 ++--
 src/backend/access/hash/hashpage.c        |  2 +-
 src/backend/access/heap/heapam_handler.c  |  2 +-
 src/backend/access/heap/rewriteheap.c     |  2 +-
 src/backend/access/heap/visibilitymap.c   |  2 +-
 src/backend/access/nbtree/nbtree.c        |  2 +-
 src/backend/access/nbtree/nbtsort.c       |  5 ++++-
 src/backend/access/spgist/spginsert.c     |  2 +-
 src/backend/access/transam/xlog.c         | 13 +++++++++++++
 src/backend/catalog/storage.c             |  2 +-
 src/backend/storage/direct/directmgr.c    | 18 ++++++++++++++++--
 src/backend/storage/freespace/freespace.c |  2 +-
 src/include/access/xlog.h                 |  1 +
 src/include/storage/directmgr.h           | 13 +++++++++++--
 14 files changed, 55 insertions(+), 15 deletions(-)

diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
index 42a5e61f8e..53226e45bf 100644
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -411,7 +411,7 @@ gist_indexsortbuild(GISTBuildState *state)
 	state->pages_allocated = 0;
 	state->pages_written = 0;
 	state->ready_num_pages = 0;
-	unbuffered_prep(&state->ub_wstate, false, false);
+	unbuffered_prep(&state->ub_wstate, false, false, false);
 
 	/*
 	 * Write an empty page as a placeholder for the root page. It will be
@@ -638,7 +638,7 @@ gist_indexsortbuild_flush_ready_pages(GISTBuildState *state)
 	if (state->ready_num_pages == 0)
 		return;
 
-	unbuffered_prep(&state->ub_wstate, false, false);
+	unbuffered_prep(&state->ub_wstate, false, false, false);
 
 	for (int i = 0; i < state->ready_num_pages; i++)
 	{
diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c
index 6096604438..0c5533e632 100644
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -1003,7 +1003,7 @@ _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks)
 	if (lastblock < firstblock || lastblock == InvalidBlockNumber)
 		return false;
 
-	unbuffered_prep(&ub_wstate, false, true);
+	unbuffered_prep(&ub_wstate, false, false, true);
 
 	page = (Page) zerobuf.data;
 
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index 9fd6a6f447..f9f6527507 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -578,7 +578,7 @@ heapam_relation_set_new_filenode(Relation rel,
 	SMgrRelation srel;
 	UnBufferedWriteState ub_wstate;
 
-	unbuffered_prep(&ub_wstate, true, false);
+	unbuffered_prep(&ub_wstate, false, true, false);
 	/*
 	 * Initialize to the minimum XID that could put tuples in the table. We
 	 * know that no xacts older than RecentXmin are still running, so that
diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index 12bdd6ff60..b103a62135 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -267,7 +267,7 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm
 	state->rs_cutoff_multi = cutoff_multi;
 	state->rs_cxt = rw_cxt;
 
-	unbuffered_prep(&state->rs_unbuffered_wstate,
+	unbuffered_prep(&state->rs_unbuffered_wstate, false,
 			RelationNeedsWAL(state->rs_new_rel), false);
 
 	/* Initialize hash tables used to track update chains */
diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
index 897de5ec1f..d844767abc 100644
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -633,7 +633,7 @@ vm_extend(Relation rel, BlockNumber vm_nblocks)
 	 * by the time we get the lock.
 	 */
 	LockRelationForExtension(rel, ExclusiveLock);
-	unbuffered_prep(&ub_wstate, false, true);
+	unbuffered_prep(&ub_wstate, false, false, true);
 
 	/*
 	 * Caution: re-using this smgr pointer could fail if the relcache entry
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 1ec7493ad3..843c9e2362 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -155,7 +155,7 @@ btbuildempty(Relation index)
 	Page		metapage;
 	UnBufferedWriteState wstate;
 
-	unbuffered_prep(&wstate, true, false);
+	unbuffered_prep(&wstate, false, true, false);
 
 	/* Construct metapage. */
 	metapage = (Page) palloc(BLCKSZ);
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index c7a65a9972..a67770f3fd 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -1186,7 +1186,10 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 	bool		deduplicate;
 
 
-	unbuffered_prep(&wstate->ub_wstate, wstate->btws_use_wal, false);
+	/*
+	 * Only bother fsync'ing the data to permanent storage if WAL logging
+	 */
+	unbuffered_prep(&wstate->ub_wstate, false, wstate->btws_use_wal, false);
 
 	deduplicate = wstate->inskey->allequalimage && !btspool->isunique &&
 		BTGetDeduplicateItems(wstate->index);
diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c
index e232ba4b86..e45f1f5db9 100644
--- a/src/backend/access/spgist/spginsert.c
+++ b/src/backend/access/spgist/spginsert.c
@@ -159,7 +159,7 @@ spgbuildempty(Relation index)
 	Page		page;
 	UnBufferedWriteState wstate;
 
-	unbuffered_prep(&wstate, true, false);
+	unbuffered_prep(&wstate, false, true, false);
 
 	/* Construct metapage. */
 	page = (Page) palloc(BLCKSZ);
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 958220c495..c1434d8f85 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -8774,6 +8774,19 @@ GetLastImportantRecPtr(void)
 	return res;
 }
 
+bool RedoRecPtrChanged(XLogRecPtr comparator_ptr)
+{
+	XLogRecPtr ptr;
+	SpinLockAcquire(&XLogCtl->info_lck);
+	ptr = XLogCtl->RedoRecPtr;
+	SpinLockRelease(&XLogCtl->info_lck);
+
+	if (RedoRecPtr < ptr)
+		RedoRecPtr = ptr;
+
+	return RedoRecPtr != comparator_ptr;
+}
+
 /*
  * Get the time and LSN of the last xlog segment switch
  */
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index 1ec90e00ab..307f32ab8c 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -443,7 +443,7 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
 	use_wal = XLogIsNeeded() &&
 		(relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork);
 
-	unbuffered_prep(&wstate, (use_wal || copying_initfork), false);
+	unbuffered_prep(&wstate, false, (use_wal || copying_initfork), false);
 
 	nblocks = smgrnblocks(src, forkNum);
 
diff --git a/src/backend/storage/direct/directmgr.c b/src/backend/storage/direct/directmgr.c
index 371ff5602f..120b7c06a7 100644
--- a/src/backend/storage/direct/directmgr.c
+++ b/src/backend/storage/direct/directmgr.c
@@ -15,15 +15,26 @@
 #include "postgres.h"
 
 
+#include "access/xlog.h"
 #include "access/xloginsert.h"
 #include "storage/directmgr.h"
 
+// TODO: do_optimization can be derived from request_fsync and fsync_self, I
+// think. but is that true in all cases and also is it confusing?
 void
-unbuffered_prep(UnBufferedWriteState *wstate, bool fsync_self, bool
-		request_fsync)
+unbuffered_prep(UnBufferedWriteState *wstate, bool do_optimization, bool
+		fsync_self, bool request_fsync)
 {
+	/*
+	 * No reason to do optimization when not required to fsync self
+	 */
+	Assert(!do_optimization || (do_optimization && fsync_self));
+
+	wstate->do_optimization = do_optimization;
 	wstate->fsync_self = fsync_self;
 	wstate->request_fsync = request_fsync;
+
+	wstate->redo = do_optimization ? GetRedoRecPtr() : InvalidXLogRecPtr;
 }
 
 void
@@ -75,5 +86,8 @@ unbuffered_finish(UnBufferedWriteState *wstate, SMgrRelation smgrrel,
 	if (!wstate->fsync_self)
 		return;
 
+	if (wstate->do_optimization && !RedoRecPtrChanged(wstate->redo))
+		return;
+
 	smgrimmedsync(smgrrel, forknum);
 }
diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c
index 4326ea8f01..7c79983ff9 100644
--- a/src/backend/storage/freespace/freespace.c
+++ b/src/backend/storage/freespace/freespace.c
@@ -627,7 +627,7 @@ fsm_extend(Relation rel, BlockNumber fsm_nblocks)
 	 */
 	LockRelationForExtension(rel, ExclusiveLock);
 
-	unbuffered_prep(&ub_wstate, false, true);
+	unbuffered_prep(&ub_wstate, false, false, true);
 
 	/*
 	 * Caution: re-using this smgr pointer could fail if the relcache entry
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index a4b1c1286f..c2ae0ce304 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -310,6 +310,7 @@ extern XLogRecPtr GetInsertRecPtr(void);
 extern XLogRecPtr GetFlushRecPtr(TimeLineID *insertTLI);
 extern TimeLineID GetWALInsertionTimeLine(void);
 extern XLogRecPtr GetLastImportantRecPtr(void);
+extern bool RedoRecPtrChanged(XLogRecPtr comparator_ptr);
 extern void RemovePromoteSignalFiles(void);
 
 extern bool PromoteIsTriggered(void);
diff --git a/src/include/storage/directmgr.h b/src/include/storage/directmgr.h
index 47653d0d1b..1ff0c9b0c8 100644
--- a/src/include/storage/directmgr.h
+++ b/src/include/storage/directmgr.h
@@ -14,6 +14,7 @@
 #ifndef DIRECTMGR_H
 #define DIRECTMGR_H
 
+#include "access/xlogdefs.h"
 #include "common/relpath.h"
 #include "storage/block.h"
 #include "storage/bufpage.h"
@@ -31,19 +32,27 @@ typedef struct UnBufferedWriteState
 	 * associated WAL entries. To avoid this, callers in this situation must
 	 * fsync the pages they have written themselves.
 	 *
+	 * These callers can optionally use the following optimization:
+	 * attempt to use the sync request queue and fall back to fsync'ing the
+	 * pages themselves if the redo pointer moves between the start and finish
+	 * of their write. In order to do this, they must set do_optimization to
+	 * true so that the redo pointer is saved before the write begins.
+	 *
 	 * Callers able to use the checkpointer's sync request queue when writing
 	 * data outside shared buffers (like fsm and vm) can set request_fsync to
 	 * true so that these fsync requests are added to the queue.
 	 */
+	bool do_optimization;
 	bool fsync_self;
 	bool request_fsync;
+	XLogRecPtr redo;
 } UnBufferedWriteState;
 /*
  * prototypes for functions in directmgr.c
  */
 extern void
-unbuffered_prep(UnBufferedWriteState *wstate, bool fsync_self, bool
-		request_fsync);
+unbuffered_prep(UnBufferedWriteState *wstate, bool do_optimization, bool
+		fsync_self, bool request_fsync);
 extern void
 unbuffered_write(UnBufferedWriteState *wstate, bool do_wal, SMgrRelation
 		smgrrel, ForkNumber forknum, BlockNumber blocknum, Page page);
-- 
2.30.2

