From 8985cd63425e79bd3be641770b61d17e9b9caa00 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Tue, 8 Feb 2022 19:01:27 -0500
Subject: [PATCH 2/4] Avoid immediate fsync for unbuffered IO

Data written to WAL-logged table forks is durable once the WAL entries
are on permanent storage; however, the XLOG Redo pointer cannot be moved
past the associated WAL until the page data is safely on permanent
storage. If a crash were to occur before the data is fsync'd, the WAL
wouldn't be replayed during recovery, and the data would be lost.

This is not a problem with pages written in shared buffers because the
checkpointer will block until FlushBuffer() is complete for all buffers
that were dirtied before it began. Therefore it will not move the Redo
pointer past their associated WAL entries until it has fsync'd the data.

A backend writing data outside of shared buffers must ensure that the
data has reached permanent storage itself or that the Redo pointer has
not moved while it was writing the data.

In the common case, the backend should not have to do this fsync itself
and can instead request the checkpointer do it.

To ensure this is safe, the backend can save the XLOG Redo pointer
location before doing the write or extend. Then it can add an fsync
request for the page to the checkpointer's pending-ops table using the
existing mechanism. After doing the write or extend, if the Redo pointer
has moved (meaning a checkpoint has started since it saved it last),
then the backend can simply fsync the page itself. Otherwise, the
checkpointer takes care of fsync'ing the page the next time it processes
the pending-ops table.

This commit adds the optimization option to the directmgr API but does
not add any users, so there is no behavior change.
---
 src/backend/access/gist/gistbuild.c       |  4 ++--
 src/backend/access/hash/hashpage.c        |  2 +-
 src/backend/access/heap/heapam_handler.c  |  2 +-
 src/backend/access/heap/rewriteheap.c     |  2 +-
 src/backend/access/heap/visibilitymap.c   |  2 +-
 src/backend/access/nbtree/nbtree.c        |  2 +-
 src/backend/access/nbtree/nbtsort.c       |  5 ++++-
 src/backend/access/spgist/spginsert.c     |  2 +-
 src/backend/access/transam/xlog.c         | 13 +++++++++++++
 src/backend/catalog/storage.c             |  2 +-
 src/backend/storage/direct/directmgr.c    | 18 ++++++++++++++++--
 src/backend/storage/freespace/freespace.c |  2 +-
 src/include/access/xlog.h                 |  1 +
 src/include/storage/directmgr.h           | 13 +++++++++++--
 14 files changed, 55 insertions(+), 15 deletions(-)

diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
index 8fabc2a42d7..3d23d5b74d6 100644
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -411,7 +411,7 @@ gist_indexsortbuild(GISTBuildState *state)
 	state->pages_allocated = 0;
 	state->pages_written = 0;
 	state->ready_num_pages = 0;
-	unbuffered_prep(&state->ub_wstate, false, false);
+	unbuffered_prep(&state->ub_wstate, false, false, false);
 
 	/*
 	 * Write an empty page as a placeholder for the root page. It will be
@@ -650,7 +650,7 @@ gist_indexsortbuild_flush_ready_pages(GISTBuildState *state)
 	if (state->ready_num_pages == 0)
 		return;
 
-	unbuffered_prep(&state->ub_wstate, false, false);
+	unbuffered_prep(&state->ub_wstate, false, false, false);
 
 	for (int i = 0; i < state->ready_num_pages; i++)
 	{
diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c
index 6096604438e..0c5533e632e 100644
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -1003,7 +1003,7 @@ _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks)
 	if (lastblock < firstblock || lastblock == InvalidBlockNumber)
 		return false;
 
-	unbuffered_prep(&ub_wstate, false, true);
+	unbuffered_prep(&ub_wstate, false, false, true);
 
 	page = (Page) zerobuf.data;
 
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index 9fd6a6f4474..f9f6527507c 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -578,7 +578,7 @@ heapam_relation_set_new_filenode(Relation rel,
 	SMgrRelation srel;
 	UnBufferedWriteState ub_wstate;
 
-	unbuffered_prep(&ub_wstate, true, false);
+	unbuffered_prep(&ub_wstate, false, true, false);
 	/*
 	 * Initialize to the minimum XID that could put tuples in the table. We
 	 * know that no xacts older than RecentXmin are still running, so that
diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index 12bdd6ff601..b103a62135e 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -267,7 +267,7 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm
 	state->rs_cutoff_multi = cutoff_multi;
 	state->rs_cxt = rw_cxt;
 
-	unbuffered_prep(&state->rs_unbuffered_wstate,
+	unbuffered_prep(&state->rs_unbuffered_wstate, false,
 			RelationNeedsWAL(state->rs_new_rel), false);
 
 	/* Initialize hash tables used to track update chains */
diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
index 897de5ec1fc..d844767abc9 100644
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -633,7 +633,7 @@ vm_extend(Relation rel, BlockNumber vm_nblocks)
 	 * by the time we get the lock.
 	 */
 	LockRelationForExtension(rel, ExclusiveLock);
-	unbuffered_prep(&ub_wstate, false, true);
+	unbuffered_prep(&ub_wstate, false, false, true);
 
 	/*
 	 * Caution: re-using this smgr pointer could fail if the relcache entry
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 1ec7493ad39..843c9e23625 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -155,7 +155,7 @@ btbuildempty(Relation index)
 	Page		metapage;
 	UnBufferedWriteState wstate;
 
-	unbuffered_prep(&wstate, true, false);
+	unbuffered_prep(&wstate, false, true, false);
 
 	/* Construct metapage. */
 	metapage = (Page) palloc(BLCKSZ);
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index c7a65a99727..a67770f3fd3 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -1186,7 +1186,10 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 	bool		deduplicate;
 
 
-	unbuffered_prep(&wstate->ub_wstate, wstate->btws_use_wal, false);
+	/*
+	 * Only bother fsync'ing the data to permanent storage if WAL logging
+	 */
+	unbuffered_prep(&wstate->ub_wstate, false, wstate->btws_use_wal, false);
 
 	deduplicate = wstate->inskey->allequalimage && !btspool->isunique &&
 		BTGetDeduplicateItems(wstate->index);
diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c
index e232ba4b866..e45f1f5db9f 100644
--- a/src/backend/access/spgist/spginsert.c
+++ b/src/backend/access/spgist/spginsert.c
@@ -159,7 +159,7 @@ spgbuildempty(Relation index)
 	Page		page;
 	UnBufferedWriteState wstate;
 
-	unbuffered_prep(&wstate, true, false);
+	unbuffered_prep(&wstate, false, true, false);
 
 	/* Construct metapage. */
 	page = (Page) palloc(BLCKSZ);
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 0d2bd7a3576..db7b33ec673 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -5971,6 +5971,19 @@ GetLastImportantRecPtr(void)
 	return res;
 }
 
+bool RedoRecPtrChanged(XLogRecPtr comparator_ptr)
+{
+	XLogRecPtr ptr;
+	SpinLockAcquire(&XLogCtl->info_lck);
+	ptr = XLogCtl->RedoRecPtr;
+	SpinLockRelease(&XLogCtl->info_lck);
+
+	if (RedoRecPtr < ptr)
+		RedoRecPtr = ptr;
+
+	return RedoRecPtr != comparator_ptr;
+}
+
 /*
  * Get the time and LSN of the last xlog segment switch
  */
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index 1ec90e00abf..307f32ab8cc 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -443,7 +443,7 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
 	use_wal = XLogIsNeeded() &&
 		(relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork);
 
-	unbuffered_prep(&wstate, (use_wal || copying_initfork), false);
+	unbuffered_prep(&wstate, false, (use_wal || copying_initfork), false);
 
 	nblocks = smgrnblocks(src, forkNum);
 
diff --git a/src/backend/storage/direct/directmgr.c b/src/backend/storage/direct/directmgr.c
index 371ff5602fe..120b7c06a74 100644
--- a/src/backend/storage/direct/directmgr.c
+++ b/src/backend/storage/direct/directmgr.c
@@ -15,15 +15,26 @@
 #include "postgres.h"
 
 
+#include "access/xlog.h"
 #include "access/xloginsert.h"
 #include "storage/directmgr.h"
 
+// TODO: do_optimization can be derived from request_fsync and fsync_self, I
+// think. but is that true in all cases and also is it confusing?
 void
-unbuffered_prep(UnBufferedWriteState *wstate, bool fsync_self, bool
-		request_fsync)
+unbuffered_prep(UnBufferedWriteState *wstate, bool do_optimization, bool
+		fsync_self, bool request_fsync)
 {
+	/*
+	 * No reason to do optimization when not required to fsync self
+	 */
+	Assert(!do_optimization || (do_optimization && fsync_self));
+
+	wstate->do_optimization = do_optimization;
 	wstate->fsync_self = fsync_self;
 	wstate->request_fsync = request_fsync;
+
+	wstate->redo = do_optimization ? GetRedoRecPtr() : InvalidXLogRecPtr;
 }
 
 void
@@ -75,5 +86,8 @@ unbuffered_finish(UnBufferedWriteState *wstate, SMgrRelation smgrrel,
 	if (!wstate->fsync_self)
 		return;
 
+	if (wstate->do_optimization && !RedoRecPtrChanged(wstate->redo))
+		return;
+
 	smgrimmedsync(smgrrel, forknum);
 }
diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c
index 4326ea8f015..7c79983ff97 100644
--- a/src/backend/storage/freespace/freespace.c
+++ b/src/backend/storage/freespace/freespace.c
@@ -627,7 +627,7 @@ fsm_extend(Relation rel, BlockNumber fsm_nblocks)
 	 */
 	LockRelationForExtension(rel, ExclusiveLock);
 
-	unbuffered_prep(&ub_wstate, false, true);
+	unbuffered_prep(&ub_wstate, false, false, true);
 
 	/*
 	 * Caution: re-using this smgr pointer could fail if the relcache entry
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 4b45ac64db8..71fe99a28d9 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -241,6 +241,7 @@ extern XLogRecPtr GetInsertRecPtr(void);
 extern XLogRecPtr GetFlushRecPtr(TimeLineID *insertTLI);
 extern TimeLineID GetWALInsertionTimeLine(void);
 extern XLogRecPtr GetLastImportantRecPtr(void);
+extern bool RedoRecPtrChanged(XLogRecPtr comparator_ptr);
 
 extern void SetWalWriterSleeping(bool sleeping);
 
diff --git a/src/include/storage/directmgr.h b/src/include/storage/directmgr.h
index 47653d0d1bb..1ff0c9b0c83 100644
--- a/src/include/storage/directmgr.h
+++ b/src/include/storage/directmgr.h
@@ -14,6 +14,7 @@
 #ifndef DIRECTMGR_H
 #define DIRECTMGR_H
 
+#include "access/xlogdefs.h"
 #include "common/relpath.h"
 #include "storage/block.h"
 #include "storage/bufpage.h"
@@ -31,19 +32,27 @@ typedef struct UnBufferedWriteState
 	 * associated WAL entries. To avoid this, callers in this situation must
 	 * fsync the pages they have written themselves.
 	 *
+	 * These callers can optionally use the following optimization:
+	 * attempt to use the sync request queue and fall back to fsync'ing the
+	 * pages themselves if the redo pointer moves between the start and finish
+	 * of their write. In order to do this, they must set do_optimization to
+	 * true so that the redo pointer is saved before the write begins.
+	 *
 	 * Callers able to use the checkpointer's sync request queue when writing
 	 * data outside shared buffers (like fsm and vm) can set request_fsync to
 	 * true so that these fsync requests are added to the queue.
 	 */
+	bool do_optimization;
 	bool fsync_self;
 	bool request_fsync;
+	XLogRecPtr redo;
 } UnBufferedWriteState;
 /*
  * prototypes for functions in directmgr.c
  */
 extern void
-unbuffered_prep(UnBufferedWriteState *wstate, bool fsync_self, bool
-		request_fsync);
+unbuffered_prep(UnBufferedWriteState *wstate, bool do_optimization, bool
+		fsync_self, bool request_fsync);
 extern void
 unbuffered_write(UnBufferedWriteState *wstate, bool do_wal, SMgrRelation
 		smgrrel, ForkNumber forknum, BlockNumber blocknum, Page page);
-- 
2.17.1

