From 50060e5a0ed66762680ddee9e30acbad905c6e98 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Tue, 1 Mar 2022 17:34:43 +1300
Subject: [PATCH v2 3/3] Use condition variable to wait when sync request queue
 is full.

Previously, in the (hopefully) rare case that we need to wait for the
checkpointer to create space in the sync request queue, we'd enter a
sleep/retry loop.  Instead, create a condition variable so the
checkpointer can wake us up whenever there is a transition from 'full'
to 'not full'.

Discussion:  https://postgr.es/m/20220226213942.nb7uvb2pamyu26dj%40alap3.anarazel.de
---
 doc/src/sgml/monitoring.sgml            | 10 +++---
 src/backend/postmaster/checkpointer.c   | 38 +++++++++++++++++++---
 src/backend/storage/sync/sync.c         | 42 ++++++++++---------------
 src/backend/utils/activity/wait_event.c |  6 ++--
 src/include/postmaster/bgwriter.h       |  4 ++-
 src/include/utils/wait_event.h          |  2 +-
 6 files changed, 62 insertions(+), 40 deletions(-)

diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index 71559442f0..5e11548075 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -1626,6 +1626,11 @@ postgres   27093  0.0  0.0  30096  2752 ?        Ss   11:34   0:00 postgres: ser
       <entry>Waiting for activity from a child process while
        executing a <literal>Gather</literal> plan node.</entry>
      </row>
+     <row>
+      <entry><literal>ForwardSyncRequest</literal></entry>
+      <entry>Waiting while sending synchronization requests to the
+       checkpointer, because the request queue is full.</entry>
+     </row>
      <row>
       <entry><literal>HashBatchAllocate</literal></entry>
       <entry>Waiting for an elected Parallel Hash participant to allocate a hash
@@ -2254,11 +2259,6 @@ postgres   27093  0.0  0.0  30096  2752 ?        Ss   11:34   0:00 postgres: ser
       <entry>Waiting during recovery when WAL data is not available from any
        source (<filename>pg_wal</filename>, archive or stream).</entry>
      </row>
-     <row>
-      <entry><literal>RegisterSyncRequest</literal></entry>
-      <entry>Waiting while sending synchronization requests to the
-       checkpointer, because the request queue is full.</entry>
-     </row>
      <row>
       <entry><literal>VacuumDelay</literal></entry>
       <entry>Waiting in a cost-based vacuum delay point.</entry>
diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c
index a59c3cf020..16cf472f90 100644
--- a/src/backend/postmaster/checkpointer.c
+++ b/src/backend/postmaster/checkpointer.c
@@ -128,6 +128,7 @@ typedef struct
 	uint32		num_backend_writes; /* counts user backend buffer writes */
 	uint32		num_backend_fsync;	/* counts user backend fsync calls */
 
+	ConditionVariable requests_not_full_cv;	/* signaled when space available */
 	int			num_requests;	/* current # of requests */
 	int			max_requests;	/* allocated array size */
 	CheckpointerRequest requests[FLEXIBLE_ARRAY_MEMBER];
@@ -903,6 +904,7 @@ CheckpointerShmemInit(void)
 		CheckpointerShmem->max_requests = NBuffers;
 		ConditionVariableInit(&CheckpointerShmem->start_cv);
 		ConditionVariableInit(&CheckpointerShmem->done_cv);
+		ConditionVariableInit(&CheckpointerShmem->requests_not_full_cv);
 	}
 }
 
@@ -1076,10 +1078,11 @@ RequestCheckpoint(int flags)
  * to perform its own fsync, which is far more expensive in practice.  It
  * is theoretically possible a backend fsync might still be necessary, if
  * the queue is full and contains no duplicate entries.  In that case, we
- * let the backend know by returning false.
+ * let the backend know by returning false, or if 'wait' is true, then we
+ * wait for space to become available.
  */
 bool
-ForwardSyncRequest(const FileTag *ftag, SyncRequestType type)
+ForwardSyncRequest(const FileTag *ftag, SyncRequestType type, bool wait)
 {
 	CheckpointerRequest *request;
 	bool		too_full;
@@ -1101,9 +1104,9 @@ ForwardSyncRequest(const FileTag *ftag, SyncRequestType type)
 	 * backend will have to perform its own fsync request.  But before forcing
 	 * that to happen, we can try to compact the request queue.
 	 */
-	if (CheckpointerShmem->checkpointer_pid == 0 ||
-		(CheckpointerShmem->num_requests >= CheckpointerShmem->max_requests &&
-		 !CompactCheckpointerRequestQueue()))
+	if (CheckpointerShmem->num_requests >= CheckpointerShmem->max_requests &&
+		!CompactCheckpointerRequestQueue() &&
+		!wait)
 	{
 		/*
 		 * Count the subset of writes where backends have to do their own
@@ -1115,6 +1118,24 @@ ForwardSyncRequest(const FileTag *ftag, SyncRequestType type)
 		return false;
 	}
 
+	/*
+	 * If we still don't have enough space and the caller asked us to wait,
+	 * wait for the checkpointer to advertise that there is space.
+	 */
+	if (CheckpointerShmem->num_requests >= CheckpointerShmem->max_requests)
+	{
+		ConditionVariablePrepareToSleep(&CheckpointerShmem->requests_not_full_cv);
+		while (CheckpointerShmem->num_requests >=
+			   CheckpointerShmem->max_requests)
+		{
+			LWLockRelease(CheckpointerCommLock);
+			ConditionVariableSleep(&CheckpointerShmem->requests_not_full_cv,
+								   WAIT_EVENT_FORWARD_SYNC_REQUEST);
+			LWLockAcquire(CheckpointerCommLock, LW_EXCLUSIVE);
+		}
+		ConditionVariableCancelSleep();
+	}
+
 	/* OK, insert request */
 	request = &CheckpointerShmem->requests[CheckpointerShmem->num_requests++];
 	request->ftag = *ftag;
@@ -1261,6 +1282,7 @@ AbsorbSyncRequests(void)
 	CheckpointerRequest *requests = NULL;
 	CheckpointerRequest *request;
 	int			n;
+	bool		was_full;
 
 	if (!AmCheckpointerProcess())
 		return;
@@ -1293,6 +1315,8 @@ AbsorbSyncRequests(void)
 		memcpy(requests, CheckpointerShmem->requests, n * sizeof(CheckpointerRequest));
 	}
 
+	was_full = n >= CheckpointerShmem->max_requests;
+
 	START_CRIT_SECTION();
 
 	CheckpointerShmem->num_requests = 0;
@@ -1304,6 +1328,10 @@ AbsorbSyncRequests(void)
 
 	END_CRIT_SECTION();
 
+	/* Wake anyone waiting for space in ForwardSyncRequest(). */
+	if (was_full)
+		ConditionVariableBroadcast(&CheckpointerShmem->requests_not_full_cv);
+
 	if (requests)
 		pfree(requests);
 }
diff --git a/src/backend/storage/sync/sync.c b/src/backend/storage/sync/sync.c
index 0c4d9ce687..d0fca02ee6 100644
--- a/src/backend/storage/sync/sync.c
+++ b/src/backend/storage/sync/sync.c
@@ -585,31 +585,23 @@ RegisterSyncRequest(const FileTag *ftag, SyncRequestType type,
 		return true;
 	}
 
-	for (;;)
-	{
-		/*
-		 * Notify the checkpointer about it.  If we fail to queue a message in
-		 * retryOnError mode, we have to sleep and try again ... ugly, but
-		 * hopefully won't happen often.
-		 *
-		 * XXX should we CHECK_FOR_INTERRUPTS in this loop?  Escaping with an
-		 * error in the case of SYNC_UNLINK_REQUEST would leave the
-		 * no-longer-used file still present on disk, which would be bad, so
-		 * I'm inclined to assume that the checkpointer will always empty the
-		 * queue soon.
-		 */
-		ret = ForwardSyncRequest(ftag, type);
-
-		/*
-		 * If we are successful in queueing the request, or we failed and were
-		 * instructed not to retry on error, break.
-		 */
-		if (ret || (!ret && !retryOnError))
-			break;
-
-		WaitLatch(NULL, WL_EXIT_ON_PM_DEATH | WL_TIMEOUT, 10,
-				  WAIT_EVENT_REGISTER_SYNC_REQUEST);
-	}
+	/*
+	 * Notify the checkpointer about it.  If we fail to queue a message in
+	 * retryOnError mode, we wait until space is available ... ugly, but
+	 * hopefully won't happen often.
+	 *
+	 * Don't allow interrupts while waiting.  Escaping with an error in the
+	 * case of SYNC_UNLINK_REQUEST would leave the no-longer-used file still
+	 * present on disk, which would be bad, so I'm inclined to assume that the
+	 * checkpointer will always empty the queue soon.
+	 *
+	 * XXX This concern would go away along with SYNC_UNLINK_REQUEST if we
+	 * figure out how to get rid of 'tombstone files'.  That would be a good
+	 * idea because holding interrupts is bad for ProcSignalBarrier.
+	 */
+	HOLD_INTERRUPTS();
+	ret = ForwardSyncRequest(ftag, type, retryOnError /* wait */);
+	RESUME_INTERRUPTS();
 
 	return ret;
 }
diff --git a/src/backend/utils/activity/wait_event.c b/src/backend/utils/activity/wait_event.c
index ff46a0e3c7..15c35691b6 100644
--- a/src/backend/utils/activity/wait_event.c
+++ b/src/backend/utils/activity/wait_event.c
@@ -346,6 +346,9 @@ pgstat_get_wait_ipc(WaitEventIPC w)
 		case WAIT_EVENT_EXECUTE_GATHER:
 			event_name = "ExecuteGather";
 			break;
+		case WAIT_EVENT_FORWARD_SYNC_REQUEST:
+			event_name = "ForwardSyncRequest";
+			break;
 		case WAIT_EVENT_HASH_BATCH_ALLOCATE:
 			event_name = "HashBatchAllocate";
 			break;
@@ -497,9 +500,6 @@ pgstat_get_wait_timeout(WaitEventTimeout w)
 		case WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL:
 			event_name = "RecoveryRetrieveRetryInterval";
 			break;
-		case WAIT_EVENT_REGISTER_SYNC_REQUEST:
-			event_name = "RegisterSyncRequest";
-			break;
 		case WAIT_EVENT_VACUUM_DELAY:
 			event_name = "VacuumDelay";
 			break;
diff --git a/src/include/postmaster/bgwriter.h b/src/include/postmaster/bgwriter.h
index 2882efd67b..fa82007fc0 100644
--- a/src/include/postmaster/bgwriter.h
+++ b/src/include/postmaster/bgwriter.h
@@ -33,7 +33,9 @@ extern void CheckpointerMain(void) pg_attribute_noreturn();
 extern void RequestCheckpoint(int flags);
 extern void CheckpointWriteDelay(int flags, double progress);
 
-extern bool ForwardSyncRequest(const FileTag *ftag, SyncRequestType type);
+extern bool ForwardSyncRequest(const FileTag *ftag,
+							   SyncRequestType type,
+							   bool wait);
 
 extern void AbsorbSyncRequests(void);
 
diff --git a/src/include/utils/wait_event.h b/src/include/utils/wait_event.h
index 1c39ce031a..3b07c3b878 100644
--- a/src/include/utils/wait_event.h
+++ b/src/include/utils/wait_event.h
@@ -91,6 +91,7 @@ typedef enum
 	WAIT_EVENT_CHECKPOINT_DONE,
 	WAIT_EVENT_CHECKPOINT_START,
 	WAIT_EVENT_EXECUTE_GATHER,
+	WAIT_EVENT_FORWARD_SYNC_REQUEST,
 	WAIT_EVENT_HASH_BATCH_ALLOCATE,
 	WAIT_EVENT_HASH_BATCH_ELECT,
 	WAIT_EVENT_HASH_BATCH_LOAD,
@@ -145,7 +146,6 @@ typedef enum
 	WAIT_EVENT_PG_SLEEP,
 	WAIT_EVENT_RECOVERY_APPLY_DELAY,
 	WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL,
-	WAIT_EVENT_REGISTER_SYNC_REQUEST,
 	WAIT_EVENT_VACUUM_DELAY,
 	WAIT_EVENT_VACUUM_TRUNCATE
 } WaitEventTimeout;
-- 
2.30.2

