Reducing WaitEventSet syscall churn

Started by Thomas Munroalmost 6 years ago15 messages
#1Thomas Munro
thomas.munro@gmail.com
3 attachment(s)

Hi,

Here are some patches to get rid of frequent system calls.

0001 changes all qualifying WaitLatch() calls to use a new function
WaitMyLatch() that reuses a common WaitEventSet. That's for callers
that only want to wait for their own latch or an optional timeout,
with automatic exit-on-postmaster-death.

0002 changes condition_variable.c to use WaitMyLatch(), instead of its
own local thing like that. Perhaps this makes up for the use of the
extra fd consumed by 0001.

0003 changes pgstat.c to use its own local reusable WaitEventSet.

To see what I'm talking about, try tracing a whole cluster with eg
strace/truss/dtruss -f postgres -D pgdata. This applies to Linux
systems, or BSD/macOS systems with the nearby kqueue patch applied.
On systems that fall back to poll(), there aren't any setup/teardown
syscalls.

Attachments:

0001-Add-WaitMyLatch-to-replace-many-WaitLatch-calls.patchapplication/octet-stream; name=0001-Add-WaitMyLatch-to-replace-many-WaitLatch-calls.patchDownload
From 1de79f8154baedfde3175319b11ca6a79921fec9 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Tue, 21 Jan 2020 12:17:09 +1300
Subject: [PATCH 1/3] Add WaitMyLatch() to replace many WaitLatch() calls.

WaitLatch() sets up and tears down a WaitEventSet every time.  A
common case is waiting for your own latch, optionally with a timeout,
and automatic exit on postmaster death.  Provide a function that
does that, with a reused WaitEventSet.  This avoids many system calls
on some builds.
---
 src/backend/access/transam/parallel.c       |  7 ++---
 src/backend/executor/nodeGather.c           |  3 +-
 src/backend/postmaster/autovacuum.c         |  6 ++--
 src/backend/postmaster/bgwriter.c           | 10 ++-----
 src/backend/postmaster/checkpointer.c       |  6 ++--
 src/backend/postmaster/pgarch.c             |  5 +---
 src/backend/postmaster/walwriter.c          |  5 +---
 src/backend/replication/basebackup.c        |  6 ++--
 src/backend/replication/logical/launcher.c  | 17 +++--------
 src/backend/replication/logical/tablesync.c |  8 ++----
 src/backend/storage/ipc/latch.c             | 32 +++++++++++++++++++++
 src/backend/storage/ipc/procsignal.c        |  5 +---
 src/backend/storage/lmgr/proc.c             |  7 ++---
 src/backend/utils/adt/misc.c                |  5 +---
 src/include/storage/latch.h                 |  1 +
 15 files changed, 58 insertions(+), 65 deletions(-)

diff --git a/src/backend/access/transam/parallel.c b/src/backend/access/transam/parallel.c
index df06e7d174..e951ae75b9 100644
--- a/src/backend/access/transam/parallel.c
+++ b/src/backend/access/transam/parallel.c
@@ -705,9 +705,7 @@ WaitForParallelWorkersToAttach(ParallelContext *pcxt)
 				 * might also get set for some other reason, but if so we'll
 				 * just end up waiting for the same worker again.
 				 */
-				rc = WaitLatch(MyLatch,
-							   WL_LATCH_SET | WL_EXIT_ON_PM_DEATH,
-							   -1, WAIT_EVENT_BGWORKER_STARTUP);
+				rc = WaitMyLatch(-1, WAIT_EVENT_BGWORKER_STARTUP);
 
 				if (rc & WL_LATCH_SET)
 					ResetLatch(MyLatch);
@@ -825,8 +823,7 @@ WaitForParallelWorkersToFinish(ParallelContext *pcxt)
 			}
 		}
 
-		(void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, -1,
-						 WAIT_EVENT_PARALLEL_FINISH);
+		(void) WaitMyLatch(-1, WAIT_EVENT_PARALLEL_FINISH);
 		ResetLatch(MyLatch);
 	}
 
diff --git a/src/backend/executor/nodeGather.c b/src/backend/executor/nodeGather.c
index 6b8ed867d5..57d82eae8f 100644
--- a/src/backend/executor/nodeGather.c
+++ b/src/backend/executor/nodeGather.c
@@ -383,8 +383,7 @@ gather_readnext(GatherState *gatherstate)
 				return NULL;
 
 			/* Nothing to do except wait for developments. */
-			(void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0,
-							 WAIT_EVENT_EXECUTE_GATHER);
+			(void) WaitMyLatch(-1, WAIT_EVENT_EXECUTE_GATHER);
 			ResetLatch(MyLatch);
 			nvisited = 0;
 		}
diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c
index 6d1f28c327..b5c3142bed 100644
--- a/src/backend/postmaster/autovacuum.c
+++ b/src/backend/postmaster/autovacuum.c
@@ -641,10 +641,8 @@ AutoVacLauncherMain(int argc, char *argv[])
 		 * Wait until naptime expires or we get some type of signal (all the
 		 * signal handlers will wake us by calling SetLatch).
 		 */
-		(void) WaitLatch(MyLatch,
-						 WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
-						 (nap.tv_sec * 1000L) + (nap.tv_usec / 1000L),
-						 WAIT_EVENT_AUTOVACUUM_MAIN);
+		(void) WaitMyLatch((nap.tv_sec * 1000L) + (nap.tv_usec / 1000L),
+						   WAIT_EVENT_AUTOVACUUM_MAIN);
 
 		ResetLatch(MyLatch);
 
diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c
index 069e27e427..85c1c6891c 100644
--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@@ -301,9 +301,7 @@ BackgroundWriterMain(void)
 		 * down with latch events that are likely to happen frequently during
 		 * normal operation.
 		 */
-		rc = WaitLatch(MyLatch,
-					   WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
-					   BgWriterDelay /* ms */ , WAIT_EVENT_BGWRITER_MAIN);
+		rc = WaitMyLatch(BgWriterDelay /* ms */ , WAIT_EVENT_BGWRITER_MAIN);
 
 		/*
 		 * If no latch event and BgBufferSync says nothing's happening, extend
@@ -328,10 +326,8 @@ BackgroundWriterMain(void)
 			/* Ask for notification at next buffer allocation */
 			StrategyNotifyBgWriter(MyProc->pgprocno);
 			/* Sleep ... */
-			(void) WaitLatch(MyLatch,
-							 WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
-							 BgWriterDelay * HIBERNATE_FACTOR,
-							 WAIT_EVENT_BGWRITER_HIBERNATE);
+			(void) WaitMyLatch(BgWriterDelay * HIBERNATE_FACTOR,
+							   WAIT_EVENT_BGWRITER_HIBERNATE);
 			/* Reset the notification request in case we timed out */
 			StrategyNotifyBgWriter(-1);
 		}
diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c
index e354a78725..b87dd03856 100644
--- a/src/backend/postmaster/checkpointer.c
+++ b/src/backend/postmaster/checkpointer.c
@@ -511,10 +511,8 @@ CheckpointerMain(void)
 			cur_timeout = Min(cur_timeout, XLogArchiveTimeout - elapsed_secs);
 		}
 
-		(void) WaitLatch(MyLatch,
-						 WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
-						 cur_timeout * 1000L /* convert to ms */ ,
-						 WAIT_EVENT_CHECKPOINTER_MAIN);
+		(void) WaitMyLatch(cur_timeout * 1000L /* convert to ms */ ,
+						   WAIT_EVENT_CHECKPOINTER_MAIN);
 	}
 }
 
diff --git a/src/backend/postmaster/pgarch.c b/src/backend/postmaster/pgarch.c
index 3ca30badb2..4833c1c343 100644
--- a/src/backend/postmaster/pgarch.c
+++ b/src/backend/postmaster/pgarch.c
@@ -361,10 +361,7 @@ pgarch_MainLoop(void)
 			{
 				int			rc;
 
-				rc = WaitLatch(MyLatch,
-							   WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
-							   timeout * 1000L,
-							   WAIT_EVENT_ARCHIVER_MAIN);
+				rc = WaitMyLatch(timeout * 1000L, WAIT_EVENT_ARCHIVER_MAIN);
 				if (rc & WL_TIMEOUT)
 					wakened = true;
 				if (rc & WL_POSTMASTER_DEATH)
diff --git a/src/backend/postmaster/walwriter.c b/src/backend/postmaster/walwriter.c
index 45a2757969..c9ac4eb742 100644
--- a/src/backend/postmaster/walwriter.c
+++ b/src/backend/postmaster/walwriter.c
@@ -253,9 +253,6 @@ WalWriterMain(void)
 		else
 			cur_timeout = WalWriterDelay * HIBERNATE_FACTOR;
 
-		(void) WaitLatch(MyLatch,
-						 WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
-						 cur_timeout,
-						 WAIT_EVENT_WAL_WRITER_MAIN);
+		(void) WaitMyLatch(cur_timeout, WAIT_EVENT_WAL_WRITER_MAIN);
 	}
 }
diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c
index dea8aab45e..0f1b54746d 100644
--- a/src/backend/replication/basebackup.c
+++ b/src/backend/replication/basebackup.c
@@ -1705,10 +1705,8 @@ throttle(size_t increment)
 		 * (TAR_SEND_SIZE / throttling_sample * elapsed_min_unit) should be
 		 * the maximum time to sleep. Thus the cast to long is safe.
 		 */
-		wait_result = WaitLatch(MyLatch,
-								WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
-								(long) (sleep / 1000),
-								WAIT_EVENT_BASE_BACKUP_THROTTLE);
+		wait_result = WaitMyLatch((long) (sleep / 1000),
+								  WAIT_EVENT_BASE_BACKUP_THROTTLE);
 
 		if (wait_result & WL_LATCH_SET)
 			CHECK_FOR_INTERRUPTS();
diff --git a/src/backend/replication/logical/launcher.c b/src/backend/replication/logical/launcher.c
index aec885e987..13e811384d 100644
--- a/src/backend/replication/logical/launcher.c
+++ b/src/backend/replication/logical/launcher.c
@@ -211,9 +211,7 @@ WaitForReplicationWorkerAttach(LogicalRepWorker *worker,
 		 * We need timeout because we generally don't get notified via latch
 		 * about the worker attach.  But we don't expect to have to wait long.
 		 */
-		rc = WaitLatch(MyLatch,
-					   WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
-					   10L, WAIT_EVENT_BGWORKER_STARTUP);
+		rc = WaitMyLatch(10L, WAIT_EVENT_BGWORKER_STARTUP);
 
 		if (rc & WL_LATCH_SET)
 		{
@@ -482,9 +480,7 @@ logicalrep_worker_stop(Oid subid, Oid relid)
 		LWLockRelease(LogicalRepWorkerLock);
 
 		/* Wait a bit --- we don't expect to have to wait long. */
-		rc = WaitLatch(MyLatch,
-					   WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
-					   10L, WAIT_EVENT_BGWORKER_STARTUP);
+		rc = WaitMyLatch(10L, WAIT_EVENT_BGWORKER_STARTUP);
 
 		if (rc & WL_LATCH_SET)
 		{
@@ -526,9 +522,7 @@ logicalrep_worker_stop(Oid subid, Oid relid)
 		LWLockRelease(LogicalRepWorkerLock);
 
 		/* Wait a bit --- we don't expect to have to wait long. */
-		rc = WaitLatch(MyLatch,
-					   WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
-					   10L, WAIT_EVENT_BGWORKER_SHUTDOWN);
+		rc = WaitMyLatch(10L, WAIT_EVENT_BGWORKER_SHUTDOWN);
 
 		if (rc & WL_LATCH_SET)
 		{
@@ -1034,10 +1028,7 @@ ApplyLauncherMain(Datum main_arg)
 		}
 
 		/* Wait for more work. */
-		rc = WaitLatch(MyLatch,
-					   WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
-					   wait_time,
-					   WAIT_EVENT_LOGICAL_LAUNCHER_MAIN);
+		rc = WaitMyLatch(wait_time, WAIT_EVENT_LOGICAL_LAUNCHER_MAIN);
 
 		if (rc & WL_LATCH_SET)
 		{
diff --git a/src/backend/replication/logical/tablesync.c b/src/backend/replication/logical/tablesync.c
index f8183cd488..44d8c275cc 100644
--- a/src/backend/replication/logical/tablesync.c
+++ b/src/backend/replication/logical/tablesync.c
@@ -185,9 +185,7 @@ wait_for_relation_state_change(Oid relid, char expected_state)
 		if (!worker)
 			return false;
 
-		(void) WaitLatch(MyLatch,
-						 WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
-						 1000L, WAIT_EVENT_LOGICAL_SYNC_STATE_CHANGE);
+		(void) WaitMyLatch(1000L, WAIT_EVENT_LOGICAL_SYNC_STATE_CHANGE);
 
 		ResetLatch(MyLatch);
 	}
@@ -238,9 +236,7 @@ wait_for_worker_state_change(char expected_state)
 		 * Wait.  We expect to get a latch signal back from the apply worker,
 		 * but use a timeout in case it dies without sending one.
 		 */
-		rc = WaitLatch(MyLatch,
-					   WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
-					   1000L, WAIT_EVENT_LOGICAL_SYNC_STATE_CHANGE);
+		rc = WaitMyLatch(1000L, WAIT_EVENT_LOGICAL_SYNC_STATE_CHANGE);
 
 		if (rc & WL_LATCH_SET)
 			ResetLatch(MyLatch);
diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c
index d677ffbda7..27fc3df920 100644
--- a/src/backend/storage/ipc/latch.c
+++ b/src/backend/storage/ipc/latch.c
@@ -52,6 +52,7 @@
 #include "storage/latch.h"
 #include "storage/pmsignal.h"
 #include "storage/shmem.h"
+#include "utils/memutils.h"
 
 /*
  * Select the fd readiness primitive to use. Normally the "most modern"
@@ -420,6 +421,37 @@ WaitLatchOrSocket(Latch *latch, int wakeEvents, pgsocket sock,
 	return ret;
 }
 
+/*
+ * For callers with the common requirement to wait for their own latch or a
+ * timeout (or -1 for no timeout), with automatic exit on postmaster death,
+ * this variant reuses a WaitEventSet.  On builds where that allocates kernel
+ * resources, this approach avoids building them up and tearing them down
+ * every time.
+ */
+int
+WaitMyLatch(long timeout, uint32 wait_event_info)
+{
+	static WaitEventSet *set;
+	WaitEvent	event;
+
+	/* Create on demand. */
+	if (set == NULL)
+	{
+		WaitEventSet *wes;
+
+		wes = CreateWaitEventSet(TopMemoryContext, 2);
+		AddWaitEventToSet(wes, WL_LATCH_SET, PGINVALID_SOCKET, MyLatch, NULL);
+		AddWaitEventToSet(wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, NULL,
+						  NULL);
+		set = wes;
+	}
+
+	if (WaitEventSetWait(set, timeout, &event, 1, wait_event_info) == 0)
+		return WL_TIMEOUT;
+	else
+		return event.events;
+}
+
 /*
  * Sets a latch and wakes up anyone waiting on it.
  *
diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c
index 65d3946386..024cbc9cf9 100644
--- a/src/backend/storage/ipc/procsignal.c
+++ b/src/backend/storage/ipc/procsignal.c
@@ -395,10 +395,7 @@ WaitForProcSignalBarrier(uint64 generation)
 
 			CHECK_FOR_INTERRUPTS();
 
-			events =
-				WaitLatch(MyLatch,
-						  WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
-						  timeout, WAIT_EVENT_PROC_SIGNAL_BARRIER);
+			events = WaitMyLatch(timeout, WAIT_EVENT_PROC_SIGNAL_BARRIER);
 			ResetLatch(MyLatch);
 
 			oldval = pg_atomic_read_u64(&slot->pss_barrierGeneration);
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index 32df8c85a1..0789c30b9b 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -1283,8 +1283,8 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
 		}
 		else
 		{
-			(void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0,
-							 PG_WAIT_LOCK | locallock->tag.lock.locktag_type);
+			(void) WaitMyLatch(-1,
+							   PG_WAIT_LOCK | locallock->tag.lock.locktag_type);
 			ResetLatch(MyLatch);
 			/* check for deadlocks first, as that's probably log-worthy */
 			if (got_deadlock_timeout)
@@ -1793,8 +1793,7 @@ CheckDeadLockAlert(void)
 void
 ProcWaitForSignal(uint32 wait_event_info)
 {
-	(void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0,
-					 wait_event_info);
+	(void) WaitMyLatch(-1, wait_event_info);
 	ResetLatch(MyLatch);
 	CHECK_FOR_INTERRUPTS();
 }
diff --git a/src/backend/utils/adt/misc.c b/src/backend/utils/adt/misc.c
index 323e36b81c..6937c143b7 100644
--- a/src/backend/utils/adt/misc.c
+++ b/src/backend/utils/adt/misc.c
@@ -378,10 +378,7 @@ pg_sleep(PG_FUNCTION_ARGS)
 		else
 			break;
 
-		(void) WaitLatch(MyLatch,
-						 WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
-						 delay_ms,
-						 WAIT_EVENT_PG_SLEEP);
+		(void) WaitMyLatch(delay_ms, WAIT_EVENT_PG_SLEEP);
 		ResetLatch(MyLatch);
 	}
 
diff --git a/src/include/storage/latch.h b/src/include/storage/latch.h
index 46ae56cae3..d6fd154dc6 100644
--- a/src/include/storage/latch.h
+++ b/src/include/storage/latch.h
@@ -176,6 +176,7 @@ extern int	WaitLatch(Latch *latch, int wakeEvents, long timeout,
 					  uint32 wait_event_info);
 extern int	WaitLatchOrSocket(Latch *latch, int wakeEvents,
 							  pgsocket sock, long timeout, uint32 wait_event_info);
+extern int	WaitMyLatch(long timeout, uint32 wait_event_info);
 
 /*
  * Unix implementation uses SIGUSR1 for inter-process signaling.
-- 
2.20.1

0002-Use-WaitMyLatch-for-condition-variables.patchapplication/octet-stream; name=0002-Use-WaitMyLatch-for-condition-variables.patchDownload
From 871be852a983c58d5efe9a1001711b4bbfe97ed8 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Tue, 21 Jan 2020 12:22:14 +1300
Subject: [PATCH 2/3] Use WaitMyLatch() for condition variables.

Previously, we condition_variable.c maintained its own reusable
WaitEventSet.  Let's use the common one, shared by other simple
latch waiting code.
---
 src/backend/storage/lmgr/condition_variable.c | 24 +------------------
 1 file changed, 1 insertion(+), 23 deletions(-)

diff --git a/src/backend/storage/lmgr/condition_variable.c b/src/backend/storage/lmgr/condition_variable.c
index 0e7f07961e..1fdcbd45eb 100644
--- a/src/backend/storage/lmgr/condition_variable.c
+++ b/src/backend/storage/lmgr/condition_variable.c
@@ -30,9 +30,6 @@
 /* Initially, we are not prepared to sleep on any condition variable. */
 static ConditionVariable *cv_sleep_target = NULL;
 
-/* Reusable WaitEventSet. */
-static WaitEventSet *cv_wait_event_set = NULL;
-
 /*
  * Initialize a condition variable.
  */
@@ -62,23 +59,6 @@ ConditionVariablePrepareToSleep(ConditionVariable *cv)
 {
 	int			pgprocno = MyProc->pgprocno;
 
-	/*
-	 * If first time through in this process, create a WaitEventSet, which
-	 * we'll reuse for all condition variable sleeps.
-	 */
-	if (cv_wait_event_set == NULL)
-	{
-		WaitEventSet *new_event_set;
-
-		new_event_set = CreateWaitEventSet(TopMemoryContext, 2);
-		AddWaitEventToSet(new_event_set, WL_LATCH_SET, PGINVALID_SOCKET,
-						  MyLatch, NULL);
-		AddWaitEventToSet(new_event_set, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
-						  NULL, NULL);
-		/* Don't set cv_wait_event_set until we have a correct WES. */
-		cv_wait_event_set = new_event_set;
-	}
-
 	/*
 	 * If some other sleep is already prepared, cancel it; this is necessary
 	 * because we have just one static variable tracking the prepared sleep,
@@ -176,15 +156,13 @@ ConditionVariableTimedSleep(ConditionVariable *cv, long timeout,
 
 	while (true)
 	{
-		WaitEvent	event;
 		bool		done = false;
 
 		/*
 		 * Wait for latch to be set.  (If we're awakened for some other
 		 * reason, the code below will cope anyway.)
 		 */
-		(void) WaitEventSetWait(cv_wait_event_set, cur_timeout, &event, 1,
-								wait_event_info);
+		(void) WaitMyLatch(cur_timeout, wait_event_info);
 
 		/* Reset latch before examining the state of the wait list. */
 		ResetLatch(MyLatch);
-- 
2.20.1

0003-Introduce-a-reusable-WaitEventSet-for-the-stats-coll.patchapplication/octet-stream; name=0003-Introduce-a-reusable-WaitEventSet-for-the-stats-coll.patchDownload
From 2fa09052df944df58fb15bfd461533d35041d2cd Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Tue, 21 Jan 2020 12:54:11 +1300
Subject: [PATCH 3/3] Introduce a reusable WaitEventSet for the stats
 collector.

Avoid setup and teardown for every wait.
---
 src/backend/postmaster/pgstat.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 51c486bebd..3a8120377a 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -4426,6 +4426,8 @@ PgstatCollectorMain(int argc, char *argv[])
 	int			len;
 	PgStat_Msg	msg;
 	int			wr;
+	WaitEvent	event;
+	WaitEventSet *wes;
 
 	/*
 	 * Ignore all signals usually bound to some action in the postmaster,
@@ -4455,6 +4457,12 @@ PgstatCollectorMain(int argc, char *argv[])
 	pgStatRunningInCollector = true;
 	pgStatDBHash = pgstat_read_statsfiles(InvalidOid, true, true);
 
+	/* Prepare to wait for our latch or data in our socket. */
+	wes = CreateWaitEventSet(CurrentMemoryContext, 3);
+	AddWaitEventToSet(wes, WL_LATCH_SET, PGINVALID_SOCKET, MyLatch, NULL);
+	AddWaitEventToSet(wes, WL_POSTMASTER_DEATH, PGINVALID_SOCKET, NULL, NULL);
+	AddWaitEventToSet(wes, WL_SOCKET_READABLE, pgStatSock, NULL, NULL);
+
 	/*
 	 * Loop to process messages until we get SIGQUIT or detect ungraceful
 	 * death of our parent postmaster.
@@ -4637,10 +4645,7 @@ PgstatCollectorMain(int argc, char *argv[])
 
 		/* Sleep until there's something to do */
 #ifndef WIN32
-		wr = WaitLatchOrSocket(MyLatch,
-							   WL_LATCH_SET | WL_POSTMASTER_DEATH | WL_SOCKET_READABLE,
-							   pgStatSock, -1L,
-							   WAIT_EVENT_PGSTAT_MAIN);
+		wr = WaitEventSetWait(wes, -1L, &event, 1, WAIT_EVENT_PGSTAT_MAIN);
 #else
 
 		/*
@@ -4653,18 +4658,15 @@ PgstatCollectorMain(int argc, char *argv[])
 		 * to not provoke "using stale statistics" complaints from
 		 * backend_read_statsfile.
 		 */
-		wr = WaitLatchOrSocket(MyLatch,
-							   WL_LATCH_SET | WL_POSTMASTER_DEATH | WL_SOCKET_READABLE | WL_TIMEOUT,
-							   pgStatSock,
-							   2 * 1000L /* msec */ ,
-							   WAIT_EVENT_PGSTAT_MAIN);
+		wr = WaitEventSetWait(wes, 2 * 1000L /* msec */, &event, 1,
+							  WAIT_EVENT_PGSTAT_MAIN);
 #endif
 
 		/*
 		 * Emergency bailout if postmaster has died.  This is to avoid the
 		 * necessity for manual cleanup of all postmaster children.
 		 */
-		if (wr & WL_POSTMASTER_DEATH)
+		if (wr == 1 && event.events == WL_POSTMASTER_DEATH)
 			break;
 	}							/* end of outer loop */
 
@@ -4673,6 +4675,8 @@ PgstatCollectorMain(int argc, char *argv[])
 	 */
 	pgstat_write_statsfiles(true, true);
 
+	FreeWaitEventSet(wes);
+
 	exit(0);
 }
 
-- 
2.20.1

#2Thomas Munro
thomas.munro@gmail.com
In reply to: Thomas Munro (#1)
5 attachment(s)
Re: Reducing WaitEventSet syscall churn

On Tue, Jan 21, 2020 at 1:45 PM Thomas Munro <thomas.munro@gmail.com> wrote:

Here are some patches to get rid of frequent system calls.

Here is one more case that I was sitting on because I wasn't sure how
to do it: walreceiver.c. To make that work, libpq needs to be able to
tell you when the socket has changed, which I did with a counter that
is exposed to client code in patch 0004. The walreceiver change in
0005 works (trace the system calls on walreceiver to see the
difference), but perhaps we can come up with a better way to code it
so that eg logical/worker.c doesn't finish up duplicating the logic.
Thoughts?

Attachments:

0001-Add-WaitMyLatch-to-replace-many-WaitLatch-calls.patchapplication/octet-stream; name=0001-Add-WaitMyLatch-to-replace-many-WaitLatch-calls.patchDownload
From 6df09b3ed57369b47781ab941d02c51f8bb66869 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Tue, 21 Jan 2020 12:17:09 +1300
Subject: [PATCH 1/5] Add WaitMyLatch() to replace many WaitLatch() calls.

WaitLatch() sets up and tears down a WaitEventSet every time.  A
common case is waiting for your own latch, optionally with a timeout,
and automatic exit on postmaster death.  Provide a function that
does that, with a reused WaitEventSet.  This avoids frequent
epoll/kqueue setup and teardown system calls.
---
 src/backend/access/transam/parallel.c       |  7 ++---
 src/backend/executor/nodeGather.c           |  3 +-
 src/backend/postmaster/autovacuum.c         |  6 ++--
 src/backend/postmaster/bgwriter.c           | 10 ++-----
 src/backend/postmaster/checkpointer.c       |  6 ++--
 src/backend/postmaster/pgarch.c             |  5 +---
 src/backend/postmaster/walwriter.c          |  5 +---
 src/backend/replication/basebackup.c        |  6 ++--
 src/backend/replication/logical/launcher.c  | 17 +++--------
 src/backend/replication/logical/tablesync.c |  8 ++----
 src/backend/storage/ipc/latch.c             | 32 +++++++++++++++++++++
 src/backend/storage/ipc/procsignal.c        |  5 +---
 src/backend/storage/lmgr/proc.c             |  7 ++---
 src/backend/utils/adt/misc.c                |  5 +---
 src/include/storage/latch.h                 |  1 +
 15 files changed, 58 insertions(+), 65 deletions(-)

diff --git a/src/backend/access/transam/parallel.c b/src/backend/access/transam/parallel.c
index df06e7d174..e951ae75b9 100644
--- a/src/backend/access/transam/parallel.c
+++ b/src/backend/access/transam/parallel.c
@@ -705,9 +705,7 @@ WaitForParallelWorkersToAttach(ParallelContext *pcxt)
 				 * might also get set for some other reason, but if so we'll
 				 * just end up waiting for the same worker again.
 				 */
-				rc = WaitLatch(MyLatch,
-							   WL_LATCH_SET | WL_EXIT_ON_PM_DEATH,
-							   -1, WAIT_EVENT_BGWORKER_STARTUP);
+				rc = WaitMyLatch(-1, WAIT_EVENT_BGWORKER_STARTUP);
 
 				if (rc & WL_LATCH_SET)
 					ResetLatch(MyLatch);
@@ -825,8 +823,7 @@ WaitForParallelWorkersToFinish(ParallelContext *pcxt)
 			}
 		}
 
-		(void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, -1,
-						 WAIT_EVENT_PARALLEL_FINISH);
+		(void) WaitMyLatch(-1, WAIT_EVENT_PARALLEL_FINISH);
 		ResetLatch(MyLatch);
 	}
 
diff --git a/src/backend/executor/nodeGather.c b/src/backend/executor/nodeGather.c
index 6b8ed867d5..57d82eae8f 100644
--- a/src/backend/executor/nodeGather.c
+++ b/src/backend/executor/nodeGather.c
@@ -383,8 +383,7 @@ gather_readnext(GatherState *gatherstate)
 				return NULL;
 
 			/* Nothing to do except wait for developments. */
-			(void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0,
-							 WAIT_EVENT_EXECUTE_GATHER);
+			(void) WaitMyLatch(-1, WAIT_EVENT_EXECUTE_GATHER);
 			ResetLatch(MyLatch);
 			nvisited = 0;
 		}
diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c
index 6d1f28c327..b5c3142bed 100644
--- a/src/backend/postmaster/autovacuum.c
+++ b/src/backend/postmaster/autovacuum.c
@@ -641,10 +641,8 @@ AutoVacLauncherMain(int argc, char *argv[])
 		 * Wait until naptime expires or we get some type of signal (all the
 		 * signal handlers will wake us by calling SetLatch).
 		 */
-		(void) WaitLatch(MyLatch,
-						 WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
-						 (nap.tv_sec * 1000L) + (nap.tv_usec / 1000L),
-						 WAIT_EVENT_AUTOVACUUM_MAIN);
+		(void) WaitMyLatch((nap.tv_sec * 1000L) + (nap.tv_usec / 1000L),
+						   WAIT_EVENT_AUTOVACUUM_MAIN);
 
 		ResetLatch(MyLatch);
 
diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c
index 069e27e427..85c1c6891c 100644
--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@@ -301,9 +301,7 @@ BackgroundWriterMain(void)
 		 * down with latch events that are likely to happen frequently during
 		 * normal operation.
 		 */
-		rc = WaitLatch(MyLatch,
-					   WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
-					   BgWriterDelay /* ms */ , WAIT_EVENT_BGWRITER_MAIN);
+		rc = WaitMyLatch(BgWriterDelay /* ms */ , WAIT_EVENT_BGWRITER_MAIN);
 
 		/*
 		 * If no latch event and BgBufferSync says nothing's happening, extend
@@ -328,10 +326,8 @@ BackgroundWriterMain(void)
 			/* Ask for notification at next buffer allocation */
 			StrategyNotifyBgWriter(MyProc->pgprocno);
 			/* Sleep ... */
-			(void) WaitLatch(MyLatch,
-							 WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
-							 BgWriterDelay * HIBERNATE_FACTOR,
-							 WAIT_EVENT_BGWRITER_HIBERNATE);
+			(void) WaitMyLatch(BgWriterDelay * HIBERNATE_FACTOR,
+							   WAIT_EVENT_BGWRITER_HIBERNATE);
 			/* Reset the notification request in case we timed out */
 			StrategyNotifyBgWriter(-1);
 		}
diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c
index e354a78725..b87dd03856 100644
--- a/src/backend/postmaster/checkpointer.c
+++ b/src/backend/postmaster/checkpointer.c
@@ -511,10 +511,8 @@ CheckpointerMain(void)
 			cur_timeout = Min(cur_timeout, XLogArchiveTimeout - elapsed_secs);
 		}
 
-		(void) WaitLatch(MyLatch,
-						 WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
-						 cur_timeout * 1000L /* convert to ms */ ,
-						 WAIT_EVENT_CHECKPOINTER_MAIN);
+		(void) WaitMyLatch(cur_timeout * 1000L /* convert to ms */ ,
+						   WAIT_EVENT_CHECKPOINTER_MAIN);
 	}
 }
 
diff --git a/src/backend/postmaster/pgarch.c b/src/backend/postmaster/pgarch.c
index 3ca30badb2..4833c1c343 100644
--- a/src/backend/postmaster/pgarch.c
+++ b/src/backend/postmaster/pgarch.c
@@ -361,10 +361,7 @@ pgarch_MainLoop(void)
 			{
 				int			rc;
 
-				rc = WaitLatch(MyLatch,
-							   WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
-							   timeout * 1000L,
-							   WAIT_EVENT_ARCHIVER_MAIN);
+				rc = WaitMyLatch(timeout * 1000L, WAIT_EVENT_ARCHIVER_MAIN);
 				if (rc & WL_TIMEOUT)
 					wakened = true;
 				if (rc & WL_POSTMASTER_DEATH)
diff --git a/src/backend/postmaster/walwriter.c b/src/backend/postmaster/walwriter.c
index 45a2757969..c9ac4eb742 100644
--- a/src/backend/postmaster/walwriter.c
+++ b/src/backend/postmaster/walwriter.c
@@ -253,9 +253,6 @@ WalWriterMain(void)
 		else
 			cur_timeout = WalWriterDelay * HIBERNATE_FACTOR;
 
-		(void) WaitLatch(MyLatch,
-						 WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
-						 cur_timeout,
-						 WAIT_EVENT_WAL_WRITER_MAIN);
+		(void) WaitMyLatch(cur_timeout, WAIT_EVENT_WAL_WRITER_MAIN);
 	}
 }
diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c
index dea8aab45e..0f1b54746d 100644
--- a/src/backend/replication/basebackup.c
+++ b/src/backend/replication/basebackup.c
@@ -1705,10 +1705,8 @@ throttle(size_t increment)
 		 * (TAR_SEND_SIZE / throttling_sample * elapsed_min_unit) should be
 		 * the maximum time to sleep. Thus the cast to long is safe.
 		 */
-		wait_result = WaitLatch(MyLatch,
-								WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
-								(long) (sleep / 1000),
-								WAIT_EVENT_BASE_BACKUP_THROTTLE);
+		wait_result = WaitMyLatch((long) (sleep / 1000),
+								  WAIT_EVENT_BASE_BACKUP_THROTTLE);
 
 		if (wait_result & WL_LATCH_SET)
 			CHECK_FOR_INTERRUPTS();
diff --git a/src/backend/replication/logical/launcher.c b/src/backend/replication/logical/launcher.c
index aec885e987..13e811384d 100644
--- a/src/backend/replication/logical/launcher.c
+++ b/src/backend/replication/logical/launcher.c
@@ -211,9 +211,7 @@ WaitForReplicationWorkerAttach(LogicalRepWorker *worker,
 		 * We need timeout because we generally don't get notified via latch
 		 * about the worker attach.  But we don't expect to have to wait long.
 		 */
-		rc = WaitLatch(MyLatch,
-					   WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
-					   10L, WAIT_EVENT_BGWORKER_STARTUP);
+		rc = WaitMyLatch(10L, WAIT_EVENT_BGWORKER_STARTUP);
 
 		if (rc & WL_LATCH_SET)
 		{
@@ -482,9 +480,7 @@ logicalrep_worker_stop(Oid subid, Oid relid)
 		LWLockRelease(LogicalRepWorkerLock);
 
 		/* Wait a bit --- we don't expect to have to wait long. */
-		rc = WaitLatch(MyLatch,
-					   WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
-					   10L, WAIT_EVENT_BGWORKER_STARTUP);
+		rc = WaitMyLatch(10L, WAIT_EVENT_BGWORKER_STARTUP);
 
 		if (rc & WL_LATCH_SET)
 		{
@@ -526,9 +522,7 @@ logicalrep_worker_stop(Oid subid, Oid relid)
 		LWLockRelease(LogicalRepWorkerLock);
 
 		/* Wait a bit --- we don't expect to have to wait long. */
-		rc = WaitLatch(MyLatch,
-					   WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
-					   10L, WAIT_EVENT_BGWORKER_SHUTDOWN);
+		rc = WaitMyLatch(10L, WAIT_EVENT_BGWORKER_SHUTDOWN);
 
 		if (rc & WL_LATCH_SET)
 		{
@@ -1034,10 +1028,7 @@ ApplyLauncherMain(Datum main_arg)
 		}
 
 		/* Wait for more work. */
-		rc = WaitLatch(MyLatch,
-					   WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
-					   wait_time,
-					   WAIT_EVENT_LOGICAL_LAUNCHER_MAIN);
+		rc = WaitMyLatch(wait_time, WAIT_EVENT_LOGICAL_LAUNCHER_MAIN);
 
 		if (rc & WL_LATCH_SET)
 		{
diff --git a/src/backend/replication/logical/tablesync.c b/src/backend/replication/logical/tablesync.c
index f8183cd488..44d8c275cc 100644
--- a/src/backend/replication/logical/tablesync.c
+++ b/src/backend/replication/logical/tablesync.c
@@ -185,9 +185,7 @@ wait_for_relation_state_change(Oid relid, char expected_state)
 		if (!worker)
 			return false;
 
-		(void) WaitLatch(MyLatch,
-						 WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
-						 1000L, WAIT_EVENT_LOGICAL_SYNC_STATE_CHANGE);
+		(void) WaitMyLatch(1000L, WAIT_EVENT_LOGICAL_SYNC_STATE_CHANGE);
 
 		ResetLatch(MyLatch);
 	}
@@ -238,9 +236,7 @@ wait_for_worker_state_change(char expected_state)
 		 * Wait.  We expect to get a latch signal back from the apply worker,
 		 * but use a timeout in case it dies without sending one.
 		 */
-		rc = WaitLatch(MyLatch,
-					   WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
-					   1000L, WAIT_EVENT_LOGICAL_SYNC_STATE_CHANGE);
+		rc = WaitMyLatch(1000L, WAIT_EVENT_LOGICAL_SYNC_STATE_CHANGE);
 
 		if (rc & WL_LATCH_SET)
 			ResetLatch(MyLatch);
diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c
index cbd495225c..a052e59987 100644
--- a/src/backend/storage/ipc/latch.c
+++ b/src/backend/storage/ipc/latch.c
@@ -55,6 +55,7 @@
 #include "storage/latch.h"
 #include "storage/pmsignal.h"
 #include "storage/shmem.h"
+#include "utils/memutils.h"
 
 /*
  * Select the fd readiness primitive to use. Normally the "most modern"
@@ -432,6 +433,37 @@ WaitLatchOrSocket(Latch *latch, int wakeEvents, pgsocket sock,
 	return ret;
 }
 
+/*
+ * For callers with the common requirement to wait for their own latch or a
+ * timeout (or -1 for no timeout), with automatic exit on postmaster death,
+ * this variant reuses a WaitEventSet.  On builds where that allocates kernel
+ * resources, this approach avoids building them up and tearing them down
+ * every time.
+ */
+int
+WaitMyLatch(long timeout, uint32 wait_event_info)
+{
+	static WaitEventSet *set;
+	WaitEvent	event;
+
+	/* Create on demand. */
+	if (set == NULL)
+	{
+		WaitEventSet *wes;
+
+		wes = CreateWaitEventSet(TopMemoryContext, 2);
+		AddWaitEventToSet(wes, WL_LATCH_SET, PGINVALID_SOCKET, MyLatch, NULL);
+		AddWaitEventToSet(wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, NULL,
+						  NULL);
+		set = wes;
+	}
+
+	if (WaitEventSetWait(set, timeout, &event, 1, wait_event_info) == 0)
+		return WL_TIMEOUT;
+	else
+		return event.events;
+}
+
 /*
  * Sets a latch and wakes up anyone waiting on it.
  *
diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c
index 65d3946386..024cbc9cf9 100644
--- a/src/backend/storage/ipc/procsignal.c
+++ b/src/backend/storage/ipc/procsignal.c
@@ -395,10 +395,7 @@ WaitForProcSignalBarrier(uint64 generation)
 
 			CHECK_FOR_INTERRUPTS();
 
-			events =
-				WaitLatch(MyLatch,
-						  WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
-						  timeout, WAIT_EVENT_PROC_SIGNAL_BARRIER);
+			events = WaitMyLatch(timeout, WAIT_EVENT_PROC_SIGNAL_BARRIER);
 			ResetLatch(MyLatch);
 
 			oldval = pg_atomic_read_u64(&slot->pss_barrierGeneration);
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index eb321f72ea..665363e47e 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -1283,8 +1283,8 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
 		}
 		else
 		{
-			(void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0,
-							 PG_WAIT_LOCK | locallock->tag.lock.locktag_type);
+			(void) WaitMyLatch(-1,
+							   PG_WAIT_LOCK | locallock->tag.lock.locktag_type);
 			ResetLatch(MyLatch);
 			/* check for deadlocks first, as that's probably log-worthy */
 			if (got_deadlock_timeout)
@@ -1793,8 +1793,7 @@ CheckDeadLockAlert(void)
 void
 ProcWaitForSignal(uint32 wait_event_info)
 {
-	(void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0,
-					 wait_event_info);
+	(void) WaitMyLatch(-1, wait_event_info);
 	ResetLatch(MyLatch);
 	CHECK_FOR_INTERRUPTS();
 }
diff --git a/src/backend/utils/adt/misc.c b/src/backend/utils/adt/misc.c
index 323e36b81c..6937c143b7 100644
--- a/src/backend/utils/adt/misc.c
+++ b/src/backend/utils/adt/misc.c
@@ -378,10 +378,7 @@ pg_sleep(PG_FUNCTION_ARGS)
 		else
 			break;
 
-		(void) WaitLatch(MyLatch,
-						 WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
-						 delay_ms,
-						 WAIT_EVENT_PG_SLEEP);
+		(void) WaitMyLatch(delay_ms, WAIT_EVENT_PG_SLEEP);
 		ResetLatch(MyLatch);
 	}
 
diff --git a/src/include/storage/latch.h b/src/include/storage/latch.h
index 46ae56cae3..d6fd154dc6 100644
--- a/src/include/storage/latch.h
+++ b/src/include/storage/latch.h
@@ -176,6 +176,7 @@ extern int	WaitLatch(Latch *latch, int wakeEvents, long timeout,
 					  uint32 wait_event_info);
 extern int	WaitLatchOrSocket(Latch *latch, int wakeEvents,
 							  pgsocket sock, long timeout, uint32 wait_event_info);
+extern int	WaitMyLatch(long timeout, uint32 wait_event_info);
 
 /*
  * Unix implementation uses SIGUSR1 for inter-process signaling.
-- 
2.23.0

0002-Use-WaitMyLatch-for-condition-variables.patchapplication/octet-stream; name=0002-Use-WaitMyLatch-for-condition-variables.patchDownload
From 412500f1f2a4c2f33beddeba089f37acfe1ad99f Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Tue, 21 Jan 2020 12:22:14 +1300
Subject: [PATCH 2/5] Use WaitMyLatch() for condition variables.

Previously, condition_variable.c maintained its own reusable
WaitEventSet.  Let's use the new common one, shared by other simple
latch waiting code.
---
 src/backend/storage/lmgr/condition_variable.c | 24 +------------------
 1 file changed, 1 insertion(+), 23 deletions(-)

diff --git a/src/backend/storage/lmgr/condition_variable.c b/src/backend/storage/lmgr/condition_variable.c
index 37b6a4eecd..8ca294b8dd 100644
--- a/src/backend/storage/lmgr/condition_variable.c
+++ b/src/backend/storage/lmgr/condition_variable.c
@@ -30,9 +30,6 @@
 /* Initially, we are not prepared to sleep on any condition variable. */
 static ConditionVariable *cv_sleep_target = NULL;
 
-/* Reusable WaitEventSet. */
-static WaitEventSet *cv_wait_event_set = NULL;
-
 /*
  * Initialize a condition variable.
  */
@@ -62,23 +59,6 @@ ConditionVariablePrepareToSleep(ConditionVariable *cv)
 {
 	int			pgprocno = MyProc->pgprocno;
 
-	/*
-	 * If first time through in this process, create a WaitEventSet, which
-	 * we'll reuse for all condition variable sleeps.
-	 */
-	if (cv_wait_event_set == NULL)
-	{
-		WaitEventSet *new_event_set;
-
-		new_event_set = CreateWaitEventSet(TopMemoryContext, 2);
-		AddWaitEventToSet(new_event_set, WL_LATCH_SET, PGINVALID_SOCKET,
-						  MyLatch, NULL);
-		AddWaitEventToSet(new_event_set, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
-						  NULL, NULL);
-		/* Don't set cv_wait_event_set until we have a correct WES. */
-		cv_wait_event_set = new_event_set;
-	}
-
 	/*
 	 * If some other sleep is already prepared, cancel it; this is necessary
 	 * because we have just one static variable tracking the prepared sleep,
@@ -170,15 +150,13 @@ ConditionVariableTimedSleep(ConditionVariable *cv, long timeout,
 
 	while (true)
 	{
-		WaitEvent	event;
 		bool		done = false;
 
 		/*
 		 * Wait for latch to be set.  (If we're awakened for some other
 		 * reason, the code below will cope anyway.)
 		 */
-		(void) WaitEventSetWait(cv_wait_event_set, cur_timeout, &event, 1,
-								wait_event_info);
+		(void) WaitMyLatch(cur_timeout, wait_event_info);
 
 		/* Reset latch before examining the state of the wait list. */
 		ResetLatch(MyLatch);
-- 
2.23.0

0003-Introduce-a-reusable-WaitEventSet-for-the-stats-coll.patchapplication/octet-stream; name=0003-Introduce-a-reusable-WaitEventSet-for-the-stats-coll.patchDownload
From e77dbecee4330272964b7ea2838a4d9501d804b4 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Tue, 21 Jan 2020 12:54:11 +1300
Subject: [PATCH 3/5] Introduce a reusable WaitEventSet for the stats
 collector.

Avoid setup and teardown for every wait.
---
 src/backend/postmaster/pgstat.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 7169509a79..953bc9b6ae 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -4426,6 +4426,8 @@ PgstatCollectorMain(int argc, char *argv[])
 	int			len;
 	PgStat_Msg	msg;
 	int			wr;
+	WaitEvent	event;
+	WaitEventSet *wes;
 
 	/*
 	 * Ignore all signals usually bound to some action in the postmaster,
@@ -4455,6 +4457,12 @@ PgstatCollectorMain(int argc, char *argv[])
 	pgStatRunningInCollector = true;
 	pgStatDBHash = pgstat_read_statsfiles(InvalidOid, true, true);
 
+	/* Prepare to wait for our latch or data in our socket. */
+	wes = CreateWaitEventSet(CurrentMemoryContext, 3);
+	AddWaitEventToSet(wes, WL_LATCH_SET, PGINVALID_SOCKET, MyLatch, NULL);
+	AddWaitEventToSet(wes, WL_POSTMASTER_DEATH, PGINVALID_SOCKET, NULL, NULL);
+	AddWaitEventToSet(wes, WL_SOCKET_READABLE, pgStatSock, NULL, NULL);
+
 	/*
 	 * Loop to process messages until we get SIGQUIT or detect ungraceful
 	 * death of our parent postmaster.
@@ -4633,10 +4641,7 @@ PgstatCollectorMain(int argc, char *argv[])
 
 		/* Sleep until there's something to do */
 #ifndef WIN32
-		wr = WaitLatchOrSocket(MyLatch,
-							   WL_LATCH_SET | WL_POSTMASTER_DEATH | WL_SOCKET_READABLE,
-							   pgStatSock, -1L,
-							   WAIT_EVENT_PGSTAT_MAIN);
+		wr = WaitEventSetWait(wes, -1L, &event, 1, WAIT_EVENT_PGSTAT_MAIN);
 #else
 
 		/*
@@ -4649,18 +4654,15 @@ PgstatCollectorMain(int argc, char *argv[])
 		 * to not provoke "using stale statistics" complaints from
 		 * backend_read_statsfile.
 		 */
-		wr = WaitLatchOrSocket(MyLatch,
-							   WL_LATCH_SET | WL_POSTMASTER_DEATH | WL_SOCKET_READABLE | WL_TIMEOUT,
-							   pgStatSock,
-							   2 * 1000L /* msec */ ,
-							   WAIT_EVENT_PGSTAT_MAIN);
+		wr = WaitEventSetWait(wes, 2 * 1000L /* msec */, &event, 1,
+							  WAIT_EVENT_PGSTAT_MAIN);
 #endif
 
 		/*
 		 * Emergency bailout if postmaster has died.  This is to avoid the
 		 * necessity for manual cleanup of all postmaster children.
 		 */
-		if (wr & WL_POSTMASTER_DEATH)
+		if (wr == 1 && event.events == WL_POSTMASTER_DEATH)
 			break;
 	}							/* end of outer loop */
 
@@ -4669,6 +4671,8 @@ PgstatCollectorMain(int argc, char *argv[])
 	 */
 	pgstat_write_statsfiles(true, true);
 
+	FreeWaitEventSet(wes);
+
 	exit(0);
 }
 
-- 
2.23.0

0004-libpq-Add-PQsocketChangeCount-to-advertise-socket-ch.patchapplication/octet-stream; name=0004-libpq-Add-PQsocketChangeCount-to-advertise-socket-ch.patchDownload
From a30a93319d5ada7928603553a1d82bb1f8409647 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Sat, 8 Feb 2020 09:01:19 +1300
Subject: [PATCH 4/5] libpq: Add PQsocketChangeCount to advertise socket
 changes.

Users of PQsocket() sometimes wish they could know when the
socket changes between calls, to allow for reuse of wait
event sets.  Provide a new function PQsocketChangeCount()
that advances whenever libpq opens or closes a socket.
---
 doc/src/sgml/libpq.sgml           | 17 +++++++++++++++++
 src/interfaces/libpq/exports.txt  |  1 +
 src/interfaces/libpq/fe-connect.c | 10 ++++++++++
 src/interfaces/libpq/libpq-fe.h   |  1 +
 src/interfaces/libpq/libpq-int.h  |  1 +
 5 files changed, 30 insertions(+)

diff --git a/doc/src/sgml/libpq.sgml b/doc/src/sgml/libpq.sgml
index 9a24c19ccb..14f0d998b6 100644
--- a/doc/src/sgml/libpq.sgml
+++ b/doc/src/sgml/libpq.sgml
@@ -2280,6 +2280,23 @@ int PQsocket(const PGconn *conn);
      </listitem>
     </varlistentry>
 
+    <varlistentry id="libpq-PQsocketChangeCount">
+     <term><function>PQsocketChangeCount</function><indexterm><primary>PQsocketChangeCount</primary></indexterm></term>
+     <listitem>
+      <para>
+       Returns a counter that increases whenever the socket has changed.
+       This can be used to know when the value returned by
+       <xref linkend="libpq-PQsocket"/> represents a different socket, even
+       if it happens to have the same file descriptor number.
+
+<synopsis>
+pg_int64 PQsocketChangeCount(const PGconn *conn);
+</synopsis>
+
+      </para>
+     </listitem>
+    </varlistentry>
+
     <varlistentry id="libpq-PQbackendPID">
      <term><function>PQbackendPID</function><indexterm><primary>PQbackendPID</primary></indexterm></term>
      <listitem>
diff --git a/src/interfaces/libpq/exports.txt b/src/interfaces/libpq/exports.txt
index 5fc1e5d289..e925ea320f 100644
--- a/src/interfaces/libpq/exports.txt
+++ b/src/interfaces/libpq/exports.txt
@@ -179,3 +179,4 @@ PQgetgssctx               176
 PQsetSSLKeyPassHook		  177
 PQgetSSLKeyPassHook		  178
 PQdefaultSSLKeyPassHook	  179
+PQsocketChangeCount       180
diff --git a/src/interfaces/libpq/fe-connect.c b/src/interfaces/libpq/fe-connect.c
index 408000af83..dff73f1784 100644
--- a/src/interfaces/libpq/fe-connect.c
+++ b/src/interfaces/libpq/fe-connect.c
@@ -464,6 +464,7 @@ pqDropConnection(PGconn *conn, bool flushInput)
 	if (conn->sock != PGINVALID_SOCKET)
 		closesocket(conn->sock);
 	conn->sock = PGINVALID_SOCKET;
+	conn->sockChangeCount++;
 
 	/* Optionally discard any unread data */
 	if (flushInput)
@@ -2538,6 +2539,7 @@ keep_going:						/* We will come back to here until there is
 					 */
 
 					conn->sock = socket(addr_cur->ai_family, SOCK_STREAM, 0);
+					conn->sockChangeCount++;
 					if (conn->sock == PGINVALID_SOCKET)
 					{
 						/*
@@ -6706,6 +6708,14 @@ PQsocket(const PGconn *conn)
 	return (conn->sock != PGINVALID_SOCKET) ? conn->sock : -1;
 }
 
+pg_int64
+PQsocketChangeCount(const PGconn *conn)
+{
+	if (!conn)
+		return -1;
+	return conn->sockChangeCount;
+}
+
 int
 PQbackendPID(const PGconn *conn)
 {
diff --git a/src/interfaces/libpq/libpq-fe.h b/src/interfaces/libpq/libpq-fe.h
index c9e6ac2b76..8b21373e4d 100644
--- a/src/interfaces/libpq/libpq-fe.h
+++ b/src/interfaces/libpq/libpq-fe.h
@@ -327,6 +327,7 @@ extern int	PQprotocolVersion(const PGconn *conn);
 extern int	PQserverVersion(const PGconn *conn);
 extern char *PQerrorMessage(const PGconn *conn);
 extern int	PQsocket(const PGconn *conn);
+extern pg_int64 PQsocketChangeCount(const PGconn *conn);
 extern int	PQbackendPID(const PGconn *conn);
 extern int	PQconnectionNeedsPassword(const PGconn *conn);
 extern int	PQconnectionUsedPassword(const PGconn *conn);
diff --git a/src/interfaces/libpq/libpq-int.h b/src/interfaces/libpq/libpq-int.h
index 72931e6019..91b190f078 100644
--- a/src/interfaces/libpq/libpq-int.h
+++ b/src/interfaces/libpq/libpq-int.h
@@ -409,6 +409,7 @@ struct pg_conn
 	/* Connection data */
 	pgsocket	sock;			/* FD for socket, PGINVALID_SOCKET if
 								 * unconnected */
+	pg_int64	sockChangeCount;	/* advances when socket is opened/closed */
 	SockAddr	laddr;			/* Local address */
 	SockAddr	raddr;			/* Remote address */
 	ProtocolVersion pversion;	/* FE/BE protocol version in use */
-- 
2.23.0

0005-Reuse-a-WaitEventSet-in-walreceiver.patchapplication/octet-stream; name=0005-Reuse-a-WaitEventSet-in-walreceiver.patchDownload
From 92223cbf9f45f4de5449f41ea65195c17fc86f1a Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Sat, 8 Feb 2020 09:14:29 +1300
Subject: [PATCH 5/5] Reuse a WaitEventSet in walreceiver.

To avoid repeatedly setting up and tearing down WaitEventSet objects
and associated kernel objects, reuse a WaitEventSet.  Recreate it
whenever PQsocketChangeCount() indicates that the socket has changed
since the last wait.

XXX This code seems a bit ugly, changes the callback interface
for libpqwalreceiver (which other code cares about that?) and
will have to be duplicated in eg logical/worker.c.  Can we find
a nice design?
---
 .../libpqwalreceiver/libpqwalreceiver.c       | 10 ++--
 src/backend/replication/logical/tablesync.c   |  2 +-
 src/backend/replication/logical/worker.c      |  4 +-
 src/backend/replication/walreceiver.c         | 47 +++++++++++--------
 src/include/replication/walreceiver.h         |  7 +--
 5 files changed, 42 insertions(+), 28 deletions(-)

diff --git a/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c b/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c
index e4fd1f9bb6..17a6f27a42 100644
--- a/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c
+++ b/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c
@@ -66,7 +66,7 @@ static bool libpqrcv_startstreaming(WalReceiverConn *conn,
 static void libpqrcv_endstreaming(WalReceiverConn *conn,
 								  TimeLineID *next_tli);
 static int	libpqrcv_receive(WalReceiverConn *conn, char **buffer,
-							 pgsocket *wait_fd);
+							 pgsocket *wait_fd, int64 *wait_fd_change_count);
 static void libpqrcv_send(WalReceiverConn *conn, const char *buffer,
 						  int nbytes);
 static char *libpqrcv_create_slot(WalReceiverConn *conn,
@@ -694,7 +694,9 @@ libpqrcv_disconnect(WalReceiverConn *conn)
  *	 until the next libpqrcv_* call.
  *
  *	 If no data was available immediately, returns 0, and *wait_fd is set to a
- *	 socket descriptor which can be waited on before trying again.
+ *	 socket descriptor which can be waited on before trying again.  If
+ *	 wait_fd_change_count is not NULL, then *wait_fd_change_count is set to
+ *	 a counter that can be used to detect changes in the socket between calls.
  *
  *	 -1 if the server ended the COPY.
  *
@@ -702,7 +704,7 @@ libpqrcv_disconnect(WalReceiverConn *conn)
  */
 static int
 libpqrcv_receive(WalReceiverConn *conn, char **buffer,
-				 pgsocket *wait_fd)
+				 pgsocket *wait_fd, int64 *wait_fd_change_count)
 {
 	int			rawlen;
 
@@ -726,6 +728,8 @@ libpqrcv_receive(WalReceiverConn *conn, char **buffer,
 		{
 			/* Tell caller to try again when our socket is ready. */
 			*wait_fd = PQsocket(conn->streamConn);
+			if (wait_fd_change_count)
+				*wait_fd_change_count = PQsocketChangeCount(conn->streamConn);
 			return 0;
 		}
 	}
diff --git a/src/backend/replication/logical/tablesync.c b/src/backend/replication/logical/tablesync.c
index 44d8c275cc..f448e11409 100644
--- a/src/backend/replication/logical/tablesync.c
+++ b/src/backend/replication/logical/tablesync.c
@@ -580,7 +580,7 @@ copy_read_data(void *outbuf, int minread, int maxread)
 		for (;;)
 		{
 			/* Try read the data. */
-			len = walrcv_receive(wrconn, &buf, &fd);
+			len = walrcv_receive(wrconn, &buf, &fd, NULL);
 
 			CHECK_FOR_INTERRUPTS();
 
diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c
index 7a5471f95c..485231ccb3 100644
--- a/src/backend/replication/logical/worker.c
+++ b/src/backend/replication/logical/worker.c
@@ -1149,7 +1149,7 @@ LogicalRepApplyLoop(XLogRecPtr last_received)
 
 		MemoryContextSwitchTo(ApplyMessageContext);
 
-		len = walrcv_receive(wrconn, &buf, &fd);
+		len = walrcv_receive(wrconn, &buf, &fd, NULL);
 
 		if (len != 0)
 		{
@@ -1229,7 +1229,7 @@ LogicalRepApplyLoop(XLogRecPtr last_received)
 					MemoryContextReset(ApplyMessageContext);
 				}
 
-				len = walrcv_receive(wrconn, &buf, &fd);
+				len = walrcv_receive(wrconn, &buf, &fd, NULL);
 			}
 		}
 
diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c
index a5e85d32f3..bff726144d 100644
--- a/src/backend/replication/walreceiver.c
+++ b/src/backend/replication/walreceiver.c
@@ -310,6 +310,8 @@ WalReceiverMain(void)
 		char	   *primary_sysid;
 		char		standby_sysid[32];
 		WalRcvStreamOptions options;
+		WaitEventSet *wes = NULL;
+		int64		wes_change_count = -1;
 
 		/*
 		 * Check that we're connected to a valid server using the
@@ -434,6 +436,8 @@ WalReceiverMain(void)
 				int			len;
 				bool		endofwal = false;
 				pgsocket	wait_fd = PGINVALID_SOCKET;
+				int64		wait_fd_change_count;
+				WaitEvent	event;
 				int			rc;
 
 				/*
@@ -455,7 +459,8 @@ WalReceiverMain(void)
 				}
 
 				/* See if we can read data immediately */
-				len = walrcv_receive(wrconn, &buf, &wait_fd);
+				len = walrcv_receive(wrconn, &buf, &wait_fd,
+									 &wait_fd_change_count);
 				if (len != 0)
 				{
 					/*
@@ -486,7 +491,8 @@ WalReceiverMain(void)
 							endofwal = true;
 							break;
 						}
-						len = walrcv_receive(wrconn, &buf, &wait_fd);
+						len = walrcv_receive(wrconn, &buf, &wait_fd,
+											 &wait_fd_change_count);
 					}
 
 					/* Let the master know that we received some data. */
@@ -505,24 +511,27 @@ WalReceiverMain(void)
 					break;
 
 				/*
-				 * Ideally we would reuse a WaitEventSet object repeatedly
-				 * here to avoid the overheads of WaitLatchOrSocket on epoll
-				 * systems, but we can't be sure that libpq (or any other
-				 * walreceiver implementation) has the same socket (even if
-				 * the fd is the same number, it may have been closed and
-				 * reopened since the last time).  In future, if there is a
-				 * function for removing sockets from WaitEventSet, then we
-				 * could add and remove just the socket each time, potentially
-				 * avoiding some system calls.
+				 * If we don't have a WaitEventSet yet, or the socket has
+				 * changed, we need to create one.  Otherwise it's safe
+				 * to reuse the one we have, to cut down on system calls.
 				 */
 				Assert(wait_fd != PGINVALID_SOCKET);
-				rc = WaitLatchOrSocket(walrcv->latch,
-									   WL_EXIT_ON_PM_DEATH | WL_SOCKET_READABLE |
-									   WL_TIMEOUT | WL_LATCH_SET,
-									   wait_fd,
-									   NAPTIME_PER_CYCLE,
-									   WAIT_EVENT_WAL_RECEIVER_MAIN);
-				if (rc & WL_LATCH_SET)
+				if (wes == NULL || wes_change_count != wait_fd_change_count)
+				{
+					if (wes != NULL)
+						FreeWaitEventSet(wes);
+					wes = CreateWaitEventSet(CurrentMemoryContext, 3);
+					AddWaitEventToSet(wes, WL_LATCH_SET, PGINVALID_SOCKET,
+									  walrcv->latch, NULL);
+					AddWaitEventToSet(wes, WL_EXIT_ON_PM_DEATH,
+									  PGINVALID_SOCKET, NULL, NULL);
+					AddWaitEventToSet(wes, WL_SOCKET_READABLE, wait_fd, NULL,
+									  NULL);
+					wes_change_count = wait_fd_change_count;
+				}
+				rc = WaitEventSetWait(wes, NAPTIME_PER_CYCLE, &event, 1,
+									  WAIT_EVENT_WAL_RECEIVER_MAIN);
+				if (rc == 1 && event.events == WL_LATCH_SET)
 				{
 					ResetLatch(walrcv->latch);
 					ProcessWalRcvInterrupts();
@@ -540,7 +549,7 @@ WalReceiverMain(void)
 						XLogWalRcvSendReply(true, false);
 					}
 				}
-				if (rc & WL_TIMEOUT)
+				if (rc == 0)
 				{
 					/*
 					 * We didn't receive anything new. If we haven't heard
diff --git a/src/include/replication/walreceiver.h b/src/include/replication/walreceiver.h
index e08afc6548..f20bfc18cc 100644
--- a/src/include/replication/walreceiver.h
+++ b/src/include/replication/walreceiver.h
@@ -226,7 +226,8 @@ typedef bool (*walrcv_startstreaming_fn) (WalReceiverConn *conn,
 typedef void (*walrcv_endstreaming_fn) (WalReceiverConn *conn,
 										TimeLineID *next_tli);
 typedef int (*walrcv_receive_fn) (WalReceiverConn *conn, char **buffer,
-								  pgsocket *wait_fd);
+								  pgsocket *wait_fd,
+								  int64 *wait_fd_change_count);
 typedef void (*walrcv_send_fn) (WalReceiverConn *conn, const char *buffer,
 								int nbytes);
 typedef char *(*walrcv_create_slot_fn) (WalReceiverConn *conn,
@@ -279,8 +280,8 @@ extern PGDLLIMPORT WalReceiverFunctionsType *WalReceiverFunctions;
 	WalReceiverFunctions->walrcv_startstreaming(conn, options)
 #define walrcv_endstreaming(conn, next_tli) \
 	WalReceiverFunctions->walrcv_endstreaming(conn, next_tli)
-#define walrcv_receive(conn, buffer, wait_fd) \
-	WalReceiverFunctions->walrcv_receive(conn, buffer, wait_fd)
+#define walrcv_receive(conn, buffer, wait_fd, wait_event_change_count) \
+	WalReceiverFunctions->walrcv_receive(conn, buffer, wait_fd, wait_event_change_count)
 #define walrcv_send(conn, buffer, nbytes) \
 	WalReceiverFunctions->walrcv_send(conn, buffer, nbytes)
 #define walrcv_create_slot(conn, slotname, temporary, snapshot_action, lsn) \
-- 
2.23.0

#3Thomas Munro
thomas.munro@gmail.com
In reply to: Thomas Munro (#2)
Re: Reducing WaitEventSet syscall churn

On Sat, Feb 8, 2020 at 10:00 AM Thomas Munro <thomas.munro@gmail.com> wrote:

On Tue, Jan 21, 2020 at 1:45 PM Thomas Munro <thomas.munro@gmail.com> wrote:

Here are some patches to get rid of frequent system calls.

Here is one more case that I was sitting on because I wasn't sure how
to do it: walreceiver.c. To make that work, libpq needs to be able to
tell you when the socket has changed, which I did with a counter that
is exposed to client code in patch 0004. The walreceiver change in
0005 works (trace the system calls on walreceiver to see the
difference), but perhaps we can come up with a better way to code it
so that eg logical/worker.c doesn't finish up duplicating the logic.
Thoughts?

(To be clear: I know the 0005 patch doesn't clean up after itself in
various cases, it's for discussion only to see if others have ideas
about how to structure things to suit various potential users of
libpqwalreceiver.so.)

#4Thomas Munro
thomas.munro@gmail.com
In reply to: Thomas Munro (#3)
10 attachment(s)
Re: Reducing WaitEventSet syscall churn

On Sat, Feb 8, 2020 at 10:15 AM Thomas Munro <thomas.munro@gmail.com> wrote:

Here are some patches to get rid of frequent system calls.

Here's a new version of this patch set. It gets rid of all temporary
WaitEventSets except a couple I mentioned in another thread[1]/messages/by-id/CA+hUKGK=m9dLrq42oWQ4XfK9iDjGiZVwpQ1HkHrAPfG7Kh681g@mail.gmail.com.
WaitLatch() uses CommonWaitSet, and calls to WaitLatchOrSocket() are
replaced by either the existing FeBeWaitSet (walsender, gssapi/openssl
auth are also candidates) or a special purpose long lived WaitEventSet
(replication, postgres_fdw, pgstats). It passes make check-world with
WAIT_USE_POLL, WAIT_USE_KQUEUE, WAIT_USE_EPOLL, all with and without
-DEXEC_BACKEND, and make check with WAIT_USE_WIN32 (Appveyor).

0001: "Don't use EV_CLEAR for kqueue events."

This fixes a problem in the kqueue implementation that only shows up
once you switch to long lived WaitEventSets. It needs to be
level-triggered like the other implementations, for example because
there's a place in the recovery tests where we wait twice in a row
without trying to do I/O in between. (This is a bit like commit
3b790d256f8 that fixed a similar problem on Windows.)

0002: "Use a long lived WaitEventSet for WaitLatch()."

In the last version, I had a new function WaitMyLatch(), but that
didn't help with recoveryWakeupLatch. Switching between latches
doesn't require a syscall, so I didn't want to have a separate WES and
function just for that. So I went back to using plain old
WaitLatch(), and made it "modify" the latch every time before waiting
on CommonWaitSet. An alternative would be to get rid of the concept
of latches other than MyLatch, and change the function to
WaitMyLatch(). A similar thing happens for exit_on_postmaster_death,
for which I didn't want to have a separate WES, so I just set that
flag every time. Thoughts?

0003: "Use regular WaitLatch() for condition variables."

That mechanism doesn't need its own WES anymore.

0004: "Introduce RemoveWaitEvent()."

We'll need that to be able to handle connections that are closed and
reopened under the covers by libpq (replication, postgres_fdw). We
also wanted this on a couple of other threads for multiplexing FDWs,
to be able to adjust the wait set dynamically for a proposed async
Append feature.

The implementation is a little naive, and leaves holes in the
"pollfds" and "handles" arrays (for poll() and win32 implementations).
That could be improved with a bit more footwork in a later patch.

XXX The Windows version is based on reading documentation. I'd be
very interested to know if check-world passes (especially
contrib/postgres_fdw and src/test/recovery). Unfortunately my
appveyor.yml fu is not yet strong enough.

0005: "libpq: Add PQsocketChangeCount to advertise socket changes."

To support a long lived WES, libpq needs a way tell us when the socket
changes underneath our feet. This is the simplest thing I could think
of; better ideas welcome.

0006: "Reuse a WaitEventSet in libpqwalreceiver.c."

Rather than having all users of libpqwalreceiver.c deal with the
complicated details of wait set management, have libpqwalreceiver
expose a waiting interface that understands socket changes.

Unfortunately, I couldn't figure out how to use CommonWaitSet for this
(ie adding and removing sockets to that as required), due to
complications with the bookkeeping required to provide the fd_closed
flag to RemoveWaitEvent(). So it creates its own internal long lived
WaitEventSet.

0007: "Use a WaitEventSet for postgres_fdw."

Create a single WaitEventSet and use it for all FDW connections. By
having our own dedicated WES, we can do the bookkeeping required to
know when sockets have been closed or need to removed from kernel wait
sets explicitly (which would be much harder to achieve if
CommonWaitSet were to be used like that; you need to know when sockets
are closed by other code, so you can provide fd_closed to
RemoveWaitEvent()).

Concretely, if you use just one postgres_fdw connection, you'll see
just epoll_wait()/kevent() calls for waits, but whever you switch
between different connections, you'll see eg EPOLL_DEL/EV_DELETE
followed by EPOLL_ADD/EV_ADD when the set is adjusted (in the kqueue
implementation these could be collapse into the following wait, but I
haven't done the work for that). An alternative would be to have one
WES per FDW connection, but that seemed wasteful of file descriptors.

0008: "Use WL_EXIT_ON_PM_DEATH in FeBeWaitSet."

The FATAL message you get if you happen to be waiting for IO rather
than waiting somewhere else seems arbitrarily different. By switching
to a standard automatic exit, it opens the possibility of using
FeBeWaitSet in a couple more places that would otherwise need to
create their own WES (see also [1]/messages/by-id/CA+hUKGK=m9dLrq42oWQ4XfK9iDjGiZVwpQ1HkHrAPfG7Kh681g@mail.gmail.com). Thoughts?

0009: "Use FeBeWaitSet for walsender.c."

Enabled by 0008.

0010: "Introduce a WaitEventSet for the stats collector."

[1]: /messages/by-id/CA+hUKGK=m9dLrq42oWQ4XfK9iDjGiZVwpQ1HkHrAPfG7Kh681g@mail.gmail.com

Attachments:

0001-Don-t-use-EV_CLEAR-for-kqueue-events.patchtext/x-patch; charset=US-ASCII; name=0001-Don-t-use-EV_CLEAR-for-kqueue-events.patchDownload
From 6787ffeb57043b2b19201a49e656d56ab58b2d7e Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Tue, 25 Feb 2020 02:53:35 +1300
Subject: [PATCH 01/11] Don't use EV_CLEAR for kqueue events.

For the semantics to match the epoll implementation, we need a
socket to continue to appear readable/writable if you wait
multiple times without doing I/O in between (in Linux
terminology, level-triggered rather than edge-triggered).
Similar to commit 3b790d256f8 for Windows.

Author: Thomas Munro
Discussion: https://postgr.es/m/CA%2BhUKGJAC4Oqao%3DqforhNey20J8CiG2R%3DoBPqvfR0vOJrFysGw%40mail.gmail.com
---
 src/backend/storage/ipc/latch.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c
index 046ca5c6c7..3b6acfb251 100644
--- a/src/backend/storage/ipc/latch.c
+++ b/src/backend/storage/ipc/latch.c
@@ -991,7 +991,7 @@ WaitEventAdjustKqueueAdd(struct kevent *k_ev, int filter, int action,
 {
 	k_ev->ident = event->fd;
 	k_ev->filter = filter;
-	k_ev->flags = action | EV_CLEAR;
+	k_ev->flags = action;
 	k_ev->fflags = 0;
 	k_ev->data = 0;
 	AccessWaitEvent(k_ev) = event;
@@ -1003,7 +1003,7 @@ WaitEventAdjustKqueueAddPostmaster(struct kevent *k_ev, WaitEvent *event)
 	/* For now postmaster death can only be added, not removed. */
 	k_ev->ident = PostmasterPid;
 	k_ev->filter = EVFILT_PROC;
-	k_ev->flags = EV_ADD | EV_CLEAR;
+	k_ev->flags = EV_ADD;
 	k_ev->fflags = NOTE_EXIT;
 	k_ev->data = 0;
 	AccessWaitEvent(k_ev) = event;
-- 
2.20.1

0002-Use-a-long-lived-WaitEventSet-for-WaitLatch.patchtext/x-patch; charset=US-ASCII; name=0002-Use-a-long-lived-WaitEventSet-for-WaitLatch.patchDownload
From 4e528ac083260e10b9fbeb35b4fd14a5c1267e32 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Mon, 24 Feb 2020 15:39:24 +1300
Subject: [PATCH 02/11] Use a long lived WaitEventSet for WaitLatch().

Create CommonWaitSet at backend startup time, and use it to
implement WaitLatch().  This avoids a bunch of epoll/kqueue
system calls, and makes sure we don't run into EMFILE later
due to lack of file descriptors.

Reorder SubPostmasterMain() slightly so that we restore the
postmaster pipe and Windows signal before we reach
InitPostmasterChild(), to make this work in EXEC_BACKEND
builds.

Author: Thomas Munro
Discussion: https://postgr.es/m/CA%2BhUKGJAC4Oqao%3DqforhNey20J8CiG2R%3DoBPqvfR0vOJrFysGw%40mail.gmail.com
---
 src/backend/postmaster/postmaster.c | 24 +++++++-------
 src/backend/storage/ipc/latch.c     | 49 +++++++++++++++++++++++++++--
 src/backend/utils/init/miscinit.c   |  2 ++
 src/include/storage/latch.h         |  1 +
 4 files changed, 61 insertions(+), 15 deletions(-)

diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index cd61665eea..b870346c75 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -4882,9 +4882,6 @@ SubPostmasterMain(int argc, char *argv[])
 	IsPostmasterEnvironment = true;
 	whereToSendOutput = DestNone;
 
-	/* Setup as postmaster child */
-	InitPostmasterChild();
-
 	/* Setup essential subsystems (to ensure elog() behaves sanely) */
 	InitializeGUCOptions();
 
@@ -4899,6 +4896,18 @@ SubPostmasterMain(int argc, char *argv[])
 	/* Close the postmaster's sockets (as soon as we know them) */
 	ClosePostmasterPorts(strcmp(argv[1], "--forklog") == 0);
 
+	/*
+	 * Start our win32 signal implementation. This has to be done after we
+	 * read the backend variables, because we need to pick up the signal pipe
+	 * from the parent process.
+	 */
+#ifdef WIN32
+	pgwin32_signal_initialize();
+#endif
+
+	/* Setup as postmaster child */
+	InitPostmasterChild();
+
 	/*
 	 * Set reference point for stack-depth checking
 	 */
@@ -4947,15 +4956,6 @@ SubPostmasterMain(int argc, char *argv[])
 	if (strcmp(argv[1], "--forkavworker") == 0)
 		AutovacuumWorkerIAm();
 
-	/*
-	 * Start our win32 signal implementation. This has to be done after we
-	 * read the backend variables, because we need to pick up the signal pipe
-	 * from the parent process.
-	 */
-#ifdef WIN32
-	pgwin32_signal_initialize();
-#endif
-
 	/* In EXEC_BACKEND case we will not have inherited these settings */
 	pqinitmask();
 	PG_SETMASK(&BlockSig);
diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c
index 3b6acfb251..30e461e965 100644
--- a/src/backend/storage/ipc/latch.c
+++ b/src/backend/storage/ipc/latch.c
@@ -56,6 +56,7 @@
 #include "storage/latch.h"
 #include "storage/pmsignal.h"
 #include "storage/shmem.h"
+#include "utils/memutils.h"
 
 /*
  * Select the fd readiness primitive to use. Normally the "most modern"
@@ -129,6 +130,9 @@ struct WaitEventSet
 #endif
 };
 
+/* A common WaitEventSet used to implement WatchLatch() */
+static WaitEventSet *CommonWaitSet;
+
 #ifndef WIN32
 /* Are we currently in WaitLatch? The signal handler would like to know. */
 static volatile sig_atomic_t waiting = false;
@@ -242,6 +246,20 @@ InitializeLatchSupport(void)
 #endif
 }
 
+void
+InitializeCommonWaitSet(void)
+{
+	Assert(CommonWaitSet == NULL);
+
+	/* Set up the WaitEventSet used by WaitLatch(). */
+	CommonWaitSet = CreateWaitEventSet(TopMemoryContext, 2);
+	AddWaitEventToSet(CommonWaitSet, WL_LATCH_SET, PGINVALID_SOCKET,
+					  MyLatch, NULL);
+	if (IsUnderPostmaster)
+		AddWaitEventToSet(CommonWaitSet, WL_EXIT_ON_PM_DEATH,
+						  PGINVALID_SOCKET, NULL, NULL);
+}
+
 /*
  * Initialize a process-local latch.
  */
@@ -365,8 +383,29 @@ int
 WaitLatch(Latch *latch, int wakeEvents, long timeout,
 		  uint32 wait_event_info)
 {
-	return WaitLatchOrSocket(latch, wakeEvents, PGINVALID_SOCKET, timeout,
-							 wait_event_info);
+	WaitEvent	event;
+
+	/* Postmaster-managed callers must handle postmaster death somehow. */
+	Assert(!IsUnderPostmaster ||
+		   (wakeEvents & WL_EXIT_ON_PM_DEATH) ||
+		   (wakeEvents & WL_POSTMASTER_DEATH));
+
+	/*
+	 * Some callers may have a latch other than MyLatch, or want to handle
+	 * postmaster death differently.  It's cheap to assign those, so just do it
+	 * every time.
+	 */
+	ModifyWaitEvent(CommonWaitSet, 0, WL_LATCH_SET, latch);
+	CommonWaitSet->exit_on_postmaster_death =
+		((wakeEvents & WL_EXIT_ON_PM_DEATH) != 0);
+
+	if (WaitEventSetWait(CommonWaitSet,
+						 (wakeEvents & WL_TIMEOUT) ? timeout : -1,
+						 &event, 1,
+						 wait_event_info) == 0)
+		return WL_TIMEOUT;
+	else
+		return event.events;
 }
 
 /*
@@ -700,7 +739,11 @@ FreeWaitEventSet(WaitEventSet *set)
 	ReleaseExternalFD();
 #elif defined(WAIT_USE_KQUEUE)
 	close(set->kqueue_fd);
-	ReleaseExternalFD();
+	if (set->kqueue_fd >= 0)
+	{
+		close(set->kqueue_fd);
+		ReleaseExternalFD();
+	}
 #elif defined(WAIT_USE_WIN32)
 	WaitEvent  *cur_event;
 
diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c
index b02bad4420..256a77fd32 100644
--- a/src/backend/utils/init/miscinit.c
+++ b/src/backend/utils/init/miscinit.c
@@ -291,6 +291,7 @@ InitPostmasterChild(void)
 	InitializeLatchSupport();
 	MyLatch = &LocalLatchData;
 	InitLatch(MyLatch);
+	InitializeCommonWaitSet();
 
 	/*
 	 * If possible, make this process a group leader, so that the postmaster
@@ -323,6 +324,7 @@ InitStandaloneProcess(const char *argv0)
 	InitializeLatchSupport();
 	MyLatch = &LocalLatchData;
 	InitLatch(MyLatch);
+	InitializeCommonWaitSet();
 
 	/* Compute paths, no postmaster to inherit from */
 	if (my_exec_path[0] == '\0')
diff --git a/src/include/storage/latch.h b/src/include/storage/latch.h
index 46ae56cae3..ec1865a8fd 100644
--- a/src/include/storage/latch.h
+++ b/src/include/storage/latch.h
@@ -176,6 +176,7 @@ extern int	WaitLatch(Latch *latch, int wakeEvents, long timeout,
 					  uint32 wait_event_info);
 extern int	WaitLatchOrSocket(Latch *latch, int wakeEvents,
 							  pgsocket sock, long timeout, uint32 wait_event_info);
+extern void InitializeCommonWaitSet(void);
 
 /*
  * Unix implementation uses SIGUSR1 for inter-process signaling.
-- 
2.20.1

0003-Use-WaitLatch-for-condition-variables.patchtext/x-patch; charset=US-ASCII; name=0003-Use-WaitLatch-for-condition-variables.patchDownload
From e15e7991f5cfa744e3816f109f9c7400213f8322 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Mon, 24 Feb 2020 15:49:00 +1300
Subject: [PATCH 03/11] Use WaitLatch() for condition variables.

Previously, condition_variable.c created its own long lived
WaitEventSet to avoid extra system calls.  WaitLatch() now
does that, so there is no point in wasting an extra kernel
descriptor.

Author: Thomas Munro
Discussion: https://postgr.es/m/CA%2BhUKGJAC4Oqao%3DqforhNey20J8CiG2R%3DoBPqvfR0vOJrFysGw%40mail.gmail.com
---
 src/backend/storage/lmgr/condition_variable.c | 28 ++++---------------
 1 file changed, 5 insertions(+), 23 deletions(-)

diff --git a/src/backend/storage/lmgr/condition_variable.c b/src/backend/storage/lmgr/condition_variable.c
index 37b6a4eecd..2ec00397b4 100644
--- a/src/backend/storage/lmgr/condition_variable.c
+++ b/src/backend/storage/lmgr/condition_variable.c
@@ -30,9 +30,6 @@
 /* Initially, we are not prepared to sleep on any condition variable. */
 static ConditionVariable *cv_sleep_target = NULL;
 
-/* Reusable WaitEventSet. */
-static WaitEventSet *cv_wait_event_set = NULL;
-
 /*
  * Initialize a condition variable.
  */
@@ -62,23 +59,6 @@ ConditionVariablePrepareToSleep(ConditionVariable *cv)
 {
 	int			pgprocno = MyProc->pgprocno;
 
-	/*
-	 * If first time through in this process, create a WaitEventSet, which
-	 * we'll reuse for all condition variable sleeps.
-	 */
-	if (cv_wait_event_set == NULL)
-	{
-		WaitEventSet *new_event_set;
-
-		new_event_set = CreateWaitEventSet(TopMemoryContext, 2);
-		AddWaitEventToSet(new_event_set, WL_LATCH_SET, PGINVALID_SOCKET,
-						  MyLatch, NULL);
-		AddWaitEventToSet(new_event_set, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
-						  NULL, NULL);
-		/* Don't set cv_wait_event_set until we have a correct WES. */
-		cv_wait_event_set = new_event_set;
-	}
-
 	/*
 	 * If some other sleep is already prepared, cancel it; this is necessary
 	 * because we have just one static variable tracking the prepared sleep,
@@ -135,6 +115,7 @@ ConditionVariableTimedSleep(ConditionVariable *cv, long timeout,
 	long		cur_timeout = -1;
 	instr_time	start_time;
 	instr_time	cur_time;
+	int			wait_events;
 
 	/*
 	 * If the caller didn't prepare to sleep explicitly, then do so now and
@@ -166,19 +147,20 @@ ConditionVariableTimedSleep(ConditionVariable *cv, long timeout,
 		INSTR_TIME_SET_CURRENT(start_time);
 		Assert(timeout >= 0 && timeout <= INT_MAX);
 		cur_timeout = timeout;
+		wait_events = WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH;
 	}
+	else
+		wait_events = WL_LATCH_SET | WL_EXIT_ON_PM_DEATH;
 
 	while (true)
 	{
-		WaitEvent	event;
 		bool		done = false;
 
 		/*
 		 * Wait for latch to be set.  (If we're awakened for some other
 		 * reason, the code below will cope anyway.)
 		 */
-		(void) WaitEventSetWait(cv_wait_event_set, cur_timeout, &event, 1,
-								wait_event_info);
+		(void) WaitLatch(MyLatch, wait_events, cur_timeout, wait_event_info);
 
 		/* Reset latch before examining the state of the wait list. */
 		ResetLatch(MyLatch);
-- 
2.20.1

0004-Introduce-RemoveWaitEvent.patchtext/x-patch; charset=US-ASCII; name=0004-Introduce-RemoveWaitEvent.patchDownload
From 587c0e3eb954f8da8eb77f101ac714ddb7c876bf Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Mon, 24 Feb 2020 19:05:00 +1300
Subject: [PATCH 04/11] Introduce RemoveWaitEvent().

This will allow WaitEventSet objects to be used in more
long lived scenarios, where sockets are added and removed.

Author: Thomas Munro
Discussion: https://postgr.es/m/CA%2BhUKGJAC4Oqao%3DqforhNey20J8CiG2R%3DoBPqvfR0vOJrFysGw%40mail.gmail.com
---
 src/backend/storage/ipc/latch.c | 131 ++++++++++++++++++++++++++++----
 src/include/storage/latch.h     |   3 +
 2 files changed, 120 insertions(+), 14 deletions(-)

diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c
index 30e461e965..025545fc89 100644
--- a/src/backend/storage/ipc/latch.c
+++ b/src/backend/storage/ipc/latch.c
@@ -84,6 +84,7 @@ struct WaitEventSet
 {
 	int			nevents;		/* number of registered events */
 	int			nevents_space;	/* maximum number of events in this set */
+	int			free_list;		/* position of first free event */
 
 	/*
 	 * Array, of nevents_space length, storing the definition of events this
@@ -119,6 +120,8 @@ struct WaitEventSet
 #elif defined(WAIT_USE_POLL)
 	/* poll expects events to be waited on every poll() call, prepare once */
 	struct pollfd *pollfds;
+	/* track the populated range of pollfds */
+	int			npollfds;
 #elif defined(WAIT_USE_WIN32)
 
 	/*
@@ -127,6 +130,8 @@ struct WaitEventSet
 	 * event->pos + 1).
 	 */
 	HANDLE	   *handles;
+	/* track the populated range of handles */
+	int			nhandles;
 #endif
 };
 
@@ -642,13 +647,16 @@ CreateWaitEventSet(MemoryContext context, int nevents)
 #elif defined(WAIT_USE_POLL)
 	set->pollfds = (struct pollfd *) data;
 	data += MAXALIGN(sizeof(struct pollfd) * nevents);
+	set->npollfds = 0;
 #elif defined(WAIT_USE_WIN32)
 	set->handles = (HANDLE) data;
 	data += MAXALIGN(sizeof(HANDLE) * nevents);
+	set->nhandles = 0;
 #endif
 
 	set->latch = NULL;
 	set->nevents_space = nevents;
+	set->nevents = 0;
 	set->exit_on_postmaster_death = false;
 
 #if defined(WAIT_USE_EPOLL)
@@ -714,11 +722,25 @@ CreateWaitEventSet(MemoryContext context, int nevents)
 	 * Note: pgwin32_signal_event should be first to ensure that it will be
 	 * reported when multiple events are set.  We want to guarantee that
 	 * pending signals are serviced.
+	 *
+	 * We set unused handles to INVALID_HANDLE_VALUE, because
+	 * WaitForMultipleObjects() considers that to mean "this process" which is
+	 * not signaled until process, so it's a way of leaving a hole in the
+	 * middle of the wait set if you remove something (just like -1 in the poll
+	 * implementation).  An alternative would be to fill in holes and create a
+	 * non 1-to-1 mapping between 'events' and 'handles'.
 	 */
 	set->handles[0] = pgwin32_signal_event;
-	StaticAssertStmt(WSA_INVALID_EVENT == NULL, "");
+	for (int i = 0; i < nevents; ++i)
+		set->handles[i + 1] = INVALID_HANDLE_VALUE;
 #endif
 
+	/* Set up the free list. */
+	for (int i = 0; i < nevents; ++i)
+		set->events[i].next_free = i + 1;
+	set->events[nevents - 1].next_free = -1;
+	set->free_list = 0;
+
 	return set;
 }
 
@@ -727,7 +749,6 @@ CreateWaitEventSet(MemoryContext context, int nevents)
  *
  * Note: preferably, this shouldn't have to free any resources that could be
  * inherited across an exec().  If it did, we'd likely leak those resources in
- * many scenarios.  For the epoll case, we ensure that by setting FD_CLOEXEC
  * when the FD is created.  For the Windows case, we assume that the handles
  * involved are non-inheritable.
  */
@@ -748,9 +769,12 @@ FreeWaitEventSet(WaitEventSet *set)
 	WaitEvent  *cur_event;
 
 	for (cur_event = set->events;
-		 cur_event < (set->events + set->nevents);
+		 cur_event < (set->events + set->nhandles);
 		 cur_event++)
 	{
+		if (set->handles[cur_event->pos + 1] == INVALID_HANDLE_VALUE)
+			continue;
+
 		if (cur_event->events & WL_LATCH_SET)
 		{
 			/* uses the latch's HANDLE */
@@ -805,9 +829,6 @@ AddWaitEventToSet(WaitEventSet *set, uint32 events, pgsocket fd, Latch *latch,
 {
 	WaitEvent  *event;
 
-	/* not enough space */
-	Assert(set->nevents < set->nevents_space);
-
 	if (events == WL_EXIT_ON_PM_DEATH)
 	{
 		events = WL_POSTMASTER_DEATH;
@@ -833,8 +854,12 @@ AddWaitEventToSet(WaitEventSet *set, uint32 events, pgsocket fd, Latch *latch,
 	if (fd == PGINVALID_SOCKET && (events & WL_SOCKET_MASK))
 		elog(ERROR, "cannot wait on socket event without a socket");
 
-	event = &set->events[set->nevents];
-	event->pos = set->nevents++;
+	/* Do we have any free slots? */
+	if (set->free_list == -1)
+		elog(ERROR, "WaitEventSet is full");
+
+	event = &set->events[set->free_list];
+	event->pos = set->free_list;
 	event->fd = fd;
 	event->events = events;
 	event->user_data = user_data;
@@ -868,6 +893,11 @@ AddWaitEventToSet(WaitEventSet *set, uint32 events, pgsocket fd, Latch *latch,
 	WaitEventAdjustWin32(set, event);
 #endif
 
+	/* Remove it from the free list. */
+	set->free_list = event->next_free;
+	event->next_free = -1;
+	set->nevents++;
+
 	return event->pos;
 }
 
@@ -885,7 +915,7 @@ ModifyWaitEvent(WaitEventSet *set, int pos, uint32 events, Latch *latch)
 	int			old_events;
 #endif
 
-	Assert(pos < set->nevents);
+	Assert(pos < set->nevents_space);
 
 	event = &set->events[pos];
 #if defined(WAIT_USE_KQUEUE)
@@ -933,6 +963,63 @@ ModifyWaitEvent(WaitEventSet *set, int pos, uint32 events, Latch *latch)
 #endif
 }
 
+/*
+ * If the descriptor has already been closed, the kernel should already have
+ * removed it from the wait set (except in WAIT_USE_POLL).  Pass in true for
+ * fd_closed in that case, so we don't try to remove it ourselves.
+ */
+void
+RemoveWaitEvent(WaitEventSet *set, int pos, bool fd_closed)
+{
+	WaitEvent  *event;
+
+	Assert(pos >= 0);
+	Assert(pos < set->nevents_space);
+	event = &set->events[pos];
+
+	/* For now only sockets can be removed */
+	if ((event->events & WL_SOCKET_MASK) == 0)
+		elog(ERROR, "event type cannot be removed");
+
+#if defined(WAIT_USE_EPOLL)
+	if (!fd_closed)
+		WaitEventAdjustEpoll(set, event, EPOLL_CTL_DEL);
+#elif defined(WAIT_USE_KQUEUE)
+	if (!fd_closed)
+	{
+		int old_events = event->events;
+
+		event->events = 0;
+		WaitEventAdjustKqueue(set, event, old_events);
+	}
+#elif defined(WAIT_USE_POLL)
+	/* no kernel state to remove, just blank out the fd */
+	set->pollfds[event->pos].fd = -1;
+	/* see if we can shrink the range of active fds */
+	while (set->npollfds > 0 &&
+		   set->pollfds[set->npollfds - 1].fd == -1)
+		set->npollfds -= 1;
+#elif defined(WAIT_USE_WIN32)
+	if (!fd_closed)
+		WSAEventSelect(event->fd, NULL, 0);
+	if (set->handles[event->pos + 1] != INVALID_HANDLE_VALUE)
+	{
+		WSACloseEvent(set->handles[event->pos + 1]);
+		set->handles[event->pos + 1] = INVALID_HANDLE_VALUE;
+	}
+	/* see if we can shrink the range of active handles */
+	while (set->nhandles > 0 &&
+		   set->handles[set->nhandles] == INVALID_HANDLE_VALUE)
+		set->nhandles -= 1;
+#endif
+
+	/* This position is now free. */
+	memset(event, 0, sizeof(*event));
+	event->next_free = set->free_list;
+	set->free_list = pos;
+	set->nevents--;
+}
+
 #if defined(WAIT_USE_EPOLL)
 /*
  * action can be one of EPOLL_CTL_ADD | EPOLL_CTL_MOD | EPOLL_CTL_DEL
@@ -994,6 +1081,9 @@ WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event)
 	pollfd->revents = 0;
 	pollfd->fd = event->fd;
 
+	/* track the known range of populated slots */
+	set->npollfds = Max(event->pos + 1, set->nevents);
+
 	/* prepare pollfd entry once */
 	if (event->events == WL_LATCH_SET)
 	{
@@ -1072,7 +1162,9 @@ WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events)
 	Assert(event->events != WL_LATCH_SET || set->latch != NULL);
 	Assert(event->events == WL_LATCH_SET ||
 		   event->events == WL_POSTMASTER_DEATH ||
-		   (event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)));
+		   (event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) ||
+		   (event->events == 0 &&
+			(old_events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))));
 
 	if (event->events == WL_POSTMASTER_DEATH)
 	{
@@ -1149,6 +1241,9 @@ WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event)
 {
 	HANDLE	   *handle = &set->handles[event->pos + 1];
 
+	/* track the known range of populated slots */
+	set->nhandles = Max(event->pos + 1, set->nhandles);
+
 	if (event->events == WL_LATCH_SET)
 	{
 		Assert(set->latch != NULL);
@@ -1169,12 +1264,15 @@ WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event)
 		if (event->events & WL_SOCKET_CONNECTED)
 			flags |= FD_CONNECT;
 
-		if (*handle == WSA_INVALID_EVENT)
+		if (*handle == INVALID_HANDLE_VALUE)
 		{
 			*handle = WSACreateEvent();
 			if (*handle == WSA_INVALID_EVENT)
+			{
+				*handle = INVALID_HANDLE_VALUE;
 				elog(ERROR, "failed to create event for socket: error code %u",
 					 WSAGetLastError());
+			}
 		}
 		if (WSAEventSelect(event->fd, *handle, flags) != 0)
 			elog(ERROR, "failed to set up event for socket: error code %u",
@@ -1304,6 +1402,11 @@ WaitEventSetWait(WaitEventSet *set, long timeout,
 	return returned_events;
 }
 
+int
+WaitEventSetSize(WaitEventSet *set)
+{
+	return set->nevents;
+}
 
 #if defined(WAIT_USE_EPOLL)
 
@@ -1589,7 +1692,7 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
 	struct pollfd *cur_pollfd;
 
 	/* Sleep */
-	rc = poll(set->pollfds, set->nevents, (int) cur_timeout);
+	rc = poll(set->pollfds, set->npollfds, (int) cur_timeout);
 
 	/* Check return code */
 	if (rc < 0)
@@ -1761,9 +1864,9 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
 	/*
 	 * Sleep.
 	 *
-	 * Need to wait for ->nevents + 1, because signal handle is in [0].
+	 * Need to wait for ->nhandles + 1, because signal handle is in [0].
 	 */
-	rc = WaitForMultipleObjects(set->nevents + 1, set->handles, FALSE,
+	rc = WaitForMultipleObjects(set->nhandles + 1, set->handles, FALSE,
 								cur_timeout);
 
 	/* Check return code */
diff --git a/src/include/storage/latch.h b/src/include/storage/latch.h
index ec1865a8fd..210f37659e 100644
--- a/src/include/storage/latch.h
+++ b/src/include/storage/latch.h
@@ -144,6 +144,7 @@ typedef struct WaitEvent
 	uint32		events;			/* triggered events */
 	pgsocket	fd;				/* socket fd associated with event */
 	void	   *user_data;		/* pointer provided in AddWaitEventToSet */
+	int			next_free;		/* free list for internal use */
 #ifdef WIN32
 	bool		reset;			/* Is reset of the event required? */
 #endif
@@ -168,6 +169,8 @@ extern void FreeWaitEventSet(WaitEventSet *set);
 extern int	AddWaitEventToSet(WaitEventSet *set, uint32 events, pgsocket fd,
 							  Latch *latch, void *user_data);
 extern void ModifyWaitEvent(WaitEventSet *set, int pos, uint32 events, Latch *latch);
+extern void RemoveWaitEvent(WaitEventSet *set, int pos, bool fd_closed);
+extern int WaitEventSetSize(WaitEventSet *set);
 
 extern int	WaitEventSetWait(WaitEventSet *set, long timeout,
 							 WaitEvent *occurred_events, int nevents,
-- 
2.20.1

0005-libpq-Add-PQsocketChangeCount-to-report-socket-chang.patchtext/x-patch; charset=US-ASCII; name=0005-libpq-Add-PQsocketChangeCount-to-report-socket-chang.patchDownload
From aa042507c74a2449c2b70fdd1eb062334182a14a Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Sat, 8 Feb 2020 09:01:19 +1300
Subject: [PATCH 05/11] libpq: Add PQsocketChangeCount() to report socket
 changes.

Users of PQsocket() sometimes wish they could know when the
socket changes between calls, to allow for reuse of wait
event sets.  Provide a new function PQsocketChangeCount()
that advances whenever libpq opens or closes a socket.

Author: Thomas Munro
Discussion: https://postgr.es/m/CA%2BhUKGJAC4Oqao%3DqforhNey20J8CiG2R%3DoBPqvfR0vOJrFysGw%40mail.gmail.com
---
 doc/src/sgml/libpq.sgml           | 17 +++++++++++++++++
 src/interfaces/libpq/exports.txt  |  1 +
 src/interfaces/libpq/fe-connect.c | 10 ++++++++++
 src/interfaces/libpq/libpq-fe.h   |  1 +
 src/interfaces/libpq/libpq-int.h  |  1 +
 5 files changed, 30 insertions(+)

diff --git a/doc/src/sgml/libpq.sgml b/doc/src/sgml/libpq.sgml
index 9a24c19ccb..14f0d998b6 100644
--- a/doc/src/sgml/libpq.sgml
+++ b/doc/src/sgml/libpq.sgml
@@ -2280,6 +2280,23 @@ int PQsocket(const PGconn *conn);
      </listitem>
     </varlistentry>
 
+    <varlistentry id="libpq-PQsocketChangeCount">
+     <term><function>PQsocketChangeCount</function><indexterm><primary>PQsocketChangeCount</primary></indexterm></term>
+     <listitem>
+      <para>
+       Returns a counter that increases whenever the socket has changed.
+       This can be used to know when the value returned by
+       <xref linkend="libpq-PQsocket"/> represents a different socket, even
+       if it happens to have the same file descriptor number.
+
+<synopsis>
+pg_int64 PQsocketChangeCount(const PGconn *conn);
+</synopsis>
+
+      </para>
+     </listitem>
+    </varlistentry>
+
     <varlistentry id="libpq-PQbackendPID">
      <term><function>PQbackendPID</function><indexterm><primary>PQbackendPID</primary></indexterm></term>
      <listitem>
diff --git a/src/interfaces/libpq/exports.txt b/src/interfaces/libpq/exports.txt
index 5fc1e5d289..e925ea320f 100644
--- a/src/interfaces/libpq/exports.txt
+++ b/src/interfaces/libpq/exports.txt
@@ -179,3 +179,4 @@ PQgetgssctx               176
 PQsetSSLKeyPassHook		  177
 PQgetSSLKeyPassHook		  178
 PQdefaultSSLKeyPassHook	  179
+PQsocketChangeCount       180
diff --git a/src/interfaces/libpq/fe-connect.c b/src/interfaces/libpq/fe-connect.c
index 408000af83..dff73f1784 100644
--- a/src/interfaces/libpq/fe-connect.c
+++ b/src/interfaces/libpq/fe-connect.c
@@ -464,6 +464,7 @@ pqDropConnection(PGconn *conn, bool flushInput)
 	if (conn->sock != PGINVALID_SOCKET)
 		closesocket(conn->sock);
 	conn->sock = PGINVALID_SOCKET;
+	conn->sockChangeCount++;
 
 	/* Optionally discard any unread data */
 	if (flushInput)
@@ -2538,6 +2539,7 @@ keep_going:						/* We will come back to here until there is
 					 */
 
 					conn->sock = socket(addr_cur->ai_family, SOCK_STREAM, 0);
+					conn->sockChangeCount++;
 					if (conn->sock == PGINVALID_SOCKET)
 					{
 						/*
@@ -6706,6 +6708,14 @@ PQsocket(const PGconn *conn)
 	return (conn->sock != PGINVALID_SOCKET) ? conn->sock : -1;
 }
 
+pg_int64
+PQsocketChangeCount(const PGconn *conn)
+{
+	if (!conn)
+		return -1;
+	return conn->sockChangeCount;
+}
+
 int
 PQbackendPID(const PGconn *conn)
 {
diff --git a/src/interfaces/libpq/libpq-fe.h b/src/interfaces/libpq/libpq-fe.h
index c9e6ac2b76..8b21373e4d 100644
--- a/src/interfaces/libpq/libpq-fe.h
+++ b/src/interfaces/libpq/libpq-fe.h
@@ -327,6 +327,7 @@ extern int	PQprotocolVersion(const PGconn *conn);
 extern int	PQserverVersion(const PGconn *conn);
 extern char *PQerrorMessage(const PGconn *conn);
 extern int	PQsocket(const PGconn *conn);
+extern pg_int64 PQsocketChangeCount(const PGconn *conn);
 extern int	PQbackendPID(const PGconn *conn);
 extern int	PQconnectionNeedsPassword(const PGconn *conn);
 extern int	PQconnectionUsedPassword(const PGconn *conn);
diff --git a/src/interfaces/libpq/libpq-int.h b/src/interfaces/libpq/libpq-int.h
index 72931e6019..91b190f078 100644
--- a/src/interfaces/libpq/libpq-int.h
+++ b/src/interfaces/libpq/libpq-int.h
@@ -409,6 +409,7 @@ struct pg_conn
 	/* Connection data */
 	pgsocket	sock;			/* FD for socket, PGINVALID_SOCKET if
 								 * unconnected */
+	pg_int64	sockChangeCount;	/* advances when socket is opened/closed */
 	SockAddr	laddr;			/* Local address */
 	SockAddr	raddr;			/* Remote address */
 	ProtocolVersion pversion;	/* FE/BE protocol version in use */
-- 
2.20.1

0006-Reuse-a-WaitEventSet-in-libpqwalreceiver.c.patchtext/x-patch; charset=US-ASCII; name=0006-Reuse-a-WaitEventSet-in-libpqwalreceiver.c.patchDownload
From 2abdbcb71484af0f0c6e62f8fc9ce2be7d99ffaa Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Sat, 8 Feb 2020 09:14:29 +1300
Subject: [PATCH 06/11] Reuse a WaitEventSet in libpqwalreceiver.c.

To avoid repeatedly setting up and tearing down WaitEventSet objects
and associated kernel objects, reuse a WaitEventSet.  Export a wait
function that is smart enough to handle socket changes under the
covers, and then use that for physical and logical replication.

Author: Thomas Munro
Discussion: https://postgr.es/m/CA%2BhUKGJAC4Oqao%3DqforhNey20J8CiG2R%3DoBPqvfR0vOJrFysGw%40mail.gmail.com
---
 .../libpqwalreceiver/libpqwalreceiver.c       | 126 +++++++++++++-----
 src/backend/replication/logical/tablesync.c   |   6 +-
 src/backend/replication/logical/worker.c      |   6 +-
 src/backend/replication/walreceiver.c         |  21 +--
 src/include/replication/walreceiver.h         |   5 +
 5 files changed, 101 insertions(+), 63 deletions(-)

diff --git a/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c b/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c
index e4fd1f9bb6..e54581f37f 100644
--- a/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c
+++ b/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c
@@ -41,6 +41,11 @@ struct WalReceiverConn
 {
 	/* Current connection to the primary, if any */
 	PGconn	   *streamConn;
+	/* Wait event set used to wait for I/O */
+	WaitEventSet *wes;
+	/* Used to handle changes in the underlying socket */
+	int64		wes_fd_change_count;
+	int			wes_socket_position;
 	/* Used to remember if the connection is logical or physical */
 	bool		logical;
 	/* Buffer for currently read records */
@@ -80,6 +85,7 @@ static WalRcvExecResult *libpqrcv_exec(WalReceiverConn *conn,
 									   const int nRetTypes,
 									   const Oid *retTypes);
 static void libpqrcv_disconnect(WalReceiverConn *conn);
+static int libpqrcv_wait(WalReceiverConn *conn, long timeout, int wait_event);
 
 static WalReceiverFunctionsType PQWalReceiverFunctions = {
 	libpqrcv_connect,
@@ -96,13 +102,16 @@ static WalReceiverFunctionsType PQWalReceiverFunctions = {
 	libpqrcv_create_slot,
 	libpqrcv_get_backend_pid,
 	libpqrcv_exec,
-	libpqrcv_disconnect
+	libpqrcv_disconnect,
+	libpqrcv_wait
 };
 
 /* Prototypes for private functions */
-static PGresult *libpqrcv_PQexec(PGconn *streamConn, const char *query);
-static PGresult *libpqrcv_PQgetResult(PGconn *streamConn);
+static PGresult *libpqrcv_PQexec(WalReceiverConn *conn, const char *query);
+static PGresult *libpqrcv_PQgetResult(WalReceiverConn *conn);
 static char *stringlist_to_identifierstr(PGconn *conn, List *strings);
+static void libpqrcv_prepare_to_wait_for_socket(WalReceiverConn *conn,
+												int io_flags);
 
 /*
  * Module initialization function
@@ -168,6 +177,15 @@ libpqrcv_connect(const char *conninfo, bool logical, const char *appname,
 		return NULL;
 	}
 
+	/* Create a WaitEventSet that will last as long as the connection. */
+	conn->wes = CreateWaitEventSet(TopMemoryContext, 3);
+	conn->wes_socket_position =
+		AddWaitEventToSet(conn->wes, WL_SOCKET_READABLE,
+						  PQsocket(conn->streamConn), NULL, NULL);
+	conn->wes_fd_change_count = PQsocketChangeCount(conn->streamConn);
+	AddWaitEventToSet(conn->wes, WL_LATCH_SET, PGINVALID_SOCKET, MyLatch, NULL);
+	AddWaitEventToSet(conn->wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, NULL, NULL);
+
 	/*
 	 * Poll connection until we have OK or FAILED status.
 	 *
@@ -177,7 +195,7 @@ libpqrcv_connect(const char *conninfo, bool logical, const char *appname,
 	do
 	{
 		int			io_flag;
-		int			rc;
+		WaitEvent	event;
 
 		if (status == PGRES_POLLING_READING)
 			io_flag = WL_SOCKET_READABLE;
@@ -189,21 +207,19 @@ libpqrcv_connect(const char *conninfo, bool logical, const char *appname,
 		else
 			io_flag = WL_SOCKET_WRITEABLE;
 
-		rc = WaitLatchOrSocket(MyLatch,
-							   WL_EXIT_ON_PM_DEATH | WL_LATCH_SET | io_flag,
-							   PQsocket(conn->streamConn),
-							   0,
-							   WAIT_EVENT_LIBPQWALRECEIVER_CONNECT);
+		libpqrcv_prepare_to_wait_for_socket(conn, io_flag);
+		(void) WaitEventSetWait(conn->wes, -1, &event, 1,
+								WAIT_EVENT_LIBPQWALRECEIVER_CONNECT);
 
 		/* Interrupted? */
-		if (rc & WL_LATCH_SET)
+		if (event.events == WL_LATCH_SET)
 		{
 			ResetLatch(MyLatch);
 			ProcessWalRcvInterrupts();
 		}
 
 		/* If socket is ready, advance the libpq state machine */
-		if (rc & io_flag)
+		if (event.events == io_flag)
 			status = PQconnectPoll(conn->streamConn);
 	} while (status != PGRES_POLLING_OK && status != PGRES_POLLING_FAILED);
 
@@ -322,7 +338,7 @@ libpqrcv_identify_system(WalReceiverConn *conn, TimeLineID *primary_tli)
 	 * Get the system identifier and timeline ID as a DataRow message from the
 	 * primary server.
 	 */
-	res = libpqrcv_PQexec(conn->streamConn, "IDENTIFY_SYSTEM");
+	res = libpqrcv_PQexec(conn, "IDENTIFY_SYSTEM");
 	if (PQresultStatus(res) != PGRES_TUPLES_OK)
 	{
 		PQclear(res);
@@ -431,7 +447,7 @@ libpqrcv_startstreaming(WalReceiverConn *conn,
 						 options->proto.physical.startpointTLI);
 
 	/* Start streaming. */
-	res = libpqrcv_PQexec(conn->streamConn, cmd.data);
+	res = libpqrcv_PQexec(conn, cmd.data);
 	pfree(cmd.data);
 
 	if (PQresultStatus(res) == PGRES_COMMAND_OK)
@@ -479,7 +495,7 @@ libpqrcv_endstreaming(WalReceiverConn *conn, TimeLineID *next_tli)
 	 * If we had not yet received CopyDone from the backend, PGRES_COPY_OUT is
 	 * also possible in case we aborted the copy in mid-stream.
 	 */
-	res = libpqrcv_PQgetResult(conn->streamConn);
+	res = libpqrcv_PQgetResult(conn);
 	if (PQresultStatus(res) == PGRES_TUPLES_OK)
 	{
 		/*
@@ -493,7 +509,7 @@ libpqrcv_endstreaming(WalReceiverConn *conn, TimeLineID *next_tli)
 		PQclear(res);
 
 		/* the result set should be followed by CommandComplete */
-		res = libpqrcv_PQgetResult(conn->streamConn);
+		res = libpqrcv_PQgetResult(conn);
 	}
 	else if (PQresultStatus(res) == PGRES_COPY_OUT)
 	{
@@ -506,7 +522,7 @@ libpqrcv_endstreaming(WalReceiverConn *conn, TimeLineID *next_tli)
 							pchomp(PQerrorMessage(conn->streamConn)))));
 
 		/* CommandComplete should follow */
-		res = libpqrcv_PQgetResult(conn->streamConn);
+		res = libpqrcv_PQgetResult(conn);
 	}
 
 	if (PQresultStatus(res) != PGRES_COMMAND_OK)
@@ -516,7 +532,7 @@ libpqrcv_endstreaming(WalReceiverConn *conn, TimeLineID *next_tli)
 	PQclear(res);
 
 	/* Verify that there are no more results */
-	res = libpqrcv_PQgetResult(conn->streamConn);
+	res = libpqrcv_PQgetResult(conn);
 	if (res != NULL)
 		ereport(ERROR,
 				(errmsg("unexpected result after CommandComplete: %s",
@@ -540,7 +556,7 @@ libpqrcv_readtimelinehistoryfile(WalReceiverConn *conn,
 	 * Request the primary to send over the history file for given timeline.
 	 */
 	snprintf(cmd, sizeof(cmd), "TIMELINE_HISTORY %u", tli);
-	res = libpqrcv_PQexec(conn->streamConn, cmd);
+	res = libpqrcv_PQexec(conn, cmd);
 	if (PQresultStatus(res) != PGRES_TUPLES_OK)
 	{
 		PQclear(res);
@@ -582,8 +598,9 @@ libpqrcv_readtimelinehistoryfile(WalReceiverConn *conn,
  * May return NULL, rather than an error result, on failure.
  */
 static PGresult *
-libpqrcv_PQexec(PGconn *streamConn, const char *query)
+libpqrcv_PQexec(WalReceiverConn *conn, const char *query)
 {
+	PGconn	   *streamConn = conn->streamConn;
 	PGresult   *lastResult = NULL;
 
 	/*
@@ -606,7 +623,7 @@ libpqrcv_PQexec(PGconn *streamConn, const char *query)
 		/* Wait for, and collect, the next PGresult. */
 		PGresult   *result;
 
-		result = libpqrcv_PQgetResult(streamConn);
+		result = libpqrcv_PQgetResult(conn);
 		if (result == NULL)
 			break;				/* query is complete, or failure */
 
@@ -631,30 +648,29 @@ libpqrcv_PQexec(PGconn *streamConn, const char *query)
  * Perform the equivalent of PQgetResult(), but watch for interrupts.
  */
 static PGresult *
-libpqrcv_PQgetResult(PGconn *streamConn)
+libpqrcv_PQgetResult(WalReceiverConn *conn)
 {
+	PGconn	   *streamConn = conn->streamConn;
+
 	/*
 	 * Collect data until PQgetResult is ready to get the result without
 	 * blocking.
 	 */
 	while (PQisBusy(streamConn))
 	{
-		int			rc;
+		WaitEvent	event;
 
 		/*
 		 * We don't need to break down the sleep into smaller increments,
 		 * since we'll get interrupted by signals and can handle any
 		 * interrupts here.
 		 */
-		rc = WaitLatchOrSocket(MyLatch,
-							   WL_EXIT_ON_PM_DEATH | WL_SOCKET_READABLE |
-							   WL_LATCH_SET,
-							   PQsocket(streamConn),
-							   0,
-							   WAIT_EVENT_LIBPQWALRECEIVER_RECEIVE);
+		libpqrcv_prepare_to_wait_for_socket(conn, WL_SOCKET_READABLE);
+		(void) WaitEventSetWait(conn->wes, -1, &event, 1,
+								WAIT_EVENT_LIBPQWALRECEIVER_RECEIVE);
 
 		/* Interrupted? */
-		if (rc & WL_LATCH_SET)
+		if (event.events == WL_LATCH_SET)
 		{
 			ResetLatch(MyLatch);
 			ProcessWalRcvInterrupts();
@@ -681,9 +697,27 @@ libpqrcv_disconnect(WalReceiverConn *conn)
 	PQfinish(conn->streamConn);
 	if (conn->recvBuf != NULL)
 		PQfreemem(conn->recvBuf);
+	FreeWaitEventSet(conn->wes);
 	pfree(conn);
 }
 
+/*
+ * Wait for new data to arrive, or a timeout.
+ */
+static int
+libpqrcv_wait(WalReceiverConn *conn, long timeout, int wait_event)
+{
+	WaitEvent	event;
+	int			rc;
+
+	libpqrcv_prepare_to_wait_for_socket(conn, WL_SOCKET_READABLE);
+	rc = WaitEventSetWait(conn->wes, timeout, &event, 1, wait_event);
+	if (rc == 0)
+		return WL_TIMEOUT;
+
+	return event.events;
+}
+
 /*
  * Receive a message available from XLOG stream.
  *
@@ -701,8 +735,7 @@ libpqrcv_disconnect(WalReceiverConn *conn)
  * ereports on error.
  */
 static int
-libpqrcv_receive(WalReceiverConn *conn, char **buffer,
-				 pgsocket *wait_fd)
+libpqrcv_receive(WalReceiverConn *conn, char **buffer, pgsocket *wait_fd)
 {
 	int			rawlen;
 
@@ -733,13 +766,13 @@ libpqrcv_receive(WalReceiverConn *conn, char **buffer,
 	{
 		PGresult   *res;
 
-		res = libpqrcv_PQgetResult(conn->streamConn);
+		res = libpqrcv_PQgetResult(conn);
 		if (PQresultStatus(res) == PGRES_COMMAND_OK)
 		{
 			PQclear(res);
 
 			/* Verify that there are no more results. */
-			res = libpqrcv_PQgetResult(conn->streamConn);
+			res = libpqrcv_PQgetResult(conn);
 			if (res != NULL)
 			{
 				PQclear(res);
@@ -839,7 +872,7 @@ libpqrcv_create_slot(WalReceiverConn *conn, const char *slotname,
 		appendStringInfoString(&cmd, " PHYSICAL RESERVE_WAL");
 	}
 
-	res = libpqrcv_PQexec(conn->streamConn, cmd.data);
+	res = libpqrcv_PQexec(conn, cmd.data);
 	pfree(cmd.data);
 
 	if (PQresultStatus(res) != PGRES_TUPLES_OK)
@@ -963,7 +996,7 @@ libpqrcv_exec(WalReceiverConn *conn, const char *query,
 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
 				 errmsg("the query interface requires a database connection")));
 
-	pgres = libpqrcv_PQexec(conn->streamConn, query);
+	pgres = libpqrcv_PQexec(conn, query);
 
 	switch (PQresultStatus(pgres))
 	{
@@ -1047,3 +1080,26 @@ stringlist_to_identifierstr(PGconn *conn, List *strings)
 
 	return res.data;
 }
+
+/*
+ * Update our WaitEventSet so that we can wait for 'io_flags' on our socket,
+ * considering that the socket might have changed.
+ */
+static void
+libpqrcv_prepare_to_wait_for_socket(WalReceiverConn *conn, int io_flags)
+{
+	if (conn->wes_fd_change_count != PQsocketChangeCount(conn->streamConn))
+	{
+		/* The previous socket has been closed.  Replace it with the new one. */
+		RemoveWaitEvent(conn->wes, conn->wes_socket_position, true);
+		conn->wes_socket_position =
+			AddWaitEventToSet(conn->wes, io_flags,
+							  PQsocket(conn->streamConn), NULL, NULL);
+		conn->wes_fd_change_count = PQsocketChangeCount(conn->streamConn);
+	}
+	else
+	{
+		/* No socket change needed, we just need to wait for the right event. */
+		ModifyWaitEvent(conn->wes, conn->wes_socket_position, io_flags, NULL);
+	}
+}
diff --git a/src/backend/replication/logical/tablesync.c b/src/backend/replication/logical/tablesync.c
index f8183cd488..6500d7275f 100644
--- a/src/backend/replication/logical/tablesync.c
+++ b/src/backend/replication/logical/tablesync.c
@@ -616,11 +616,7 @@ copy_read_data(void *outbuf, int minread, int maxread)
 		/*
 		 * Wait for more data or latch.
 		 */
-		(void) WaitLatchOrSocket(MyLatch,
-								 WL_SOCKET_READABLE | WL_LATCH_SET |
-								 WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
-								 fd, 1000L, WAIT_EVENT_LOGICAL_SYNC_DATA);
-
+		(void) walrcv_wait(wrconn, 1000L, WAIT_EVENT_LOGICAL_SYNC_DATA);
 		ResetLatch(MyLatch);
 	}
 
diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c
index ad4a732fd2..f12763f54a 100644
--- a/src/backend/replication/logical/worker.c
+++ b/src/backend/replication/logical/worker.c
@@ -1278,11 +1278,7 @@ LogicalRepApplyLoop(XLogRecPtr last_received)
 		else
 			wait_time = NAPTIME_PER_CYCLE;
 
-		rc = WaitLatchOrSocket(MyLatch,
-							   WL_SOCKET_READABLE | WL_LATCH_SET |
-							   WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
-							   fd, wait_time,
-							   WAIT_EVENT_LOGICAL_APPLY_MAIN);
+		rc = walrcv_wait(wrconn, wait_time, WAIT_EVENT_LOGICAL_APPLY_MAIN);
 
 		if (rc & WL_LATCH_SET)
 		{
diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c
index 2ab15c3cbb..e5d054d1a1 100644
--- a/src/backend/replication/walreceiver.c
+++ b/src/backend/replication/walreceiver.c
@@ -503,24 +503,9 @@ WalReceiverMain(void)
 				if (endofwal)
 					break;
 
-				/*
-				 * Ideally we would reuse a WaitEventSet object repeatedly
-				 * here to avoid the overheads of WaitLatchOrSocket on epoll
-				 * systems, but we can't be sure that libpq (or any other
-				 * walreceiver implementation) has the same socket (even if
-				 * the fd is the same number, it may have been closed and
-				 * reopened since the last time).  In future, if there is a
-				 * function for removing sockets from WaitEventSet, then we
-				 * could add and remove just the socket each time, potentially
-				 * avoiding some system calls.
-				 */
-				Assert(wait_fd != PGINVALID_SOCKET);
-				rc = WaitLatchOrSocket(walrcv->latch,
-									   WL_EXIT_ON_PM_DEATH | WL_SOCKET_READABLE |
-									   WL_TIMEOUT | WL_LATCH_SET,
-									   wait_fd,
-									   NAPTIME_PER_CYCLE,
-									   WAIT_EVENT_WAL_RECEIVER_MAIN);
+				/* Wait for latch or data. */
+				rc = walrcv_wait(wrconn, NAPTIME_PER_CYCLE,
+								 WAIT_EVENT_WAL_RECEIVER_MAIN);
 				if (rc & WL_LATCH_SET)
 				{
 					ResetLatch(walrcv->latch);
diff --git a/src/include/replication/walreceiver.h b/src/include/replication/walreceiver.h
index e08afc6548..115d6acf18 100644
--- a/src/include/replication/walreceiver.h
+++ b/src/include/replication/walreceiver.h
@@ -239,6 +239,8 @@ typedef WalRcvExecResult *(*walrcv_exec_fn) (WalReceiverConn *conn,
 											 const int nRetTypes,
 											 const Oid *retTypes);
 typedef void (*walrcv_disconnect_fn) (WalReceiverConn *conn);
+typedef int (*walrcv_wait_fn) (WalReceiverConn *conn, long timeout,
+							   int wait_event);
 
 typedef struct WalReceiverFunctionsType
 {
@@ -257,6 +259,7 @@ typedef struct WalReceiverFunctionsType
 	walrcv_get_backend_pid_fn walrcv_get_backend_pid;
 	walrcv_exec_fn walrcv_exec;
 	walrcv_disconnect_fn walrcv_disconnect;
+	walrcv_wait_fn walrcv_wait;
 } WalReceiverFunctionsType;
 
 extern PGDLLIMPORT WalReceiverFunctionsType *WalReceiverFunctions;
@@ -291,6 +294,8 @@ extern PGDLLIMPORT WalReceiverFunctionsType *WalReceiverFunctions;
 	WalReceiverFunctions->walrcv_exec(conn, exec, nRetTypes, retTypes)
 #define walrcv_disconnect(conn) \
 	WalReceiverFunctions->walrcv_disconnect(conn)
+#define walrcv_wait(conn, timeout, wait_event) \
+	WalReceiverFunctions->walrcv_wait(conn, timeout, wait_event)
 
 static inline void
 walrcv_clear_result(WalRcvExecResult *walres)
-- 
2.20.1

0007-Use-a-WaitEventSet-for-postgres_fdw.patchtext/x-patch; charset=US-ASCII; name=0007-Use-a-WaitEventSet-for-postgres_fdw.patchDownload
From 8ba696292d0c853236acbfd88a03e7bf040105e9 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Tue, 25 Feb 2020 15:34:12 +1300
Subject: [PATCH 07/11] Use a WaitEventSet for postgres_fdw.

The same WaitEventSet object will be reused for the life of the
backend.

Author: Thomas Munro
Discussion: https://postgr.es/m/CA%2BhUKGJAC4Oqao%3DqforhNey20J8CiG2R%3DoBPqvfR0vOJrFysGw%40mail.gmail.com
---
 contrib/postgres_fdw/connection.c | 98 +++++++++++++++++++++++++++----
 1 file changed, 88 insertions(+), 10 deletions(-)

diff --git a/contrib/postgres_fdw/connection.c b/contrib/postgres_fdw/connection.c
index e45647f3ea..0397a16702 100644
--- a/contrib/postgres_fdw/connection.c
+++ b/contrib/postgres_fdw/connection.c
@@ -65,6 +65,12 @@ typedef struct ConnCacheEntry
  */
 static HTAB *ConnectionHash = NULL;
 
+/* Reusuable WaitEventSet. */
+static WaitEventSet *ConnectionWaitSet = NULL;
+static int64 ConnectionWaitSetSocketChangeCount = -1;
+static PGconn *ConnectionWaitSetConn = NULL;
+static int ConnectionWaitSetPosition = -1;
+
 /* for assigning cursor numbers and prepared statement numbers */
 static unsigned int cursor_number = 0;
 static unsigned int prep_stmt_number = 0;
@@ -92,6 +98,7 @@ static bool pgfdw_exec_cleanup_query(PGconn *conn, const char *query,
 static bool pgfdw_get_cleanup_result(PGconn *conn, TimestampTz endtime,
 									 PGresult **result);
 static bool UserMappingPasswordRequired(UserMapping *user);
+static int pgfdw_wait_for_socket(PGconn *conn, long timeout);
 
 /*
  * Get a PGconn which can be used to execute queries on the remote PostgreSQL
@@ -115,6 +122,17 @@ GetConnection(UserMapping *user, bool will_prep_stmt)
 	{
 		HASHCTL		ctl;
 
+		/*
+		 * We'll use a single WaitEventSet for the lifetime of this backend,
+		 * and add and remove sockets as appropriate.  Only one socket will
+		 * be in it at a time.
+		 */
+		Assert(ConnectionWaitSet == NULL);
+		ConnectionWaitSet = CreateWaitEventSet(TopMemoryContext, 3);
+		AddWaitEventToSet(ConnectionWaitSet, WL_LATCH_SET, -1, MyLatch, NULL);
+		AddWaitEventToSet(ConnectionWaitSet, WL_EXIT_ON_PM_DEATH, -1, NULL,
+						  NULL);
+
 		MemSet(&ctl, 0, sizeof(ctl));
 		ctl.keysize = sizeof(ConnCacheKey);
 		ctl.entrysize = sizeof(ConnCacheEntry);
@@ -344,6 +362,14 @@ disconnect_pg_server(ConnCacheEntry *entry)
 	if (entry->conn != NULL)
 	{
 		PQfinish(entry->conn);
+		if (ConnectionWaitSetConn == entry->conn)
+		{
+			/* We do this after PQfinish, so we know the socket is closed. */
+			RemoveWaitEvent(ConnectionWaitSet,
+							ConnectionWaitSetPosition,
+							true);
+			ConnectionWaitSetConn = NULL;
+		}
 		entry->conn = NULL;
 		ReleaseExternalFD();
 	}
@@ -603,11 +629,7 @@ pgfdw_get_result(PGconn *conn, const char *query)
 				int			wc;
 
 				/* Sleep until there's something to do */
-				wc = WaitLatchOrSocket(MyLatch,
-									   WL_LATCH_SET | WL_SOCKET_READABLE |
-									   WL_EXIT_ON_PM_DEATH,
-									   PQsocket(conn),
-									   -1L, PG_WAIT_EXTENSION);
+				wc = pgfdw_wait_for_socket(conn, -1);
 				ResetLatch(MyLatch);
 
 				CHECK_FOR_INTERRUPTS();
@@ -1207,11 +1229,7 @@ pgfdw_get_cleanup_result(PGconn *conn, TimestampTz endtime, PGresult **result)
 				cur_timeout = Min(60000, secs * USECS_PER_SEC + microsecs);
 
 				/* Sleep until there's something to do */
-				wc = WaitLatchOrSocket(MyLatch,
-									   WL_LATCH_SET | WL_SOCKET_READABLE |
-									   WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
-									   PQsocket(conn),
-									   cur_timeout, PG_WAIT_EXTENSION);
+				wc = pgfdw_wait_for_socket(conn, cur_timeout);
 				ResetLatch(MyLatch);
 
 				CHECK_FOR_INTERRUPTS();
@@ -1250,3 +1268,63 @@ exit:	;
 		*result = last_res;
 	return timed_out;
 }
+
+static int
+pgfdw_wait_for_socket(PGconn *conn, long timeout)
+{
+	WaitEvent	event;
+	int			rc;
+
+	/* If a different conn is in the set, or the socket changed, remove. */
+	if (ConnectionWaitSetConn)
+	{
+		bool socket_changed =
+			(ConnectionWaitSetSocketChangeCount !=
+			PQsocketChangeCount(ConnectionWaitSetConn));
+
+		if (ConnectionWaitSetConn == conn)
+		{
+			/*
+			 * This connection is already in there, but the socket might have
+			 * changed.  If so, remove it.
+			 */
+			if (socket_changed)
+			{
+				RemoveWaitEvent(ConnectionWaitSet,
+								ConnectionWaitSetPosition,
+								true);
+				ConnectionWaitSetConn = NULL;
+			}
+		}
+		else
+		{
+			/*
+			 * A different connection is in there.  Remove it, being careful
+			 * to report whether the socket was already closed (this affects
+			 * whether we unregister the fd with the kernel).
+			 */
+			RemoveWaitEvent(ConnectionWaitSet,
+							ConnectionWaitSetPosition,
+							socket_changed);
+			ConnectionWaitSetConn = NULL;
+		}
+	}
+
+	/* Do we need to add our connection? */
+	if (ConnectionWaitSetConn == NULL)
+	{
+		ConnectionWaitSetPosition =
+			AddWaitEventToSet(ConnectionWaitSet, WL_SOCKET_READABLE,
+							  PQsocket(conn), NULL, NULL);
+		ConnectionWaitSetConn = conn;
+		ConnectionWaitSetSocketChangeCount = PQsocketChangeCount(conn);
+	}
+
+	/* Finally, we can wait. */
+	rc = WaitEventSetWait(ConnectionWaitSet, timeout, &event, 1,
+						  PG_WAIT_EXTENSION);
+	if (rc == 0)
+		return WL_TIMEOUT;
+
+	return event.events;
+}
-- 
2.20.1

0008-Use-WL_EXIT_ON_PM_DEATH-in-FeBeWaitSet.patchtext/x-patch; charset=US-ASCII; name=0008-Use-WL_EXIT_ON_PM_DEATH-in-FeBeWaitSet.patchDownload
From 9e457207c655b11ae5ed3fe97d481a8b044b0674 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Mon, 24 Feb 2020 23:51:09 +1300
Subject: [PATCH 08/11] Use WL_EXIT_ON_PM_DEATH in FeBeWaitSet.

Previously, we'd give either a FATAL message or a silent exit() when
we detected postmaster death, depending on which wait point we were at.
Make the exit more uniform, by using the standard exit facility.

Author: Thomas Munro
Discussion: https://postgr.es/m/CA%2BhUKGJAC4Oqao%3DqforhNey20J8CiG2R%3DoBPqvfR0vOJrFysGw%40mail.gmail.com
---
 src/backend/libpq/be-secure.c | 28 ----------------------------
 src/backend/libpq/pqcomm.c    |  2 +-
 2 files changed, 1 insertion(+), 29 deletions(-)

diff --git a/src/backend/libpq/be-secure.c b/src/backend/libpq/be-secure.c
index 2ae507a902..aec0926d93 100644
--- a/src/backend/libpq/be-secure.c
+++ b/src/backend/libpq/be-secure.c
@@ -184,28 +184,6 @@ retry:
 		WaitEventSetWait(FeBeWaitSet, -1 /* no timeout */ , &event, 1,
 						 WAIT_EVENT_CLIENT_READ);
 
-		/*
-		 * If the postmaster has died, it's not safe to continue running,
-		 * because it is the postmaster's job to kill us if some other backend
-		 * exits uncleanly.  Moreover, we won't run very well in this state;
-		 * helper processes like walwriter and the bgwriter will exit, so
-		 * performance may be poor.  Finally, if we don't exit, pg_ctl will be
-		 * unable to restart the postmaster without manual intervention, so no
-		 * new connections can be accepted.  Exiting clears the deck for a
-		 * postmaster restart.
-		 *
-		 * (Note that we only make this check when we would otherwise sleep on
-		 * our latch.  We might still continue running for a while if the
-		 * postmaster is killed in mid-query, or even through multiple queries
-		 * if we never have to wait for read.  We don't want to burn too many
-		 * cycles checking for this very rare condition, and this should cause
-		 * us to exit quickly in most cases.)
-		 */
-		if (event.events & WL_POSTMASTER_DEATH)
-			ereport(FATAL,
-					(errcode(ERRCODE_ADMIN_SHUTDOWN),
-					 errmsg("terminating connection due to unexpected postmaster exit")));
-
 		/* Handle interrupt. */
 		if (event.events & WL_LATCH_SET)
 		{
@@ -296,12 +274,6 @@ retry:
 		WaitEventSetWait(FeBeWaitSet, -1 /* no timeout */ , &event, 1,
 						 WAIT_EVENT_CLIENT_WRITE);
 
-		/* See comments in secure_read. */
-		if (event.events & WL_POSTMASTER_DEATH)
-			ereport(FATAL,
-					(errcode(ERRCODE_ADMIN_SHUTDOWN),
-					 errmsg("terminating connection due to unexpected postmaster exit")));
-
 		/* Handle interrupt. */
 		if (event.events & WL_LATCH_SET)
 		{
diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c
index 7717bb2719..5422175185 100644
--- a/src/backend/libpq/pqcomm.c
+++ b/src/backend/libpq/pqcomm.c
@@ -222,7 +222,7 @@ pq_init(void)
 	AddWaitEventToSet(FeBeWaitSet, WL_SOCKET_WRITEABLE, MyProcPort->sock,
 					  NULL, NULL);
 	AddWaitEventToSet(FeBeWaitSet, WL_LATCH_SET, -1, MyLatch, NULL);
-	AddWaitEventToSet(FeBeWaitSet, WL_POSTMASTER_DEATH, -1, NULL, NULL);
+	AddWaitEventToSet(FeBeWaitSet, WL_EXIT_ON_PM_DEATH, -1, NULL, NULL);
 }
 
 /* --------------------------------
-- 
2.20.1

0009-Use-FeBeWaitSet-for-walsender.c.patchtext/x-patch; charset=US-ASCII; name=0009-Use-FeBeWaitSet-for-walsender.c.patchDownload
From 3ba8b6165cbd975ede1cf06c03032fb65fc49278 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Mon, 24 Feb 2020 23:48:29 +1300
Subject: [PATCH 09/11] Use FeBeWaitSet for walsender.c.

Author: Thomas Munro
Discussion: https://postgr.es/m/CA%2BhUKGJAC4Oqao%3DqforhNey20J8CiG2R%3DoBPqvfR0vOJrFysGw%40mail.gmail.com
---
 src/backend/replication/walsender.c | 32 ++++++++++++++---------------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index abb533b9d0..580187d636 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -1235,7 +1235,7 @@ WalSndWriteData(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid,
 	/* If we have pending write here, go to slow path */
 	for (;;)
 	{
-		int			wakeEvents;
+		WaitEvent	event;
 		long		sleeptime;
 
 		/* Check for input from the client */
@@ -1252,13 +1252,11 @@ WalSndWriteData(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid,
 
 		sleeptime = WalSndComputeSleeptime(GetCurrentTimestamp());
 
-		wakeEvents = WL_LATCH_SET | WL_EXIT_ON_PM_DEATH |
-			WL_SOCKET_WRITEABLE | WL_SOCKET_READABLE | WL_TIMEOUT;
-
 		/* Sleep until something happens or we time out */
-		(void) WaitLatchOrSocket(MyLatch, wakeEvents,
-								 MyProcPort->sock, sleeptime,
-								 WAIT_EVENT_WAL_SENDER_WRITE_DATA);
+		ModifyWaitEvent(FeBeWaitSet, 0,
+						WL_SOCKET_WRITEABLE | WL_SOCKET_READABLE, NULL);
+		(void) WaitEventSetWait(FeBeWaitSet, sleeptime, &event, 1,
+								WAIT_EVENT_WAL_SENDER_WRITE_DATA);
 
 		/* Clear any already-pending wakeups */
 		ResetLatch(MyLatch);
@@ -1342,6 +1340,7 @@ WalSndWaitForWal(XLogRecPtr loc)
 
 	for (;;)
 	{
+		WaitEvent	event;
 		long		sleeptime;
 
 		/* Clear any already-pending wakeups */
@@ -1435,15 +1434,14 @@ WalSndWaitForWal(XLogRecPtr loc)
 		 */
 		sleeptime = WalSndComputeSleeptime(GetCurrentTimestamp());
 
-		wakeEvents = WL_LATCH_SET | WL_EXIT_ON_PM_DEATH |
-			WL_SOCKET_READABLE | WL_TIMEOUT;
+		wakeEvents = WL_SOCKET_READABLE;
 
 		if (pq_is_send_pending())
 			wakeEvents |= WL_SOCKET_WRITEABLE;
 
-		(void) WaitLatchOrSocket(MyLatch, wakeEvents,
-								 MyProcPort->sock, sleeptime,
-								 WAIT_EVENT_WAL_SENDER_WAIT_WAL);
+		ModifyWaitEvent(FeBeWaitSet, 0, wakeEvents, NULL);
+		(void) WaitEventSetWait(FeBeWaitSet, sleeptime, &event, 1,
+								WAIT_EVENT_WAL_SENDER_WAIT_WAL);
 	}
 
 	/* reactivate latch so WalSndLoop knows to continue */
@@ -2282,9 +2280,9 @@ WalSndLoop(WalSndSendDataCallback send_data)
 		{
 			long		sleeptime;
 			int			wakeEvents;
+			WaitEvent	event;
 
-			wakeEvents = WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | WL_TIMEOUT |
-				WL_SOCKET_READABLE;
+			wakeEvents = WL_SOCKET_READABLE;
 
 			/*
 			 * Use fresh timestamp, not last_processing, to reduce the chance
@@ -2296,9 +2294,9 @@ WalSndLoop(WalSndSendDataCallback send_data)
 				wakeEvents |= WL_SOCKET_WRITEABLE;
 
 			/* Sleep until something happens or we time out */
-			(void) WaitLatchOrSocket(MyLatch, wakeEvents,
-									 MyProcPort->sock, sleeptime,
-									 WAIT_EVENT_WAL_SENDER_MAIN);
+			ModifyWaitEvent(FeBeWaitSet, 0, wakeEvents, NULL);
+			(void) WaitEventSetWait(FeBeWaitSet, sleeptime, &event, 1,
+									WAIT_EVENT_WAL_SENDER_MAIN);
 		}
 	}
 }
-- 
2.20.1

0010-Introduce-a-WaitEventSet-for-the-stats-collector.patchtext/x-patch; charset=US-ASCII; name=0010-Introduce-a-WaitEventSet-for-the-stats-collector.patchDownload
From c383e9cd06fb89b7fc020b330a3d9cd003d2993a Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Tue, 21 Jan 2020 12:54:11 +1300
Subject: [PATCH 10/11] Introduce a WaitEventSet for the stats collector.

Author: Thomas Munro
Discussion: https://postgr.es/m/CA%2BhUKGJAC4Oqao%3DqforhNey20J8CiG2R%3DoBPqvfR0vOJrFysGw%40mail.gmail.com
---
 src/backend/postmaster/pgstat.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 462b4d7e06..2e16f6cc66 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -4429,6 +4429,8 @@ PgstatCollectorMain(int argc, char *argv[])
 	int			len;
 	PgStat_Msg	msg;
 	int			wr;
+	WaitEvent	event;
+	WaitEventSet *wes;
 
 	/*
 	 * Ignore all signals usually bound to some action in the postmaster,
@@ -4458,6 +4460,12 @@ PgstatCollectorMain(int argc, char *argv[])
 	pgStatRunningInCollector = true;
 	pgStatDBHash = pgstat_read_statsfiles(InvalidOid, true, true);
 
+	/* Prepare to wait for our latch or data in our socket. */
+	wes = CreateWaitEventSet(CurrentMemoryContext, 3);
+	AddWaitEventToSet(wes, WL_LATCH_SET, PGINVALID_SOCKET, MyLatch, NULL);
+	AddWaitEventToSet(wes, WL_POSTMASTER_DEATH, PGINVALID_SOCKET, NULL, NULL);
+	AddWaitEventToSet(wes, WL_SOCKET_READABLE, pgStatSock, NULL, NULL);
+
 	/*
 	 * Loop to process messages until we get SIGQUIT or detect ungraceful
 	 * death of our parent postmaster.
@@ -4636,10 +4644,7 @@ PgstatCollectorMain(int argc, char *argv[])
 
 		/* Sleep until there's something to do */
 #ifndef WIN32
-		wr = WaitLatchOrSocket(MyLatch,
-							   WL_LATCH_SET | WL_POSTMASTER_DEATH | WL_SOCKET_READABLE,
-							   pgStatSock, -1L,
-							   WAIT_EVENT_PGSTAT_MAIN);
+		wr = WaitEventSetWait(wes, -1L, &event, 1, WAIT_EVENT_PGSTAT_MAIN);
 #else
 
 		/*
@@ -4652,18 +4657,15 @@ PgstatCollectorMain(int argc, char *argv[])
 		 * to not provoke "using stale statistics" complaints from
 		 * backend_read_statsfile.
 		 */
-		wr = WaitLatchOrSocket(MyLatch,
-							   WL_LATCH_SET | WL_POSTMASTER_DEATH | WL_SOCKET_READABLE | WL_TIMEOUT,
-							   pgStatSock,
-							   2 * 1000L /* msec */ ,
-							   WAIT_EVENT_PGSTAT_MAIN);
+		wr = WaitEventSetWait(wes, 2 * 1000L /* msec */, &event, 1,
+							  WAIT_EVENT_PGSTAT_MAIN);
 #endif
 
 		/*
 		 * Emergency bailout if postmaster has died.  This is to avoid the
 		 * necessity for manual cleanup of all postmaster children.
 		 */
-		if (wr & WL_POSTMASTER_DEATH)
+		if (wr == 1 && event.events == WL_POSTMASTER_DEATH)
 			break;
 	}							/* end of outer loop */
 
@@ -4672,6 +4674,8 @@ PgstatCollectorMain(int argc, char *argv[])
 	 */
 	pgstat_write_statsfiles(true, true);
 
+	FreeWaitEventSet(wes);
+
 	exit(0);
 }
 
-- 
2.20.1

#5Kyotaro Horiguchi
horikyota.ntt@gmail.com
In reply to: Thomas Munro (#4)
Re: Reducing WaitEventSet syscall churn

Hello.

I looked this.

At Thu, 27 Feb 2020 12:17:45 +1300, Thomas Munro <thomas.munro@gmail.com> wrote in

On Sat, Feb 8, 2020 at 10:15 AM Thomas Munro <thomas.munro@gmail.com> wrote:

Here are some patches to get rid of frequent system calls.

Here's a new version of this patch set. It gets rid of all temporary
WaitEventSets except a couple I mentioned in another thread[1].
WaitLatch() uses CommonWaitSet, and calls to WaitLatchOrSocket() are
replaced by either the existing FeBeWaitSet (walsender, gssapi/openssl
auth are also candidates) or a special purpose long lived WaitEventSet
(replication, postgres_fdw, pgstats). It passes make check-world with
WAIT_USE_POLL, WAIT_USE_KQUEUE, WAIT_USE_EPOLL, all with and without
-DEXEC_BACKEND, and make check with WAIT_USE_WIN32 (Appveyor).

0001: "Don't use EV_CLEAR for kqueue events."

This fixes a problem in the kqueue implementation that only shows up
once you switch to long lived WaitEventSets. It needs to be
level-triggered like the other implementations, for example because
there's a place in the recovery tests where we wait twice in a row
without trying to do I/O in between. (This is a bit like commit
3b790d256f8 that fixed a similar problem on Windows.)

It looks fine in the light of document of kqueue.

0002: "Use a long lived WaitEventSet for WaitLatch()."

In the last version, I had a new function WaitMyLatch(), but that
didn't help with recoveryWakeupLatch. Switching between latches
doesn't require a syscall, so I didn't want to have a separate WES and
function just for that. So I went back to using plain old
WaitLatch(), and made it "modify" the latch every time before waiting
on CommonWaitSet. An alternative would be to get rid of the concept
of latches other than MyLatch, and change the function to
WaitMyLatch(). A similar thing happens for exit_on_postmaster_death,
for which I didn't want to have a separate WES, so I just set that
flag every time. Thoughts?

It is surely an improvement from that we create a full-fledged WES
every time. The name CommonWaitSet gives an impression as if it is
used for a variety of waitevent sets, but it is used only by
WaitLatch. So I would name it LatchWaitSet. With that name I won't be
surprised by that the function is pointing WL_LATCH_SET by index 0
without any explanation when calling ModifyWaitSet.

@@ -700,7 +739,11 @@ FreeWaitEventSet(WaitEventSet *set)
 	ReleaseExternalFD();
 #elif defined(WAIT_USE_KQUEUE)
 	close(set->kqueue_fd);
-	ReleaseExternalFD();
+	if (set->kqueue_fd >= 0)
+	{
+		close(set->kqueue_fd);
+		ReleaseExternalFD();
+	}

Did you forget to remove the close() outside the if block?
Don't we need the same amendment for epoll_fd with kqueue_fd?

WaitLatch is defined as "If the latch is already set (and WL_LATCH_SET
is given), the function returns immediately.". But now the function
reacts to latch even if WL_LATCH_SET is not set. I think actually it
is alwys set so I think we need to modify Assert and function comment
following the change.

0003: "Use regular WaitLatch() for condition variables."

That mechanism doesn't need its own WES anymore.

Looks fine.

0004: "Introduce RemoveWaitEvent()."

We'll need that to be able to handle connections that are closed and
reopened under the covers by libpq (replication, postgres_fdw). We
also wanted this on a couple of other threads for multiplexing FDWs,
to be able to adjust the wait set dynamically for a proposed async
Append feature.

The implementation is a little naive, and leaves holes in the
"pollfds" and "handles" arrays (for poll() and win32 implementations).
That could be improved with a bit more footwork in a later patch.

XXX The Windows version is based on reading documentation. I'd be
very interested to know if check-world passes (especially
contrib/postgres_fdw and src/test/recovery). Unfortunately my
appveyor.yml fu is not yet strong enough.

I didn't find the documentation about INVALID_HANDLE_VALUE in
lpHandles. Could you give me the URL for that?

I didn't run recoverycheck because because I couldn't install IPC::Run
for ActivePerl.. But contribcheck succeeded.

+	for (int i = 0; i < nevents; ++i)
+		set->handles[i + 1] = INVALID_HANDLE_VALUE;

It accesses set->handles[nevents], which is overrun.

+	/* Set up the free list. */
+	for (int i = 0; i < nevents; ++i)
+		set->events[i].next_free = i + 1;
+	set->events[nevents - 1].next_free = -1;

It sets the last element twice. (harmless but useless).

set->handles = (HANDLE) data;
data += MAXALIGN(sizeof(HANDLE) * nevents);

It is not an issue of thie patch, but does hadles need nevents + 1
elements?

WaitEventSetSize is not checking its parameter range.

I'l continue reviewing in later mail.

0005: "libpq: Add PQsocketChangeCount to advertise socket changes."

....

regards.

--
Kyotaro Horiguchi
NTT Open Source Software Center

#6Kyotaro Horiguchi
horikyota.ntt@gmail.com
In reply to: Thomas Munro (#4)
11 attachment(s)
Re: Reducing WaitEventSet syscall churn

Hello.

At Tue, 10 Mar 2020 08:19:24 +0900 (JST), Kyotaro Horiguchi <horikyota.ntt@gmail.com> wrote in
me> I'l continue reviewing in later mail.
me>
me> > 0005: "libpq: Add PQsocketChangeCount to advertise socket changes."
me> ....

At Thu, 27 Feb 2020 12:17:45 +1300, Thomas Munro <thomas.munro@gmail.com> wrote
in

On Sat, Feb 8, 2020 at 10:15 AM Thomas Munro <thomas.munro@gmail.com> wrote:
0005: "libpq: Add PQsocketChangeCount to advertise socket changes."

To support a long lived WES, libpq needs a way tell us when the socket
changes underneath our feet. This is the simplest thing I could think
of; better ideas welcome.

I think at least windows is the reason for not just detecting the
change of the value of fd. Instead of counting disconnection, we
could use event libpq-event.

QregisterEventProc returns false for all of bad-parameter,
already-registered, out-of-memory and proc-rejection. I don't think it
is usable interface so the attached 0005 patch fixes that. (but I
found it not necessarily needed after making 0007, but I included it
as a proposal separate from this patch set. It's not including the
corresponding doc fix.).

0006: "Reuse a WaitEventSet in libpqwalreceiver.c."

Rather than having all users of libpqwalreceiver.c deal with the
complicated details of wait set management, have libpqwalreceiver
expose a waiting interface that understands socket changes.

Looks reasonable. The attached 0006 and 0007 are a possible
replacement if we use libpq-event.

Unfortunately, I couldn't figure out how to use CommonWaitSet for this
(ie adding and removing sockets to that as required), due to
complications with the bookkeeping required to provide the fd_closed
flag to RemoveWaitEvent(). So it creates its own internal long lived
WaitEventSet.

Agreed since they are used different way. But with the attached closed
connection is marked as wes_socket_position = -1.

0007: "Use a WaitEventSet for postgres_fdw."

Continues..

The attached are:
0001-0004 Not changed
0005 Fix interface of PQregisterEventProc
0006 Add new libpq event for this use.
0007 Another version of "0006 Reuse a WaitEventSet in
libpqwalreceiver.c" based on libpq event.
0008-0011 Not changed (old 0007-0010, blindly appended)

passed the regression (includeing TAP recovery test) up to here.

regards.

--
Kyotaro Horiguchi
NTT Open Source Software Center

Attachments:

0001-Don-t-use-EV_CLEAR-for-kqueue-events.patchtext/x-patch; charset=us-asciiDownload
From 1a80abd292ff0d6a47274eb188a5576b2ef3cf6e Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Tue, 25 Feb 2020 02:53:35 +1300
Subject: [PATCH 01/11] Don't use EV_CLEAR for kqueue events.

For the semantics to match the epoll implementation, we need a
socket to continue to appear readable/writable if you wait
multiple times without doing I/O in between (in Linux
terminology, level-triggered rather than edge-triggered).
Similar to commit 3b790d256f8 for Windows.

Author: Thomas Munro
Discussion: https://postgr.es/m/CA%2BhUKGJAC4Oqao%3DqforhNey20J8CiG2R%3DoBPqvfR0vOJrFysGw%40mail.gmail.com
---
 src/backend/storage/ipc/latch.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c
index 046ca5c6c7..3b6acfb251 100644
--- a/src/backend/storage/ipc/latch.c
+++ b/src/backend/storage/ipc/latch.c
@@ -991,7 +991,7 @@ WaitEventAdjustKqueueAdd(struct kevent *k_ev, int filter, int action,
 {
 	k_ev->ident = event->fd;
 	k_ev->filter = filter;
-	k_ev->flags = action | EV_CLEAR;
+	k_ev->flags = action;
 	k_ev->fflags = 0;
 	k_ev->data = 0;
 	AccessWaitEvent(k_ev) = event;
@@ -1003,7 +1003,7 @@ WaitEventAdjustKqueueAddPostmaster(struct kevent *k_ev, WaitEvent *event)
 	/* For now postmaster death can only be added, not removed. */
 	k_ev->ident = PostmasterPid;
 	k_ev->filter = EVFILT_PROC;
-	k_ev->flags = EV_ADD | EV_CLEAR;
+	k_ev->flags = EV_ADD;
 	k_ev->fflags = NOTE_EXIT;
 	k_ev->data = 0;
 	AccessWaitEvent(k_ev) = event;
-- 
2.18.2

0002-Use-a-long-lived-WaitEventSet-for-WaitLatch.patchtext/x-patch; charset=us-asciiDownload
From f93551a42b9e9ad0da05197d48fac5790474249d Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Mon, 24 Feb 2020 15:39:24 +1300
Subject: [PATCH 02/11] Use a long lived WaitEventSet for WaitLatch().

Create CommonWaitSet at backend startup time, and use it to
implement WaitLatch().  This avoids a bunch of epoll/kqueue
system calls, and makes sure we don't run into EMFILE later
due to lack of file descriptors.

Reorder SubPostmasterMain() slightly so that we restore the
postmaster pipe and Windows signal before we reach
InitPostmasterChild(), to make this work in EXEC_BACKEND
builds.

Author: Thomas Munro
Discussion: https://postgr.es/m/CA%2BhUKGJAC4Oqao%3DqforhNey20J8CiG2R%3DoBPqvfR0vOJrFysGw%40mail.gmail.com
---
 src/backend/postmaster/postmaster.c | 24 +++++++-------
 src/backend/storage/ipc/latch.c     | 49 +++++++++++++++++++++++++++--
 src/backend/utils/init/miscinit.c   |  2 ++
 src/include/storage/latch.h         |  1 +
 4 files changed, 61 insertions(+), 15 deletions(-)

diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 46be78aadb..c472971ce0 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -4878,9 +4878,6 @@ SubPostmasterMain(int argc, char *argv[])
 	IsPostmasterEnvironment = true;
 	whereToSendOutput = DestNone;
 
-	/* Setup as postmaster child */
-	InitPostmasterChild();
-
 	/* Setup essential subsystems (to ensure elog() behaves sanely) */
 	InitializeGUCOptions();
 
@@ -4895,6 +4892,18 @@ SubPostmasterMain(int argc, char *argv[])
 	/* Close the postmaster's sockets (as soon as we know them) */
 	ClosePostmasterPorts(strcmp(argv[1], "--forklog") == 0);
 
+	/*
+	 * Start our win32 signal implementation. This has to be done after we
+	 * read the backend variables, because we need to pick up the signal pipe
+	 * from the parent process.
+	 */
+#ifdef WIN32
+	pgwin32_signal_initialize();
+#endif
+
+	/* Setup as postmaster child */
+	InitPostmasterChild();
+
 	/*
 	 * Set reference point for stack-depth checking
 	 */
@@ -4943,15 +4952,6 @@ SubPostmasterMain(int argc, char *argv[])
 	if (strcmp(argv[1], "--forkavworker") == 0)
 		AutovacuumWorkerIAm();
 
-	/*
-	 * Start our win32 signal implementation. This has to be done after we
-	 * read the backend variables, because we need to pick up the signal pipe
-	 * from the parent process.
-	 */
-#ifdef WIN32
-	pgwin32_signal_initialize();
-#endif
-
 	/* In EXEC_BACKEND case we will not have inherited these settings */
 	pqinitmask();
 	PG_SETMASK(&BlockSig);
diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c
index 3b6acfb251..30e461e965 100644
--- a/src/backend/storage/ipc/latch.c
+++ b/src/backend/storage/ipc/latch.c
@@ -56,6 +56,7 @@
 #include "storage/latch.h"
 #include "storage/pmsignal.h"
 #include "storage/shmem.h"
+#include "utils/memutils.h"
 
 /*
  * Select the fd readiness primitive to use. Normally the "most modern"
@@ -129,6 +130,9 @@ struct WaitEventSet
 #endif
 };
 
+/* A common WaitEventSet used to implement WatchLatch() */
+static WaitEventSet *CommonWaitSet;
+
 #ifndef WIN32
 /* Are we currently in WaitLatch? The signal handler would like to know. */
 static volatile sig_atomic_t waiting = false;
@@ -242,6 +246,20 @@ InitializeLatchSupport(void)
 #endif
 }
 
+void
+InitializeCommonWaitSet(void)
+{
+	Assert(CommonWaitSet == NULL);
+
+	/* Set up the WaitEventSet used by WaitLatch(). */
+	CommonWaitSet = CreateWaitEventSet(TopMemoryContext, 2);
+	AddWaitEventToSet(CommonWaitSet, WL_LATCH_SET, PGINVALID_SOCKET,
+					  MyLatch, NULL);
+	if (IsUnderPostmaster)
+		AddWaitEventToSet(CommonWaitSet, WL_EXIT_ON_PM_DEATH,
+						  PGINVALID_SOCKET, NULL, NULL);
+}
+
 /*
  * Initialize a process-local latch.
  */
@@ -365,8 +383,29 @@ int
 WaitLatch(Latch *latch, int wakeEvents, long timeout,
 		  uint32 wait_event_info)
 {
-	return WaitLatchOrSocket(latch, wakeEvents, PGINVALID_SOCKET, timeout,
-							 wait_event_info);
+	WaitEvent	event;
+
+	/* Postmaster-managed callers must handle postmaster death somehow. */
+	Assert(!IsUnderPostmaster ||
+		   (wakeEvents & WL_EXIT_ON_PM_DEATH) ||
+		   (wakeEvents & WL_POSTMASTER_DEATH));
+
+	/*
+	 * Some callers may have a latch other than MyLatch, or want to handle
+	 * postmaster death differently.  It's cheap to assign those, so just do it
+	 * every time.
+	 */
+	ModifyWaitEvent(CommonWaitSet, 0, WL_LATCH_SET, latch);
+	CommonWaitSet->exit_on_postmaster_death =
+		((wakeEvents & WL_EXIT_ON_PM_DEATH) != 0);
+
+	if (WaitEventSetWait(CommonWaitSet,
+						 (wakeEvents & WL_TIMEOUT) ? timeout : -1,
+						 &event, 1,
+						 wait_event_info) == 0)
+		return WL_TIMEOUT;
+	else
+		return event.events;
 }
 
 /*
@@ -700,7 +739,11 @@ FreeWaitEventSet(WaitEventSet *set)
 	ReleaseExternalFD();
 #elif defined(WAIT_USE_KQUEUE)
 	close(set->kqueue_fd);
-	ReleaseExternalFD();
+	if (set->kqueue_fd >= 0)
+	{
+		close(set->kqueue_fd);
+		ReleaseExternalFD();
+	}
 #elif defined(WAIT_USE_WIN32)
 	WaitEvent  *cur_event;
 
diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c
index cd099b0c70..1cace9735d 100644
--- a/src/backend/utils/init/miscinit.c
+++ b/src/backend/utils/init/miscinit.c
@@ -109,6 +109,7 @@ InitPostmasterChild(void)
 	InitializeLatchSupport();
 	MyLatch = &LocalLatchData;
 	InitLatch(MyLatch);
+	InitializeCommonWaitSet();
 
 	/*
 	 * If possible, make this process a group leader, so that the postmaster
@@ -141,6 +142,7 @@ InitStandaloneProcess(const char *argv0)
 	InitializeLatchSupport();
 	MyLatch = &LocalLatchData;
 	InitLatch(MyLatch);
+	InitializeCommonWaitSet();
 
 	/* Compute paths, no postmaster to inherit from */
 	if (my_exec_path[0] == '\0')
diff --git a/src/include/storage/latch.h b/src/include/storage/latch.h
index 46ae56cae3..ec1865a8fd 100644
--- a/src/include/storage/latch.h
+++ b/src/include/storage/latch.h
@@ -176,6 +176,7 @@ extern int	WaitLatch(Latch *latch, int wakeEvents, long timeout,
 					  uint32 wait_event_info);
 extern int	WaitLatchOrSocket(Latch *latch, int wakeEvents,
 							  pgsocket sock, long timeout, uint32 wait_event_info);
+extern void InitializeCommonWaitSet(void);
 
 /*
  * Unix implementation uses SIGUSR1 for inter-process signaling.
-- 
2.18.2

0003-Use-WaitLatch-for-condition-variables.patchtext/x-patch; charset=us-asciiDownload
From 297764ce9da84bdb7c66faac61ec51ed49fbab0b Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Mon, 24 Feb 2020 15:49:00 +1300
Subject: [PATCH 03/11] Use WaitLatch() for condition variables.

Previously, condition_variable.c created its own long lived
WaitEventSet to avoid extra system calls.  WaitLatch() now
does that, so there is no point in wasting an extra kernel
descriptor.

Author: Thomas Munro
Discussion: https://postgr.es/m/CA%2BhUKGJAC4Oqao%3DqforhNey20J8CiG2R%3DoBPqvfR0vOJrFysGw%40mail.gmail.com
---
 src/backend/storage/lmgr/condition_variable.c | 28 ++++---------------
 1 file changed, 5 insertions(+), 23 deletions(-)

diff --git a/src/backend/storage/lmgr/condition_variable.c b/src/backend/storage/lmgr/condition_variable.c
index 37b6a4eecd..2ec00397b4 100644
--- a/src/backend/storage/lmgr/condition_variable.c
+++ b/src/backend/storage/lmgr/condition_variable.c
@@ -30,9 +30,6 @@
 /* Initially, we are not prepared to sleep on any condition variable. */
 static ConditionVariable *cv_sleep_target = NULL;
 
-/* Reusable WaitEventSet. */
-static WaitEventSet *cv_wait_event_set = NULL;
-
 /*
  * Initialize a condition variable.
  */
@@ -62,23 +59,6 @@ ConditionVariablePrepareToSleep(ConditionVariable *cv)
 {
 	int			pgprocno = MyProc->pgprocno;
 
-	/*
-	 * If first time through in this process, create a WaitEventSet, which
-	 * we'll reuse for all condition variable sleeps.
-	 */
-	if (cv_wait_event_set == NULL)
-	{
-		WaitEventSet *new_event_set;
-
-		new_event_set = CreateWaitEventSet(TopMemoryContext, 2);
-		AddWaitEventToSet(new_event_set, WL_LATCH_SET, PGINVALID_SOCKET,
-						  MyLatch, NULL);
-		AddWaitEventToSet(new_event_set, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
-						  NULL, NULL);
-		/* Don't set cv_wait_event_set until we have a correct WES. */
-		cv_wait_event_set = new_event_set;
-	}
-
 	/*
 	 * If some other sleep is already prepared, cancel it; this is necessary
 	 * because we have just one static variable tracking the prepared sleep,
@@ -135,6 +115,7 @@ ConditionVariableTimedSleep(ConditionVariable *cv, long timeout,
 	long		cur_timeout = -1;
 	instr_time	start_time;
 	instr_time	cur_time;
+	int			wait_events;
 
 	/*
 	 * If the caller didn't prepare to sleep explicitly, then do so now and
@@ -166,19 +147,20 @@ ConditionVariableTimedSleep(ConditionVariable *cv, long timeout,
 		INSTR_TIME_SET_CURRENT(start_time);
 		Assert(timeout >= 0 && timeout <= INT_MAX);
 		cur_timeout = timeout;
+		wait_events = WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH;
 	}
+	else
+		wait_events = WL_LATCH_SET | WL_EXIT_ON_PM_DEATH;
 
 	while (true)
 	{
-		WaitEvent	event;
 		bool		done = false;
 
 		/*
 		 * Wait for latch to be set.  (If we're awakened for some other
 		 * reason, the code below will cope anyway.)
 		 */
-		(void) WaitEventSetWait(cv_wait_event_set, cur_timeout, &event, 1,
-								wait_event_info);
+		(void) WaitLatch(MyLatch, wait_events, cur_timeout, wait_event_info);
 
 		/* Reset latch before examining the state of the wait list. */
 		ResetLatch(MyLatch);
-- 
2.18.2

0004-Introduce-RemoveWaitEvent.patchtext/x-patch; charset=us-asciiDownload
From d5ba7d38d7618c173eb10f3369e21937fb23aa74 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Mon, 24 Feb 2020 19:05:00 +1300
Subject: [PATCH 04/11] Introduce RemoveWaitEvent().

This will allow WaitEventSet objects to be used in more
long lived scenarios, where sockets are added and removed.

Author: Thomas Munro
Discussion: https://postgr.es/m/CA%2BhUKGJAC4Oqao%3DqforhNey20J8CiG2R%3DoBPqvfR0vOJrFysGw%40mail.gmail.com
---
 src/backend/storage/ipc/latch.c | 131 ++++++++++++++++++++++++++++----
 src/include/storage/latch.h     |   3 +
 2 files changed, 120 insertions(+), 14 deletions(-)

diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c
index 30e461e965..025545fc89 100644
--- a/src/backend/storage/ipc/latch.c
+++ b/src/backend/storage/ipc/latch.c
@@ -84,6 +84,7 @@ struct WaitEventSet
 {
 	int			nevents;		/* number of registered events */
 	int			nevents_space;	/* maximum number of events in this set */
+	int			free_list;		/* position of first free event */
 
 	/*
 	 * Array, of nevents_space length, storing the definition of events this
@@ -119,6 +120,8 @@ struct WaitEventSet
 #elif defined(WAIT_USE_POLL)
 	/* poll expects events to be waited on every poll() call, prepare once */
 	struct pollfd *pollfds;
+	/* track the populated range of pollfds */
+	int			npollfds;
 #elif defined(WAIT_USE_WIN32)
 
 	/*
@@ -127,6 +130,8 @@ struct WaitEventSet
 	 * event->pos + 1).
 	 */
 	HANDLE	   *handles;
+	/* track the populated range of handles */
+	int			nhandles;
 #endif
 };
 
@@ -642,13 +647,16 @@ CreateWaitEventSet(MemoryContext context, int nevents)
 #elif defined(WAIT_USE_POLL)
 	set->pollfds = (struct pollfd *) data;
 	data += MAXALIGN(sizeof(struct pollfd) * nevents);
+	set->npollfds = 0;
 #elif defined(WAIT_USE_WIN32)
 	set->handles = (HANDLE) data;
 	data += MAXALIGN(sizeof(HANDLE) * nevents);
+	set->nhandles = 0;
 #endif
 
 	set->latch = NULL;
 	set->nevents_space = nevents;
+	set->nevents = 0;
 	set->exit_on_postmaster_death = false;
 
 #if defined(WAIT_USE_EPOLL)
@@ -714,11 +722,25 @@ CreateWaitEventSet(MemoryContext context, int nevents)
 	 * Note: pgwin32_signal_event should be first to ensure that it will be
 	 * reported when multiple events are set.  We want to guarantee that
 	 * pending signals are serviced.
+	 *
+	 * We set unused handles to INVALID_HANDLE_VALUE, because
+	 * WaitForMultipleObjects() considers that to mean "this process" which is
+	 * not signaled until process, so it's a way of leaving a hole in the
+	 * middle of the wait set if you remove something (just like -1 in the poll
+	 * implementation).  An alternative would be to fill in holes and create a
+	 * non 1-to-1 mapping between 'events' and 'handles'.
 	 */
 	set->handles[0] = pgwin32_signal_event;
-	StaticAssertStmt(WSA_INVALID_EVENT == NULL, "");
+	for (int i = 0; i < nevents; ++i)
+		set->handles[i + 1] = INVALID_HANDLE_VALUE;
 #endif
 
+	/* Set up the free list. */
+	for (int i = 0; i < nevents; ++i)
+		set->events[i].next_free = i + 1;
+	set->events[nevents - 1].next_free = -1;
+	set->free_list = 0;
+
 	return set;
 }
 
@@ -727,7 +749,6 @@ CreateWaitEventSet(MemoryContext context, int nevents)
  *
  * Note: preferably, this shouldn't have to free any resources that could be
  * inherited across an exec().  If it did, we'd likely leak those resources in
- * many scenarios.  For the epoll case, we ensure that by setting FD_CLOEXEC
  * when the FD is created.  For the Windows case, we assume that the handles
  * involved are non-inheritable.
  */
@@ -748,9 +769,12 @@ FreeWaitEventSet(WaitEventSet *set)
 	WaitEvent  *cur_event;
 
 	for (cur_event = set->events;
-		 cur_event < (set->events + set->nevents);
+		 cur_event < (set->events + set->nhandles);
 		 cur_event++)
 	{
+		if (set->handles[cur_event->pos + 1] == INVALID_HANDLE_VALUE)
+			continue;
+
 		if (cur_event->events & WL_LATCH_SET)
 		{
 			/* uses the latch's HANDLE */
@@ -805,9 +829,6 @@ AddWaitEventToSet(WaitEventSet *set, uint32 events, pgsocket fd, Latch *latch,
 {
 	WaitEvent  *event;
 
-	/* not enough space */
-	Assert(set->nevents < set->nevents_space);
-
 	if (events == WL_EXIT_ON_PM_DEATH)
 	{
 		events = WL_POSTMASTER_DEATH;
@@ -833,8 +854,12 @@ AddWaitEventToSet(WaitEventSet *set, uint32 events, pgsocket fd, Latch *latch,
 	if (fd == PGINVALID_SOCKET && (events & WL_SOCKET_MASK))
 		elog(ERROR, "cannot wait on socket event without a socket");
 
-	event = &set->events[set->nevents];
-	event->pos = set->nevents++;
+	/* Do we have any free slots? */
+	if (set->free_list == -1)
+		elog(ERROR, "WaitEventSet is full");
+
+	event = &set->events[set->free_list];
+	event->pos = set->free_list;
 	event->fd = fd;
 	event->events = events;
 	event->user_data = user_data;
@@ -868,6 +893,11 @@ AddWaitEventToSet(WaitEventSet *set, uint32 events, pgsocket fd, Latch *latch,
 	WaitEventAdjustWin32(set, event);
 #endif
 
+	/* Remove it from the free list. */
+	set->free_list = event->next_free;
+	event->next_free = -1;
+	set->nevents++;
+
 	return event->pos;
 }
 
@@ -885,7 +915,7 @@ ModifyWaitEvent(WaitEventSet *set, int pos, uint32 events, Latch *latch)
 	int			old_events;
 #endif
 
-	Assert(pos < set->nevents);
+	Assert(pos < set->nevents_space);
 
 	event = &set->events[pos];
 #if defined(WAIT_USE_KQUEUE)
@@ -933,6 +963,63 @@ ModifyWaitEvent(WaitEventSet *set, int pos, uint32 events, Latch *latch)
 #endif
 }
 
+/*
+ * If the descriptor has already been closed, the kernel should already have
+ * removed it from the wait set (except in WAIT_USE_POLL).  Pass in true for
+ * fd_closed in that case, so we don't try to remove it ourselves.
+ */
+void
+RemoveWaitEvent(WaitEventSet *set, int pos, bool fd_closed)
+{
+	WaitEvent  *event;
+
+	Assert(pos >= 0);
+	Assert(pos < set->nevents_space);
+	event = &set->events[pos];
+
+	/* For now only sockets can be removed */
+	if ((event->events & WL_SOCKET_MASK) == 0)
+		elog(ERROR, "event type cannot be removed");
+
+#if defined(WAIT_USE_EPOLL)
+	if (!fd_closed)
+		WaitEventAdjustEpoll(set, event, EPOLL_CTL_DEL);
+#elif defined(WAIT_USE_KQUEUE)
+	if (!fd_closed)
+	{
+		int old_events = event->events;
+
+		event->events = 0;
+		WaitEventAdjustKqueue(set, event, old_events);
+	}
+#elif defined(WAIT_USE_POLL)
+	/* no kernel state to remove, just blank out the fd */
+	set->pollfds[event->pos].fd = -1;
+	/* see if we can shrink the range of active fds */
+	while (set->npollfds > 0 &&
+		   set->pollfds[set->npollfds - 1].fd == -1)
+		set->npollfds -= 1;
+#elif defined(WAIT_USE_WIN32)
+	if (!fd_closed)
+		WSAEventSelect(event->fd, NULL, 0);
+	if (set->handles[event->pos + 1] != INVALID_HANDLE_VALUE)
+	{
+		WSACloseEvent(set->handles[event->pos + 1]);
+		set->handles[event->pos + 1] = INVALID_HANDLE_VALUE;
+	}
+	/* see if we can shrink the range of active handles */
+	while (set->nhandles > 0 &&
+		   set->handles[set->nhandles] == INVALID_HANDLE_VALUE)
+		set->nhandles -= 1;
+#endif
+
+	/* This position is now free. */
+	memset(event, 0, sizeof(*event));
+	event->next_free = set->free_list;
+	set->free_list = pos;
+	set->nevents--;
+}
+
 #if defined(WAIT_USE_EPOLL)
 /*
  * action can be one of EPOLL_CTL_ADD | EPOLL_CTL_MOD | EPOLL_CTL_DEL
@@ -994,6 +1081,9 @@ WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event)
 	pollfd->revents = 0;
 	pollfd->fd = event->fd;
 
+	/* track the known range of populated slots */
+	set->npollfds = Max(event->pos + 1, set->nevents);
+
 	/* prepare pollfd entry once */
 	if (event->events == WL_LATCH_SET)
 	{
@@ -1072,7 +1162,9 @@ WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events)
 	Assert(event->events != WL_LATCH_SET || set->latch != NULL);
 	Assert(event->events == WL_LATCH_SET ||
 		   event->events == WL_POSTMASTER_DEATH ||
-		   (event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)));
+		   (event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) ||
+		   (event->events == 0 &&
+			(old_events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))));
 
 	if (event->events == WL_POSTMASTER_DEATH)
 	{
@@ -1149,6 +1241,9 @@ WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event)
 {
 	HANDLE	   *handle = &set->handles[event->pos + 1];
 
+	/* track the known range of populated slots */
+	set->nhandles = Max(event->pos + 1, set->nhandles);
+
 	if (event->events == WL_LATCH_SET)
 	{
 		Assert(set->latch != NULL);
@@ -1169,12 +1264,15 @@ WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event)
 		if (event->events & WL_SOCKET_CONNECTED)
 			flags |= FD_CONNECT;
 
-		if (*handle == WSA_INVALID_EVENT)
+		if (*handle == INVALID_HANDLE_VALUE)
 		{
 			*handle = WSACreateEvent();
 			if (*handle == WSA_INVALID_EVENT)
+			{
+				*handle = INVALID_HANDLE_VALUE;
 				elog(ERROR, "failed to create event for socket: error code %u",
 					 WSAGetLastError());
+			}
 		}
 		if (WSAEventSelect(event->fd, *handle, flags) != 0)
 			elog(ERROR, "failed to set up event for socket: error code %u",
@@ -1304,6 +1402,11 @@ WaitEventSetWait(WaitEventSet *set, long timeout,
 	return returned_events;
 }
 
+int
+WaitEventSetSize(WaitEventSet *set)
+{
+	return set->nevents;
+}
 
 #if defined(WAIT_USE_EPOLL)
 
@@ -1589,7 +1692,7 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
 	struct pollfd *cur_pollfd;
 
 	/* Sleep */
-	rc = poll(set->pollfds, set->nevents, (int) cur_timeout);
+	rc = poll(set->pollfds, set->npollfds, (int) cur_timeout);
 
 	/* Check return code */
 	if (rc < 0)
@@ -1761,9 +1864,9 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
 	/*
 	 * Sleep.
 	 *
-	 * Need to wait for ->nevents + 1, because signal handle is in [0].
+	 * Need to wait for ->nhandles + 1, because signal handle is in [0].
 	 */
-	rc = WaitForMultipleObjects(set->nevents + 1, set->handles, FALSE,
+	rc = WaitForMultipleObjects(set->nhandles + 1, set->handles, FALSE,
 								cur_timeout);
 
 	/* Check return code */
diff --git a/src/include/storage/latch.h b/src/include/storage/latch.h
index ec1865a8fd..210f37659e 100644
--- a/src/include/storage/latch.h
+++ b/src/include/storage/latch.h
@@ -144,6 +144,7 @@ typedef struct WaitEvent
 	uint32		events;			/* triggered events */
 	pgsocket	fd;				/* socket fd associated with event */
 	void	   *user_data;		/* pointer provided in AddWaitEventToSet */
+	int			next_free;		/* free list for internal use */
 #ifdef WIN32
 	bool		reset;			/* Is reset of the event required? */
 #endif
@@ -168,6 +169,8 @@ extern void FreeWaitEventSet(WaitEventSet *set);
 extern int	AddWaitEventToSet(WaitEventSet *set, uint32 events, pgsocket fd,
 							  Latch *latch, void *user_data);
 extern void ModifyWaitEvent(WaitEventSet *set, int pos, uint32 events, Latch *latch);
+extern void RemoveWaitEvent(WaitEventSet *set, int pos, bool fd_closed);
+extern int WaitEventSetSize(WaitEventSet *set);
 
 extern int	WaitEventSetWait(WaitEventSet *set, long timeout,
 							 WaitEvent *occurred_events, int nevents,
-- 
2.18.2

0005-Fix-interface-of-PQregisterEventProc.patchtext/x-patch; charset=us-asciiDownload
From 3fdb3f11d99f4efb8e88b3c8888b4092694f45c7 Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <horikyoga.ntt@gmail.com>
Date: Fri, 13 Mar 2020 12:00:23 +0900
Subject: [PATCH 05/11] Fix interface of PQregisterEventProc

The function returns false for all kind of failures so the caller
cannot identify the cause of failure. Change the return value from
int(bool) to an enum.
---
 src/interfaces/libpq/libpq-events.c | 34 ++++++++++++++++++++++-------
 src/interfaces/libpq/libpq-events.h | 15 +++++++++++--
 2 files changed, 39 insertions(+), 10 deletions(-)

diff --git a/src/interfaces/libpq/libpq-events.c b/src/interfaces/libpq/libpq-events.c
index d050d7f3f2..6b594d910a 100644
--- a/src/interfaces/libpq/libpq-events.c
+++ b/src/interfaces/libpq/libpq-events.c
@@ -36,20 +36,31 @@
  * The function returns a non-zero if successful.  If the function fails,
  * zero is returned.
  */
-int
+PGRegEventResult
 PQregisterEventProc(PGconn *conn, PGEventProc proc,
 					const char *name, void *passThrough)
 {
 	int			i;
 	PGEventRegister regevt;
 
-	if (!proc || !conn || !name || !*name)
-		return false;			/* bad arguments */
+	if (!conn)
+		return PGEVTREG_BADARG;
+
+	if (!proc || !name || !*name)
+	{
+		printfPQExpBuffer(&conn->errorMessage,
+						  "bad argument in PQregisterEventProc");
+		return PGEVTREG_BADARG;
+	}
 
 	for (i = 0; i < conn->nEvents; i++)
 	{
 		if (conn->events[i].proc == proc)
-			return false;		/* already registered */
+		{
+			printfPQExpBuffer(&conn->errorMessage,
+							  "proc is already registered in PQregisterEventProc");
+			return PGEVTREG_ALREADY_REAGISTERED;
+		}
 	}
 
 	if (conn->nEvents >= conn->eventArraySize)
@@ -64,7 +75,10 @@ PQregisterEventProc(PGconn *conn, PGEventProc proc,
 			e = (PGEvent *) malloc(newSize * sizeof(PGEvent));
 
 		if (!e)
-			return false;
+		{
+			printfPQExpBuffer(&conn->errorMessage, "out of memory");
+			return PGEVTREG_OUT_OF_MEMORY;
+		}
 
 		conn->eventArraySize = newSize;
 		conn->events = e;
@@ -73,7 +87,10 @@ PQregisterEventProc(PGconn *conn, PGEventProc proc,
 	conn->events[conn->nEvents].proc = proc;
 	conn->events[conn->nEvents].name = strdup(name);
 	if (!conn->events[conn->nEvents].name)
-		return false;
+	{
+		printfPQExpBuffer(&conn->errorMessage, "out of memory");
+		return PGEVTREG_OUT_OF_MEMORY;
+	}
 	conn->events[conn->nEvents].passThrough = passThrough;
 	conn->events[conn->nEvents].data = NULL;
 	conn->events[conn->nEvents].resultInitialized = false;
@@ -84,10 +101,11 @@ PQregisterEventProc(PGconn *conn, PGEventProc proc,
 	{
 		conn->nEvents--;
 		free(conn->events[conn->nEvents].name);
-		return false;
+		printfPQExpBuffer(&conn->errorMessage, "proc rejected");
+		return PGEVTREG_PROC_REJECTED;
 	}
 
-	return true;
+	return PGEVTREG_SUCCESS;
 }
 
 /*
diff --git a/src/interfaces/libpq/libpq-events.h b/src/interfaces/libpq/libpq-events.h
index 5108a55cf6..7b07537c11 100644
--- a/src/interfaces/libpq/libpq-events.h
+++ b/src/interfaces/libpq/libpq-events.h
@@ -23,6 +23,16 @@ extern "C"
 {
 #endif
 
+/* PQregisterEventProc return value */
+typedef enum
+{
+	PGEVTREG_SUCCESS,
+	PGEVTREG_BADARG,
+	PGEVTREG_ALREADY_REAGISTERED,
+	PGEVTREG_OUT_OF_MEMORY,
+	PGEVTREG_PROC_REJECTED
+} PGRegEventResult;
+	
 /* Callback Event Ids */
 typedef enum
 {
@@ -69,8 +79,9 @@ typedef struct
 typedef int (*PGEventProc) (PGEventId evtId, void *evtInfo, void *passThrough);
 
 /* Registers an event proc with the given PGconn. */
-extern int	PQregisterEventProc(PGconn *conn, PGEventProc proc,
-								const char *name, void *passThrough);
+extern PGRegEventResult	PQregisterEventProc(PGconn *conn, PGEventProc proc,
+											const char *name,
+											void *passThrough);
 
 /* Sets the PGconn instance data for the provided proc to data. */
 extern int	PQsetInstanceData(PGconn *conn, PGEventProc proc, void *data);
-- 
2.18.2

0006-Add-new-libpq-event-PGEVT_CONNDISCONNECTION.patchtext/x-patch; charset=us-asciiDownload
From 15013dfca167ec336ba05cdf4550204ee69cc8f5 Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <horikyoga.ntt@gmail.com>
Date: Fri, 13 Mar 2020 12:01:56 +0900
Subject: [PATCH 06/11] Add new libpq-event PGEVT_CONNDISCONNECTION

Add an event to detect closure of the underlying socket.
---
 src/interfaces/libpq/fe-connect.c   | 14 ++++++++++++++
 src/interfaces/libpq/libpq-events.h |  6 ++++++
 2 files changed, 20 insertions(+)

diff --git a/src/interfaces/libpq/fe-connect.c b/src/interfaces/libpq/fe-connect.c
index 0157c619aa..8de5304600 100644
--- a/src/interfaces/libpq/fe-connect.c
+++ b/src/interfaces/libpq/fe-connect.c
@@ -457,6 +457,8 @@ pgthreadlock_t pg_g_threadlock = default_threadlock;
 void
 pqDropConnection(PGconn *conn, bool flushInput)
 {
+	int i;
+
 	/* Drop any SSL state */
 	pqsecure_close(conn);
 
@@ -465,6 +467,18 @@ pqDropConnection(PGconn *conn, bool flushInput)
 		closesocket(conn->sock);
 	conn->sock = PGINVALID_SOCKET;
 
+	/* let any event procs notice of disconnection */
+	for (i = 0; i < conn->nEvents; i++)
+	{
+		PGEventConnDisconnection evt;
+
+		evt.conn = conn;
+
+		/* ignoring failure */
+		(void) conn->events[i].proc(PGEVT_CONNDISCONNECTION, &evt,
+									conn->events[i].passThrough);
+	}
+	
 	/* Optionally discard any unread data */
 	if (flushInput)
 		conn->inStart = conn->inCursor = conn->inEnd = 0;
diff --git a/src/interfaces/libpq/libpq-events.h b/src/interfaces/libpq/libpq-events.h
index 7b07537c11..ffcd604b82 100644
--- a/src/interfaces/libpq/libpq-events.h
+++ b/src/interfaces/libpq/libpq-events.h
@@ -38,6 +38,7 @@ typedef enum
 {
 	PGEVT_REGISTER,
 	PGEVT_CONNRESET,
+	PGEVT_CONNDISCONNECTION,
 	PGEVT_CONNDESTROY,
 	PGEVT_RESULTCREATE,
 	PGEVT_RESULTCOPY,
@@ -59,6 +60,11 @@ typedef struct
 	PGconn	   *conn;
 } PGEventConnDestroy;
 
+typedef struct
+{
+	PGconn	   *conn;
+} PGEventConnDisconnection;
+
 typedef struct
 {
 	PGconn	   *conn;
-- 
2.18.2

0007-Reuse-a-WaitEventSet-in-libpqwalreceiver.c.patchtext/x-patch; charset=us-asciiDownload
From 00eb469bb537dfb134d36d79b93c337e92308557 Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <horikyoga.ntt@gmail.com>
Date: Fri, 13 Mar 2020 14:18:05 +0900
Subject: [PATCH 07/11] Reuse a WaitEventSet in libpqwalreceiver.c.

To avoid repeatedly setting up and tearing down WaitEventSet objects
and associated kernel objects, reuse a WaitEventSet.  Export a wait
function that is smart enough to handle socket changes under the
covers, and then use that for physical and logical replication.
---
 .../libpqwalreceiver/libpqwalreceiver.c       | 157 ++++++++++++++----
 src/include/replication/walreceiver.h         |   5 +
 2 files changed, 127 insertions(+), 35 deletions(-)

diff --git a/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c b/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c
index e4fd1f9bb6..319d5b970d 100644
--- a/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c
+++ b/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c
@@ -23,6 +23,7 @@
 #include "catalog/pg_type.h"
 #include "funcapi.h"
 #include "libpq-fe.h"
+#include "libpq-events.h"
 #include "mb/pg_wchar.h"
 #include "miscadmin.h"
 #include "pgstat.h"
@@ -41,6 +42,10 @@ struct WalReceiverConn
 {
 	/* Current connection to the primary, if any */
 	PGconn	   *streamConn;
+	/* Wait event set used to wait for I/O */
+	WaitEventSet *wes;
+	/* Used to handle changes in the underlying socket */
+	int			wes_socket_position;
 	/* Used to remember if the connection is logical or physical */
 	bool		logical;
 	/* Buffer for currently read records */
@@ -80,6 +85,7 @@ static WalRcvExecResult *libpqrcv_exec(WalReceiverConn *conn,
 									   const int nRetTypes,
 									   const Oid *retTypes);
 static void libpqrcv_disconnect(WalReceiverConn *conn);
+static int libpqrcv_wait(WalReceiverConn *conn, long timeout, int wait_event);
 
 static WalReceiverFunctionsType PQWalReceiverFunctions = {
 	libpqrcv_connect,
@@ -96,13 +102,20 @@ static WalReceiverFunctionsType PQWalReceiverFunctions = {
 	libpqrcv_create_slot,
 	libpqrcv_get_backend_pid,
 	libpqrcv_exec,
-	libpqrcv_disconnect
+	libpqrcv_disconnect,
+	libpqrcv_wait
 };
 
 /* Prototypes for private functions */
-static PGresult *libpqrcv_PQexec(PGconn *streamConn, const char *query);
-static PGresult *libpqrcv_PQgetResult(PGconn *streamConn);
+static int walrcv_libpqcallback(PGEventId evtId, void *evtInfo,
+								void *passThrough);
+static PGresult *libpqrcv_PQexec(WalReceiverConn *conn, const char *query);
+static PGresult *libpqrcv_PQgetResult(WalReceiverConn *conn);
 static char *stringlist_to_identifierstr(PGconn *conn, List *strings);
+static int libpqrcv_wait_for_socket(WalReceiverConn *conn, int io_flags,
+									long timeout,
+									WaitEvent *occurred_events,
+									WaitEventClient event_type);
 
 /*
  * Module initialization function
@@ -115,6 +128,27 @@ _PG_init(void)
 	WalReceiverFunctions = &PQWalReceiverFunctions;
 }
 
+static int
+walrcv_libpqcallback(PGEventId evtId, void *evtInfo, void *passThrough)
+{
+	PGconn					   *conn PG_USED_FOR_ASSERTS_ONLY;
+	WalReceiverConn			   *walrcvconn = (WalReceiverConn *) passThrough;
+
+	/* return if not interested in the event or nothing to do */
+	if (evtId != PGEVT_CONNDISCONNECTION ||
+		walrcvconn->wes == NULL || walrcvconn->wes_socket_position < 0)
+		return true;
+
+	conn = ((PGEventConnDisconnection *)evtInfo)->conn;
+	Assert(walrcvconn->streamConn == conn);
+
+	/* The socket is already closed. */
+	RemoveWaitEvent(walrcvconn->wes, walrcvconn->wes_socket_position, true);
+	walrcvconn->wes_socket_position = -1;
+
+	return true;
+}
+
 /*
  * Establish the connection to the primary server for XLOG streaming
  *
@@ -129,6 +163,7 @@ libpqrcv_connect(const char *conninfo, bool logical, const char *appname,
 	const char *keys[5];
 	const char *vals[5];
 	int			i = 0;
+	PGRegEventResult regresult;
 
 	/*
 	 * We use the expand_dbname parameter to process the connection string (or
@@ -168,6 +203,23 @@ libpqrcv_connect(const char *conninfo, bool logical, const char *appname,
 		return NULL;
 	}
 
+	/* Create a WaitEventSet that will last as long as the connection. */
+	Assert(conn->streamConn);
+	regresult = PQregisterEventProc(conn->streamConn, walrcv_libpqcallback,
+									"libpqwalrcv disconnect callback", conn);
+	if (regresult != PGEVTREG_SUCCESS)
+	{
+		*err = pchomp(PQerrorMessage(conn->streamConn));
+		return NULL;
+	}
+		
+	conn->wes = CreateWaitEventSet(TopMemoryContext, 3);
+	conn->wes_socket_position =
+		AddWaitEventToSet(conn->wes, WL_SOCKET_READABLE,
+						  PQsocket(conn->streamConn), NULL, NULL);
+	AddWaitEventToSet(conn->wes, WL_LATCH_SET, PGINVALID_SOCKET, MyLatch, NULL);
+	AddWaitEventToSet(conn->wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, NULL, NULL);
+
 	/*
 	 * Poll connection until we have OK or FAILED status.
 	 *
@@ -177,7 +229,7 @@ libpqrcv_connect(const char *conninfo, bool logical, const char *appname,
 	do
 	{
 		int			io_flag;
-		int			rc;
+		WaitEvent	event;
 
 		if (status == PGRES_POLLING_READING)
 			io_flag = WL_SOCKET_READABLE;
@@ -189,21 +241,18 @@ libpqrcv_connect(const char *conninfo, bool logical, const char *appname,
 		else
 			io_flag = WL_SOCKET_WRITEABLE;
 
-		rc = WaitLatchOrSocket(MyLatch,
-							   WL_EXIT_ON_PM_DEATH | WL_LATCH_SET | io_flag,
-							   PQsocket(conn->streamConn),
-							   0,
-							   WAIT_EVENT_LIBPQWALRECEIVER_CONNECT);
+		(void) libpqrcv_wait_for_socket(conn, io_flag, -1, &event,
+										WAIT_EVENT_LIBPQWALRECEIVER_CONNECT);
 
 		/* Interrupted? */
-		if (rc & WL_LATCH_SET)
+		if (event.events == WL_LATCH_SET)
 		{
 			ResetLatch(MyLatch);
 			ProcessWalRcvInterrupts();
 		}
 
 		/* If socket is ready, advance the libpq state machine */
-		if (rc & io_flag)
+		if (event.events == io_flag)
 			status = PQconnectPoll(conn->streamConn);
 	} while (status != PGRES_POLLING_OK && status != PGRES_POLLING_FAILED);
 
@@ -322,7 +371,7 @@ libpqrcv_identify_system(WalReceiverConn *conn, TimeLineID *primary_tli)
 	 * Get the system identifier and timeline ID as a DataRow message from the
 	 * primary server.
 	 */
-	res = libpqrcv_PQexec(conn->streamConn, "IDENTIFY_SYSTEM");
+	res = libpqrcv_PQexec(conn, "IDENTIFY_SYSTEM");
 	if (PQresultStatus(res) != PGRES_TUPLES_OK)
 	{
 		PQclear(res);
@@ -431,7 +480,7 @@ libpqrcv_startstreaming(WalReceiverConn *conn,
 						 options->proto.physical.startpointTLI);
 
 	/* Start streaming. */
-	res = libpqrcv_PQexec(conn->streamConn, cmd.data);
+	res = libpqrcv_PQexec(conn, cmd.data);
 	pfree(cmd.data);
 
 	if (PQresultStatus(res) == PGRES_COMMAND_OK)
@@ -479,7 +528,7 @@ libpqrcv_endstreaming(WalReceiverConn *conn, TimeLineID *next_tli)
 	 * If we had not yet received CopyDone from the backend, PGRES_COPY_OUT is
 	 * also possible in case we aborted the copy in mid-stream.
 	 */
-	res = libpqrcv_PQgetResult(conn->streamConn);
+	res = libpqrcv_PQgetResult(conn);
 	if (PQresultStatus(res) == PGRES_TUPLES_OK)
 	{
 		/*
@@ -493,7 +542,7 @@ libpqrcv_endstreaming(WalReceiverConn *conn, TimeLineID *next_tli)
 		PQclear(res);
 
 		/* the result set should be followed by CommandComplete */
-		res = libpqrcv_PQgetResult(conn->streamConn);
+		res = libpqrcv_PQgetResult(conn);
 	}
 	else if (PQresultStatus(res) == PGRES_COPY_OUT)
 	{
@@ -506,7 +555,7 @@ libpqrcv_endstreaming(WalReceiverConn *conn, TimeLineID *next_tli)
 							pchomp(PQerrorMessage(conn->streamConn)))));
 
 		/* CommandComplete should follow */
-		res = libpqrcv_PQgetResult(conn->streamConn);
+		res = libpqrcv_PQgetResult(conn);
 	}
 
 	if (PQresultStatus(res) != PGRES_COMMAND_OK)
@@ -516,7 +565,7 @@ libpqrcv_endstreaming(WalReceiverConn *conn, TimeLineID *next_tli)
 	PQclear(res);
 
 	/* Verify that there are no more results */
-	res = libpqrcv_PQgetResult(conn->streamConn);
+	res = libpqrcv_PQgetResult(conn);
 	if (res != NULL)
 		ereport(ERROR,
 				(errmsg("unexpected result after CommandComplete: %s",
@@ -540,7 +589,7 @@ libpqrcv_readtimelinehistoryfile(WalReceiverConn *conn,
 	 * Request the primary to send over the history file for given timeline.
 	 */
 	snprintf(cmd, sizeof(cmd), "TIMELINE_HISTORY %u", tli);
-	res = libpqrcv_PQexec(conn->streamConn, cmd);
+	res = libpqrcv_PQexec(conn, cmd);
 	if (PQresultStatus(res) != PGRES_TUPLES_OK)
 	{
 		PQclear(res);
@@ -582,8 +631,9 @@ libpqrcv_readtimelinehistoryfile(WalReceiverConn *conn,
  * May return NULL, rather than an error result, on failure.
  */
 static PGresult *
-libpqrcv_PQexec(PGconn *streamConn, const char *query)
+libpqrcv_PQexec(WalReceiverConn *conn, const char *query)
 {
+	PGconn	   *streamConn = conn->streamConn;
 	PGresult   *lastResult = NULL;
 
 	/*
@@ -606,7 +656,7 @@ libpqrcv_PQexec(PGconn *streamConn, const char *query)
 		/* Wait for, and collect, the next PGresult. */
 		PGresult   *result;
 
-		result = libpqrcv_PQgetResult(streamConn);
+		result = libpqrcv_PQgetResult(conn);
 		if (result == NULL)
 			break;				/* query is complete, or failure */
 
@@ -631,30 +681,28 @@ libpqrcv_PQexec(PGconn *streamConn, const char *query)
  * Perform the equivalent of PQgetResult(), but watch for interrupts.
  */
 static PGresult *
-libpqrcv_PQgetResult(PGconn *streamConn)
+libpqrcv_PQgetResult(WalReceiverConn *conn)
 {
+	PGconn	   *streamConn = conn->streamConn;
+
 	/*
 	 * Collect data until PQgetResult is ready to get the result without
 	 * blocking.
 	 */
 	while (PQisBusy(streamConn))
 	{
-		int			rc;
+		WaitEvent	event;
 
 		/*
 		 * We don't need to break down the sleep into smaller increments,
 		 * since we'll get interrupted by signals and can handle any
 		 * interrupts here.
 		 */
-		rc = WaitLatchOrSocket(MyLatch,
-							   WL_EXIT_ON_PM_DEATH | WL_SOCKET_READABLE |
-							   WL_LATCH_SET,
-							   PQsocket(streamConn),
-							   0,
-							   WAIT_EVENT_LIBPQWALRECEIVER_RECEIVE);
+		(void) libpqrcv_wait_for_socket(conn, WL_SOCKET_READABLE, -1, &event,
+										WAIT_EVENT_LIBPQWALRECEIVER_RECEIVE);
 
 		/* Interrupted? */
-		if (rc & WL_LATCH_SET)
+		if (event.events == WL_LATCH_SET)
 		{
 			ResetLatch(MyLatch);
 			ProcessWalRcvInterrupts();
@@ -681,9 +729,27 @@ libpqrcv_disconnect(WalReceiverConn *conn)
 	PQfinish(conn->streamConn);
 	if (conn->recvBuf != NULL)
 		PQfreemem(conn->recvBuf);
+	FreeWaitEventSet(conn->wes);
 	pfree(conn);
 }
 
+/*
+ * Wait for new data to arrive, or a timeout.
+ */
+static int
+libpqrcv_wait(WalReceiverConn *conn, long timeout, int wait_event)
+{
+	WaitEvent	event;
+	int			rc;
+
+	rc = libpqrcv_wait_for_socket(conn, WL_SOCKET_READABLE,
+								  timeout, &event, wait_event);
+	if (rc == 0)
+		return WL_TIMEOUT;
+
+	return event.events;
+}
+
 /*
  * Receive a message available from XLOG stream.
  *
@@ -701,8 +767,7 @@ libpqrcv_disconnect(WalReceiverConn *conn)
  * ereports on error.
  */
 static int
-libpqrcv_receive(WalReceiverConn *conn, char **buffer,
-				 pgsocket *wait_fd)
+libpqrcv_receive(WalReceiverConn *conn, char **buffer, pgsocket *wait_fd)
 {
 	int			rawlen;
 
@@ -733,13 +798,13 @@ libpqrcv_receive(WalReceiverConn *conn, char **buffer,
 	{
 		PGresult   *res;
 
-		res = libpqrcv_PQgetResult(conn->streamConn);
+		res = libpqrcv_PQgetResult(conn);
 		if (PQresultStatus(res) == PGRES_COMMAND_OK)
 		{
 			PQclear(res);
 
 			/* Verify that there are no more results. */
-			res = libpqrcv_PQgetResult(conn->streamConn);
+			res = libpqrcv_PQgetResult(conn);
 			if (res != NULL)
 			{
 				PQclear(res);
@@ -839,7 +904,7 @@ libpqrcv_create_slot(WalReceiverConn *conn, const char *slotname,
 		appendStringInfoString(&cmd, " PHYSICAL RESERVE_WAL");
 	}
 
-	res = libpqrcv_PQexec(conn->streamConn, cmd.data);
+	res = libpqrcv_PQexec(conn, cmd.data);
 	pfree(cmd.data);
 
 	if (PQresultStatus(res) != PGRES_TUPLES_OK)
@@ -963,7 +1028,7 @@ libpqrcv_exec(WalReceiverConn *conn, const char *query,
 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
 				 errmsg("the query interface requires a database connection")));
 
-	pgres = libpqrcv_PQexec(conn->streamConn, query);
+	pgres = libpqrcv_PQexec(conn, query);
 
 	switch (PQresultStatus(pgres))
 	{
@@ -1047,3 +1112,25 @@ stringlist_to_identifierstr(PGconn *conn, List *strings)
 
 	return res.data;
 }
+
+/*
+ * Update our WaitEventSet so that we can wait for 'io_flags' on our socket,
+ * considering that the socket might have changed.
+ */
+static int
+libpqrcv_wait_for_socket(WalReceiverConn *conn, int io_flags, long timeout,
+						 WaitEvent *occurred_events,
+						 WaitEventClient event_type)
+{
+	/*  wes is removed on disconnection, add new event if needed */
+	if (conn->wes_socket_position < 0)
+		conn->wes_socket_position =
+			AddWaitEventToSet(conn->wes, io_flags,
+							  PQsocket(conn->streamConn), NULL, NULL);
+	else
+		/* just set to wait for the right event */
+		ModifyWaitEvent(conn->wes, conn->wes_socket_position,
+						io_flags, NULL);
+
+	return WaitEventSetWait(conn->wes, timeout, occurred_events, 1, event_type);
+}
diff --git a/src/include/replication/walreceiver.h b/src/include/replication/walreceiver.h
index e08afc6548..115d6acf18 100644
--- a/src/include/replication/walreceiver.h
+++ b/src/include/replication/walreceiver.h
@@ -239,6 +239,8 @@ typedef WalRcvExecResult *(*walrcv_exec_fn) (WalReceiverConn *conn,
 											 const int nRetTypes,
 											 const Oid *retTypes);
 typedef void (*walrcv_disconnect_fn) (WalReceiverConn *conn);
+typedef int (*walrcv_wait_fn) (WalReceiverConn *conn, long timeout,
+							   int wait_event);
 
 typedef struct WalReceiverFunctionsType
 {
@@ -257,6 +259,7 @@ typedef struct WalReceiverFunctionsType
 	walrcv_get_backend_pid_fn walrcv_get_backend_pid;
 	walrcv_exec_fn walrcv_exec;
 	walrcv_disconnect_fn walrcv_disconnect;
+	walrcv_wait_fn walrcv_wait;
 } WalReceiverFunctionsType;
 
 extern PGDLLIMPORT WalReceiverFunctionsType *WalReceiverFunctions;
@@ -291,6 +294,8 @@ extern PGDLLIMPORT WalReceiverFunctionsType *WalReceiverFunctions;
 	WalReceiverFunctions->walrcv_exec(conn, exec, nRetTypes, retTypes)
 #define walrcv_disconnect(conn) \
 	WalReceiverFunctions->walrcv_disconnect(conn)
+#define walrcv_wait(conn, timeout, wait_event) \
+	WalReceiverFunctions->walrcv_wait(conn, timeout, wait_event)
 
 static inline void
 walrcv_clear_result(WalRcvExecResult *walres)
-- 
2.18.2

0008-Use-a-WaitEventSet-for-postgres_fdw.patchtext/x-patch; charset=us-asciiDownload
From df330045ef8b7a303f01b38c90e5d1805a71acac Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Tue, 25 Feb 2020 15:34:12 +1300
Subject: [PATCH 08/11] Use a WaitEventSet for postgres_fdw.

The same WaitEventSet object will be reused for the life of the
backend.

Author: Thomas Munro
Discussion: https://postgr.es/m/CA%2BhUKGJAC4Oqao%3DqforhNey20J8CiG2R%3DoBPqvfR0vOJrFysGw%40mail.gmail.com
---
 contrib/postgres_fdw/connection.c | 98 +++++++++++++++++++++++++++----
 1 file changed, 88 insertions(+), 10 deletions(-)

diff --git a/contrib/postgres_fdw/connection.c b/contrib/postgres_fdw/connection.c
index e45647f3ea..0397a16702 100644
--- a/contrib/postgres_fdw/connection.c
+++ b/contrib/postgres_fdw/connection.c
@@ -65,6 +65,12 @@ typedef struct ConnCacheEntry
  */
 static HTAB *ConnectionHash = NULL;
 
+/* Reusuable WaitEventSet. */
+static WaitEventSet *ConnectionWaitSet = NULL;
+static int64 ConnectionWaitSetSocketChangeCount = -1;
+static PGconn *ConnectionWaitSetConn = NULL;
+static int ConnectionWaitSetPosition = -1;
+
 /* for assigning cursor numbers and prepared statement numbers */
 static unsigned int cursor_number = 0;
 static unsigned int prep_stmt_number = 0;
@@ -92,6 +98,7 @@ static bool pgfdw_exec_cleanup_query(PGconn *conn, const char *query,
 static bool pgfdw_get_cleanup_result(PGconn *conn, TimestampTz endtime,
 									 PGresult **result);
 static bool UserMappingPasswordRequired(UserMapping *user);
+static int pgfdw_wait_for_socket(PGconn *conn, long timeout);
 
 /*
  * Get a PGconn which can be used to execute queries on the remote PostgreSQL
@@ -115,6 +122,17 @@ GetConnection(UserMapping *user, bool will_prep_stmt)
 	{
 		HASHCTL		ctl;
 
+		/*
+		 * We'll use a single WaitEventSet for the lifetime of this backend,
+		 * and add and remove sockets as appropriate.  Only one socket will
+		 * be in it at a time.
+		 */
+		Assert(ConnectionWaitSet == NULL);
+		ConnectionWaitSet = CreateWaitEventSet(TopMemoryContext, 3);
+		AddWaitEventToSet(ConnectionWaitSet, WL_LATCH_SET, -1, MyLatch, NULL);
+		AddWaitEventToSet(ConnectionWaitSet, WL_EXIT_ON_PM_DEATH, -1, NULL,
+						  NULL);
+
 		MemSet(&ctl, 0, sizeof(ctl));
 		ctl.keysize = sizeof(ConnCacheKey);
 		ctl.entrysize = sizeof(ConnCacheEntry);
@@ -344,6 +362,14 @@ disconnect_pg_server(ConnCacheEntry *entry)
 	if (entry->conn != NULL)
 	{
 		PQfinish(entry->conn);
+		if (ConnectionWaitSetConn == entry->conn)
+		{
+			/* We do this after PQfinish, so we know the socket is closed. */
+			RemoveWaitEvent(ConnectionWaitSet,
+							ConnectionWaitSetPosition,
+							true);
+			ConnectionWaitSetConn = NULL;
+		}
 		entry->conn = NULL;
 		ReleaseExternalFD();
 	}
@@ -603,11 +629,7 @@ pgfdw_get_result(PGconn *conn, const char *query)
 				int			wc;
 
 				/* Sleep until there's something to do */
-				wc = WaitLatchOrSocket(MyLatch,
-									   WL_LATCH_SET | WL_SOCKET_READABLE |
-									   WL_EXIT_ON_PM_DEATH,
-									   PQsocket(conn),
-									   -1L, PG_WAIT_EXTENSION);
+				wc = pgfdw_wait_for_socket(conn, -1);
 				ResetLatch(MyLatch);
 
 				CHECK_FOR_INTERRUPTS();
@@ -1207,11 +1229,7 @@ pgfdw_get_cleanup_result(PGconn *conn, TimestampTz endtime, PGresult **result)
 				cur_timeout = Min(60000, secs * USECS_PER_SEC + microsecs);
 
 				/* Sleep until there's something to do */
-				wc = WaitLatchOrSocket(MyLatch,
-									   WL_LATCH_SET | WL_SOCKET_READABLE |
-									   WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
-									   PQsocket(conn),
-									   cur_timeout, PG_WAIT_EXTENSION);
+				wc = pgfdw_wait_for_socket(conn, cur_timeout);
 				ResetLatch(MyLatch);
 
 				CHECK_FOR_INTERRUPTS();
@@ -1250,3 +1268,63 @@ exit:	;
 		*result = last_res;
 	return timed_out;
 }
+
+static int
+pgfdw_wait_for_socket(PGconn *conn, long timeout)
+{
+	WaitEvent	event;
+	int			rc;
+
+	/* If a different conn is in the set, or the socket changed, remove. */
+	if (ConnectionWaitSetConn)
+	{
+		bool socket_changed =
+			(ConnectionWaitSetSocketChangeCount !=
+			PQsocketChangeCount(ConnectionWaitSetConn));
+
+		if (ConnectionWaitSetConn == conn)
+		{
+			/*
+			 * This connection is already in there, but the socket might have
+			 * changed.  If so, remove it.
+			 */
+			if (socket_changed)
+			{
+				RemoveWaitEvent(ConnectionWaitSet,
+								ConnectionWaitSetPosition,
+								true);
+				ConnectionWaitSetConn = NULL;
+			}
+		}
+		else
+		{
+			/*
+			 * A different connection is in there.  Remove it, being careful
+			 * to report whether the socket was already closed (this affects
+			 * whether we unregister the fd with the kernel).
+			 */
+			RemoveWaitEvent(ConnectionWaitSet,
+							ConnectionWaitSetPosition,
+							socket_changed);
+			ConnectionWaitSetConn = NULL;
+		}
+	}
+
+	/* Do we need to add our connection? */
+	if (ConnectionWaitSetConn == NULL)
+	{
+		ConnectionWaitSetPosition =
+			AddWaitEventToSet(ConnectionWaitSet, WL_SOCKET_READABLE,
+							  PQsocket(conn), NULL, NULL);
+		ConnectionWaitSetConn = conn;
+		ConnectionWaitSetSocketChangeCount = PQsocketChangeCount(conn);
+	}
+
+	/* Finally, we can wait. */
+	rc = WaitEventSetWait(ConnectionWaitSet, timeout, &event, 1,
+						  PG_WAIT_EXTENSION);
+	if (rc == 0)
+		return WL_TIMEOUT;
+
+	return event.events;
+}
-- 
2.18.2

0009-Use-WL_EXIT_ON_PM_DEATH-in-FeBeWaitSet.patchtext/x-patch; charset=us-asciiDownload
From 79d9ba58f686462bebe035e27c5439a96b04a662 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Mon, 24 Feb 2020 23:51:09 +1300
Subject: [PATCH 09/11] Use WL_EXIT_ON_PM_DEATH in FeBeWaitSet.

Previously, we'd give either a FATAL message or a silent exit() when
we detected postmaster death, depending on which wait point we were at.
Make the exit more uniform, by using the standard exit facility.

Author: Thomas Munro
Discussion: https://postgr.es/m/CA%2BhUKGJAC4Oqao%3DqforhNey20J8CiG2R%3DoBPqvfR0vOJrFysGw%40mail.gmail.com
---
 src/backend/libpq/be-secure.c | 28 ----------------------------
 src/backend/libpq/pqcomm.c    |  2 +-
 2 files changed, 1 insertion(+), 29 deletions(-)

diff --git a/src/backend/libpq/be-secure.c b/src/backend/libpq/be-secure.c
index 2ae507a902..aec0926d93 100644
--- a/src/backend/libpq/be-secure.c
+++ b/src/backend/libpq/be-secure.c
@@ -184,28 +184,6 @@ retry:
 		WaitEventSetWait(FeBeWaitSet, -1 /* no timeout */ , &event, 1,
 						 WAIT_EVENT_CLIENT_READ);
 
-		/*
-		 * If the postmaster has died, it's not safe to continue running,
-		 * because it is the postmaster's job to kill us if some other backend
-		 * exits uncleanly.  Moreover, we won't run very well in this state;
-		 * helper processes like walwriter and the bgwriter will exit, so
-		 * performance may be poor.  Finally, if we don't exit, pg_ctl will be
-		 * unable to restart the postmaster without manual intervention, so no
-		 * new connections can be accepted.  Exiting clears the deck for a
-		 * postmaster restart.
-		 *
-		 * (Note that we only make this check when we would otherwise sleep on
-		 * our latch.  We might still continue running for a while if the
-		 * postmaster is killed in mid-query, or even through multiple queries
-		 * if we never have to wait for read.  We don't want to burn too many
-		 * cycles checking for this very rare condition, and this should cause
-		 * us to exit quickly in most cases.)
-		 */
-		if (event.events & WL_POSTMASTER_DEATH)
-			ereport(FATAL,
-					(errcode(ERRCODE_ADMIN_SHUTDOWN),
-					 errmsg("terminating connection due to unexpected postmaster exit")));
-
 		/* Handle interrupt. */
 		if (event.events & WL_LATCH_SET)
 		{
@@ -296,12 +274,6 @@ retry:
 		WaitEventSetWait(FeBeWaitSet, -1 /* no timeout */ , &event, 1,
 						 WAIT_EVENT_CLIENT_WRITE);
 
-		/* See comments in secure_read. */
-		if (event.events & WL_POSTMASTER_DEATH)
-			ereport(FATAL,
-					(errcode(ERRCODE_ADMIN_SHUTDOWN),
-					 errmsg("terminating connection due to unexpected postmaster exit")));
-
 		/* Handle interrupt. */
 		if (event.events & WL_LATCH_SET)
 		{
diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c
index 7717bb2719..5422175185 100644
--- a/src/backend/libpq/pqcomm.c
+++ b/src/backend/libpq/pqcomm.c
@@ -222,7 +222,7 @@ pq_init(void)
 	AddWaitEventToSet(FeBeWaitSet, WL_SOCKET_WRITEABLE, MyProcPort->sock,
 					  NULL, NULL);
 	AddWaitEventToSet(FeBeWaitSet, WL_LATCH_SET, -1, MyLatch, NULL);
-	AddWaitEventToSet(FeBeWaitSet, WL_POSTMASTER_DEATH, -1, NULL, NULL);
+	AddWaitEventToSet(FeBeWaitSet, WL_EXIT_ON_PM_DEATH, -1, NULL, NULL);
 }
 
 /* --------------------------------
-- 
2.18.2

0010-Use-FeBeWaitSet-for-walsender.c.patchtext/x-patch; charset=us-asciiDownload
From 779a255cd97ceb52f138d7a00034dc6d774881de Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Mon, 24 Feb 2020 23:48:29 +1300
Subject: [PATCH 10/11] Use FeBeWaitSet for walsender.c.

Author: Thomas Munro
Discussion: https://postgr.es/m/CA%2BhUKGJAC4Oqao%3DqforhNey20J8CiG2R%3DoBPqvfR0vOJrFysGw%40mail.gmail.com
---
 src/backend/replication/walsender.c | 32 ++++++++++++++---------------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index 3f74bc8493..2beedc0f1a 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -1241,7 +1241,7 @@ WalSndWriteData(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid,
 	/* If we have pending write here, go to slow path */
 	for (;;)
 	{
-		int			wakeEvents;
+		WaitEvent	event;
 		long		sleeptime;
 
 		/* Check for input from the client */
@@ -1258,13 +1258,11 @@ WalSndWriteData(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid,
 
 		sleeptime = WalSndComputeSleeptime(GetCurrentTimestamp());
 
-		wakeEvents = WL_LATCH_SET | WL_EXIT_ON_PM_DEATH |
-			WL_SOCKET_WRITEABLE | WL_SOCKET_READABLE | WL_TIMEOUT;
-
 		/* Sleep until something happens or we time out */
-		(void) WaitLatchOrSocket(MyLatch, wakeEvents,
-								 MyProcPort->sock, sleeptime,
-								 WAIT_EVENT_WAL_SENDER_WRITE_DATA);
+		ModifyWaitEvent(FeBeWaitSet, 0,
+						WL_SOCKET_WRITEABLE | WL_SOCKET_READABLE, NULL);
+		(void) WaitEventSetWait(FeBeWaitSet, sleeptime, &event, 1,
+								WAIT_EVENT_WAL_SENDER_WRITE_DATA);
 
 		/* Clear any already-pending wakeups */
 		ResetLatch(MyLatch);
@@ -1348,6 +1346,7 @@ WalSndWaitForWal(XLogRecPtr loc)
 
 	for (;;)
 	{
+		WaitEvent	event;
 		long		sleeptime;
 
 		/* Clear any already-pending wakeups */
@@ -1441,15 +1440,14 @@ WalSndWaitForWal(XLogRecPtr loc)
 		 */
 		sleeptime = WalSndComputeSleeptime(GetCurrentTimestamp());
 
-		wakeEvents = WL_LATCH_SET | WL_EXIT_ON_PM_DEATH |
-			WL_SOCKET_READABLE | WL_TIMEOUT;
+		wakeEvents = WL_SOCKET_READABLE;
 
 		if (pq_is_send_pending())
 			wakeEvents |= WL_SOCKET_WRITEABLE;
 
-		(void) WaitLatchOrSocket(MyLatch, wakeEvents,
-								 MyProcPort->sock, sleeptime,
-								 WAIT_EVENT_WAL_SENDER_WAIT_WAL);
+		ModifyWaitEvent(FeBeWaitSet, 0, wakeEvents, NULL);
+		(void) WaitEventSetWait(FeBeWaitSet, sleeptime, &event, 1,
+								WAIT_EVENT_WAL_SENDER_WAIT_WAL);
 	}
 
 	/* reactivate latch so WalSndLoop knows to continue */
@@ -2290,9 +2288,9 @@ WalSndLoop(WalSndSendDataCallback send_data)
 		{
 			long		sleeptime;
 			int			wakeEvents;
+			WaitEvent	event;
 
-			wakeEvents = WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | WL_TIMEOUT |
-				WL_SOCKET_READABLE;
+			wakeEvents = WL_SOCKET_READABLE;
 
 			/*
 			 * Use fresh timestamp, not last_processing, to reduce the chance
@@ -2304,9 +2302,9 @@ WalSndLoop(WalSndSendDataCallback send_data)
 				wakeEvents |= WL_SOCKET_WRITEABLE;
 
 			/* Sleep until something happens or we time out */
-			(void) WaitLatchOrSocket(MyLatch, wakeEvents,
-									 MyProcPort->sock, sleeptime,
-									 WAIT_EVENT_WAL_SENDER_MAIN);
+			ModifyWaitEvent(FeBeWaitSet, 0, wakeEvents, NULL);
+			(void) WaitEventSetWait(FeBeWaitSet, sleeptime, &event, 1,
+									WAIT_EVENT_WAL_SENDER_MAIN);
 		}
 	}
 }
-- 
2.18.2

0011-Introduce-a-WaitEventSet-for-the-stats-collector.patchtext/x-patch; charset=us-asciiDownload
From 0fd5c662953035a69bc90de796e82dae284284d0 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Tue, 21 Jan 2020 12:54:11 +1300
Subject: [PATCH 11/11] Introduce a WaitEventSet for the stats collector.

Author: Thomas Munro
Discussion: https://postgr.es/m/CA%2BhUKGJAC4Oqao%3DqforhNey20J8CiG2R%3DoBPqvfR0vOJrFysGw%40mail.gmail.com
---
 src/backend/postmaster/pgstat.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 107c965336..248ddc0bdd 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -4429,6 +4429,8 @@ PgstatCollectorMain(int argc, char *argv[])
 	int			len;
 	PgStat_Msg	msg;
 	int			wr;
+	WaitEvent	event;
+	WaitEventSet *wes;
 
 	/*
 	 * Ignore all signals usually bound to some action in the postmaster,
@@ -4458,6 +4460,12 @@ PgstatCollectorMain(int argc, char *argv[])
 	pgStatRunningInCollector = true;
 	pgStatDBHash = pgstat_read_statsfiles(InvalidOid, true, true);
 
+	/* Prepare to wait for our latch or data in our socket. */
+	wes = CreateWaitEventSet(CurrentMemoryContext, 3);
+	AddWaitEventToSet(wes, WL_LATCH_SET, PGINVALID_SOCKET, MyLatch, NULL);
+	AddWaitEventToSet(wes, WL_POSTMASTER_DEATH, PGINVALID_SOCKET, NULL, NULL);
+	AddWaitEventToSet(wes, WL_SOCKET_READABLE, pgStatSock, NULL, NULL);
+
 	/*
 	 * Loop to process messages until we get SIGQUIT or detect ungraceful
 	 * death of our parent postmaster.
@@ -4636,10 +4644,7 @@ PgstatCollectorMain(int argc, char *argv[])
 
 		/* Sleep until there's something to do */
 #ifndef WIN32
-		wr = WaitLatchOrSocket(MyLatch,
-							   WL_LATCH_SET | WL_POSTMASTER_DEATH | WL_SOCKET_READABLE,
-							   pgStatSock, -1L,
-							   WAIT_EVENT_PGSTAT_MAIN);
+		wr = WaitEventSetWait(wes, -1L, &event, 1, WAIT_EVENT_PGSTAT_MAIN);
 #else
 
 		/*
@@ -4652,18 +4657,15 @@ PgstatCollectorMain(int argc, char *argv[])
 		 * to not provoke "using stale statistics" complaints from
 		 * backend_read_statsfile.
 		 */
-		wr = WaitLatchOrSocket(MyLatch,
-							   WL_LATCH_SET | WL_POSTMASTER_DEATH | WL_SOCKET_READABLE | WL_TIMEOUT,
-							   pgStatSock,
-							   2 * 1000L /* msec */ ,
-							   WAIT_EVENT_PGSTAT_MAIN);
+		wr = WaitEventSetWait(wes, 2 * 1000L /* msec */, &event, 1,
+							  WAIT_EVENT_PGSTAT_MAIN);
 #endif
 
 		/*
 		 * Emergency bailout if postmaster has died.  This is to avoid the
 		 * necessity for manual cleanup of all postmaster children.
 		 */
-		if (wr & WL_POSTMASTER_DEATH)
+		if (wr == 1 && event.events == WL_POSTMASTER_DEATH)
 			break;
 	}							/* end of outer loop */
 
@@ -4672,6 +4674,8 @@ PgstatCollectorMain(int argc, char *argv[])
 	 */
 	pgstat_write_statsfiles(true, true);
 
+	FreeWaitEventSet(wes);
+
 	exit(0);
 }
 
-- 
2.18.2

#7Kyotaro Horiguchi
horikyota.ntt@gmail.com
In reply to: Thomas Munro (#2)
Re: Reducing WaitEventSet syscall churn

At Fri, 13 Mar 2020 16:21:13 +0900 (JST), Kyotaro Horiguchi <horikyota.ntt@gmail.com> wrote in

0007: "Use a WaitEventSet for postgres_fdw."

Continues..

At Thu, 27 Feb 2020 12:17:45 +1300, Thomas Munro <thomas.munro@gmail.com> wrote in

0007: "Use a WaitEventSet for postgres_fdw."

Create a single WaitEventSet and use it for all FDW connections. By
having our own dedicated WES, we can do the bookkeeping required to
know when sockets have been closed or need to removed from kernel wait
sets explicitly (which would be much harder to achieve if
CommonWaitSet were to be used like that; you need to know when sockets
are closed by other code, so you can provide fd_closed to
RemoveWaitEvent()).

It is straightforward and looks fine. If we use the libpq-event-based
solution instead of using the changecount, pgfdw_wait_for_socket and
disconnect_pg_server would be a bit simpler.

Concretely, if you use just one postgres_fdw connection, you'll see
just epoll_wait()/kevent() calls for waits, but whever you switch
between different connections, you'll see eg EPOLL_DEL/EV_DELETE
followed by EPOLL_ADD/EV_ADD when the set is adjusted (in the kqueue
implementation these could be collapse into the following wait, but I

Such syscall sequences is shorter than or equal to what we issue now,
so this patch is still an improvement.

haven't done the work for that). An alternative would be to have one
WES per FDW connection, but that seemed wasteful of file descriptors.

In the multi-connection case, we can save both fds and syscalls by one
WaitEventSet containing all connection fds together. WaitEventSetWait
should take event mask parameter or ModifyWaitEvent should set the
mask in that case. The event is now completely level-triggered so we
can safely ignore unwanted events.

0008: "Use WL_EXIT_ON_PM_DEATH in FeBeWaitSet."

The FATAL message you get if you happen to be waiting for IO rather
than waiting somewhere else seems arbitrarily different. By switching
to a standard automatic exit, it opens the possibility of using
FeBeWaitSet in a couple more places that would otherwise need to
create their own WES (see also [1]). Thoughts?

Don't we really need any message on backend disconnection due to
postmaster death? As for authentication code, it seems to me the
rationale for not writing the log is that the connection has not been
established at the time. (That depends on what we think the
"connection" is.)

And I suppose that the reason for the authentication logic ignoring
signals is that clients expect that authentication should be completed
as far as possible.

0009: "Use FeBeWaitSet for walsender.c."

Enabled by 0008.

It works and doesn't change behavior. But I found it a bit difficult
to find what events the WaitEventSetWait waits. Maybe a comment at
the caller sites would be sufficient. I think any comment about the
bare number "0" as the event position of ModifyWaitEvent is also
needed.

0010: "Introduce a WaitEventSet for the stats collector."

This looks fine. The variable event is defined outside its effective
scope but almost all of its function variables the same way.

By the way I found that pqcomm.c uses -1 instead of PGINVALID_SOCKET
for AddWaitEventToSet.

[1] /messages/by-id/CA+hUKGK=m9dLrq42oWQ4XfK9iDjGiZVwpQ1HkHrAPfG7Kh681g@mail.gmail.com

regards.

--
Kyotaro Horiguchi
NTT Open Source Software Center

#8Daniel Gustafsson
daniel@yesql.se
In reply to: Kyotaro Horiguchi (#6)
Re: Reducing WaitEventSet syscall churn

On 13 Mar 2020, at 08:21, Kyotaro Horiguchi <horikyota.ntt@gmail.com> wrote:

The attached are:
0001-0004 Not changed
0005 Fix interface of PQregisterEventProc
0006 Add new libpq event for this use.
0007 Another version of "0006 Reuse a WaitEventSet in
libpqwalreceiver.c" based on libpq event.
0008-0011 Not changed (old 0007-0010, blindly appended)

Since 0001 has been applied already in 9b8aa0929390a, the patchtester is unable
to make heads or tails with trying this patchset. Can you please submit a
rebased version without the already applied changes?

cheers ./daniel

#9Tom Lane
tgl@sss.pgh.pa.us
In reply to: Daniel Gustafsson (#8)
Re: Reducing WaitEventSet syscall churn

Daniel Gustafsson <daniel@yesql.se> writes:

On 13 Mar 2020, at 08:21, Kyotaro Horiguchi <horikyota.ntt@gmail.com> wrote:
The attached are:
0001-0004 Not changed
0005 Fix interface of PQregisterEventProc
0006 Add new libpq event for this use.
0007 Another version of "0006 Reuse a WaitEventSet in
libpqwalreceiver.c" based on libpq event.
0008-0011 Not changed (old 0007-0010, blindly appended)

Since 0001 has been applied already in 9b8aa0929390a, the patchtester is unable
to make heads or tails with trying this patchset. Can you please submit a
rebased version without the already applied changes?

While I've not really looked at this patchset, I did happen to notice
0005, and I think that's utterly unacceptable as written. You can't
break API/ABI on a released libpq function. I suppose the patch is
assuming that an enum return value is ABI-equivalent to int, but that
assumption is faulty. Moreover, what's the point? None of the later
patches require this AFAICS. (The patch could probably be salvaged
ABI-wise by making the error codes be integer #define's not an enum,
but I fail to see the point of changing this at all. I also don't
much like the idea of allowing callers to assume that there is a fixed
set of possible failure conditions for PQregisterEventProc. If we
wanted to return error details, it'd likely be better to say that an
error message is left in conn->errorMessage.)

0006 is an even more egregious ABI break; you can't renumber existing
enum values without breaking applications. That in itself could be
fixed by putting the new value at the end. But I'd still object very
strongly to 0006, because I do not think that it's acceptable to
have pqDropConnection correspond to an application-visible event.
We use that all over the place for cases that should not be
application-visible, for example when deciding that a connection attempt
has failed and moving on to the next candidate host. We already have
PGEVT_CONNRESET and PGEVT_CONNDESTROY as application-visible connection
state change events, and I don't see why those aren't sufficient.

(BTW, even if these weren't ABI breaks, where are the documentation
changes to go with them?)

regards, tom lane

#10Thomas Munro
thomas.munro@gmail.com
In reply to: Tom Lane (#9)
6 attachment(s)
Re: Reducing WaitEventSet syscall churn

On Sun, Jul 12, 2020 at 7:22 AM Tom Lane <tgl@sss.pgh.pa.us> wrote:

[...complaints about 0005 and 0006...] We already have
PGEVT_CONNRESET and PGEVT_CONNDESTROY as application-visible connection
state change events, and I don't see why those aren't sufficient.

I think Horiguchi-san's general idea of using event callbacks for this
sounds much more promising than my earlier idea of exposing a change
counter (that really was terrible). If it can be done with existing
events then that's even better. Perhaps he and/or I can look into
that for the next CF.

In the meantime, here's a rebase of the more straightforward patches
in the stack. These are the ones that deal only with fixed sets of
file descriptors, and they survive check-world on Linux,
Linux+EXEC_BACKEND (with ASLR disabled) and FreeBSD, and at least
check on macOS and Windows (my CI recipes need more work to get
check-world working on those two). There's one user-visible change
that I'd appreciate feedback on: I propose to drop the FATAL error
when the postmaster goes away, to make things more consistent. See
below for more on that.

Responding to earlier review from Horiguchi-san:

On Tue, Mar 10, 2020 at 12:20 PM Kyotaro Horiguchi
<horikyota.ntt@gmail.com> wrote:

0002: "Use a long lived WaitEventSet for WaitLatch()."

In the last version, I had a new function WaitMyLatch(), but that
didn't help with recoveryWakeupLatch. Switching between latches
doesn't require a syscall, so I didn't want to have a separate WES and
function just for that. So I went back to using plain old
WaitLatch(), and made it "modify" the latch every time before waiting
on CommonWaitSet. An alternative would be to get rid of the concept
of latches other than MyLatch, and change the function to
WaitMyLatch(). A similar thing happens for exit_on_postmaster_death,
for which I didn't want to have a separate WES, so I just set that
flag every time. Thoughts?

It is surely an improvement from that we create a full-fledged WES
every time. The name CommonWaitSet gives an impression as if it is
used for a variety of waitevent sets, but it is used only by
WaitLatch. So I would name it LatchWaitSet. With that name I won't be
surprised by that the function is pointing WL_LATCH_SET by index 0
without any explanation when calling ModifyWaitSet.

Ok, I changed it to LatchWaitSet. I also replaced the index 0 with a
symbolic name LatchWaitSetLatchPos, to make that clearer.

@@ -700,7 +739,11 @@ FreeWaitEventSet(WaitEventSet *set)
ReleaseExternalFD();
#elif defined(WAIT_USE_KQUEUE)
close(set->kqueue_fd);
-       ReleaseExternalFD();
+       if (set->kqueue_fd >= 0)
+       {
+               close(set->kqueue_fd);
+               ReleaseExternalFD();
+       }

Did you forget to remove the close() outside the if block?
Don't we need the same amendment for epoll_fd with kqueue_fd?

Hmm, maybe I screwed that up when resolving a conflict with the
ReleaseExternalFD() stuff. Fixed.

WaitLatch is defined as "If the latch is already set (and WL_LATCH_SET
is given), the function returns immediately.". But now the function
reacts to latch even if WL_LATCH_SET is not set. I think actually it
is alwys set so I think we need to modify Assert and function comment
following the change.

It seems a bit silly to call WaitLatch() if you don't want to wait for
a latch, but I think we can keep that comment and logic by assigning
set->latch = NULL when you wait without WL_LATCH_SET. I tried that in
the attached.

0004: "Introduce RemoveWaitEvent()."

We'll need that to be able to handle connections that are closed and
reopened under the covers by libpq (replication, postgres_fdw). We
also wanted this on a couple of other threads for multiplexing FDWs,
to be able to adjust the wait set dynamically for a proposed async
Append feature.

The implementation is a little naive, and leaves holes in the
"pollfds" and "handles" arrays (for poll() and win32 implementations).
That could be improved with a bit more footwork in a later patch.

XXX The Windows version is based on reading documentation. I'd be
very interested to know if check-world passes (especially
contrib/postgres_fdw and src/test/recovery). Unfortunately my
appveyor.yml fu is not yet strong enough.

I didn't find the documentation about INVALID_HANDLE_VALUE in
lpHandles. Could you give me the URL for that?

I was looking for how you do the equivalent of Unix file descriptor -1
in a call to poll(), and somewhere I read that INVALID_HANDLE_VALUE
has the right effect. I can't find that reference now. Apparently it
works because that's the pseudo-handle value -1 that is returned by
GetCurrentProcess(), and waiting for your own process waits forever so
it's a suitable value for holes in an array of event handles. We
should probably call GetCurrentProcess() instead, but really that is
just stand-in code: we should rewrite it so that we don't need holes!
That might require a second array for use by the poll and win32
implementations. Let's return to that in a later CF?

On Mon, Mar 30, 2020 at 6:15 PM Kyotaro Horiguchi
<horikyota.ntt@gmail.com> wrote:

0008: "Use WL_EXIT_ON_PM_DEATH in FeBeWaitSet."

The FATAL message you get if you happen to be waiting for IO rather
than waiting somewhere else seems arbitrarily different. By switching
to a standard automatic exit, it opens the possibility of using
FeBeWaitSet in a couple more places that would otherwise need to
create their own WES (see also [1]). Thoughts?

Don't we really need any message on backend disconnection due to
postmaster death? As for authentication code, it seems to me the
rationale for not writing the log is that the connection has not been
established at the time. (That depends on what we think the
"connection" is.)

To me, it looks like this variation is just from a time when
postmaster death handling was less standardised. The comment (removed
by this patch) even introduces the topic of postmaster exit as if
you've never heard of it, in this arbitrary location in the tree, one
wait point among many (admittedly one that is often reached). I don't
think there is any good reason for random timing to affect whether or
not you get a hand crafted FATAL message.

However, if others don't like this change, we could drop this patch
and still use FeBeWaitSet for walsender.c. It'd just need to handle
postmaster death explicitly. It felt weird to be adding new code that
has to handle postmaster death explicitly, which is what led me to
notice that the existing FeBeWaitSet coding (and resulting user
experience) is different from other code.

0009: "Use FeBeWaitSet for walsender.c."

Enabled by 0008.

It works and doesn't change behavior. But I found it a bit difficult
to find what events the WaitEventSetWait waits. Maybe a comment at
the caller sites would be sufficient. I think any comment about the
bare number "0" as the event position of ModifyWaitEvent is also
needed.

Agreed. I changed it to use symbolic names
FeBeWaitSet{Socket,Latch}Pos in the new code and also in the
pre-existing code like this.

By the way I found that pqcomm.c uses -1 instead of PGINVALID_SOCKET
for AddWaitEventToSet.

Oh yeah, that's a pre-existing problem. Fixed, since that code was
changed by the patch anyway.

Attachments:

v5-0002-Use-WaitLatch-for-condition-variables.patchtext/x-patch; charset=US-ASCII; name=v5-0002-Use-WaitLatch-for-condition-variables.patchDownload
From 76026728c2ba2fff65e2bdad4ebd97e674afc594 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Mon, 13 Jul 2020 13:44:36 +1200
Subject: [PATCH v5 2/7] Use WaitLatch() for condition variables.

Previously, condition_variable.c created its own long lived
WaitEventSet to avoid extra system calls.  WaitLatch() now
does that, so there is no point in wasting an extra kernel
descriptor.

Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Discussion: https://postgr.es/m/CA%2BhUKGJAC4Oqao%3DqforhNey20J8CiG2R%3DoBPqvfR0vOJrFysGw%40mail.gmail.com
---
 src/backend/storage/lmgr/condition_variable.c | 28 ++++---------------
 1 file changed, 5 insertions(+), 23 deletions(-)

diff --git a/src/backend/storage/lmgr/condition_variable.c b/src/backend/storage/lmgr/condition_variable.c
index 37b6a4eecd..2ec00397b4 100644
--- a/src/backend/storage/lmgr/condition_variable.c
+++ b/src/backend/storage/lmgr/condition_variable.c
@@ -30,9 +30,6 @@
 /* Initially, we are not prepared to sleep on any condition variable. */
 static ConditionVariable *cv_sleep_target = NULL;
 
-/* Reusable WaitEventSet. */
-static WaitEventSet *cv_wait_event_set = NULL;
-
 /*
  * Initialize a condition variable.
  */
@@ -62,23 +59,6 @@ ConditionVariablePrepareToSleep(ConditionVariable *cv)
 {
 	int			pgprocno = MyProc->pgprocno;
 
-	/*
-	 * If first time through in this process, create a WaitEventSet, which
-	 * we'll reuse for all condition variable sleeps.
-	 */
-	if (cv_wait_event_set == NULL)
-	{
-		WaitEventSet *new_event_set;
-
-		new_event_set = CreateWaitEventSet(TopMemoryContext, 2);
-		AddWaitEventToSet(new_event_set, WL_LATCH_SET, PGINVALID_SOCKET,
-						  MyLatch, NULL);
-		AddWaitEventToSet(new_event_set, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
-						  NULL, NULL);
-		/* Don't set cv_wait_event_set until we have a correct WES. */
-		cv_wait_event_set = new_event_set;
-	}
-
 	/*
 	 * If some other sleep is already prepared, cancel it; this is necessary
 	 * because we have just one static variable tracking the prepared sleep,
@@ -135,6 +115,7 @@ ConditionVariableTimedSleep(ConditionVariable *cv, long timeout,
 	long		cur_timeout = -1;
 	instr_time	start_time;
 	instr_time	cur_time;
+	int			wait_events;
 
 	/*
 	 * If the caller didn't prepare to sleep explicitly, then do so now and
@@ -166,19 +147,20 @@ ConditionVariableTimedSleep(ConditionVariable *cv, long timeout,
 		INSTR_TIME_SET_CURRENT(start_time);
 		Assert(timeout >= 0 && timeout <= INT_MAX);
 		cur_timeout = timeout;
+		wait_events = WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH;
 	}
+	else
+		wait_events = WL_LATCH_SET | WL_EXIT_ON_PM_DEATH;
 
 	while (true)
 	{
-		WaitEvent	event;
 		bool		done = false;
 
 		/*
 		 * Wait for latch to be set.  (If we're awakened for some other
 		 * reason, the code below will cope anyway.)
 		 */
-		(void) WaitEventSetWait(cv_wait_event_set, cur_timeout, &event, 1,
-								wait_event_info);
+		(void) WaitLatch(MyLatch, wait_events, cur_timeout, wait_event_info);
 
 		/* Reset latch before examining the state of the wait list. */
 		ResetLatch(MyLatch);
-- 
2.20.1

v5-0003-Introduce-a-WaitEventSet-for-the-stats-collector.patchtext/x-patch; charset=US-ASCII; name=v5-0003-Introduce-a-WaitEventSet-for-the-stats-collector.patchDownload
From 33e2f433cfeb7234cdef23c12466736707cda7ca Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Mon, 13 Jul 2020 13:46:55 +1200
Subject: [PATCH v5 3/7] Introduce a WaitEventSet for the stats collector.

This avoids avoids several system calls for every wait.

Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Discussion: https://postgr.es/m/CA%2BhUKGJAC4Oqao%3DqforhNey20J8CiG2R%3DoBPqvfR0vOJrFysGw%40mail.gmail.com
---
 src/backend/postmaster/pgstat.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 88992c2da2..7dbe5d62fd 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -4458,6 +4458,8 @@ PgstatCollectorMain(int argc, char *argv[])
 	int			len;
 	PgStat_Msg	msg;
 	int			wr;
+	WaitEvent	event;
+	WaitEventSet *wes;
 
 	/*
 	 * Ignore all signals usually bound to some action in the postmaster,
@@ -4485,6 +4487,12 @@ PgstatCollectorMain(int argc, char *argv[])
 	pgStatRunningInCollector = true;
 	pgStatDBHash = pgstat_read_statsfiles(InvalidOid, true, true);
 
+	/* Prepare to wait for our latch or data in our socket. */
+	wes = CreateWaitEventSet(CurrentMemoryContext, 3);
+	AddWaitEventToSet(wes, WL_LATCH_SET, PGINVALID_SOCKET, MyLatch, NULL);
+	AddWaitEventToSet(wes, WL_POSTMASTER_DEATH, PGINVALID_SOCKET, NULL, NULL);
+	AddWaitEventToSet(wes, WL_SOCKET_READABLE, pgStatSock, NULL, NULL);
+
 	/*
 	 * Loop to process messages until we get SIGQUIT or detect ungraceful
 	 * death of our parent postmaster.
@@ -4672,10 +4680,7 @@ PgstatCollectorMain(int argc, char *argv[])
 
 		/* Sleep until there's something to do */
 #ifndef WIN32
-		wr = WaitLatchOrSocket(MyLatch,
-							   WL_LATCH_SET | WL_POSTMASTER_DEATH | WL_SOCKET_READABLE,
-							   pgStatSock, -1L,
-							   WAIT_EVENT_PGSTAT_MAIN);
+		wr = WaitEventSetWait(wes, -1L, &event, 1, WAIT_EVENT_PGSTAT_MAIN);
 #else
 
 		/*
@@ -4688,18 +4693,15 @@ PgstatCollectorMain(int argc, char *argv[])
 		 * to not provoke "using stale statistics" complaints from
 		 * backend_read_statsfile.
 		 */
-		wr = WaitLatchOrSocket(MyLatch,
-							   WL_LATCH_SET | WL_POSTMASTER_DEATH | WL_SOCKET_READABLE | WL_TIMEOUT,
-							   pgStatSock,
-							   2 * 1000L /* msec */ ,
-							   WAIT_EVENT_PGSTAT_MAIN);
+		wr = WaitEventSetWait(wes, 2 * 1000L /* msec */, &event, 1,
+							  WAIT_EVENT_PGSTAT_MAIN);
 #endif
 
 		/*
 		 * Emergency bailout if postmaster has died.  This is to avoid the
 		 * necessity for manual cleanup of all postmaster children.
 		 */
-		if (wr & WL_POSTMASTER_DEATH)
+		if (wr == 1 && event.events == WL_POSTMASTER_DEATH)
 			break;
 	}							/* end of outer loop */
 
@@ -4708,6 +4710,8 @@ PgstatCollectorMain(int argc, char *argv[])
 	 */
 	pgstat_write_statsfiles(true, true);
 
+	FreeWaitEventSet(wes);
+
 	exit(0);
 }
 
-- 
2.20.1

v5-0004-Use-WL_EXIT_ON_PM_DEATH-in-FeBeWaitSet.patchtext/x-patch; charset=US-ASCII; name=v5-0004-Use-WL_EXIT_ON_PM_DEATH-in-FeBeWaitSet.patchDownload
From 3a12cc7a75c73f9f63b22a48467ace221b59ccc1 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Mon, 24 Feb 2020 23:51:09 +1300
Subject: [PATCH v5 4/7] Use WL_EXIT_ON_PM_DEATH in FeBeWaitSet.

Previously, we'd give either a FATAL message or a silent exit() when
we detected postmaster death, depending on which wait point we were at.
Make the exit more uniform, by using the standard exit facility.  This
will allow a later patch to use FeBeWaitSet in a new location without
having to add more explicit handling for postmaster death, in line with
our policy of reducing such clutter.

Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Discussion: https://postgr.es/m/CA%2BhUKGJAC4Oqao%3DqforhNey20J8CiG2R%3DoBPqvfR0vOJrFysGw%40mail.gmail.com
---
 src/backend/libpq/be-secure.c | 28 ----------------------------
 src/backend/libpq/pqcomm.c    |  2 +-
 2 files changed, 1 insertion(+), 29 deletions(-)

diff --git a/src/backend/libpq/be-secure.c b/src/backend/libpq/be-secure.c
index 2ae507a902..aec0926d93 100644
--- a/src/backend/libpq/be-secure.c
+++ b/src/backend/libpq/be-secure.c
@@ -184,28 +184,6 @@ retry:
 		WaitEventSetWait(FeBeWaitSet, -1 /* no timeout */ , &event, 1,
 						 WAIT_EVENT_CLIENT_READ);
 
-		/*
-		 * If the postmaster has died, it's not safe to continue running,
-		 * because it is the postmaster's job to kill us if some other backend
-		 * exits uncleanly.  Moreover, we won't run very well in this state;
-		 * helper processes like walwriter and the bgwriter will exit, so
-		 * performance may be poor.  Finally, if we don't exit, pg_ctl will be
-		 * unable to restart the postmaster without manual intervention, so no
-		 * new connections can be accepted.  Exiting clears the deck for a
-		 * postmaster restart.
-		 *
-		 * (Note that we only make this check when we would otherwise sleep on
-		 * our latch.  We might still continue running for a while if the
-		 * postmaster is killed in mid-query, or even through multiple queries
-		 * if we never have to wait for read.  We don't want to burn too many
-		 * cycles checking for this very rare condition, and this should cause
-		 * us to exit quickly in most cases.)
-		 */
-		if (event.events & WL_POSTMASTER_DEATH)
-			ereport(FATAL,
-					(errcode(ERRCODE_ADMIN_SHUTDOWN),
-					 errmsg("terminating connection due to unexpected postmaster exit")));
-
 		/* Handle interrupt. */
 		if (event.events & WL_LATCH_SET)
 		{
@@ -296,12 +274,6 @@ retry:
 		WaitEventSetWait(FeBeWaitSet, -1 /* no timeout */ , &event, 1,
 						 WAIT_EVENT_CLIENT_WRITE);
 
-		/* See comments in secure_read. */
-		if (event.events & WL_POSTMASTER_DEATH)
-			ereport(FATAL,
-					(errcode(ERRCODE_ADMIN_SHUTDOWN),
-					 errmsg("terminating connection due to unexpected postmaster exit")));
-
 		/* Handle interrupt. */
 		if (event.events & WL_LATCH_SET)
 		{
diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c
index ac986c0505..6e91581c0b 100644
--- a/src/backend/libpq/pqcomm.c
+++ b/src/backend/libpq/pqcomm.c
@@ -222,7 +222,7 @@ pq_init(void)
 	AddWaitEventToSet(FeBeWaitSet, WL_SOCKET_WRITEABLE, MyProcPort->sock,
 					  NULL, NULL);
 	AddWaitEventToSet(FeBeWaitSet, WL_LATCH_SET, -1, MyLatch, NULL);
-	AddWaitEventToSet(FeBeWaitSet, WL_POSTMASTER_DEATH, -1, NULL, NULL);
+	AddWaitEventToSet(FeBeWaitSet, WL_EXIT_ON_PM_DEATH, -1, NULL, NULL);
 }
 
 /* --------------------------------
-- 
2.20.1

v5-0001-Use-a-long-lived-WaitEventSet-for-WaitLatch.patchtext/x-patch; charset=US-ASCII; name=v5-0001-Use-a-long-lived-WaitEventSet-for-WaitLatch.patchDownload
From 1f8e1a2a1e9c35e70a68b29bebeddf8cc67304c1 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Mon, 13 Jul 2020 13:32:47 +1200
Subject: [PATCH v5 1/7] Use a long lived WaitEventSet for WaitLatch().

Create LatchWaitSet at backend startup time, and use it to
implement WaitLatch().  This avoids repeated epoll/kqueue set-up and
tear-down system calls, and makes sure we don't run into EMFILE later
due to lack of file descriptors.

Reorder SubPostmasterMain() slightly so that we restore the
postmaster pipe and Windows signal before we reach
InitPostmasterChild(), to make this work in EXEC_BACKEND
builds.

Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Discussion: https://postgr.es/m/CA%2BhUKGJAC4Oqao%3DqforhNey20J8CiG2R%3DoBPqvfR0vOJrFysGw%40mail.gmail.com
---
 src/backend/postmaster/postmaster.c | 24 ++++++-------
 src/backend/storage/ipc/latch.c     | 55 ++++++++++++++++++++++++++---
 src/backend/utils/init/miscinit.c   |  2 ++
 src/include/storage/latch.h         |  1 +
 4 files changed, 66 insertions(+), 16 deletions(-)

diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index dec02586c7..fc5ba11fb8 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -4896,9 +4896,6 @@ SubPostmasterMain(int argc, char *argv[])
 	IsPostmasterEnvironment = true;
 	whereToSendOutput = DestNone;
 
-	/* Setup as postmaster child */
-	InitPostmasterChild();
-
 	/* Setup essential subsystems (to ensure elog() behaves sanely) */
 	InitializeGUCOptions();
 
@@ -4913,6 +4910,18 @@ SubPostmasterMain(int argc, char *argv[])
 	/* Close the postmaster's sockets (as soon as we know them) */
 	ClosePostmasterPorts(strcmp(argv[1], "--forklog") == 0);
 
+	/*
+	 * Start our win32 signal implementation. This has to be done after we
+	 * read the backend variables, because we need to pick up the signal pipe
+	 * from the parent process.
+	 */
+#ifdef WIN32
+	pgwin32_signal_initialize();
+#endif
+
+	/* Setup as postmaster child */
+	InitPostmasterChild();
+
 	/*
 	 * Set up memory area for GSS information. Mirrors the code in ConnCreate
 	 * for the non-exec case.
@@ -4956,15 +4965,6 @@ SubPostmasterMain(int argc, char *argv[])
 	if (strcmp(argv[1], "--forkavworker") == 0)
 		AutovacuumWorkerIAm();
 
-	/*
-	 * Start our win32 signal implementation. This has to be done after we
-	 * read the backend variables, because we need to pick up the signal pipe
-	 * from the parent process.
-	 */
-#ifdef WIN32
-	pgwin32_signal_initialize();
-#endif
-
 	/* In EXEC_BACKEND case we will not have inherited these settings */
 	pqinitmask();
 	PG_SETMASK(&BlockSig);
diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c
index 91fa4b619b..fd4716ffaa 100644
--- a/src/backend/storage/ipc/latch.c
+++ b/src/backend/storage/ipc/latch.c
@@ -56,6 +56,7 @@
 #include "storage/latch.h"
 #include "storage/pmsignal.h"
 #include "storage/shmem.h"
+#include "utils/memutils.h"
 
 /*
  * Select the fd readiness primitive to use. Normally the "most modern"
@@ -129,6 +130,11 @@ struct WaitEventSet
 #endif
 };
 
+/* A common WaitEventSet used to implement WatchLatch() */
+static WaitEventSet *LatchWaitSet;
+
+#define LatchWaitSetLatchPos 0
+
 #ifndef WIN32
 /* Are we currently in WaitLatch? The signal handler would like to know. */
 static volatile sig_atomic_t waiting = false;
@@ -242,6 +248,24 @@ InitializeLatchSupport(void)
 #endif
 }
 
+void
+InitializeLatchWaitSet(void)
+{
+	int		latch_pos PG_USED_FOR_ASSERTS_ONLY;
+
+	Assert(LatchWaitSet == NULL);
+
+	/* Set up the WaitEventSet used by WaitLatch(). */
+	LatchWaitSet = CreateWaitEventSet(TopMemoryContext, 2);
+	latch_pos = AddWaitEventToSet(LatchWaitSet, WL_LATCH_SET, PGINVALID_SOCKET,
+								  MyLatch, NULL);
+	if (IsUnderPostmaster)
+		AddWaitEventToSet(LatchWaitSet, WL_EXIT_ON_PM_DEATH,
+						  PGINVALID_SOCKET, NULL, NULL);
+
+	Assert(latch_pos == LatchWaitSetLatchPos);
+}
+
 /*
  * Initialize a process-local latch.
  */
@@ -365,8 +389,31 @@ int
 WaitLatch(Latch *latch, int wakeEvents, long timeout,
 		  uint32 wait_event_info)
 {
-	return WaitLatchOrSocket(latch, wakeEvents, PGINVALID_SOCKET, timeout,
-							 wait_event_info);
+	WaitEvent	event;
+
+	/* Postmaster-managed callers must handle postmaster death somehow. */
+	Assert(!IsUnderPostmaster ||
+		   (wakeEvents & WL_EXIT_ON_PM_DEATH) ||
+		   (wakeEvents & WL_POSTMASTER_DEATH));
+
+	/*
+	 * Some callers may have a latch other than MyLatch, or no latch at all, or
+	 * want to handle postmaster death differently.  It's cheap to assign
+	 * those, so just do it every time.
+	 */
+	if (!(wakeEvents & WL_LATCH_SET))
+		latch = NULL;
+	ModifyWaitEvent(LatchWaitSet, LatchWaitSetLatchPos, WL_LATCH_SET, latch);
+	LatchWaitSet->exit_on_postmaster_death =
+		((wakeEvents & WL_EXIT_ON_PM_DEATH) != 0);
+
+	if (WaitEventSetWait(LatchWaitSet,
+						 (wakeEvents & WL_TIMEOUT) ? timeout : -1,
+						 &event, 1,
+						 wait_event_info) == 0)
+		return WL_TIMEOUT;
+	else
+		return event.events;
 }
 
 /*
@@ -830,7 +877,8 @@ AddWaitEventToSet(WaitEventSet *set, uint32 events, pgsocket fd, Latch *latch,
 
 /*
  * Change the event mask and, in the WL_LATCH_SET case, the latch associated
- * with the WaitEvent.
+ * with the WaitEvent.  The latch may be changed to NULL to disable the latch
+ * temporarily, and then set back to a latch later.
  *
  * 'pos' is the id returned by AddWaitEventToSet.
  */
@@ -862,7 +910,6 @@ ModifyWaitEvent(WaitEventSet *set, int pos, uint32 events, Latch *latch)
 	if (event->events & WL_LATCH_SET &&
 		events != event->events)
 	{
-		/* we could allow to disable latch events for a while */
 		elog(ERROR, "cannot modify latch event");
 	}
 
diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c
index cca9704d2d..cf8f9579c3 100644
--- a/src/backend/utils/init/miscinit.c
+++ b/src/backend/utils/init/miscinit.c
@@ -120,6 +120,7 @@ InitPostmasterChild(void)
 	InitializeLatchSupport();
 	MyLatch = &LocalLatchData;
 	InitLatch(MyLatch);
+	InitializeLatchWaitSet();
 
 	/*
 	 * If possible, make this process a group leader, so that the postmaster
@@ -152,6 +153,7 @@ InitStandaloneProcess(const char *argv0)
 	InitializeLatchSupport();
 	MyLatch = &LocalLatchData;
 	InitLatch(MyLatch);
+	InitializeLatchWaitSet();
 
 	/* Compute paths, no postmaster to inherit from */
 	if (my_exec_path[0] == '\0')
diff --git a/src/include/storage/latch.h b/src/include/storage/latch.h
index 46ae56cae3..7c742021fb 100644
--- a/src/include/storage/latch.h
+++ b/src/include/storage/latch.h
@@ -176,6 +176,7 @@ extern int	WaitLatch(Latch *latch, int wakeEvents, long timeout,
 					  uint32 wait_event_info);
 extern int	WaitLatchOrSocket(Latch *latch, int wakeEvents,
 							  pgsocket sock, long timeout, uint32 wait_event_info);
+extern void InitializeLatchWaitSet(void);
 
 /*
  * Unix implementation uses SIGUSR1 for inter-process signaling.
-- 
2.20.1

v5-0005-Introduce-symbolic-names-for-FeBeWaitSet-position.patchtext/x-patch; charset=US-ASCII; name=v5-0005-Introduce-symbolic-names-for-FeBeWaitSet-position.patchDownload
From ff88d4da1e1e96f785be6f277187576e84bb4eda Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Mon, 24 Feb 2020 23:48:29 +1300
Subject: [PATCH v5 5/7] Introduce symbolic names for FeBeWaitSet positions.

Previously we used hard coded 0 and 1 to refer to the socket and latch.

Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Discussion: https://postgr.es/m/CA%2BhUKGJAC4Oqao%3DqforhNey20J8CiG2R%3DoBPqvfR0vOJrFysGw%40mail.gmail.com
---
 src/backend/libpq/be-secure.c     |  4 ++--
 src/backend/libpq/pqcomm.c        | 18 +++++++++++++++---
 src/backend/utils/init/miscinit.c |  4 ++--
 src/include/libpq/libpq.h         |  3 +++
 4 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/src/backend/libpq/be-secure.c b/src/backend/libpq/be-secure.c
index aec0926d93..2d5cfbd6f8 100644
--- a/src/backend/libpq/be-secure.c
+++ b/src/backend/libpq/be-secure.c
@@ -179,7 +179,7 @@ retry:
 
 		Assert(waitfor);
 
-		ModifyWaitEvent(FeBeWaitSet, 0, waitfor, NULL);
+		ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetSocketPos, waitfor, NULL);
 
 		WaitEventSetWait(FeBeWaitSet, -1 /* no timeout */ , &event, 1,
 						 WAIT_EVENT_CLIENT_READ);
@@ -269,7 +269,7 @@ retry:
 
 		Assert(waitfor);
 
-		ModifyWaitEvent(FeBeWaitSet, 0, waitfor, NULL);
+		ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetSocketPos, waitfor, NULL);
 
 		WaitEventSetWait(FeBeWaitSet, -1 /* no timeout */ , &event, 1,
 						 WAIT_EVENT_CLIENT_WRITE);
diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c
index 6e91581c0b..98fb60ffca 100644
--- a/src/backend/libpq/pqcomm.c
+++ b/src/backend/libpq/pqcomm.c
@@ -191,6 +191,9 @@ WaitEventSet *FeBeWaitSet;
 void
 pq_init(void)
 {
+	int		socket_pos PG_USED_FOR_ASSERTS_ONLY;
+	int		latch_pos PG_USED_FOR_ASSERTS_ONLY;
+
 	/* initialize state variables */
 	PqSendBufferSize = PQ_SEND_BUFFER_SIZE;
 	PqSendBuffer = MemoryContextAlloc(TopMemoryContext, PqSendBufferSize);
@@ -219,10 +222,19 @@ pq_init(void)
 #endif
 
 	FeBeWaitSet = CreateWaitEventSet(TopMemoryContext, 3);
-	AddWaitEventToSet(FeBeWaitSet, WL_SOCKET_WRITEABLE, MyProcPort->sock,
+	socket_pos = AddWaitEventToSet(FeBeWaitSet, WL_SOCKET_WRITEABLE,
+								   MyProcPort->sock, NULL, NULL);
+	latch_pos = AddWaitEventToSet(FeBeWaitSet, WL_LATCH_SET, PGINVALID_SOCKET,
+								  MyLatch, NULL);
+	AddWaitEventToSet(FeBeWaitSet, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
 					  NULL, NULL);
-	AddWaitEventToSet(FeBeWaitSet, WL_LATCH_SET, -1, MyLatch, NULL);
-	AddWaitEventToSet(FeBeWaitSet, WL_EXIT_ON_PM_DEATH, -1, NULL, NULL);
+
+	/*
+	 * The event positions match the order we added them, but let's sanity
+	 * check them to be sure.
+	 */
+	Assert(socket_pos == FeBeWaitSetSocketPos);
+	Assert(latch_pos == FeBeWaitSetLatchPos);
 }
 
 /* --------------------------------
diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c
index cf8f9579c3..bcbf414db5 100644
--- a/src/backend/utils/init/miscinit.c
+++ b/src/backend/utils/init/miscinit.c
@@ -176,7 +176,7 @@ SwitchToSharedLatch(void)
 	MyLatch = &MyProc->procLatch;
 
 	if (FeBeWaitSet)
-		ModifyWaitEvent(FeBeWaitSet, 1, WL_LATCH_SET, MyLatch);
+		ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetLatchPos, WL_LATCH_SET, MyLatch);
 
 	/*
 	 * Set the shared latch as the local one might have been set. This
@@ -195,7 +195,7 @@ SwitchBackToLocalLatch(void)
 	MyLatch = &LocalLatchData;
 
 	if (FeBeWaitSet)
-		ModifyWaitEvent(FeBeWaitSet, 1, WL_LATCH_SET, MyLatch);
+		ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetLatchPos, WL_LATCH_SET, MyLatch);
 
 	SetLatch(MyLatch);
 }
diff --git a/src/include/libpq/libpq.h b/src/include/libpq/libpq.h
index b1152475ac..d96f69070d 100644
--- a/src/include/libpq/libpq.h
+++ b/src/include/libpq/libpq.h
@@ -55,6 +55,9 @@ extern const PGDLLIMPORT PQcommMethods *PqCommMethods;
  */
 extern WaitEventSet *FeBeWaitSet;
 
+#define FeBeWaitSetSocketPos 0
+#define FeBeWaitSetLatchPos 1
+
 extern int	StreamServerPort(int family, const char *hostName,
 							 unsigned short portNumber, const char *unixSocketDir,
 							 pgsocket ListenSocket[], int MaxListen);
-- 
2.20.1

v5-0006-Use-FeBeWaitSet-for-walsender.c.patchtext/x-patch; charset=US-ASCII; name=v5-0006-Use-FeBeWaitSet-for-walsender.c.patchDownload
From 4363808a8d714c8c157d47946002a1ce0b2b9599 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Mon, 24 Feb 2020 23:48:29 +1300
Subject: [PATCH v5 6/7] Use FeBeWaitSet for walsender.c.

This avoids the need to set up and tear down a new WaitEventSet every
time we wait.

Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Discussion: https://postgr.es/m/CA%2BhUKGJAC4Oqao%3DqforhNey20J8CiG2R%3DoBPqvfR0vOJrFysGw%40mail.gmail.com
---
 src/backend/replication/walsender.c | 32 ++++++++++++++---------------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index 5e2210dd7b..97d9a06620 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -1297,7 +1297,7 @@ WalSndWriteData(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid,
 	/* If we have pending write here, go to slow path */
 	for (;;)
 	{
-		int			wakeEvents;
+		WaitEvent	event;
 		long		sleeptime;
 
 		/* Check for input from the client */
@@ -1314,13 +1314,11 @@ WalSndWriteData(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid,
 
 		sleeptime = WalSndComputeSleeptime(GetCurrentTimestamp());
 
-		wakeEvents = WL_LATCH_SET | WL_EXIT_ON_PM_DEATH |
-			WL_SOCKET_WRITEABLE | WL_SOCKET_READABLE | WL_TIMEOUT;
-
 		/* Sleep until something happens or we time out */
-		(void) WaitLatchOrSocket(MyLatch, wakeEvents,
-								 MyProcPort->sock, sleeptime,
-								 WAIT_EVENT_WAL_SENDER_WRITE_DATA);
+		ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetSocketPos,
+						WL_SOCKET_WRITEABLE | WL_SOCKET_READABLE, NULL);
+		(void) WaitEventSetWait(FeBeWaitSet, sleeptime, &event, 1,
+								WAIT_EVENT_WAL_SENDER_WRITE_DATA);
 
 		/* Clear any already-pending wakeups */
 		ResetLatch(MyLatch);
@@ -1398,6 +1396,7 @@ WalSndWaitForWal(XLogRecPtr loc)
 
 	for (;;)
 	{
+		WaitEvent	event;
 		long		sleeptime;
 
 		/* Clear any already-pending wakeups */
@@ -1493,15 +1492,14 @@ WalSndWaitForWal(XLogRecPtr loc)
 		 */
 		sleeptime = WalSndComputeSleeptime(GetCurrentTimestamp());
 
-		wakeEvents = WL_LATCH_SET | WL_EXIT_ON_PM_DEATH |
-			WL_SOCKET_READABLE | WL_TIMEOUT;
+		wakeEvents = WL_SOCKET_READABLE;
 
 		if (pq_is_send_pending())
 			wakeEvents |= WL_SOCKET_WRITEABLE;
 
-		(void) WaitLatchOrSocket(MyLatch, wakeEvents,
-								 MyProcPort->sock, sleeptime,
-								 WAIT_EVENT_WAL_SENDER_WAIT_WAL);
+		ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetSocketPos, wakeEvents, NULL);
+		(void) WaitEventSetWait(FeBeWaitSet, sleeptime, &event, 1,
+								WAIT_EVENT_WAL_SENDER_WAIT_WAL);
 	}
 
 	/* reactivate latch so WalSndLoop knows to continue */
@@ -2344,9 +2342,9 @@ WalSndLoop(WalSndSendDataCallback send_data)
 		{
 			long		sleeptime;
 			int			wakeEvents;
+			WaitEvent	event;
 
-			wakeEvents = WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | WL_TIMEOUT |
-				WL_SOCKET_READABLE;
+			wakeEvents = WL_SOCKET_READABLE;
 
 			/*
 			 * Use fresh timestamp, not last_processing, to reduce the chance
@@ -2358,9 +2356,9 @@ WalSndLoop(WalSndSendDataCallback send_data)
 				wakeEvents |= WL_SOCKET_WRITEABLE;
 
 			/* Sleep until something happens or we time out */
-			(void) WaitLatchOrSocket(MyLatch, wakeEvents,
-									 MyProcPort->sock, sleeptime,
-									 WAIT_EVENT_WAL_SENDER_MAIN);
+			ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetSocketPos, wakeEvents, NULL);
+			(void) WaitEventSetWait(FeBeWaitSet, sleeptime, &event, 1,
+									WAIT_EVENT_WAL_SENDER_MAIN);
 		}
 	}
 }
-- 
2.20.1

#11Thomas Munro
thomas.munro@gmail.com
In reply to: Thomas Munro (#10)
Re: Reducing WaitEventSet syscall churn

On Tue, Jul 14, 2020 at 6:51 PM Thomas Munro <thomas.munro@gmail.com> wrote:

In the meantime, here's a rebase of the more straightforward patches
in the stack. These are the ones that deal only with fixed sets of
file descriptors, and they survive check-world on Linux,
Linux+EXEC_BACKEND (with ASLR disabled) and FreeBSD, and at least
check on macOS and Windows (my CI recipes need more work to get
check-world working on those two). There's one user-visible change
that I'd appreciate feedback on: I propose to drop the FATAL error
when the postmaster goes away, to make things more consistent. See
below for more on that.

Here's the effect of patches 0001-0003 on the number of relevant
system calls generate by "make check" on Linux and FreeBSD, according
to strace/truss -f -c:

epoll_create1: 4,825 -> 865
epoll_ctl: 12,454 -> 2,721
epoll_wait: ~45k -> ~45k
close: ~81k -> ~77k

kqueue: 4,618 -> 866
kevent: ~54k -> ~46k
close: ~65k -> ~61k

I pushed those three patches, but will wait for more discussion on the rest.

#12Thomas Munro
thomas.munro@gmail.com
In reply to: Thomas Munro (#11)
3 attachment(s)
Re: Reducing WaitEventSet syscall churn

On Thu, Jul 30, 2020 at 5:50 PM Thomas Munro <thomas.munro@gmail.com> wrote:

I pushed those three patches, but will wait for more discussion on the rest.

And here's a rebase, to make cfbot happy.

Attachments:

v6-0001-Use-WL_EXIT_ON_PM_DEATH-in-FeBeWaitSet.patchtext/x-patch; charset=US-ASCII; name=v6-0001-Use-WL_EXIT_ON_PM_DEATH-in-FeBeWaitSet.patchDownload
From a760053ac6fea1b9a829e9a170902a555863ce66 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Mon, 24 Feb 2020 23:51:09 +1300
Subject: [PATCH v6 1/4] Use WL_EXIT_ON_PM_DEATH in FeBeWaitSet.

Previously, we'd give either a FATAL message or a silent exit() when
we detected postmaster death, depending on which wait point we were at.
Make the exit more uniform, by using the standard exit facility.  This
will allow a later patch to use FeBeWaitSet in a new location without
having to add more explicit handling for postmaster death, in line with
our policy of reducing such clutter.

Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Discussion: https://postgr.es/m/CA%2BhUKGJAC4Oqao%3DqforhNey20J8CiG2R%3DoBPqvfR0vOJrFysGw%40mail.gmail.com
---
 src/backend/libpq/be-secure.c | 28 ----------------------------
 src/backend/libpq/pqcomm.c    |  2 +-
 2 files changed, 1 insertion(+), 29 deletions(-)

diff --git a/src/backend/libpq/be-secure.c b/src/backend/libpq/be-secure.c
index 2ae507a902..aec0926d93 100644
--- a/src/backend/libpq/be-secure.c
+++ b/src/backend/libpq/be-secure.c
@@ -184,28 +184,6 @@ retry:
 		WaitEventSetWait(FeBeWaitSet, -1 /* no timeout */ , &event, 1,
 						 WAIT_EVENT_CLIENT_READ);
 
-		/*
-		 * If the postmaster has died, it's not safe to continue running,
-		 * because it is the postmaster's job to kill us if some other backend
-		 * exits uncleanly.  Moreover, we won't run very well in this state;
-		 * helper processes like walwriter and the bgwriter will exit, so
-		 * performance may be poor.  Finally, if we don't exit, pg_ctl will be
-		 * unable to restart the postmaster without manual intervention, so no
-		 * new connections can be accepted.  Exiting clears the deck for a
-		 * postmaster restart.
-		 *
-		 * (Note that we only make this check when we would otherwise sleep on
-		 * our latch.  We might still continue running for a while if the
-		 * postmaster is killed in mid-query, or even through multiple queries
-		 * if we never have to wait for read.  We don't want to burn too many
-		 * cycles checking for this very rare condition, and this should cause
-		 * us to exit quickly in most cases.)
-		 */
-		if (event.events & WL_POSTMASTER_DEATH)
-			ereport(FATAL,
-					(errcode(ERRCODE_ADMIN_SHUTDOWN),
-					 errmsg("terminating connection due to unexpected postmaster exit")));
-
 		/* Handle interrupt. */
 		if (event.events & WL_LATCH_SET)
 		{
@@ -296,12 +274,6 @@ retry:
 		WaitEventSetWait(FeBeWaitSet, -1 /* no timeout */ , &event, 1,
 						 WAIT_EVENT_CLIENT_WRITE);
 
-		/* See comments in secure_read. */
-		if (event.events & WL_POSTMASTER_DEATH)
-			ereport(FATAL,
-					(errcode(ERRCODE_ADMIN_SHUTDOWN),
-					 errmsg("terminating connection due to unexpected postmaster exit")));
-
 		/* Handle interrupt. */
 		if (event.events & WL_LATCH_SET)
 		{
diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c
index ac986c0505..6e91581c0b 100644
--- a/src/backend/libpq/pqcomm.c
+++ b/src/backend/libpq/pqcomm.c
@@ -222,7 +222,7 @@ pq_init(void)
 	AddWaitEventToSet(FeBeWaitSet, WL_SOCKET_WRITEABLE, MyProcPort->sock,
 					  NULL, NULL);
 	AddWaitEventToSet(FeBeWaitSet, WL_LATCH_SET, -1, MyLatch, NULL);
-	AddWaitEventToSet(FeBeWaitSet, WL_POSTMASTER_DEATH, -1, NULL, NULL);
+	AddWaitEventToSet(FeBeWaitSet, WL_EXIT_ON_PM_DEATH, -1, NULL, NULL);
 }
 
 /* --------------------------------
-- 
2.20.1

v6-0002-Introduce-symbolic-names-for-FeBeWaitSet-position.patchtext/x-patch; charset=US-ASCII; name=v6-0002-Introduce-symbolic-names-for-FeBeWaitSet-position.patchDownload
From 91c7a5a8a04363274ea0e74a69d89e261d7a0e4f Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Mon, 24 Feb 2020 23:48:29 +1300
Subject: [PATCH v6 2/4] Introduce symbolic names for FeBeWaitSet positions.

Previously we used hard coded 0 and 1 to refer to the socket and latch.

Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Discussion: https://postgr.es/m/CA%2BhUKGJAC4Oqao%3DqforhNey20J8CiG2R%3DoBPqvfR0vOJrFysGw%40mail.gmail.com
---
 src/backend/libpq/be-secure.c     |  4 ++--
 src/backend/libpq/pqcomm.c        | 18 +++++++++++++++---
 src/backend/utils/init/miscinit.c |  4 ++--
 src/include/libpq/libpq.h         |  3 +++
 4 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/src/backend/libpq/be-secure.c b/src/backend/libpq/be-secure.c
index aec0926d93..2d5cfbd6f8 100644
--- a/src/backend/libpq/be-secure.c
+++ b/src/backend/libpq/be-secure.c
@@ -179,7 +179,7 @@ retry:
 
 		Assert(waitfor);
 
-		ModifyWaitEvent(FeBeWaitSet, 0, waitfor, NULL);
+		ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetSocketPos, waitfor, NULL);
 
 		WaitEventSetWait(FeBeWaitSet, -1 /* no timeout */ , &event, 1,
 						 WAIT_EVENT_CLIENT_READ);
@@ -269,7 +269,7 @@ retry:
 
 		Assert(waitfor);
 
-		ModifyWaitEvent(FeBeWaitSet, 0, waitfor, NULL);
+		ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetSocketPos, waitfor, NULL);
 
 		WaitEventSetWait(FeBeWaitSet, -1 /* no timeout */ , &event, 1,
 						 WAIT_EVENT_CLIENT_WRITE);
diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c
index 6e91581c0b..98fb60ffca 100644
--- a/src/backend/libpq/pqcomm.c
+++ b/src/backend/libpq/pqcomm.c
@@ -191,6 +191,9 @@ WaitEventSet *FeBeWaitSet;
 void
 pq_init(void)
 {
+	int		socket_pos PG_USED_FOR_ASSERTS_ONLY;
+	int		latch_pos PG_USED_FOR_ASSERTS_ONLY;
+
 	/* initialize state variables */
 	PqSendBufferSize = PQ_SEND_BUFFER_SIZE;
 	PqSendBuffer = MemoryContextAlloc(TopMemoryContext, PqSendBufferSize);
@@ -219,10 +222,19 @@ pq_init(void)
 #endif
 
 	FeBeWaitSet = CreateWaitEventSet(TopMemoryContext, 3);
-	AddWaitEventToSet(FeBeWaitSet, WL_SOCKET_WRITEABLE, MyProcPort->sock,
+	socket_pos = AddWaitEventToSet(FeBeWaitSet, WL_SOCKET_WRITEABLE,
+								   MyProcPort->sock, NULL, NULL);
+	latch_pos = AddWaitEventToSet(FeBeWaitSet, WL_LATCH_SET, PGINVALID_SOCKET,
+								  MyLatch, NULL);
+	AddWaitEventToSet(FeBeWaitSet, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
 					  NULL, NULL);
-	AddWaitEventToSet(FeBeWaitSet, WL_LATCH_SET, -1, MyLatch, NULL);
-	AddWaitEventToSet(FeBeWaitSet, WL_EXIT_ON_PM_DEATH, -1, NULL, NULL);
+
+	/*
+	 * The event positions match the order we added them, but let's sanity
+	 * check them to be sure.
+	 */
+	Assert(socket_pos == FeBeWaitSetSocketPos);
+	Assert(latch_pos == FeBeWaitSetLatchPos);
 }
 
 /* --------------------------------
diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c
index cf8f9579c3..bcbf414db5 100644
--- a/src/backend/utils/init/miscinit.c
+++ b/src/backend/utils/init/miscinit.c
@@ -176,7 +176,7 @@ SwitchToSharedLatch(void)
 	MyLatch = &MyProc->procLatch;
 
 	if (FeBeWaitSet)
-		ModifyWaitEvent(FeBeWaitSet, 1, WL_LATCH_SET, MyLatch);
+		ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetLatchPos, WL_LATCH_SET, MyLatch);
 
 	/*
 	 * Set the shared latch as the local one might have been set. This
@@ -195,7 +195,7 @@ SwitchBackToLocalLatch(void)
 	MyLatch = &LocalLatchData;
 
 	if (FeBeWaitSet)
-		ModifyWaitEvent(FeBeWaitSet, 1, WL_LATCH_SET, MyLatch);
+		ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetLatchPos, WL_LATCH_SET, MyLatch);
 
 	SetLatch(MyLatch);
 }
diff --git a/src/include/libpq/libpq.h b/src/include/libpq/libpq.h
index b1152475ac..d96f69070d 100644
--- a/src/include/libpq/libpq.h
+++ b/src/include/libpq/libpq.h
@@ -55,6 +55,9 @@ extern const PGDLLIMPORT PQcommMethods *PqCommMethods;
  */
 extern WaitEventSet *FeBeWaitSet;
 
+#define FeBeWaitSetSocketPos 0
+#define FeBeWaitSetLatchPos 1
+
 extern int	StreamServerPort(int family, const char *hostName,
 							 unsigned short portNumber, const char *unixSocketDir,
 							 pgsocket ListenSocket[], int MaxListen);
-- 
2.20.1

v6-0003-Use-FeBeWaitSet-for-walsender.c.patchtext/x-patch; charset=US-ASCII; name=v6-0003-Use-FeBeWaitSet-for-walsender.c.patchDownload
From cd8c3901d75d050267f7765ea5306af49c2bcef0 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Mon, 24 Feb 2020 23:48:29 +1300
Subject: [PATCH v6 3/4] Use FeBeWaitSet for walsender.c.

This avoids the need to set up and tear down a new WaitEventSet every
time we wait.

Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Discussion: https://postgr.es/m/CA%2BhUKGJAC4Oqao%3DqforhNey20J8CiG2R%3DoBPqvfR0vOJrFysGw%40mail.gmail.com
---
 src/backend/replication/walsender.c | 32 ++++++++++++++---------------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index 5e2210dd7b..97d9a06620 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -1297,7 +1297,7 @@ WalSndWriteData(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid,
 	/* If we have pending write here, go to slow path */
 	for (;;)
 	{
-		int			wakeEvents;
+		WaitEvent	event;
 		long		sleeptime;
 
 		/* Check for input from the client */
@@ -1314,13 +1314,11 @@ WalSndWriteData(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid,
 
 		sleeptime = WalSndComputeSleeptime(GetCurrentTimestamp());
 
-		wakeEvents = WL_LATCH_SET | WL_EXIT_ON_PM_DEATH |
-			WL_SOCKET_WRITEABLE | WL_SOCKET_READABLE | WL_TIMEOUT;
-
 		/* Sleep until something happens or we time out */
-		(void) WaitLatchOrSocket(MyLatch, wakeEvents,
-								 MyProcPort->sock, sleeptime,
-								 WAIT_EVENT_WAL_SENDER_WRITE_DATA);
+		ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetSocketPos,
+						WL_SOCKET_WRITEABLE | WL_SOCKET_READABLE, NULL);
+		(void) WaitEventSetWait(FeBeWaitSet, sleeptime, &event, 1,
+								WAIT_EVENT_WAL_SENDER_WRITE_DATA);
 
 		/* Clear any already-pending wakeups */
 		ResetLatch(MyLatch);
@@ -1398,6 +1396,7 @@ WalSndWaitForWal(XLogRecPtr loc)
 
 	for (;;)
 	{
+		WaitEvent	event;
 		long		sleeptime;
 
 		/* Clear any already-pending wakeups */
@@ -1493,15 +1492,14 @@ WalSndWaitForWal(XLogRecPtr loc)
 		 */
 		sleeptime = WalSndComputeSleeptime(GetCurrentTimestamp());
 
-		wakeEvents = WL_LATCH_SET | WL_EXIT_ON_PM_DEATH |
-			WL_SOCKET_READABLE | WL_TIMEOUT;
+		wakeEvents = WL_SOCKET_READABLE;
 
 		if (pq_is_send_pending())
 			wakeEvents |= WL_SOCKET_WRITEABLE;
 
-		(void) WaitLatchOrSocket(MyLatch, wakeEvents,
-								 MyProcPort->sock, sleeptime,
-								 WAIT_EVENT_WAL_SENDER_WAIT_WAL);
+		ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetSocketPos, wakeEvents, NULL);
+		(void) WaitEventSetWait(FeBeWaitSet, sleeptime, &event, 1,
+								WAIT_EVENT_WAL_SENDER_WAIT_WAL);
 	}
 
 	/* reactivate latch so WalSndLoop knows to continue */
@@ -2344,9 +2342,9 @@ WalSndLoop(WalSndSendDataCallback send_data)
 		{
 			long		sleeptime;
 			int			wakeEvents;
+			WaitEvent	event;
 
-			wakeEvents = WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | WL_TIMEOUT |
-				WL_SOCKET_READABLE;
+			wakeEvents = WL_SOCKET_READABLE;
 
 			/*
 			 * Use fresh timestamp, not last_processing, to reduce the chance
@@ -2358,9 +2356,9 @@ WalSndLoop(WalSndSendDataCallback send_data)
 				wakeEvents |= WL_SOCKET_WRITEABLE;
 
 			/* Sleep until something happens or we time out */
-			(void) WaitLatchOrSocket(MyLatch, wakeEvents,
-									 MyProcPort->sock, sleeptime,
-									 WAIT_EVENT_WAL_SENDER_MAIN);
+			ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetSocketPos, wakeEvents, NULL);
+			(void) WaitEventSetWait(FeBeWaitSet, sleeptime, &event, 1,
+									WAIT_EVENT_WAL_SENDER_MAIN);
 		}
 	}
 }
-- 
2.20.1

#13Thomas Munro
thomas.munro@gmail.com
In reply to: Thomas Munro (#12)
3 attachment(s)
Re: Reducing WaitEventSet syscall churn

On Fri, Jul 31, 2020 at 9:42 AM Thomas Munro <thomas.munro@gmail.com> wrote:

On Thu, Jul 30, 2020 at 5:50 PM Thomas Munro <thomas.munro@gmail.com> wrote:

I pushed those three patches, but will wait for more discussion on the rest.

And here's a rebase, to make cfbot happy.

And again.

To restate the two goals of the remaining patches:

1. Use FeBeWaitSet for walsender, instead of setting up and tearing
down temporary WaitEventSets all the time.
2. Use the standard WL_EXIT_ON_PM_DEATH flag for FeBeWaitSet, instead
of the special case ereport(FATAL, ... "terminating connection due to
unexpected postmaster exit" ...).

For point 2, the question I am raising is: why should users get a
special FATAL message in some places and not others, for PM death?
However, if people are attached to that behaviour, we could still
achieve goal 1 without goal 2 by handling PM death explicitly in
walsender.c and I'd be happy to post an alternative patch set like
that.

Attachments:

v7-0001-Use-WL_EXIT_ON_PM_DEATH-in-FeBeWaitSet.patchapplication/x-patch; name=v7-0001-Use-WL_EXIT_ON_PM_DEATH-in-FeBeWaitSet.patchDownload
From 72fa3b6858b3ab121fd60533ed7d6a4dcdbc4a5c Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Mon, 24 Feb 2020 23:51:09 +1300
Subject: [PATCH v7 1/3] Use WL_EXIT_ON_PM_DEATH in FeBeWaitSet.

Previously, we'd give either a FATAL message or a silent exit() when
we detected postmaster death, depending on which wait point we were at.
Make the exit more uniform, by using the standard exit facility.  This
will allow a later patch to use FeBeWaitSet in a new location without
having to add more explicit handling for postmaster death, in line with
our policy of reducing such clutter.

Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Discussion: https://postgr.es/m/CA%2BhUKGJAC4Oqao%3DqforhNey20J8CiG2R%3DoBPqvfR0vOJrFysGw%40mail.gmail.com
---
 src/backend/libpq/be-secure.c | 28 ----------------------------
 src/backend/libpq/pqcomm.c    |  2 +-
 2 files changed, 1 insertion(+), 29 deletions(-)

diff --git a/src/backend/libpq/be-secure.c b/src/backend/libpq/be-secure.c
index 4cf139a223..746fce8288 100644
--- a/src/backend/libpq/be-secure.c
+++ b/src/backend/libpq/be-secure.c
@@ -184,28 +184,6 @@ retry:
 		WaitEventSetWait(FeBeWaitSet, -1 /* no timeout */ , &event, 1,
 						 WAIT_EVENT_CLIENT_READ);
 
-		/*
-		 * If the postmaster has died, it's not safe to continue running,
-		 * because it is the postmaster's job to kill us if some other backend
-		 * exits uncleanly.  Moreover, we won't run very well in this state;
-		 * helper processes like walwriter and the bgwriter will exit, so
-		 * performance may be poor.  Finally, if we don't exit, pg_ctl will be
-		 * unable to restart the postmaster without manual intervention, so no
-		 * new connections can be accepted.  Exiting clears the deck for a
-		 * postmaster restart.
-		 *
-		 * (Note that we only make this check when we would otherwise sleep on
-		 * our latch.  We might still continue running for a while if the
-		 * postmaster is killed in mid-query, or even through multiple queries
-		 * if we never have to wait for read.  We don't want to burn too many
-		 * cycles checking for this very rare condition, and this should cause
-		 * us to exit quickly in most cases.)
-		 */
-		if (event.events & WL_POSTMASTER_DEATH)
-			ereport(FATAL,
-					(errcode(ERRCODE_ADMIN_SHUTDOWN),
-					 errmsg("terminating connection due to unexpected postmaster exit")));
-
 		/* Handle interrupt. */
 		if (event.events & WL_LATCH_SET)
 		{
@@ -296,12 +274,6 @@ retry:
 		WaitEventSetWait(FeBeWaitSet, -1 /* no timeout */ , &event, 1,
 						 WAIT_EVENT_CLIENT_WRITE);
 
-		/* See comments in secure_read. */
-		if (event.events & WL_POSTMASTER_DEATH)
-			ereport(FATAL,
-					(errcode(ERRCODE_ADMIN_SHUTDOWN),
-					 errmsg("terminating connection due to unexpected postmaster exit")));
-
 		/* Handle interrupt. */
 		if (event.events & WL_LATCH_SET)
 		{
diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c
index 1e6b6db540..289d9bd2da 100644
--- a/src/backend/libpq/pqcomm.c
+++ b/src/backend/libpq/pqcomm.c
@@ -222,7 +222,7 @@ pq_init(void)
 	AddWaitEventToSet(FeBeWaitSet, WL_SOCKET_WRITEABLE, MyProcPort->sock,
 					  NULL, NULL);
 	AddWaitEventToSet(FeBeWaitSet, WL_LATCH_SET, -1, MyLatch, NULL);
-	AddWaitEventToSet(FeBeWaitSet, WL_POSTMASTER_DEATH, -1, NULL, NULL);
+	AddWaitEventToSet(FeBeWaitSet, WL_EXIT_ON_PM_DEATH, -1, NULL, NULL);
 }
 
 /* --------------------------------
-- 
2.20.1

v7-0002-Introduce-symbolic-names-for-FeBeWaitSet-position.patchapplication/x-patch; name=v7-0002-Introduce-symbolic-names-for-FeBeWaitSet-position.patchDownload
From ec7348f4fc794eed6cfd6bb3897086ef29c213ae Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Mon, 24 Feb 2020 23:48:29 +1300
Subject: [PATCH v7 2/3] Introduce symbolic names for FeBeWaitSet positions.

Previously we used hard coded 0 and 1 to refer to the socket and latch.

Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Discussion: https://postgr.es/m/CA%2BhUKGJAC4Oqao%3DqforhNey20J8CiG2R%3DoBPqvfR0vOJrFysGw%40mail.gmail.com
---
 src/backend/libpq/be-secure.c     |  4 ++--
 src/backend/libpq/pqcomm.c        | 18 +++++++++++++++---
 src/backend/utils/init/miscinit.c |  4 ++--
 src/include/libpq/libpq.h         |  3 +++
 4 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/src/backend/libpq/be-secure.c b/src/backend/libpq/be-secure.c
index 746fce8288..0531db8fce 100644
--- a/src/backend/libpq/be-secure.c
+++ b/src/backend/libpq/be-secure.c
@@ -179,7 +179,7 @@ retry:
 
 		Assert(waitfor);
 
-		ModifyWaitEvent(FeBeWaitSet, 0, waitfor, NULL);
+		ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetSocketPos, waitfor, NULL);
 
 		WaitEventSetWait(FeBeWaitSet, -1 /* no timeout */ , &event, 1,
 						 WAIT_EVENT_CLIENT_READ);
@@ -269,7 +269,7 @@ retry:
 
 		Assert(waitfor);
 
-		ModifyWaitEvent(FeBeWaitSet, 0, waitfor, NULL);
+		ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetSocketPos, waitfor, NULL);
 
 		WaitEventSetWait(FeBeWaitSet, -1 /* no timeout */ , &event, 1,
 						 WAIT_EVENT_CLIENT_WRITE);
diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c
index 289d9bd2da..5e006ef493 100644
--- a/src/backend/libpq/pqcomm.c
+++ b/src/backend/libpq/pqcomm.c
@@ -191,6 +191,9 @@ WaitEventSet *FeBeWaitSet;
 void
 pq_init(void)
 {
+	int		socket_pos PG_USED_FOR_ASSERTS_ONLY;
+	int		latch_pos PG_USED_FOR_ASSERTS_ONLY;
+
 	/* initialize state variables */
 	PqSendBufferSize = PQ_SEND_BUFFER_SIZE;
 	PqSendBuffer = MemoryContextAlloc(TopMemoryContext, PqSendBufferSize);
@@ -219,10 +222,19 @@ pq_init(void)
 #endif
 
 	FeBeWaitSet = CreateWaitEventSet(TopMemoryContext, 3);
-	AddWaitEventToSet(FeBeWaitSet, WL_SOCKET_WRITEABLE, MyProcPort->sock,
+	socket_pos = AddWaitEventToSet(FeBeWaitSet, WL_SOCKET_WRITEABLE,
+								   MyProcPort->sock, NULL, NULL);
+	latch_pos = AddWaitEventToSet(FeBeWaitSet, WL_LATCH_SET, PGINVALID_SOCKET,
+								  MyLatch, NULL);
+	AddWaitEventToSet(FeBeWaitSet, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
 					  NULL, NULL);
-	AddWaitEventToSet(FeBeWaitSet, WL_LATCH_SET, -1, MyLatch, NULL);
-	AddWaitEventToSet(FeBeWaitSet, WL_EXIT_ON_PM_DEATH, -1, NULL, NULL);
+
+	/*
+	 * The event positions match the order we added them, but let's sanity
+	 * check them to be sure.
+	 */
+	Assert(socket_pos == FeBeWaitSetSocketPos);
+	Assert(latch_pos == FeBeWaitSetLatchPos);
 }
 
 /* --------------------------------
diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c
index 0f67b99cc5..69e945f3a1 100644
--- a/src/backend/utils/init/miscinit.c
+++ b/src/backend/utils/init/miscinit.c
@@ -202,7 +202,7 @@ SwitchToSharedLatch(void)
 	MyLatch = &MyProc->procLatch;
 
 	if (FeBeWaitSet)
-		ModifyWaitEvent(FeBeWaitSet, 1, WL_LATCH_SET, MyLatch);
+		ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetLatchPos, WL_LATCH_SET, MyLatch);
 
 	/*
 	 * Set the shared latch as the local one might have been set. This
@@ -221,7 +221,7 @@ SwitchBackToLocalLatch(void)
 	MyLatch = &LocalLatchData;
 
 	if (FeBeWaitSet)
-		ModifyWaitEvent(FeBeWaitSet, 1, WL_LATCH_SET, MyLatch);
+		ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetLatchPos, WL_LATCH_SET, MyLatch);
 
 	SetLatch(MyLatch);
 }
diff --git a/src/include/libpq/libpq.h b/src/include/libpq/libpq.h
index a55898c85a..3037b5789d 100644
--- a/src/include/libpq/libpq.h
+++ b/src/include/libpq/libpq.h
@@ -55,6 +55,9 @@ extern const PGDLLIMPORT PQcommMethods *PqCommMethods;
  */
 extern WaitEventSet *FeBeWaitSet;
 
+#define FeBeWaitSetSocketPos 0
+#define FeBeWaitSetLatchPos 1
+
 extern int	StreamServerPort(int family, const char *hostName,
 							 unsigned short portNumber, const char *unixSocketDir,
 							 pgsocket ListenSocket[], int MaxListen);
-- 
2.20.1

v7-0003-Use-FeBeWaitSet-for-walsender.c.patchapplication/x-patch; name=v7-0003-Use-FeBeWaitSet-for-walsender.c.patchDownload
From cc9731331feb91d804f3ba18596eee0eb4986ca6 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Mon, 24 Feb 2020 23:48:29 +1300
Subject: [PATCH v7 3/3] Use FeBeWaitSet for walsender.c.

This avoids the need to set up and tear down a new WaitEventSet every
time we wait.

Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Discussion: https://postgr.es/m/CA%2BhUKGJAC4Oqao%3DqforhNey20J8CiG2R%3DoBPqvfR0vOJrFysGw%40mail.gmail.com
---
 src/backend/replication/walsender.c | 35 ++++++++++++++---------------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index fe0d368a35..4b86fc4e30 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -1293,7 +1293,7 @@ WalSndWriteData(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid,
 	/* If we have pending write here, go to slow path */
 	for (;;)
 	{
-		int			wakeEvents;
+		WaitEvent	event;
 		long		sleeptime;
 
 		/* Check for input from the client */
@@ -1310,13 +1310,11 @@ WalSndWriteData(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid,
 
 		sleeptime = WalSndComputeSleeptime(GetCurrentTimestamp());
 
-		wakeEvents = WL_LATCH_SET | WL_EXIT_ON_PM_DEATH |
-			WL_SOCKET_WRITEABLE | WL_SOCKET_READABLE | WL_TIMEOUT;
-
 		/* Sleep until something happens or we time out */
-		(void) WaitLatchOrSocket(MyLatch, wakeEvents,
-								 MyProcPort->sock, sleeptime,
-								 WAIT_EVENT_WAL_SENDER_WRITE_DATA);
+		ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetSocketPos,
+						WL_SOCKET_WRITEABLE | WL_SOCKET_READABLE, NULL);
+		(void) WaitEventSetWait(FeBeWaitSet, sleeptime, &event, 1,
+								WAIT_EVENT_WAL_SENDER_WRITE_DATA);
 
 		/* Clear any already-pending wakeups */
 		ResetLatch(MyLatch);
@@ -1394,6 +1392,7 @@ WalSndWaitForWal(XLogRecPtr loc)
 
 	for (;;)
 	{
+		WaitEvent	event;
 		long		sleeptime;
 
 		/* Clear any already-pending wakeups */
@@ -1486,15 +1485,14 @@ WalSndWaitForWal(XLogRecPtr loc)
 		 */
 		sleeptime = WalSndComputeSleeptime(GetCurrentTimestamp());
 
-		wakeEvents = WL_LATCH_SET | WL_EXIT_ON_PM_DEATH |
-			WL_SOCKET_READABLE | WL_TIMEOUT;
+		wakeEvents = WL_SOCKET_READABLE;
 
 		if (pq_is_send_pending())
 			wakeEvents |= WL_SOCKET_WRITEABLE;
 
-		(void) WaitLatchOrSocket(MyLatch, wakeEvents,
-								 MyProcPort->sock, sleeptime,
-								 WAIT_EVENT_WAL_SENDER_WAIT_WAL);
+		ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetSocketPos, wakeEvents, NULL);
+		(void) WaitEventSetWait(FeBeWaitSet, sleeptime, &event, 1,
+								WAIT_EVENT_WAL_SENDER_WAIT_WAL);
 	}
 
 	/* reactivate latch so WalSndLoop knows to continue */
@@ -2353,11 +2351,12 @@ WalSndLoop(WalSndSendDataCallback send_data)
 		{
 			long		sleeptime;
 			int			wakeEvents;
-
-			wakeEvents = WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | WL_TIMEOUT;
+			WaitEvent	event;
 
 			if (!streamingDoneReceiving)
-				wakeEvents |= WL_SOCKET_READABLE;
+				wakeEvents = WL_SOCKET_READABLE;
+			else
+				wakeEvents = 0;
 
 			/*
 			 * Use fresh timestamp, not last_processing, to reduce the chance
@@ -2369,9 +2368,9 @@ WalSndLoop(WalSndSendDataCallback send_data)
 				wakeEvents |= WL_SOCKET_WRITEABLE;
 
 			/* Sleep until something happens or we time out */
-			(void) WaitLatchOrSocket(MyLatch, wakeEvents,
-									 MyProcPort->sock, sleeptime,
-									 WAIT_EVENT_WAL_SENDER_MAIN);
+			ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetSocketPos, wakeEvents, NULL);
+			(void) WaitEventSetWait(FeBeWaitSet, sleeptime, &event, 1,
+									WAIT_EVENT_WAL_SENDER_MAIN);
 		}
 	}
 }
-- 
2.20.1

#14Thomas Munro
thomas.munro@gmail.com
In reply to: Thomas Munro (#13)
2 attachment(s)
Re: Reducing WaitEventSet syscall churn

On Tue, Jan 5, 2021 at 6:10 PM Thomas Munro <thomas.munro@gmail.com> wrote:

For point 2, the question I am raising is: why should users get a
special FATAL message in some places and not others, for PM death?
However, if people are attached to that behaviour, we could still
achieve goal 1 without goal 2 by handling PM death explicitly in
walsender.c and I'd be happy to post an alternative patch set like
that.

Here's the alternative patch set, with no change to existing error
message behaviour. I'm going to commit this version and close this CF
item soon if there are no objections.

Attachments:

v8-0001-Introduce-symbolic-names-for-FeBeWaitSet-position.patchtext/x-patch; charset=US-ASCII; name=v8-0001-Introduce-symbolic-names-for-FeBeWaitSet-position.patchDownload
From 8f563edefdc2f276b3d8abeb019af1ff172bf7d9 Mon Sep 17 00:00:00 2001
From: Thomas Munro <tmunro@postgresql.org>
Date: Sat, 27 Feb 2021 13:34:37 +1300
Subject: [PATCH v8 1/2] Introduce symbolic names for FeBeWaitSet positions.

Previously we used 0 and 1 to refer to the socket and latch in far flung
parts of the tree, without any explanation.  Also use PGINVALID_SOCKET
rather than -1 in the place where we weren't already.

Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Discussion: https://postgr.es/m/CA%2BhUKGJAC4Oqao%3DqforhNey20J8CiG2R%3DoBPqvfR0vOJrFysGw%40mail.gmail.com
---
 src/backend/libpq/be-secure.c     |  4 ++--
 src/backend/libpq/pqcomm.c        | 18 +++++++++++++++---
 src/backend/utils/init/miscinit.c |  6 ++++--
 src/include/libpq/libpq.h         |  3 +++
 4 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/src/backend/libpq/be-secure.c b/src/backend/libpq/be-secure.c
index d1545a2ad6..bb603ad209 100644
--- a/src/backend/libpq/be-secure.c
+++ b/src/backend/libpq/be-secure.c
@@ -180,7 +180,7 @@ retry:
 
 		Assert(waitfor);
 
-		ModifyWaitEvent(FeBeWaitSet, 0, waitfor, NULL);
+		ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetSocketPos, waitfor, NULL);
 
 		WaitEventSetWait(FeBeWaitSet, -1 /* no timeout */ , &event, 1,
 						 WAIT_EVENT_CLIENT_READ);
@@ -292,7 +292,7 @@ retry:
 
 		Assert(waitfor);
 
-		ModifyWaitEvent(FeBeWaitSet, 0, waitfor, NULL);
+		ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetSocketPos, waitfor, NULL);
 
 		WaitEventSetWait(FeBeWaitSet, -1 /* no timeout */ , &event, 1,
 						 WAIT_EVENT_CLIENT_WRITE);
diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c
index 1e6b6db540..27a298f110 100644
--- a/src/backend/libpq/pqcomm.c
+++ b/src/backend/libpq/pqcomm.c
@@ -191,6 +191,9 @@ WaitEventSet *FeBeWaitSet;
 void
 pq_init(void)
 {
+	int		socket_pos PG_USED_FOR_ASSERTS_ONLY;
+	int		latch_pos PG_USED_FOR_ASSERTS_ONLY;
+
 	/* initialize state variables */
 	PqSendBufferSize = PQ_SEND_BUFFER_SIZE;
 	PqSendBuffer = MemoryContextAlloc(TopMemoryContext, PqSendBufferSize);
@@ -219,10 +222,19 @@ pq_init(void)
 #endif
 
 	FeBeWaitSet = CreateWaitEventSet(TopMemoryContext, 3);
-	AddWaitEventToSet(FeBeWaitSet, WL_SOCKET_WRITEABLE, MyProcPort->sock,
+	socket_pos = AddWaitEventToSet(FeBeWaitSet, WL_SOCKET_WRITEABLE,
+								   MyProcPort->sock, NULL, NULL);
+	latch_pos = AddWaitEventToSet(FeBeWaitSet, WL_LATCH_SET, PGINVALID_SOCKET,
+								  MyLatch, NULL);
+	AddWaitEventToSet(FeBeWaitSet, WL_POSTMASTER_DEATH, PGINVALID_SOCKET,
 					  NULL, NULL);
-	AddWaitEventToSet(FeBeWaitSet, WL_LATCH_SET, -1, MyLatch, NULL);
-	AddWaitEventToSet(FeBeWaitSet, WL_POSTMASTER_DEATH, -1, NULL, NULL);
+
+	/*
+	 * The event positions match the order we added them, but let's sanity
+	 * check them to be sure.
+	 */
+	Assert(socket_pos == FeBeWaitSetSocketPos);
+	Assert(latch_pos == FeBeWaitSetLatchPos);
 }
 
 /* --------------------------------
diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c
index 734c66d4e8..8de70eb46d 100644
--- a/src/backend/utils/init/miscinit.c
+++ b/src/backend/utils/init/miscinit.c
@@ -202,7 +202,8 @@ SwitchToSharedLatch(void)
 	MyLatch = &MyProc->procLatch;
 
 	if (FeBeWaitSet)
-		ModifyWaitEvent(FeBeWaitSet, 1, WL_LATCH_SET, MyLatch);
+		ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetLatchPos, WL_LATCH_SET,
+						MyLatch);
 
 	/*
 	 * Set the shared latch as the local one might have been set. This
@@ -221,7 +222,8 @@ SwitchBackToLocalLatch(void)
 	MyLatch = &LocalLatchData;
 
 	if (FeBeWaitSet)
-		ModifyWaitEvent(FeBeWaitSet, 1, WL_LATCH_SET, MyLatch);
+		ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetLatchPos, WL_LATCH_SET,
+						MyLatch);
 
 	SetLatch(MyLatch);
 }
diff --git a/src/include/libpq/libpq.h b/src/include/libpq/libpq.h
index b41b10620a..e4e5c21565 100644
--- a/src/include/libpq/libpq.h
+++ b/src/include/libpq/libpq.h
@@ -55,6 +55,9 @@ extern const PGDLLIMPORT PQcommMethods *PqCommMethods;
  */
 extern WaitEventSet *FeBeWaitSet;
 
+#define FeBeWaitSetSocketPos 0
+#define FeBeWaitSetLatchPos 1
+
 extern int	StreamServerPort(int family, const char *hostName,
 							 unsigned short portNumber, const char *unixSocketDir,
 							 pgsocket ListenSocket[], int MaxListen);
-- 
2.30.0

v8-0002-Use-FeBeWaitSet-for-walsender.c.patchtext/x-patch; charset=US-ASCII; name=v8-0002-Use-FeBeWaitSet-for-walsender.c.patchDownload
From 22b2c474b476ca91b579408f73ebdc014bb76083 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Mon, 24 Feb 2020 23:48:29 +1300
Subject: [PATCH v8 2/2] Use FeBeWaitSet for walsender.c.

This avoids the need to set up and tear down a new WaitEventSet every
time we wait.

Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Discussion: https://postgr.es/m/CA%2BhUKGJAC4Oqao%3DqforhNey20J8CiG2R%3DoBPqvfR0vOJrFysGw%40mail.gmail.com
---
 src/backend/replication/walsender.c | 43 +++++++++++++++++------------
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index 81244541e2..eb3f18ed48 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -244,6 +244,7 @@ static void WalSndKeepalive(bool requestReply);
 static void WalSndKeepaliveIfNecessary(void);
 static void WalSndCheckTimeOut(void);
 static long WalSndComputeSleeptime(TimestampTz now);
+static void WalSndWait(uint32 socket_events, long timeout, uint32 wait_event);
 static void WalSndPrepareWrite(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid, bool last_write);
 static void WalSndWriteData(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid, bool last_write);
 static void WalSndUpdateProgress(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid);
@@ -1287,7 +1288,6 @@ WalSndWriteData(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid,
 	/* If we have pending write here, go to slow path */
 	for (;;)
 	{
-		int			wakeEvents;
 		long		sleeptime;
 
 		/* Check for input from the client */
@@ -1304,13 +1304,9 @@ WalSndWriteData(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid,
 
 		sleeptime = WalSndComputeSleeptime(GetCurrentTimestamp());
 
-		wakeEvents = WL_LATCH_SET | WL_EXIT_ON_PM_DEATH |
-			WL_SOCKET_WRITEABLE | WL_SOCKET_READABLE | WL_TIMEOUT;
-
 		/* Sleep until something happens or we time out */
-		(void) WaitLatchOrSocket(MyLatch, wakeEvents,
-								 MyProcPort->sock, sleeptime,
-								 WAIT_EVENT_WAL_SENDER_WRITE_DATA);
+		WalSndWait(WL_SOCKET_WRITEABLE | WL_SOCKET_READABLE, sleeptime,
+				   WAIT_EVENT_WAL_SENDER_WRITE_DATA);
 
 		/* Clear any already-pending wakeups */
 		ResetLatch(MyLatch);
@@ -1480,15 +1476,12 @@ WalSndWaitForWal(XLogRecPtr loc)
 		 */
 		sleeptime = WalSndComputeSleeptime(GetCurrentTimestamp());
 
-		wakeEvents = WL_LATCH_SET | WL_EXIT_ON_PM_DEATH |
-			WL_SOCKET_READABLE | WL_TIMEOUT;
+		wakeEvents = WL_SOCKET_READABLE;
 
 		if (pq_is_send_pending())
 			wakeEvents |= WL_SOCKET_WRITEABLE;
 
-		(void) WaitLatchOrSocket(MyLatch, wakeEvents,
-								 MyProcPort->sock, sleeptime,
-								 WAIT_EVENT_WAL_SENDER_WAIT_WAL);
+		WalSndWait(wakeEvents, sleeptime, WAIT_EVENT_WAL_SENDER_WAIT_WAL);
 	}
 
 	/* reactivate latch so WalSndLoop knows to continue */
@@ -2348,10 +2341,10 @@ WalSndLoop(WalSndSendDataCallback send_data)
 			long		sleeptime;
 			int			wakeEvents;
 
-			wakeEvents = WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | WL_TIMEOUT;
-
 			if (!streamingDoneReceiving)
-				wakeEvents |= WL_SOCKET_READABLE;
+				wakeEvents = WL_SOCKET_READABLE;
+			else
+				wakeEvents = 0;
 
 			/*
 			 * Use fresh timestamp, not last_processing, to reduce the chance
@@ -2363,9 +2356,7 @@ WalSndLoop(WalSndSendDataCallback send_data)
 				wakeEvents |= WL_SOCKET_WRITEABLE;
 
 			/* Sleep until something happens or we time out */
-			(void) WaitLatchOrSocket(MyLatch, wakeEvents,
-									 MyProcPort->sock, sleeptime,
-									 WAIT_EVENT_WAL_SENDER_MAIN);
+			WalSndWait(wakeEvents, sleeptime, WAIT_EVENT_WAL_SENDER_MAIN);
 		}
 	}
 }
@@ -3121,6 +3112,22 @@ WalSndWakeup(void)
 	}
 }
 
+/*
+ * Wait for readiness on the FeBe socket, or a timeout.  The mask should be
+ * composed of optional WL_SOCKET_WRITEABLE and WL_SOCKET_READABLE flags.  Exit
+ * on postmaster death.
+ */
+static void
+WalSndWait(uint32 socket_events, long timeout, uint32 wait_event)
+{
+	WaitEvent event;
+
+	ModifyWaitEvent(FeBeWaitSet, FeBeWaitSetSocketPos, socket_events, NULL);
+	if (WaitEventSetWait(FeBeWaitSet, timeout, &event, 1, wait_event) == 1 &&
+		(event.events & WL_POSTMASTER_DEATH))
+		proc_exit(1);
+}
+
 /*
  * Signal all walsenders to move to stopping state.
  *
-- 
2.30.0

#15Thomas Munro
thomas.munro@gmail.com
In reply to: Thomas Munro (#14)
Re: Reducing WaitEventSet syscall churn

On Sat, Feb 27, 2021 at 2:48 PM Thomas Munro <thomas.munro@gmail.com> wrote:

Here's the alternative patch set, with no change to existing error
message behaviour. I'm going to commit this version and close this CF
item soon if there are no objections.

Pushed.

That leaves just walreceiver and postgres_fdw in need of WaitEventSet
improvements along these lines. The raw ingredients for that were
present in earlier patches, and I think Horiguchi-san had the right
idea: we should use the libpq event system to adjust our WES as
appropriate when it changes the socket underneath us. I will leave
this CF entry open a bit longer in case he would like to post an
updated version of that part (considering Tom's feedback[1]/messages/by-id/2446176.1594495351@sss.pgh.pa.us). If not,
we can close this and try again in the next cycle.

[1]: /messages/by-id/2446176.1594495351@sss.pgh.pa.us