>From f6f3be360d1af1f9d83d4b5a522defa87bfda346 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Tue, 13 Jan 2015 11:58:58 +0100
Subject: [PATCH 10/10] WIP: unix_latch.c: efficiency hackery

Hack 1) Use eventfds on linux when available

        This provides a measurable performance adavante on linux, due
        to avoidance of the pipe buffer.

        I think we should do this, but this isn't a finished patch.

Hack 2) Store a fd in the latch, and use that instead of signals + selfbyte for wakeups

        This requires a fair amount (NumProcSignalSlots) of fds, and
        only works for slots initialized in the postmaster.

        The, not really proven, performance benefits come from
        avoiding roundtrips through the kernel as only one write to a
        fd is needed, instead of a signal + write to the selfpipe.
---
 src/backend/port/unix_latch.c   | 166 +++++++++++++++++++++++++++++++++-------
 src/backend/storage/lmgr/proc.c |   2 +-
 src/include/storage/latch.h     |   2 +
 3 files changed, 140 insertions(+), 30 deletions(-)

diff --git a/src/backend/port/unix_latch.c b/src/backend/port/unix_latch.c
index 147e22c..cf5f9fe 100644
--- a/src/backend/port/unix_latch.c
+++ b/src/backend/port/unix_latch.c
@@ -32,10 +32,15 @@
  */
 #include "postgres.h"
 
+#define HAVE_EVENTFD
+
 #include <fcntl.h>
 #include <limits.h>
 #include <signal.h>
 #include <unistd.h>
+#ifdef HAVE_EVENTFD
+#include <sys/eventfd.h>
+#endif
 #include <sys/time.h>
 #include <sys/types.h>
 #ifdef HAVE_POLL_H
@@ -62,10 +67,13 @@ static volatile sig_atomic_t waiting = false;
 /* Read and write ends of the self-pipe */
 static int	selfpipe_readfd = -1;
 static int	selfpipe_writefd = -1;
+#if defined(HAVE_EVENTFD) && defined(EFD_NONBLOCK)
+static int	selfpipe_eventfd = -1;
+#endif
 
 /* Private function prototypes */
 static void sendSelfPipeByte(void);
-static void drainSelfPipe(void);
+static void drainSelfPipe(volatile Latch *latch);
 
 
 /*
@@ -77,26 +85,45 @@ static void drainSelfPipe(void);
 void
 InitializeLatchSupport(void)
 {
-	int			pipefd[2];
-
 	Assert(selfpipe_readfd == -1);
 
-	/*
-	 * Set up the self-pipe that allows a signal handler to wake up the
-	 * select() in WaitLatch. Make the write-end non-blocking, so that
-	 * SetLatch won't block if the event has already been set many times
-	 * filling the kernel buffer. Make the read-end non-blocking too, so that
-	 * we can easily clear the pipe by reading until EAGAIN or EWOULDBLOCK.
-	 */
-	if (pipe(pipefd) < 0)
-		elog(FATAL, "pipe() failed: %m");
-	if (fcntl(pipefd[0], F_SETFL, O_NONBLOCK) < 0)
-		elog(FATAL, "fcntl() failed on read-end of self-pipe: %m");
-	if (fcntl(pipefd[1], F_SETFL, O_NONBLOCK) < 0)
-		elog(FATAL, "fcntl() failed on write-end of self-pipe: %m");
-
-	selfpipe_readfd = pipefd[0];
-	selfpipe_writefd = pipefd[1];
+#if defined(HAVE_EVENTFD) && defined(EFD_NONBLOCK)
+	do
+	{
+		selfpipe_eventfd = eventfd(0, EFD_NONBLOCK);
+		if (selfpipe_eventfd < 0)
+		{
+			elog(DEBUG1, "eventfd() failed: %m");
+			break;
+		}
+		selfpipe_readfd = selfpipe_eventfd;
+		selfpipe_writefd = selfpipe_eventfd;
+
+		return;
+	}
+	while (0);
+#endif
+	{
+		int			pipefd[2];
+
+		/*
+		 * Set up the self-pipe that allows a signal handler to wake up the
+		 * select() in WaitLatch. Make the write-end non-blocking, so that
+		 * SetLatch won't block if the event has already been set many times
+		 * filling the kernel buffer. Make the read-end non-blocking too, so
+		 * that we can easily clear the pipe by reading until EAGAIN or
+		 * EWOULDBLOCK.
+		 */
+		if (pipe(pipefd) < 0)
+			elog(FATAL, "pipe() failed: %m");
+		if (fcntl(pipefd[0], F_SETFL, O_NONBLOCK) < 0)
+			elog(FATAL, "fcntl() failed on read-end of self-pipe: %m");
+		if (fcntl(pipefd[1], F_SETFL, O_NONBLOCK) < 0)
+			elog(FATAL, "fcntl() failed on write-end of self-pipe: %m");
+
+		selfpipe_readfd = pipefd[0];
+		selfpipe_writefd = pipefd[1];
+	}
 }
 
 /*
@@ -111,6 +138,7 @@ InitLatch(volatile Latch *latch)
 	latch->is_set = false;
 	latch->owner_pid = MyProcPid;
 	latch->is_shared = false;
+	latch->owner_fd = -1;
 }
 
 /*
@@ -130,6 +158,24 @@ InitSharedLatch(volatile Latch *latch)
 	latch->is_set = false;
 	latch->owner_pid = 0;
 	latch->is_shared = true;
+	latch->owner_fd = -1;
+}
+
+void
+InitSharedLatchPostmaster(volatile Latch *latch)
+{
+	latch->is_set = false;
+	latch->owner_pid = 0;
+	latch->is_shared = true;
+	latch->owner_fd = -1;
+
+#if defined(HAVE_EVENTFD) && defined(EFD_NONBLOCK)
+	latch->owner_fd = eventfd(0, EFD_NONBLOCK);
+	if (latch->owner_fd < 0)
+	{
+		elog(DEBUG1, "eventfd() failed: %m");
+	}
+#endif
 }
 
 /*
@@ -218,6 +264,7 @@ WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock,
 #ifdef HAVE_POLL
 	struct pollfd pfds[3];
 	int			nfds;
+	int			selfd = -1;
 #else
 	struct timeval tv,
 			   *tvp;
@@ -277,8 +324,6 @@ WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock,
 		 * with weak memory ordering, so that we cannot miss seeing is_set if
 		 * the signal byte is already in the pipe when we drain it.
 		 */
-		drainSelfPipe();
-
 		if ((wakeEvents & WL_LATCH_SET) && latch->is_set)
 		{
 			result |= WL_LATCH_SET;
@@ -315,10 +360,17 @@ WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock,
 			nfds++;
 		}
 
-		pfds[nfds].fd = selfpipe_readfd;
-		pfds[nfds].events = POLLIN;
-		pfds[nfds].revents = 0;
-		nfds++;
+		if (wakeEvents & WL_LATCH_SET)
+		{
+			if (latch->owner_fd != -1)
+				pfds[nfds].fd = latch->owner_fd;
+			else
+				pfds[nfds].fd = selfpipe_readfd;
+			pfds[nfds].events = POLLIN;
+			pfds[nfds].revents = 0;
+			selfd = nfds;
+			nfds++;
+		}
 
 		if (wakeEvents & WL_POSTMASTER_DEATH)
 		{
@@ -395,6 +447,13 @@ WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock,
 				if (!PostmasterIsAlive())
 					result |= WL_POSTMASTER_DEATH;
 			}
+
+			/* clear data from selfpipe */
+			if ((wakeEvents & WL_LATCH_SET) && (pfds[selfd].revents & POLLIN))
+			{
+				Assert(selfd != -1);
+				drainSelfPipe(latch);
+			}
 		}
 #else							/* !HAVE_POLL */
 
@@ -552,11 +611,36 @@ SetLatch(volatile Latch *latch)
 	owner_pid = latch->owner_pid;
 	if (owner_pid == 0)
 		return;
+#if defined(HAVE_EVENTFD) && defined(EFD_NONBLOCK)
+	else if (latch->owner_fd != -1)
+	{
+		int			rc;
+		int64		count = 1;
+retry:
+		rc = write(latch->owner_fd, &count, sizeof(count));
+
+		if (rc < 0)
+		{
+			if (errno == EINTR)
+				goto retry;
+			else if (errno == EAGAIN && errno == EWOULDBLOCK)
+			{
+				/* counter is full, no need to do anything */
+			}
+			/* XXX: odd, should we warn instead? */
+			Assert(false);
+			return;
+		}
+	}
+#endif
 	else if (owner_pid == MyProcPid)
 	{
+		Assert(latch->owner_fd == -1);
+
 		if (waiting)
 			sendSelfPipeByte();
 	}
+
 	else
 		kill(owner_pid, SIGUSR1);
 }
@@ -604,10 +688,21 @@ static void
 sendSelfPipeByte(void)
 {
 	int			rc;
-	char		dummy = 0;
 
 retry:
-	rc = write(selfpipe_writefd, &dummy, 1);
+#if defined(HAVE_EVENTFD) && defined(EFD_NONBLOCK)
+	if (selfpipe_eventfd != -1)
+	{
+		int64		count = 1;
+		rc = write(selfpipe_writefd, &count, sizeof(count));
+	}
+	else
+#endif
+	{
+		char		dummy = 0;
+		rc = write(selfpipe_writefd, &dummy, 1);
+	}
+
 	if (rc < 0)
 	{
 		/* If interrupted by signal, just retry */
@@ -638,7 +733,7 @@ retry:
  * happen).
  */
 static void
-drainSelfPipe(void)
+drainSelfPipe(volatile Latch *latch)
 {
 	/*
 	 * There shouldn't normally be more than one byte in the pipe, or maybe a
@@ -646,10 +741,23 @@ drainSelfPipe(void)
 	 */
 	char		buf[16];
 	int			rc;
+	int			fd;
+
+	if (latch->owner_fd != -1)
+		fd = latch->owner_fd;
+	else
+		fd = selfpipe_readfd;
 
 	for (;;)
 	{
-		rc = read(selfpipe_readfd, buf, sizeof(buf));
+		/*
+		 * If eventfd is being used, this will always return 8 bytes
+		 * (containing the number of events) as being readable. That means
+		 * we'll always recognize it as the pipe being drained. Not pretty,
+		 * but beats repeating the code (perhaps).
+		 */
+		rc = read(fd, buf, sizeof(buf));
+
 		if (rc < 0)
 		{
 			if (errno == EAGAIN || errno == EWOULDBLOCK)
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index d97c244..0e7ee87 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -224,7 +224,7 @@ InitProcGlobal(void)
 		if (i < MaxBackends + NUM_AUXILIARY_PROCS)
 		{
 			PGSemaphoreCreate(&(procs[i].sem));
-			InitSharedLatch(&(procs[i].procLatch));
+			InitSharedLatchPostmaster(&(procs[i].procLatch));
 			procs[i].backendLock = LWLockAssign();
 		}
 		procs[i].pgprocno = i;
diff --git a/src/include/storage/latch.h b/src/include/storage/latch.h
index 28fc684..a4a0ba4 100644
--- a/src/include/storage/latch.h
+++ b/src/include/storage/latch.h
@@ -90,6 +90,7 @@ typedef struct Latch
 	sig_atomic_t is_set;
 	bool		is_shared;
 	int			owner_pid;
+	int			owner_fd;
 #ifdef WIN32
 	HANDLE		event;
 #endif
@@ -108,6 +109,7 @@ typedef struct Latch
 extern void InitializeLatchSupport(void);
 extern void InitLatch(volatile Latch *latch);
 extern void InitSharedLatch(volatile Latch *latch);
+extern void InitSharedLatchPostmaster(volatile Latch *latch);
 extern void OwnLatch(volatile Latch *latch);
 extern void DisownLatch(volatile Latch *latch);
 extern int	WaitLatch(volatile Latch *latch, int wakeEvents, long timeout);
-- 
2.0.0.rc2.4.g1dc51c6.dirty

