>From fe417866a7132b1ee65e2ed96f79fbaad7922435 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Thu, 14 Jan 2016 15:24:15 +0100
Subject: [PATCH 4/4] Support using epoll as the polling primitive in
 unix_latch.c.

epoll(2) has the advantage of being able to reuse the wait datastructure
from previous calls when waiting the next time, on the same
events. Especially when waiting on a socket used by many processes like
the postmaster_alive_fd, that's good for scalability.
---
 configure                     |   2 +-
 configure.in                  |   2 +-
 src/backend/port/unix_latch.c | 228 +++++++++++++++++++++++++++++++++++++++++-
 src/include/pg_config.h.in    |   3 +
 src/include/storage/latch.h   |   4 +
 5 files changed, 234 insertions(+), 5 deletions(-)

diff --git a/configure b/configure
index 3dd1b15..d65e0b4 100755
--- a/configure
+++ b/configure
@@ -10144,7 +10144,7 @@ fi
 ## Header files
 ##
 
-for ac_header in atomic.h crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h mbarrier.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h
+for ac_header in atomic.h crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h mbarrier.h poll.h pwd.h sys/epoll.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h
 do :
   as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh`
 ac_fn_c_check_header_mongrel "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default"
diff --git a/configure.in b/configure.in
index 9398482..d24b7e8 100644
--- a/configure.in
+++ b/configure.in
@@ -1163,7 +1163,7 @@ AC_SUBST(UUID_LIBS)
 ##
 
 dnl sys/socket.h is required by AC_FUNC_ACCEPT_ARGTYPES
-AC_CHECK_HEADERS([atomic.h crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h mbarrier.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h])
+AC_CHECK_HEADERS([atomic.h crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h langinfo.h mbarrier.h poll.h pwd.h sys/epoll.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h])
 
 # On BSD, test for net/if.h will fail unless sys/socket.h
 # is included first.
diff --git a/src/backend/port/unix_latch.c b/src/backend/port/unix_latch.c
index 03bca68..5e0edf6 100644
--- a/src/backend/port/unix_latch.c
+++ b/src/backend/port/unix_latch.c
@@ -38,6 +38,9 @@
 #include <unistd.h>
 #include <sys/time.h>
 #include <sys/types.h>
+#ifdef HAVE_SYS_EPOLL_H
+#include <sys/epoll.h>
+#endif
 #ifdef HAVE_POLL_H
 #include <poll.h>
 #endif
@@ -62,8 +65,10 @@
  * useful to manually specify the used primitive.  If desired, just add a
  * define somewhere before this block.
  */
-#if defined(LATCH_USE_POLL) || defined(LATCH_USE_SELECT)
+#if defined(LATCH_USE_EPOLL) || defined(LATCH_USE_POLL) || defined(LATCH_USE_SELECT)
 /* don't overwrite manual choice */
+#elif defined(HAVE_SYS_EPOLL_H)
+#define LATCH_USE_EPOLL
 #elif defined(HAVE_POLL)
 #define LATCH_USE_POLL
 #elif HAVE_SYS_SELECT_H
@@ -82,6 +87,9 @@ static int	selfpipe_writefd = -1;
 /* Private function prototypes */
 static void sendSelfPipeByte(void);
 static void drainSelfPipe(void);
+#ifdef LATCH_USE_EPOLL
+static void initEpoll(volatile Latch *latch);
+#endif
 
 
 /*
@@ -127,6 +135,10 @@ InitLatch(volatile Latch *latch)
 	latch->is_set = false;
 	latch->owner_pid = MyProcPid;
 	latch->is_shared = false;
+
+#ifdef LATCH_USE_EPOLL
+	initEpoll(latch);
+#endif
 }
 
 /*
@@ -174,6 +186,10 @@ OwnLatch(volatile Latch *latch)
 		elog(ERROR, "latch already owned");
 
 	latch->owner_pid = MyProcPid;
+
+#ifdef LATCH_USE_EPOLL
+	initEpoll(latch);
+#endif
 }
 
 /*
@@ -186,6 +202,14 @@ DisownLatch(volatile Latch *latch)
 	Assert(latch->owner_pid == MyProcPid);
 
 	latch->owner_pid = 0;
+
+#ifdef LATCH_USE_EPOLL
+	if (latch->epollfd >= 0)
+	{
+		close(latch->epollfd);
+		latch->epollfd = -1;
+	}
+#endif
 }
 
 /*
@@ -231,7 +255,9 @@ WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock,
 				cur_time;
 	long		cur_timeout;
 
-#if defined(LATCH_USE_POLL)
+#if defined(LATCH_USE_EPOLL)
+	struct epoll_event events[1];
+#elif defined(LATCH_USE_POLL)
 	struct pollfd pfds[3];
 	int			nfds;
 #elif defined(LATCH_USE_SELECT)
@@ -311,7 +337,175 @@ WaitLatchOrSocket(volatile Latch *latch, int wakeEvents, pgsocket sock,
 		 * Must wait ... we use the polling interface determined at the top of
 		 * this file to do so.
 		 */
-#if defined(LATCH_USE_POLL)
+#if defined(LATCH_USE_EPOLL)
+		if (wakeEvents != latch->lastmask || latch->lastwatchfd != sock)
+		{
+			bool sockfd_changed = latch->lastwatchfd != sock;
+
+			if (latch->lastwatchfd != -1 && sockfd_changed)
+			{
+				struct epoll_event data;
+
+				/*
+				 * Unnecessarily pass data for delete due to bug errorneously
+				 * requiring it in the past.
+				 */
+				rc = epoll_ctl(latch->epollfd, EPOLL_CTL_DEL,
+							   latch->lastwatchfd, &data);
+				if (rc < 0)
+				{
+					waiting = false;
+					ereport(ERROR,
+							(errcode_for_socket_access(),
+							 errmsg("epoll_ctl() failed: %m")));
+				}
+
+				latch->lastwatchfd = -1;
+			}
+
+			if (sock != -1 && sockfd_changed)
+			{
+				struct epoll_event data;
+				data.events = 0;
+				data.data.fd = sock;
+				rc = epoll_ctl(latch->epollfd, EPOLL_CTL_ADD, sock, &data);
+				if (rc < 0)
+				{
+					waiting = false;
+					ereport(ERROR,
+							(errcode_for_socket_access(),
+							 errmsg("epoll_ctl() failed: %m")));
+				}
+
+				latch->lastwatchfd = sock;
+			}
+
+			if (sock != -1 && (
+					sockfd_changed ||
+					(wakeEvents & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) !=
+					(latch->lastmask & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))))
+			{
+				struct epoll_event data;
+
+				data.events = EPOLLRDHUP | EPOLLERR | EPOLLHUP;
+				data.data.fd = sock;
+
+				if (wakeEvents & WL_SOCKET_READABLE)
+					data.events |= EPOLLIN;
+				if (wakeEvents & WL_SOCKET_WRITEABLE)
+					data.events |= EPOLLOUT;
+
+				rc = epoll_ctl(latch->epollfd, EPOLL_CTL_MOD, sock, &data);
+				if (rc < 0)
+				{
+					waiting = false;
+					ereport(ERROR,
+							(errcode_for_socket_access(),
+							 errmsg("epoll_ctl() failed: %m")));
+				}
+			}
+
+			if ((latch->lastmask & WL_POSTMASTER_DEATH) &&
+				!(wakeEvents & WL_POSTMASTER_DEATH))
+			{
+				struct epoll_event data;
+
+				/*
+				 * Unnecessarily pass data for delete due to bug errorneously
+				 * requiring it in the past.
+				 */
+				rc = epoll_ctl(latch->epollfd, EPOLL_CTL_DEL,
+							   postmaster_alive_fds[POSTMASTER_FD_WATCH],
+							   &data);
+				if (rc < 0)
+				{
+					waiting = false;
+					ereport(ERROR,
+							(errcode_for_socket_access(),
+							 errmsg("epoll_ctl() failed: %m")));
+				}
+			}
+
+
+			if (!(latch->lastmask & WL_POSTMASTER_DEATH) &&
+				(wakeEvents & WL_POSTMASTER_DEATH))
+			{
+				struct epoll_event data;
+
+				data.events = EPOLLIN | EPOLLHUP | EPOLLRDHUP | EPOLLERR;
+				data.data.fd = postmaster_alive_fds[POSTMASTER_FD_WATCH];
+
+				rc = epoll_ctl(latch->epollfd, EPOLL_CTL_ADD,
+							   postmaster_alive_fds[POSTMASTER_FD_WATCH],
+							   &data);
+				if (rc < 0)
+				{
+					waiting = false;
+					ereport(ERROR,
+							(errcode_for_socket_access(),
+							 errmsg("epoll_ctl() failed: %m")));
+				}
+			}
+
+			latch->lastmask = wakeEvents;
+		}
+
+		rc = epoll_wait(latch->epollfd, events, 1, cur_timeout);
+		if (rc < 0)
+		{
+			/* EINTR is okay, otherwise complain */
+			if (errno != EINTR)
+			{
+				waiting = false;
+				ereport(ERROR,
+						(errcode_for_socket_access(),
+						 errmsg("epoll_wait() failed: %m")));
+			}
+		}
+		else if (rc == 0)
+		{
+			/* timeout exceeded */
+			if (wakeEvents & WL_TIMEOUT)
+				result |= WL_TIMEOUT;
+		}
+		else
+		{
+			if (events[0].data.fd == sock)
+			{
+				/* data available in socket */
+				if (events[0].events & EPOLLIN)
+					result |= WL_SOCKET_READABLE;
+
+				/* socket is writable */
+				if (events[0].events & EPOLLOUT)
+					result |= WL_SOCKET_WRITEABLE;
+
+				/* EOF/error condition */
+				if (events[0].events & (EPOLLERR | EPOLLHUP | EPOLLRDHUP))
+				{
+					if (wakeEvents & WL_SOCKET_READABLE)
+						result |= WL_SOCKET_READABLE;
+					if (wakeEvents & WL_SOCKET_WRITEABLE)
+						result |= WL_SOCKET_WRITEABLE;
+				}
+			}
+
+			if (events[0].data.fd == postmaster_alive_fds[POSTMASTER_FD_WATCH] &&
+				events[0].events & (EPOLLIN | EPOLLHUP | EPOLLERR | EPOLLRDHUP))
+			{
+				/* check comment for the corresponding LATCH_USE_POLL case */
+				Assert(!PostmasterIsAlive());
+				result |= WL_POSTMASTER_DEATH;
+			}
+
+			if (events[0].data.fd == selfpipe_readfd &&
+				events[0].events & EPOLLIN)
+			{
+				/* There's data in the self-pipe, clear it. */
+				drainSelfPipe();
+			}
+		}
+#elif defined(LATCH_USE_POLL)
 		nfds = 0;
 
 		/* selfpipe is always in pfds[0] */
@@ -725,3 +919,31 @@ drainSelfPipe(void)
 		/* else buffer wasn't big enough, so read again */
 	}
 }
+
+#ifdef LATCH_USE_EPOLL
+/*
+ * Create the epoll fd used to wait for readiness. Needs to be called whenever
+ * owning a latch, be it a shared or a backend-local one.
+ */
+static void
+initEpoll(volatile Latch *latch)
+{
+	struct epoll_event data;
+	int rc;
+
+	/* one each for selfpipe, socket, postmaster alive fd */
+	latch->epollfd = epoll_create(3);
+	if (latch->epollfd < 0)
+		elog(FATAL, "epoll_create failed: %m");
+
+	/* always want to be nodified of writes into thee self-pipe */
+	data.events = EPOLLIN;
+	data.data.fd = selfpipe_readfd;
+	rc = epoll_ctl(latch->epollfd, EPOLL_CTL_ADD, selfpipe_readfd, &data);
+	if (rc < 0)
+		elog(FATAL, "epoll_ctl failed: %m");
+
+	latch->lastwatchfd = -1;
+	latch->lastmask = 0;
+}
+#endif
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 16a272e..0fc4ce2 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -530,6 +530,9 @@
 /* Define to 1 if you have the syslog interface. */
 #undef HAVE_SYSLOG
 
+/* Define to 1 if you have the <sys/epoll.h> header file. */
+#undef HAVE_SYS_EPOLL_H
+
 /* Define to 1 if you have the <sys/ioctl.h> header file. */
 #undef HAVE_SYS_IOCTL_H
 
diff --git a/src/include/storage/latch.h b/src/include/storage/latch.h
index e77491e..3666352 100644
--- a/src/include/storage/latch.h
+++ b/src/include/storage/latch.h
@@ -92,6 +92,10 @@ typedef struct Latch
 	int			owner_pid;
 #ifdef WIN32
 	HANDLE		event;
+#elif defined(HAVE_SYS_EPOLL_H)
+	int			epollfd;
+	int			lastwatchfd;
+	int			lastmask;
 #endif
 } Latch;
 
-- 
2.5.0.400.gff86faf.dirty

