From 1f8265b2971ae7bfc686aafb9d3fdeae2a0dcc1d Mon Sep 17 00:00:00 2001
From: Thomas Munro <tmunro@postgresql.org>
Date: Mon, 20 Feb 2023 23:26:36 +1300
Subject: [PATCH v4 2/5] Don't leak descriptors into subprograms.

Open long-lived data and WAL file descriptors with O_CLOEXEC.  This flag
was introduced by SUSv4 (POSIX.1-2008), and by now all of our target
Unix systems have it.  Our open() implementation for Windows already had
that behavior, so provide a dummy O_CLOEXEC flag on that platform.

Do the same for sockets and the postmaster pipe with FD_CLOEXEC.  Later
commits might use modern interfaces to remove these extra fcntl() calls
where possible, but we'll need them as a fallback for a couple of
systems, so do it that way in this initial commit.

With this change, subprograms executed for archiving, copying etc will
no longer have access to the server's descriptors, other than the ones
that we decide to pass down.

Reviewed-by: Andres Freund <andres@anarazel.de>
Discussion: https://postgr.es/m/CA%2BhUKGKb6FsAdQWcRL35KJsftv%2B9zXqQbzwkfRf1i0J2e57%2BhQ%40mail.gmail.com
---
 src/backend/access/transam/xlog.c | 9 ++++++---
 src/backend/libpq/pqcomm.c        | 7 +++++++
 src/backend/storage/file/fd.c     | 8 ++++++++
 src/backend/utils/init/miscinit.c | 8 ++++++++
 src/include/port/win32_port.h     | 7 +++++++
 5 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index f9f0f6db8d..87af608d15 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -2936,7 +2936,8 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
 	 * Try to use existent file (checkpoint maker may have created it already)
 	 */
 	*added = false;
-	fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
+	fd = BasicOpenFile(path, O_RDWR | PG_BINARY | O_CLOEXEC |
+					   get_sync_bit(sync_method));
 	if (fd < 0)
 	{
 		if (errno != ENOENT)
@@ -3097,7 +3098,8 @@ XLogFileInit(XLogSegNo logsegno, TimeLineID logtli)
 		return fd;
 
 	/* Now open original target segment (might not be file I just made) */
-	fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
+	fd = BasicOpenFile(path, O_RDWR | PG_BINARY | O_CLOEXEC |
+					   get_sync_bit(sync_method));
 	if (fd < 0)
 		ereport(ERROR,
 				(errcode_for_file_access(),
@@ -3328,7 +3330,8 @@ XLogFileOpen(XLogSegNo segno, TimeLineID tli)
 
 	XLogFilePath(path, tli, segno, wal_segment_size);
 
-	fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
+	fd = BasicOpenFile(path, O_RDWR | PG_BINARY | O_CLOEXEC |
+					   get_sync_bit(sync_method));
 	if (fd < 0)
 		ereport(PANIC,
 				(errcode_for_file_access(),
diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c
index 864c9debe8..da5bb5fc5d 100644
--- a/src/backend/libpq/pqcomm.c
+++ b/src/backend/libpq/pqcomm.c
@@ -200,6 +200,13 @@ pq_init(void)
 				(errmsg("could not set socket to nonblocking mode: %m")));
 #endif
 
+#ifndef WIN32
+
+	/* Don't give the socket to any subprograms we execute. */
+	if (fcntl(MyProcPort->sock, F_SETFD, FD_CLOEXEC) < 0)
+		elog(FATAL, "fcntl(F_SETFD) failed on socket: %m");
+#endif
+
 	FeBeWaitSet = CreateWaitEventSet(TopMemoryContext, FeBeWaitSetNEvents);
 	socket_pos = AddWaitEventToSet(FeBeWaitSet, WL_SOCKET_WRITEABLE,
 								   MyProcPort->sock, NULL, NULL);
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index ea690f05c6..132a85c04b 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -1515,6 +1515,14 @@ PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
 	/* Close excess kernel FDs. */
 	ReleaseLruFiles();
 
+	/*
+	 * Descriptors managed by virtual descriptors are implicitly O_CLOEXEC.
+	 * The client shouldn't be expected to know which kernel descriptors are
+	 * currently open, so it wouldn't make sense for them to be inherited by
+	 * executed subprograms.
+	 */
+	fileFlags |= O_CLOEXEC;
+
 	vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
 
 	if (vfdP->fd < 0)
diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c
index 59532bbd80..7eb7fe87f6 100644
--- a/src/backend/utils/init/miscinit.c
+++ b/src/backend/utils/init/miscinit.c
@@ -163,6 +163,14 @@ InitPostmasterChild(void)
 
 	/* Request a signal if the postmaster dies, if possible. */
 	PostmasterDeathSignalInit();
+
+	/* Don't give the pipe to subprograms that we execute. */
+#ifndef WIN32
+	if (fcntl(postmaster_alive_fds[POSTMASTER_FD_WATCH], F_SETFD, FD_CLOEXEC) < 0)
+		ereport(FATAL,
+				(errcode_for_socket_access(),
+				 errmsg_internal("could not set postmaster death monitoring pipe to FD_CLOEXEC mode: %m")));
+#endif
 }
 
 /*
diff --git a/src/include/port/win32_port.h b/src/include/port/win32_port.h
index 9488195799..5116c2fc06 100644
--- a/src/include/port/win32_port.h
+++ b/src/include/port/win32_port.h
@@ -353,6 +353,13 @@ extern int	_pglstat64(const char *name, struct stat *buf);
  */
 #define O_DSYNC 0x0080
 
+/*
+ * Our open() replacement does not create inheritable handles, so it is safe to
+ * ignore O_CLOEXEC.  (If we were using Windows' own open(), it might be
+ * necessary to convert this to _O_NOINHERIT.)
+ */
+#define O_CLOEXEC 0
+
 /*
  * Supplement to <errno.h>.
  *
-- 
2.39.1

