From 8e3c7cbf49c3710226c177d3259c4ef9b70bc048 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Mon, 28 Oct 2024 18:03:53 -0400
Subject: [PATCH 12/12] WIP: bufmgr: Detect some bad buffer accesses

I wrote this mainly to ensure that I did not miss converting any hint bit sets
to BufferPrepareToSetHintBits(), but it seems like it might be more generally
useful.

If we do want to include it, it needs a bit more polishing.

On my workstation, the performance effect of this test infrastructure is as
follows:

base:
real	1m4.613s
user	4m31.409s
sys	3m20.445s

ENFORCE_BUFFER_PROT

real	1m11.912s
user	4m27.332s
sys	3m28.063s

ENFORCE_BUFFER_PROT + ENFORCE_BUFFER_PROT_READ
real	1m33.455s
user	4m32.188s
sys	3m41.275s

Author:
Reviewed-by:
Discussion: https://postgr.es/m/
Backpatch:
---
 src/backend/storage/aio/method_worker.c |  11 ++
 src/backend/storage/buffer/buf_init.c   |  10 ++
 src/backend/storage/buffer/bufmgr.c     | 183 ++++++++++++++++++++++--
 src/backend/utils/misc/guc_tables.c     |   2 +-
 src/include/pg_config_manual.h          |  24 ++++
 src/include/storage/buf_internals.h     |   9 ++
 src/include/storage/pg_shmem.h          |   2 +
 7 files changed, 229 insertions(+), 12 deletions(-)

diff --git a/src/backend/storage/aio/method_worker.c b/src/backend/storage/aio/method_worker.c
index 4a7853d13fa..7442ade2608 100644
--- a/src/backend/storage/aio/method_worker.c
+++ b/src/backend/storage/aio/method_worker.c
@@ -388,6 +388,17 @@ IoWorkerMain(const void *startup_data, size_t startup_data_len)
 	sprintf(cmd, "%d", MyIoWorkerId);
 	set_ps_display(cmd);
 
+	/*
+	 * Allow access to all buffers.  We assume that backends will not request
+	 * I/Os on shared buffers without holding proper locks.
+	 */
+#if defined(ENFORCE_BUFFER_PROT) && defined(ENFORCE_BUFFER_PROT_READ)
+	for (int i = 0; i < NBuffers; i++)
+	{
+		SetBufferProtection(i + 1, true, true);
+	}
+#endif
+
 	/* see PostgresMain() */
 	if (sigsetjmp(local_sigjmp_buf, 1) != 0)
 	{
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
index ed1dc488a42..e5e22cfc59d 100644
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -138,6 +138,16 @@ BufferManagerShmemInit(void)
 							 LWTRANCHE_BUFFER_CONTENT);
 
 			ConditionVariableInit(BufferDescriptorGetIOCV(buf));
+
+			/*
+			 * Unused buffers are inaccessible. But if we're not enforcing
+			 * making buffers inaccessible without a pin, we won't mark
+			 * buffers as accessible during pinning, therefore we better don't
+			 * make them initially inaccessible.
+			 */
+#if defined(ENFORCE_BUFFER_PROT) && defined(ENFORCE_BUFFER_PROT_READ)
+			SetBufferProtection(i + 1, false, false);
+#endif
 		}
 
 		/* Correct last entry of linked list */
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index e619ebd38ff..d680b57bcb7 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -37,6 +37,10 @@
 #include <sys/file.h>
 #include <unistd.h>
 
+#ifdef ENFORCE_BUFFER_PROT
+#include <sys/mman.h>			/* for mprotect() */
+#endif							/* ENFORCE_BUFFER_PROT */
+
 #include "access/tableam.h"
 #include "access/xloginsert.h"
 #include "access/xlogutils.h"
@@ -54,6 +58,7 @@
 #include "storage/fd.h"
 #include "storage/ipc.h"
 #include "storage/lmgr.h"
+#include "storage/pg_shmem.h"
 #include "storage/proc.h"
 #include "storage/read_stream.h"
 #include "storage/smgr.h"
@@ -1074,8 +1079,6 @@ ZeroAndLockBuffer(Buffer buffer, ReadBufferMode mode, bool already_valid)
 
 	if (need_to_zero)
 	{
-		memset(BufferGetPage(buffer), 0, BLCKSZ);
-
 		/*
 		 * Grab the buffer content lock before marking the page as valid, to
 		 * make sure that no other backend sees the zeroed page before the
@@ -1088,7 +1091,12 @@ ZeroAndLockBuffer(Buffer buffer, ReadBufferMode mode, bool already_valid)
 		 * already valid.)
 		 */
 		if (!isLocalBuf)
+		{
 			LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_EXCLUSIVE);
+			SetBufferProtection(buffer, true, true);
+		}
+
+		memset(BufferGetPage(buffer), 0, BLCKSZ);
 
 		/* Set BM_VALID, terminate IO, and wake up any waiters */
 		if (isLocalBuf)
@@ -1922,6 +1930,9 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
 		io_pages[0] = BufferGetBlock(buffers[nblocks_done]);
 		io_buffers_len = 1;
 
+		if (persistence != RELPERSISTENCE_TEMP)
+			SetBufferProtection(io_buffers[0], true, true);
+
 		/*
 		 * How many neighboring-on-disk blocks can we scatter-read into other
 		 * buffers at the same time?  In this case we don't wait if we see an
@@ -1937,7 +1948,16 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
 				   BufferGetBlockNumber(buffers[i]) - 1);
 			Assert(io_buffers[io_buffers_len] == buffers[i]);
 
-			io_pages[io_buffers_len++] = BufferGetBlock(buffers[i]);
+			io_pages[io_buffers_len] = BufferGetBlock(buffers[i]);
+
+			/*
+			 * If the I/O is performed synchronously, allow the synchronous
+			 * read to modify the buffer.
+			 */
+			if (persistence != RELPERSISTENCE_TEMP)
+				SetBufferProtection(io_buffers[io_buffers_len], true, true);
+
+			io_buffers_len++;
 		}
 
 		/* get a reference to wait for in WaitReadBuffers() */
@@ -1975,6 +1995,16 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
 		else
 			pgBufferUsage.shared_blks_read += io_buffers_len;
 
+		/*
+		 * Disallow writes for this proces again.  If the I/O is performed
+		 * asynchronously, the I/O worker will allow the write for itself.
+		 */
+		if (persistence != RELPERSISTENCE_TEMP)
+		{
+			for (int i = 0; i < io_buffers_len; i++)
+				SetBufferProtection(io_buffers[i], true, false);
+		}
+
 		/*
 		 * Track vacuum cost when issuing IO, not after waiting for it.
 		 * Otherwise we could end up issuing a lot of IO in a short timespan,
@@ -2649,7 +2679,9 @@ ExtendBufferedRelShared(BufferManagerRelation bmr,
 		buf_block = BufHdrGetBlock(GetBufferDescriptor(buffers[i] - 1));
 
 		/* new buffers are zero-filled */
+		SetBufferProtection(buffers[i], true, true);
 		MemSet(buf_block, 0, BLCKSZ);
+		SetBufferProtection(buffers[i], true, false);
 	}
 
 	/*
@@ -2877,7 +2909,7 @@ ExtendBufferedRelShared(BufferManagerRelation bmr,
 		}
 
 		if (lock)
-			LWLockAcquire(BufferDescriptorGetContentLock(buf_hdr), LW_EXCLUSIVE);
+			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
 
 		TerminateBufferIO(buf_hdr, false, BM_VALID, true, false);
 	}
@@ -3136,6 +3168,10 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
 				 * non-accessible in any case.
 				 */
 				VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(buf), BLCKSZ);
+
+#if defined(ENFORCE_BUFFER_PROT) && defined(ENFORCE_BUFFER_PROT_READ)
+				SetBufferProtection(BufferDescriptorGetBuffer(buf), true, false);
+#endif
 				break;
 			}
 		}
@@ -3208,6 +3244,10 @@ PinBuffer_Locked(BufferDesc *buf)
 	 */
 	VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(buf), BLCKSZ);
 
+#if defined(ENFORCE_BUFFER_PROT) && defined(ENFORCE_BUFFER_PROT_READ)
+	SetBufferProtection(BufferDescriptorGetBuffer(buf), true, false);
+#endif
+
 	/*
 	 * Since we hold the buffer spinlock, we can update the buffer state and
 	 * release the lock in one operation.
@@ -3305,6 +3345,10 @@ UnpinBufferNoOwner(BufferDesc *buf)
 		 */
 		VALGRIND_MAKE_MEM_NOACCESS(BufHdrGetBlock(buf), BLCKSZ);
 
+#if defined(ENFORCE_BUFFER_PROT) && defined(ENFORCE_BUFFER_PROT_READ)
+		SetBufferProtection(BufferDescriptorGetBuffer(buf), false, false);
+#endif
+
 		/* I'd better not still hold the buffer content lock */
 		Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
 
@@ -4113,6 +4157,69 @@ CheckForBufferLeaks(void)
 #endif
 }
 
+/*
+ * To verify that we are following buffer locking rules, we can make pages
+ * inaccessible or read-only when we don't have sufficient locks etc.
+ *
+ * XXX: It might be possible to fold the VALGRIND_MAKE_MEM_NOACCESS() /
+ * VALGRIND_MAKE_MEM_DEFINED() calls into this.
+ */
+#ifdef ENFORCE_BUFFER_PROT
+void
+SetBufferProtection(Buffer buf, bool allow_reads, bool allow_writes)
+{
+	static long	pagesz = 0;
+	int			prot = PROT_NONE;
+	int			rc;
+
+	Assert(huge_pages_status != HUGE_PAGES_UNKNOWN &&
+		   huge_pages_status != HUGE_PAGES_TRY);
+	StaticAssertStmt(PROT_NONE == 0, "that can't be right");
+
+	if (unlikely(pagesz == 0))
+	{
+		pagesz = sysconf(_SC_PAGESIZE);
+
+		elog(DEBUG1, "sysconf(_SC_PAGESIZE) = %ld", pagesz);
+		if (pagesz == -1)
+		{
+			elog(ERROR, "sysconf(_SC_PAGESIZE) failed: %m");
+		}
+		else if (pagesz > BLCKSZ)
+		{
+			elog(DEBUG1, "pagesz > BLCKSZ, disabling buffer protection mode");
+			pagesz = -1;
+		}
+		else if(BLCKSZ % pagesz != 0)
+		{
+			elog(DEBUG1, "BLCKSZ %% pagesz != 0, disabling buffer protection mode");
+			pagesz = -1;
+		}
+		else if (huge_pages_status == HUGE_PAGES_ON)
+		{
+			/* can't set status in a granular enough way */
+			elog(DEBUG1, "huge pages enabled, disabling buffer protection mode");
+			pagesz = -1;
+		}
+	}
+
+	/* disabled */
+	if (pagesz == -1)
+		return;
+
+	if (allow_reads)
+		prot |= PROT_READ;
+
+	if (allow_writes)
+		prot |= PROT_WRITE;
+
+	rc = mprotect(BufferGetBlock(buf), BLCKSZ, prot);
+
+	if (rc != 0)
+		elog(ERROR, "mprotect(%d, %d) failed: %m", buf, prot);
+}
+#endif							/* ENFORCE_BUFFER_PROT */
+
 /*
  * Helper routine to issue warnings when a buffer is unexpectedly pinned
  */
@@ -4307,7 +4414,10 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
 	bufBlock = BufHdrGetBlock(buf);
 
 	/* Update page checksum if desired. */
+	SetBufferProtection(BufferDescriptorGetBuffer(buf), true, true);
 	PageSetChecksum((Page) bufBlock, buf->tag.blockNum);
+	/* FIXME: could theoretically be exclusively locked */
+	SetBufferProtection(BufferDescriptorGetBuffer(buf), true, false);
 
 	io_start = pgstat_prepare_io_time(track_io_timing);
 
@@ -5563,19 +5673,38 @@ void
 LockBuffer(Buffer buffer, int mode)
 {
 	BufferDesc *buf;
+	LWLock	   *content_lock;
 
 	Assert(BufferIsPinned(buffer));
 	if (BufferIsLocal(buffer))
 		return;					/* local buffers need no lock */
 
 	buf = GetBufferDescriptor(buffer - 1);
+	content_lock = BufferDescriptorGetContentLock(buf);
 
 	if (mode == BUFFER_LOCK_UNLOCK)
-		LWLockRelease(BufferDescriptorGetContentLock(buf));
+	{
+#ifdef ENFORCE_BUFFER_PROT
+		bool		was_exclusive;
+
+		was_exclusive = LWLockHeldByMeInMode(content_lock, LW_EXCLUSIVE);
+#endif							/* ENFORCE_BUFFER_PROT */
+
+		LWLockRelease(content_lock);
+
+#ifdef ENFORCE_BUFFER_PROT
+		if (was_exclusive)
+			SetBufferProtection(buffer, true, false);
+#endif							/* ENFORCE_BUFFER_PROT */
+	}
 	else if (mode == BUFFER_LOCK_SHARE)
-		LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_SHARED);
+		LWLockAcquire(content_lock, LW_SHARED);
 	else if (mode == BUFFER_LOCK_EXCLUSIVE)
-		LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_EXCLUSIVE);
+	{
+		LWLockAcquire(content_lock, LW_EXCLUSIVE);
+
+		SetBufferProtection(buffer, true, true);
+	}
 	else
 		elog(ERROR, "unrecognized buffer lock mode: %d", mode);
 }
@@ -5589,6 +5718,7 @@ bool
 ConditionalLockBuffer(Buffer buffer)
 {
 	BufferDesc *buf;
+	bool		ret;
 
 	Assert(BufferIsPinned(buffer));
 	if (BufferIsLocal(buffer))
@@ -5596,8 +5726,13 @@ ConditionalLockBuffer(Buffer buffer)
 
 	buf = GetBufferDescriptor(buffer - 1);
 
-	return LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf),
-									LW_EXCLUSIVE);
+	ret = LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf),
+								   LW_EXCLUSIVE);
+
+	if (ret)
+		SetBufferProtection(buffer, true, true);
+
+	return ret;
 }
 
 /*
@@ -6732,6 +6867,11 @@ BufferPrepareToSetHintBits(Buffer buffer)
 		{
 			ResourceOwnerRememberBufferSettingHints(CurrentResourceOwner, buffer);
 
+#ifdef ENFORCE_BUFFER_PROT
+			if (!LWLockHeldByMeInMode(BufferDescriptorGetContentLock(desc), LW_EXCLUSIVE))
+				SetBufferProtection(buffer, true, true);
+#endif							/* ENFORCE_BUFFER_PROT */
+
 			return true;
 		}
 	}
@@ -6807,6 +6947,11 @@ BufferFinishSetHintBits(Buffer buffer, bool mark_dirty, bool buffer_std)
 	 * know if there is somebody waiting.
 	 */
 	ConditionVariableBroadcast(BufferDescriptorGetIOCV(desc));
+
+#ifdef ENFORCE_BUFFER_PROT
+	if (!LWLockHeldByMeInMode(BufferDescriptorGetContentLock(desc), LW_EXCLUSIVE))
+		SetBufferProtection(buffer, true, false);
+#endif							/* ENFORCE_BUFFER_PROT */
 }
 
 /*
@@ -6853,14 +6998,25 @@ BufferSetHintBits16(Buffer buffer, uint16 *ptr, uint16 val)
 	}
 	else
 	{
+#ifdef ENFORCE_BUFFER_PROT
+		bool exclusive;
+#endif
+
 		desc = GetBufferDescriptor(buffer - 1);
+
+#ifdef ENFORCE_BUFFER_PROT
+		exclusive = LWLockHeldByMeInMode(BufferDescriptorGetContentLock(desc),
+										 LW_EXCLUSIVE);
+
+		if (!exclusive)
+			SetBufferProtection(buffer, true, true);
+#endif
+
 		buf_state = LockBufHdr(desc);
 
 		if (likely((buf_state & BM_IO_IN_PROGRESS) == 0))
 		{
-
 			*ptr = val;
-
 			did_set = true;
 		}
 
@@ -6869,6 +7025,11 @@ BufferSetHintBits16(Buffer buffer, uint16 *ptr, uint16 val)
 		is_dirty = (buf_state & (BM_DIRTY | BM_JUST_DIRTIED)) == (BM_DIRTY | BM_JUST_DIRTIED);
 		if (did_set && !is_dirty)
 			MarkBufferDirtyHintImpl(buffer, true, true);
+
+#ifdef ENFORCE_BUFFER_PROT
+		if (!exclusive)
+			SetBufferProtection(buffer, true, false);
+#endif
 	}
 }
 
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 4eaeca89f2c..ea8d796e7c4 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -566,7 +566,7 @@ static int	ssl_renegotiation_limit;
  */
 int			huge_pages = HUGE_PAGES_TRY;
 int			huge_page_size;
-static int	huge_pages_status = HUGE_PAGES_UNKNOWN;
+int			huge_pages_status = HUGE_PAGES_UNKNOWN;
 
 /*
  * These variables are all dummies that don't do anything, except in some
diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h
index 125d3eb5fff..04f1e49b0e7 100644
--- a/src/include/pg_config_manual.h
+++ b/src/include/pg_config_manual.h
@@ -361,3 +361,27 @@
  * Enable tracing of syncscan operations (see also the trace_syncscan GUC var).
  */
 /* #define TRACE_SYNCSCAN */
+
+
+/*
+ * On some operating systems we know how to change whether pages are
+ * readable/writeable. We can use that to verify that we are following buffer
+ * locking rules, we can make pages inaccessible or read-only when we don't
+ * have sufficient locks etc. This obviously is fairly expensive, so by
+ * default we only so in assert enabled builds.
+ */
+#if defined(USE_ASSERT_CHECKING) && !defined(WIN32)
+#define ENFORCE_BUFFER_PROT
+#endif
+
+/*
+ * Protecting pages against being modified without an exclusive lock /
+ * BufferPrepareToSetHintBits() is reasonably cheap, neither happens *that*
+ * often. Pinning/unpinning buffers is a lot more common, making it more
+ * expensive to call mprotect() that often.
+ *
+ * Therefore disable this by default, even in assert enabled builds.
+ */
+#ifdef ENFORCE_BUFFER_PROT
+/* #define ENFORCE_BUFFER_PROT_READ */
+#endif
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 0bf6a45b500..50f7c7c29a0 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -494,6 +494,15 @@ extern Size StrategyShmemSize(void);
 extern void StrategyInitialize(bool init);
 extern bool have_free_buffer(void);
 
+#ifdef ENFORCE_BUFFER_PROT
+extern void SetBufferProtection(Buffer buf, bool allow_reads, bool allow_writes);
+#else
+static inline void
+SetBufferProtection(Buffer buf, bool allow_reads, bool allow_writes)
+{
+}
+#endif
+
 /* buf_table.c */
 extern Size BufTableShmemSize(int size);
 extern void InitBufTable(int size);
diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h
index b99ebc9e86f..5d0c56841d4 100644
--- a/src/include/storage/pg_shmem.h
+++ b/src/include/storage/pg_shmem.h
@@ -63,6 +63,8 @@ typedef enum
 	SHMEM_TYPE_MMAP,
 }			PGShmemType;
 
+extern PGDLLIMPORT int huge_pages_status;
+
 #ifndef WIN32
 extern PGDLLIMPORT unsigned long UsedShmemSegID;
 #else
-- 
2.39.5

