From 6276d2f66b0760053e3fdfe259971be3abba3c63 Mon Sep 17 00:00:00 2001
From: "yizhi.fzh" <yizhi.fzh@alibaba-inc.com>
Date: Fri, 19 Jan 2024 13:52:07 +0800
Subject: [PATCH v6 1/2] Detect more misuse of spin lock automatically

Spin lock are intended for *very* short-term locks, but it is possible
to be misused in many cases. e.g. Acquiring another LWLocks or regular
locks, memory allocation, errstart when holding a spin lock. this patch
would detect such misuse automatically in a USE_ASSERT_CHECKING build.

CHECK_FOR_INTERRUPTS should be avoided as well when holding a spin lock.
Depends on what signals are left to handle, PG may raise error/fatal
which would cause the code jump to some other places which is hardly to
release the spin lock anyway.
---
 src/backend/storage/lmgr/lock.c   |  6 ++++
 src/backend/storage/lmgr/lwlock.c |  6 ++++
 src/backend/storage/lmgr/spin.c   | 13 +++++++++
 src/backend/utils/error/elog.c    |  7 +++++
 src/backend/utils/mmgr/mcxt.c     | 16 +++++++++++
 src/include/miscadmin.h           | 12 +++++++-
 src/include/storage/spin.h        | 46 +++++++++++++++++++++++++++++--
 7 files changed, 102 insertions(+), 4 deletions(-)

diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c
index c70a1adb9a..cb9969b860 100644
--- a/src/backend/storage/lmgr/lock.c
+++ b/src/backend/storage/lmgr/lock.c
@@ -776,6 +776,12 @@ LockAcquireExtended(const LOCKTAG *locktag,
 	bool		found_conflict;
 	bool		log_lock = false;
 
+	/*
+	 * Spin lock should not be held for a long time, but the time needed here
+	 * may be too long, so let make sure no spin lock is held now.
+	 */
+	VerifyNoSpinLocksHeld();
+
 	if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
 		elog(ERROR, "unrecognized lock method: %d", lockmethodid);
 	lockMethodTable = LockMethods[lockmethodid];
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index b4b989ac56..2faeb3aba6 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -1205,6 +1205,12 @@ LWLockAcquire(LWLock *lock, LWLockMode mode)
 
 	Assert(mode == LW_SHARED || mode == LW_EXCLUSIVE);
 
+	/*
+	 * Spin lock should not be held for a long time, but the time needed here
+	 * may be too long, so let make sure no spin lock is held now.
+	 */
+	VerifyNoSpinLocksHeld();
+
 	PRINT_LWDEBUG("LWLockAcquire", lock, mode);
 
 #ifdef LWLOCK_STATS
diff --git a/src/backend/storage/lmgr/spin.c b/src/backend/storage/lmgr/spin.c
index 50cb99cd3b..08cc6da5d9 100644
--- a/src/backend/storage/lmgr/spin.c
+++ b/src/backend/storage/lmgr/spin.c
@@ -47,6 +47,9 @@ PGSemaphore *SpinlockSemaArray;
 
 #endif							/* HAVE_SPINLOCKS */
 
+volatile const char *last_spin_lock_file = NULL;
+volatile int last_spin_lock_lineno = -1;
+
 /*
  * Report the amount of shared memory needed to store semaphores for spinlock
  * support.
@@ -178,3 +181,13 @@ tas_sema(volatile slock_t *lock)
 }
 
 #endif							/* !HAVE_SPINLOCKS */
+
+void
+VerifyNoSpinLocksHeld(void)
+{
+#ifdef USE_ASSERT_CHECKING
+	if (last_spin_lock_file != NULL)
+		elog(PANIC, "A spin lock has been held at %s:%d",
+			 last_spin_lock_file, last_spin_lock_lineno);
+#endif
+}
diff --git a/src/backend/utils/error/elog.c b/src/backend/utils/error/elog.c
index 2c7a20e3d3..22662955d2 100644
--- a/src/backend/utils/error/elog.c
+++ b/src/backend/utils/error/elog.c
@@ -351,6 +351,13 @@ errstart(int elevel, const char *domain)
 	bool		output_to_client = false;
 	int			i;
 
+	/*
+	 * Logging likely happens in many places without a outstanding attention,
+	 * and it's far more than a few dozen instructions, so it should be only
+	 * called when there is no spin lock is held.
+	 */
+	VerifyNoSpinLocksHeld();
+
 	/*
 	 * Check some cases in which we want to promote an error into a more
 	 * severe error.  None of this logic applies for non-error messages.
diff --git a/src/backend/utils/mmgr/mcxt.c b/src/backend/utils/mmgr/mcxt.c
index 1336944084..7a14e347aa 100644
--- a/src/backend/utils/mmgr/mcxt.c
+++ b/src/backend/utils/mmgr/mcxt.c
@@ -1028,6 +1028,13 @@ MemoryContextAlloc(MemoryContext context, Size size)
 	if (!AllocSizeIsValid(size))
 		elog(ERROR, "invalid memory alloc request size %zu", size);
 
+	/*
+	 * Memory allocation likely happens in many places without a outstanding
+	 * attention, and it's far more than a few dozen instructions, so it
+	 * should be only called when there is no spin lock is held.
+	 */
+	VerifyNoSpinLocksHeld();
+
 	context->isReset = false;
 
 	ret = context->methods->alloc(context, size);
@@ -1071,6 +1078,9 @@ MemoryContextAllocZero(MemoryContext context, Size size)
 	if (!AllocSizeIsValid(size))
 		elog(ERROR, "invalid memory alloc request size %zu", size);
 
+	/* see comments in MemoryContextAlloc. */
+	VerifyNoSpinLocksHeld();
+
 	context->isReset = false;
 
 	ret = context->methods->alloc(context, size);
@@ -1197,6 +1207,9 @@ palloc(Size size)
 	if (!AllocSizeIsValid(size))
 		elog(ERROR, "invalid memory alloc request size %zu", size);
 
+	/* see comments in MemoryContextAlloc. */
+	VerifyNoSpinLocksHeld();
+
 	context->isReset = false;
 
 	ret = context->methods->alloc(context, size);
@@ -1228,6 +1241,9 @@ palloc0(Size size)
 	if (!AllocSizeIsValid(size))
 		elog(ERROR, "invalid memory alloc request size %zu", size);
 
+	/* see comments in MemoryContextAlloc. */
+	VerifyNoSpinLocksHeld();
+
 	context->isReset = false;
 
 	ret = context->methods->alloc(context, size);
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 0b01c1f093..26edf041ca 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -107,6 +107,9 @@ extern PGDLLIMPORT volatile uint32 CritSectionCount;
 /* in tcop/postgres.c */
 extern void ProcessInterrupts(void);
 
+/* in storage/spin.h */
+extern void VerifyNoSpinLocksHeld(void);
+
 /* Test whether an interrupt is pending */
 #ifndef WIN32
 #define INTERRUPTS_PENDING_CONDITION() \
@@ -117,9 +120,16 @@ extern void ProcessInterrupts(void);
 	 unlikely(InterruptPending))
 #endif
 
-/* Service interrupt, if one is pending and it's safe to service it now */
+/*
+ * Service interrupt, if one is pending and it's safe to service it now
+ * Spin lock doesn't have a overall infrastructure to release all the locks
+ * on Error. and ProcessInterrupts likely raises some ERROR or FATAL which
+ * makes the code jump to some other places, then spin lock is leaked
+ * Let's make sure no spin lock can be held at this point.
+ */
 #define CHECK_FOR_INTERRUPTS() \
 do { \
+	VerifyNoSpinLocksHeld(); \
 	if (INTERRUPTS_PENDING_CONDITION()) \
 		ProcessInterrupts(); \
 } while(0)
diff --git a/src/include/storage/spin.h b/src/include/storage/spin.h
index c0679c5999..b69424e9c8 100644
--- a/src/include/storage/spin.h
+++ b/src/include/storage/spin.h
@@ -59,12 +59,52 @@
 
 #define SpinLockInit(lock)	S_INIT_LOCK(lock)
 
-#define SpinLockAcquire(lock) S_LOCK(lock)
+extern PGDLLIMPORT volatile const char *last_spin_lock_file;
+extern PGDLLIMPORT volatile int last_spin_lock_lineno;
+extern void VerifyNoSpinLocksHeld(void);
 
-#define SpinLockRelease(lock) S_UNLOCK(lock)
+#ifdef USE_ASSERT_CHECKING
 
-#define SpinLockFree(lock)	S_LOCK_FREE(lock)
+/*
+ * START_SPIN_LOCK - the start of a spin lock acquiring.
+ *
+ * 	Acquiring another spin lock when holding one spin lock
+ * already is not allowed.
+ */
+#define START_SPIN_LOCK() \
+do \
+{ \
+	VerifyNoSpinLocksHeld(); \
+	last_spin_lock_file = __FILE__; \
+	last_spin_lock_lineno = __LINE__; \
+} while (0)
+
+#define END_SPIN_LOCK() \
+do \
+{ \
+	last_spin_lock_file = NULL; \
+	last_spin_lock_lineno = -1; \
+} while (0)
+#else
+#define START_SPIN_LOCK() ((void) true)
+#define END_SPIN_LOCK() ((void) true)
+#endif
+
+#define SpinLockAcquire(lock) \
+do \
+{ \
+	START_SPIN_LOCK(); \
+	S_LOCK(lock); \
+} while (false)
 
+#define SpinLockRelease(lock) \
+do \
+{ \
+	S_UNLOCK(lock); \
+	END_SPIN_LOCK(); \
+} while (false)
+
+#define SpinLockFree(lock)	S_LOCK_FREE(lock)
 
 extern int	SpinlockSemas(void);
 extern Size SpinlockSemaSize(void);
-- 
2.34.1