From 0ec0beb294f4c5ed35dbb35260f53b069563638f Mon Sep 17 00:00:00 2001
From: Craig Ringer <craig.ringer@2ndquadrant.com>
Date: Thu, 19 Nov 2020 14:24:55 +0800
Subject: [PATCH] LWLock self-deadlock detection

On cassert builds, if we miss the LWLock acquire fast-path and have to
wait for a LWLock, check to make sure that the acquiring backend
doesn't already hold the lock.

This detects conditions that would otherwise cause a backend to
deadlock indefinitely with interrupts disabled. A backend in this
state won't respond to SIGTERM or SIGQUIT and will prevent postgres
from finishing shutdown.

While a LWLock self-deadlock is always the result of a programming
mistake, it's not always easy to forsee all possible locking states
especially when dealing with exception handling and with exit
callbacks.
---
 src/backend/storage/lmgr/lwlock.c | 87 +++++++++++++++++++++++++++++++
 1 file changed, 87 insertions(+)

diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index 2fa90cc095..a2d6dec557 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -313,6 +313,85 @@ LOG_LWDEBUG(const char *where, LWLock *lock, const char *msg)
 #define LOG_LWDEBUG(a,b,c) ((void)0)
 #endif							/* LOCK_DEBUG */
 
+#if defined(USE_ASSERT_CHECKING) || defined(LOCK_DEBUG)
+#ifndef LWLOCK_SELF_DEADLOCK_DETECTION
+#define LWLOCK_SELF_DEADLOCK_DETECTION
+#endif
+#endif
+
+#ifdef LWLOCK_SELF_DEADLOCK_DETECTION
+static void
+LWLockReportSelfDeadlock(LWLock *lock, LWLockMode mode, int held_lockno)
+{
+		StringInfoData locklist;
+		int i;
+
+		LOG_LWDEBUG("LWLockReportSelfDeadlock", lock, "acquire self-deadlock detected");
+
+		initStringInfo(&locklist);
+		for (i=0; i < num_held_lwlocks; i++) {
+			appendStringInfo(&locklist, "%s (%p) excl=%d",
+							T_NAME(held_lwlocks[i].lock),
+							held_lwlocks[i].lock,
+							held_lwlocks[i].mode == LW_EXCLUSIVE);
+			if (i > 0)
+				appendStringInfoString(&locklist, ", ");
+		}
+
+
+		/*
+		 * It might be safe to bail out here on a non-cassert build but
+		 * more care is needed before anything like that is enabled.
+		 * LWLockAcquire() doesn't know if it's safe to `elog(FATAL)`
+		 * out from the current callsite. So we PANIC. This introduces some
+		 * risk of buggy code causing a server crash/restart cycle where
+		 * it would've previously just deadlocked a single backend, but
+		 * that's part of why we only enable this for assert builds.
+		 */
+		ereport(PANIC,
+				(errmsg_internal("self-deadlock detected on acquire of lwlock %s (%p) for mode %s: lock already held by this backend in mode %s",
+								 T_NAME(lock), lock,
+								 mode == LW_EXCLUSIVE
+										? "LW_EXCLUSIVE" : "LW_SHARED",
+								 held_lwlocks[held_lockno].mode == LW_EXCLUSIVE
+										? "LW_EXCLUSIVE" : "LW_SHARED"),
+				 errdetail("backend already holds %d lwlocks: %s",
+						   num_held_lwlocks, locklist.data)));
+}
+
+/*
+ * A bug in code that uses this particular LWLock could cause
+ * the lock-holder could be ourselves, in which case we'll
+ * self-deadlock forever as an unkillable process.
+ *
+ * Call this in any LWLock acquire retry loop once the fast-path
+ * acquire has failed.
+ *
+ * held_lockno should be initialized to 0 outside the wait loop
+ * so that this check only runs on the first iteration of any
+ * wait/retry loop.
+ */
+inline static void
+LWLockCheckSelfDeadlock(LWLock *lock, LWLockMode mode, int * const held_lockno)
+{
+	for ( ; *held_lockno < num_held_lwlocks; (*held_lockno)++) {
+		if (unlikely(held_lwlocks[*held_lockno].lock == lock)) {
+			LWLockReportSelfDeadlock(lock, mode, *held_lockno);
+		}
+	}
+}
+#else
+/*
+ * Making LWLockCheckSelfDeadlock an empty static inline instead of a macro
+ * silences compiler whinging about held_lockno being unused while optimising
+ * away without fuss.
+ */
+inline static void
+LWLockCheckSelfDeadlock(LWLock *lock, LWLockMode mode, int * const held_lockno)
+{
+}
+#endif
+
 #ifdef LWLOCK_STATS
 
 static void init_lwlock_stats(void);
@@ -1210,6 +1289,7 @@ LWLockAcquire(LWLock *lock, LWLockMode mode)
 	PGPROC	   *proc = MyProc;
 	bool		result = true;
 	int			extraWaits = 0;
+	int			held_lockno = 0; /* for self-deadlock detection */
 #ifdef LWLOCK_STATS
 	lwlock_stats *lwstats;
 
@@ -1322,6 +1402,9 @@ LWLockAcquire(LWLock *lock, LWLockMode mode)
 		lwstats->block_count++;
 #endif
 
+		/* Detect if we're trying to acquire our own lock */
+		LWLockCheckSelfDeadlock(lock, mode, &held_lockno);
+
 		LWLockReportWaitStart(lock);
 		TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), mode);
 
@@ -1621,6 +1704,7 @@ LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval)
 	PGPROC	   *proc = MyProc;
 	int			extraWaits = 0;
 	bool		result = false;
+	int			held_lockno = 0; /* for self-deadlock detection */
 #ifdef LWLOCK_STATS
 	lwlock_stats *lwstats;
 
@@ -1699,6 +1783,9 @@ LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval)
 		lwstats->block_count++;
 #endif
 
+		/* Detect if we're trying to wait on our own lock */
+		LWLockCheckSelfDeadlock(lock, LW_EXCLUSIVE, &held_lockno);
+
 		LWLockReportWaitStart(lock);
 		TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), LW_EXCLUSIVE);
 
-- 
2.26.2

