From 0f46bb1357bafbe940e7df8fce38c01e2237f57e Mon Sep 17 00:00:00 2001
From: Marti Raudsepp <marti@juffo.org>
Date: Wed, 28 Sep 2011 00:46:48 +0300
Subject: [PATCH] Log crashed backend's query (activity string)

The crashing query is often a good starting point in debugging the
cause, and much more easily accessible than core dumps.

We're extra-paranoid in reading the activity buffer since it might be
corrupt. All non-ASCII characters are replaced with '?'

Example output:
LOG:  server process (PID 31451) was terminated by signal 9: Killed
DETAIL:  Running query: DO LANGUAGE plpythonu 'import os;os.kill(os.getpid(),9)'
---
 src/backend/postmaster/pgstat.c     |   59 +++++++++++++++++++++++++++++++++++
 src/backend/postmaster/postmaster.c |   17 +++++++---
 src/backend/utils/adt/ascii.c       |   31 ++++++++++++++++++
 src/include/pgstat.h                |    1 +
 src/include/utils/ascii.h           |    1 +
 5 files changed, 104 insertions(+), 5 deletions(-)

diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index eb9adc8..812bf04 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -58,6 +58,7 @@
 #include "storage/pg_shmem.h"
 #include "storage/pmsignal.h"
 #include "storage/procsignal.h"
+#include "utils/ascii.h"
 #include "utils/guc.h"
 #include "utils/memutils.h"
 #include "utils/ps_status.h"
@@ -2747,6 +2748,64 @@ pgstat_get_backend_current_activity(int pid, bool checkUser)
 	return "<backend information not available>";
 }
 
+/* ----------
+ * pgstat_get_backend_current_activity() -
+ *
+ *	Return a string representing the current activity of the backend with
+ *	the specified PID.	Like the function above, but reads shared memory with
+ *	the expectation that it may be corrupt.
+ *
+ *	This function is only intended to be used by postmaster to report the
+ *	query that crashed the backend. In particular, no attempt is made to
+ *	follow the correct concurrency protocol when accessing the
+ *	BackendStatusArray. But that's OK, in the worst case we get a corrupted
+ *	message. We also must take care not to trip on ereport(ERROR).
+ *
+ *	Note: return strings for special cases match pg_stat_get_backend_activity.
+ * ----------
+ */
+const char *
+pgstat_get_crashed_backend_activity(int pid)
+{
+	volatile PgBackendStatus *beentry;
+	int			i;
+
+	beentry = BackendStatusArray;
+	for (i = 1; i <= MaxBackends; i++)
+	{
+		if (beentry->st_procpid == pid)
+		{
+			/* Read pointer just once, so it can't change after validation */
+			const char *activity = beentry->st_activity;
+			char	   *buffer;
+
+			/*
+			 * We can't access activity pointer before we verify that it
+			 * falls into BackendActivityBuffer. To make sure that the
+			 * string's ending is contained within the buffer, the string
+			 * can start at offset (MaxBackends - 1) at most.
+			 */
+			if (activity < BackendActivityBuffer ||
+				activity > (BackendActivityBuffer +
+						(MaxBackends - 1) * pgstat_track_activity_query_size))
+				return "<command string corrupt>";
+
+			if (*(activity) == '\0')
+				return "<command string not enabled>";
+
+			buffer = (char *) palloc(pgstat_track_activity_query_size);
+			ascii_safe_strncpy(buffer, activity,
+							   pgstat_track_activity_query_size);
+
+			return buffer;
+		}
+
+		beentry++;
+	}
+
+	/* PID not found */
+	return "<backend information not available>";
+}
 
 /* ------------------------------------------------------------
  * Local support functions follow
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 94b57fa..9ba622c 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -2763,6 +2763,8 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
 static void
 LogChildExit(int lev, const char *procname, int pid, int exitstatus)
 {
+	const char   *activity = pgstat_get_crashed_backend_activity(pid);
+
 	if (WIFEXITED(exitstatus))
 		ereport(lev,
 
@@ -2770,7 +2772,8 @@ LogChildExit(int lev, const char *procname, int pid, int exitstatus)
 		  translator: %s is a noun phrase describing a child process, such as
 		  "server process" */
 				(errmsg("%s (PID %d) exited with exit code %d",
-						procname, pid, WEXITSTATUS(exitstatus))));
+						procname, pid, WEXITSTATUS(exitstatus)),
+				 errdetail("Running query: %s", activity)));
 	else if (WIFSIGNALED(exitstatus))
 #if defined(WIN32)
 		ereport(lev,
@@ -2780,7 +2783,8 @@ LogChildExit(int lev, const char *procname, int pid, int exitstatus)
 		  "server process" */
 				(errmsg("%s (PID %d) was terminated by exception 0x%X",
 						procname, pid, WTERMSIG(exitstatus)),
-				 errhint("See C include file \"ntstatus.h\" for a description of the hexadecimal value.")));
+				 errhint("See C include file \"ntstatus.h\" for a description of the hexadecimal value."),
+				 errdetail("Running query: %s", activity)));
 #elif defined(HAVE_DECL_SYS_SIGLIST) && HAVE_DECL_SYS_SIGLIST
 	ereport(lev,
 
@@ -2790,7 +2794,8 @@ LogChildExit(int lev, const char *procname, int pid, int exitstatus)
 			(errmsg("%s (PID %d) was terminated by signal %d: %s",
 					procname, pid, WTERMSIG(exitstatus),
 					WTERMSIG(exitstatus) < NSIG ?
-					sys_siglist[WTERMSIG(exitstatus)] : "(unknown)")));
+					sys_siglist[WTERMSIG(exitstatus)] : "(unknown)"),
+			 errdetail("Running query: %s", activity)));
 #else
 		ereport(lev,
 
@@ -2798,7 +2803,8 @@ LogChildExit(int lev, const char *procname, int pid, int exitstatus)
 		  translator: %s is a noun phrase describing a child process, such as
 		  "server process" */
 				(errmsg("%s (PID %d) was terminated by signal %d",
-						procname, pid, WTERMSIG(exitstatus))));
+						procname, pid, WTERMSIG(exitstatus)),
+				 errdetail("Running query: %s", activity)));
 #endif
 	else
 		ereport(lev,
@@ -2807,7 +2813,8 @@ LogChildExit(int lev, const char *procname, int pid, int exitstatus)
 		  translator: %s is a noun phrase describing a child process, such as
 		  "server process" */
 				(errmsg("%s (PID %d) exited with unrecognized status %d",
-						procname, pid, exitstatus)));
+						procname, pid, exitstatus),
+				 errdetail("Running query: %s", activity)));
 }
 
 /*
diff --git a/src/backend/utils/adt/ascii.c b/src/backend/utils/adt/ascii.c
index 7ecf494..d59f6ec 100644
--- a/src/backend/utils/adt/ascii.c
+++ b/src/backend/utils/adt/ascii.c
@@ -158,3 +158,34 @@ to_ascii_default(PG_FUNCTION_ARGS)
 
 	PG_RETURN_TEXT_P(encode_to_ascii(data, enc));
 }
+
+/* ----------
+ * convert a string in unknown encoding to ASCII. Replace non-ASCII bytes
+ * with '?'
+ * This must not trigger ereport(ERROR), as this is called from postmaster.
+ * ----------
+ */
+void
+ascii_safe_strncpy(char *dest, const char *src, int len)
+{
+	int			i;
+
+	for (i = 0; i < (len - 1); i++)
+	{
+		char ch = src[i];
+
+		if (ch == '\0')
+			break;
+		/* Keep printable ASCII characters */
+		if (32 <= ch && ch <= 127)
+			dest[i] = src[i];
+		/* White-space is also OK */
+		else if (ch == '\n' || ch == '\r' || ch == '\t')
+			dest[i] = src[i];
+		/* Everything else is replaced with '?' */
+		else
+			dest[i] = '?';
+	}
+
+	dest[i] = '\0';
+}
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 20c4d43..9694896 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -720,6 +720,7 @@ extern void pgstat_report_appname(const char *appname);
 extern void pgstat_report_xact_timestamp(TimestampTz tstamp);
 extern void pgstat_report_waiting(bool waiting);
 extern const char *pgstat_get_backend_current_activity(int pid, bool checkUser);
+extern const char *pgstat_get_crashed_backend_activity(int pid);
 
 extern PgStat_TableStatus *find_tabstat_entry(Oid rel_id);
 extern PgStat_BackendFunctionEntry *find_funcstat_entry(Oid func_id);
diff --git a/src/include/utils/ascii.h b/src/include/utils/ascii.h
index ad70e03..adeddb5 100644
--- a/src/include/utils/ascii.h
+++ b/src/include/utils/ascii.h
@@ -16,5 +16,6 @@
 extern Datum to_ascii_encname(PG_FUNCTION_ARGS);
 extern Datum to_ascii_enc(PG_FUNCTION_ARGS);
 extern Datum to_ascii_default(PG_FUNCTION_ARGS);
+extern void ascii_safe_strncpy(char *dest, const char *src, int len);
 
 #endif   /* _ASCII_H_ */
-- 
1.7.6.3

