From 1020cd6daf6e513420984d172026218502f99ec2 Mon Sep 17 00:00:00 2001
From: Jelte Fennema-Nio <jelte.fennema@microsoft.com>
Date: Mon, 24 Jun 2024 00:29:39 +0200
Subject: [PATCH v6] Do not reset statement_timeout indicator outside of
 ProcessInterrupts

The only way that ProcessInterrupts can know why QueryCancelPending is
set is by looking at the indicator bits of the various timeouts. The
STATEMENT_TIMEOUT indicator was being reset in two places, causing
ProcessInterrupts to report "user request" instead of "statement
timeout".

The primary reason for this patch is that this different error message
caused a test to be flaky. The reason this was happening is because
enable_statement_timeout would be called twice for a single query. The
intent of calling enable_statement_timeout twice is that the second call
was a no-op, but that was not the case in practice. If the timer expired
right before the second call, then the timer would restart and unset the
current indicator. The QueryCancelPending flag would still be true, so
on the next CHECK_FOR_INTERRUPTS it would throw the "canceled statement
due to user request" error.

The disable_statement_timeout function has a similar theoretical race
condition, where it would reset the indicator, but leave the cancel flag
set. So that one was changed too, even though none of our tests actually
trigger that specific bug.

The downside of keeping these indicators set, is that they can leak
outside of the query that's being run. To make sure that this causes no
problems in practice, ProcessInterrupts now also checks that
DoingCommandRead is false before canceling because of statement_timeout
or lock_timeout, in addition to resetting of those indicators (which it
already did). Given that CHECK_FOR_INTERRUPTS is called right before
changing the DoingCommandRead back to false, this will reset the
indicators before entering the next query.
---
 src/backend/tcop/postgres.c | 44 ++++++++++++++++++++++++-------------
 1 file changed, 29 insertions(+), 15 deletions(-)

diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index e54bf1e760f..b3b4afdac24 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -3436,34 +3436,37 @@ ProcessInterrupts(void)
 			get_timeout_finish_time(STATEMENT_TIMEOUT) < get_timeout_finish_time(LOCK_TIMEOUT))
 			lock_timeout_occurred = false;	/* report stmt timeout */
 
-		if (lock_timeout_occurred)
+		if (DoingCommandRead)
+		{
+			/*
+			 * If we are reading a command from the client, just ignore the
+			 * cancel request --- sending an extra error message won't
+			 * accomplish anything.  Otherwise, go ahead and throw an error
+			 * with the correct reason.
+			 */
+		}
+		else if (lock_timeout_occurred)
 		{
 			LockErrorCleanup();
 			ereport(ERROR,
 					(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
 					 errmsg("canceling statement due to lock timeout")));
 		}
-		if (stmt_timeout_occurred)
+		else if (stmt_timeout_occurred)
 		{
 			LockErrorCleanup();
 			ereport(ERROR,
 					(errcode(ERRCODE_QUERY_CANCELED),
 					 errmsg("canceling statement due to statement timeout")));
 		}
-		if (AmAutoVacuumWorkerProcess())
+		else if (AmAutoVacuumWorkerProcess())
 		{
 			LockErrorCleanup();
 			ereport(ERROR,
 					(errcode(ERRCODE_QUERY_CANCELED),
 					 errmsg("canceling autovacuum task")));
 		}
-
-		/*
-		 * If we are reading a command from the client, just ignore the cancel
-		 * request --- sending an extra error message won't accomplish
-		 * anything.  Otherwise, go ahead and throw the error.
-		 */
-		if (!DoingCommandRead)
+		else
 		{
 			LockErrorCleanup();
 			ereport(ERROR,
@@ -5218,22 +5221,33 @@ enable_statement_timeout(void)
 	if (StatementTimeout > 0
 		&& (StatementTimeout < TransactionTimeout || TransactionTimeout == 0))
 	{
-		if (!get_timeout_active(STATEMENT_TIMEOUT))
+		/*
+		 * We check both if it's active or if it's already triggered. If it's
+		 * already triggered we don't want to restart it because that clears
+		 * the indicator flag, which in turn would cause the wrong error
+		 * message to be used by ProcessInterrupts() on the next
+		 * CHECK_FOR_INTERRUPTS() call. Restarting the timer in that case
+		 * would be pointless anyway, because the statement timeout error is
+		 * going to trigger on the next CHECK_FOR_INTERRUPTS() call.
+		 */
+		if (!get_timeout_active(STATEMENT_TIMEOUT)
+			&& !get_timeout_indicator(STATEMENT_TIMEOUT, false))
 			enable_timeout_after(STATEMENT_TIMEOUT, StatementTimeout);
 	}
 	else
 	{
-		if (get_timeout_active(STATEMENT_TIMEOUT))
-			disable_timeout(STATEMENT_TIMEOUT, false);
+		disable_statement_timeout();
 	}
 }
 
 /*
- * Disable statement timeout, if active.
+ * Disable statement timeout, if active. We preserve the indicator flag
+ * though, otherwise we'd lose the knowledge in ProcessInterupts that the
+ * SIGINT came from a statement timeout.
  */
 static void
 disable_statement_timeout(void)
 {
 	if (get_timeout_active(STATEMENT_TIMEOUT))
-		disable_timeout(STATEMENT_TIMEOUT, false);
+		disable_timeout(STATEMENT_TIMEOUT, true);
 }

base-commit: e5a5e0a90750d665cab417322b9f85c806430d85
-- 
2.52.0