From ea023f4cd892d65e36ac24450fa57003047bd592 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Fri, 21 Mar 2025 14:09:39 +0900
Subject: [PATCH 1/2] Add more entropy to query jumbling

NULL nodes have now some arbitrary data added to the jumbling, with
a rule enforcing that all nodes need to participate in the query jumble.
This last part counts for custom jumble functions.
---
 src/include/nodes/queryjumble.h               | 15 +++-
 src/backend/nodes/queryjumblefuncs.c          | 27 ++++++
 .../pg_stat_statements/expected/select.out    | 87 ++++++++++++++++++-
 contrib/pg_stat_statements/sql/select.sql     | 20 +++++
 4 files changed, 147 insertions(+), 2 deletions(-)

diff --git a/src/include/nodes/queryjumble.h b/src/include/nodes/queryjumble.h
index 905f66bc0bd4..e13fd49942ce 100644
--- a/src/include/nodes/queryjumble.h
+++ b/src/include/nodes/queryjumble.h
@@ -40,9 +40,22 @@ typedef struct JumbleState
 	/* Jumble of current query tree */
 	unsigned char *jumble;
 
-	/* Number of bytes used in jumble[] */
+	/* Number of bytes used in jumble[], capped at JUMBLE_SIZE */
 	Size		jumble_len;
 
+	/*
+	 * Total number of number bytes used in a query jumble, used for
+	 * sanity checks and debugging.
+	 */
+	Size		jumble_total_len;
+
+	/*
+	 * Extra counter provided for the case of NULL entries.  This counter is
+	 * incremented each time a NULL node or string is found, providing some
+	 * data then appended to a query jumble.
+	 */
+	int			null_count;
+
 	/* Array of locations of constants that should be removed */
 	LocationLen *clocations;
 
diff --git a/src/backend/nodes/queryjumblefuncs.c b/src/backend/nodes/queryjumblefuncs.c
index 189bfda610aa..cc3c73117f6c 100644
--- a/src/backend/nodes/queryjumblefuncs.c
+++ b/src/backend/nodes/queryjumblefuncs.c
@@ -129,6 +129,8 @@ JumbleQuery(Query *query)
 	/* Set up workspace for query jumbling */
 	jstate->jumble = (unsigned char *) palloc(JUMBLE_SIZE);
 	jstate->jumble_len = 0;
+	jstate->jumble_total_len = 0;
+	jstate->null_count = 0;
 	jstate->clocations_buf_size = 32;
 	jstate->clocations = (LocationLen *)
 		palloc(jstate->clocations_buf_size * sizeof(LocationLen));
@@ -179,6 +181,8 @@ AppendJumble(JumbleState *jstate, const unsigned char *item, Size size)
 	unsigned char *jumble = jstate->jumble;
 	Size		jumble_len = jstate->jumble_len;
 
+	Assert(size > 0);
+
 	/*
 	 * Whenever the jumble buffer is full, we hash the current contents and
 	 * reset the buffer to contain just that hash value, thus relying on the
@@ -202,6 +206,7 @@ AppendJumble(JumbleState *jstate, const unsigned char *item, Size size)
 		jumble_len += part_size;
 		item += part_size;
 		size -= part_size;
+		jstate->jumble_total_len += part_size;
 	}
 	jstate->jumble_len = jumble_len;
 }
@@ -328,10 +333,17 @@ IsSquashableConstList(List *elements, Node **firstExpr, Node **lastExpr)
 	AppendJumble(jstate, (const unsigned char *) &(expr->item), sizeof(expr->item))
 #define JUMBLE_FIELD_SINGLE(item) \
 	AppendJumble(jstate, (const unsigned char *) &(item), sizeof(item))
+
+/* Append string to the jumble, including NULL values */
 #define JUMBLE_STRING(str) \
 do { \
 	if (expr->str) \
 		AppendJumble(jstate, (const unsigned char *) (expr->str), strlen(expr->str) + 1); \
+	else \
+	{ \
+		jstate->null_count++; \
+		JUMBLE_FIELD_SINGLE(jstate->null_count); \
+	} \
 } while(0)
 
 #include "queryjumblefuncs.funcs.c"
@@ -379,9 +391,18 @@ static void
 _jumbleNode(JumbleState *jstate, Node *node)
 {
 	Node	   *expr = node;
+	Size		initial_total_len = jstate->jumble_total_len;
 
 	if (expr == NULL)
+	{
+		/*
+		 * Increment the NULL counter, and add it to the jumble to force
+		 * more entropy to the computation.
+		 */
+		jstate->null_count++;
+		JUMBLE_FIELD_SINGLE(jstate->null_count);
 		return;
+	}
 
 	/* Guard against stack overflow due to overly complex expressions */
 	check_stack_depth();
@@ -429,6 +450,12 @@ _jumbleNode(JumbleState *jstate, Node *node)
 		default:
 			break;
 	}
+
+	/*
+	 * All nodes must participate in the computation, even nodes with
+	 * custom functions.
+	 */
+	Assert(initial_total_len < jstate->jumble_total_len);
 }
 
 static void
diff --git a/contrib/pg_stat_statements/expected/select.out b/contrib/pg_stat_statements/expected/select.out
index 37a30af034a6..1587d2cafb3a 100644
--- a/contrib/pg_stat_statements/expected/select.out
+++ b/contrib/pg_stat_statements/expected/select.out
@@ -19,6 +19,86 @@ SELECT 1 AS "int";
    1
 (1 row)
 
+-- LIMIT and OFFSET patterns
+-- These require more entropy with parsing node offsets.
+SELECT 1 AS "int" LIMIT 1;
+ int 
+-----
+   1
+(1 row)
+
+SELECT 1 AS "int" LIMIT 2;
+ int 
+-----
+   1
+(1 row)
+
+SELECT 1 AS "int" OFFSET 1;
+ int 
+-----
+(0 rows)
+
+SELECT 1 AS "int" OFFSET 2;
+ int 
+-----
+(0 rows)
+
+SELECT 1 AS "int" OFFSET 1 LIMIT 1;
+ int 
+-----
+(0 rows)
+
+SELECT 1 AS "int" OFFSET 2 LIMIT 2;
+ int 
+-----
+(0 rows)
+
+SELECT 1 AS "int" LIMIT 1 OFFSET 1;
+ int 
+-----
+(0 rows)
+
+SELECT 1 AS "int" LIMIT 3 OFFSET 3;
+ int 
+-----
+(0 rows)
+
+SELECT 1 AS "int" OFFSET 1 FETCH FIRST 2 ROW ONLY;
+ int 
+-----
+(0 rows)
+
+SELECT 1 AS "int" OFFSET 2 FETCH FIRST 3 ROW ONLY;
+ int 
+-----
+(0 rows)
+
+-- DISTINCT and ORDER BY patterns
+-- These require more entropy with parsing node offsets.
+SELECT DISTINCT 1 AS "int";
+ int 
+-----
+   1
+(1 row)
+
+SELECT DISTINCT 2 AS "int";
+ int 
+-----
+   2
+(1 row)
+
+SELECT 1 AS "int" ORDER BY 1;
+ int 
+-----
+   1
+(1 row)
+
+SELECT 2 AS "int" ORDER BY 1;
+ int 
+-----
+   2
+(1 row)
+
 /* this comment should not appear in the output */
 SELECT 'hello'
   -- but this one will appear
@@ -135,9 +215,14 @@ SELECT calls, rows, query FROM pg_stat_statements ORDER BY query COLLATE "C";
      3 |    3 | SELECT $1 + $2 + $3 AS "add"
      1 |    1 | SELECT $1 AS "float"
      2 |    2 | SELECT $1 AS "int"
+     2 |    2 | SELECT $1 AS "int" LIMIT $2
+     2 |    0 | SELECT $1 AS "int" OFFSET $2
+     6 |    0 | SELECT $1 AS "int" OFFSET $2 LIMIT $3
+     2 |    2 | SELECT $1 AS "int" ORDER BY 1
      1 |    2 | SELECT $1 AS i UNION SELECT $2 ORDER BY i
      1 |    1 | SELECT $1 || $2
      1 |    1 | SELECT $1, $2 LIMIT $3
+     2 |    2 | SELECT DISTINCT $1 AS "int"
      0 |    0 | SELECT calls, rows, query FROM pg_stat_statements ORDER BY query COLLATE "C"
      1 |    1 | SELECT pg_stat_statements_reset() IS NOT NULL AS t
      1 |    2 | WITH t(f) AS (                                                              +
@@ -145,7 +230,7 @@ SELECT calls, rows, query FROM pg_stat_statements ORDER BY query COLLATE "C";
        |      | )                                                                           +
        |      |   SELECT f FROM t ORDER BY f
      1 |    1 | select $1::jsonb ? $2
-(12 rows)
+(17 rows)
 
 SELECT pg_stat_statements_reset() IS NOT NULL AS t;
  t 
diff --git a/contrib/pg_stat_statements/sql/select.sql b/contrib/pg_stat_statements/sql/select.sql
index e0be58d5e24b..4dcfa8ef74dc 100644
--- a/contrib/pg_stat_statements/sql/select.sql
+++ b/contrib/pg_stat_statements/sql/select.sql
@@ -12,6 +12,26 @@ SELECT pg_stat_statements_reset() IS NOT NULL AS t;
 --
 SELECT 1 AS "int";
 
+-- LIMIT and OFFSET patterns
+-- These require more entropy with parsing node offsets.
+SELECT 1 AS "int" LIMIT 1;
+SELECT 1 AS "int" LIMIT 2;
+SELECT 1 AS "int" OFFSET 1;
+SELECT 1 AS "int" OFFSET 2;
+SELECT 1 AS "int" OFFSET 1 LIMIT 1;
+SELECT 1 AS "int" OFFSET 2 LIMIT 2;
+SELECT 1 AS "int" LIMIT 1 OFFSET 1;
+SELECT 1 AS "int" LIMIT 3 OFFSET 3;
+SELECT 1 AS "int" OFFSET 1 FETCH FIRST 2 ROW ONLY;
+SELECT 1 AS "int" OFFSET 2 FETCH FIRST 3 ROW ONLY;
+
+-- DISTINCT and ORDER BY patterns
+-- These require more entropy with parsing node offsets.
+SELECT DISTINCT 1 AS "int";
+SELECT DISTINCT 2 AS "int";
+SELECT 1 AS "int" ORDER BY 1;
+SELECT 2 AS "int" ORDER BY 1;
+
 /* this comment should not appear in the output */
 SELECT 'hello'
   -- but this one will appear
-- 
2.49.0