From 0b6f74eecd2a6ec0fe7361c7933f28cfffb30a18 Mon Sep 17 00:00:00 2001
From: Dmitrii Dolgov <9erthalion6@gmail.com>
Date: Sun, 15 Oct 2023 10:06:09 +0200
Subject: [PATCH v16 3/4] Merge constants in ArrayExpr into groups

Using query_id_const_merge only first/last element in an ArrayExpr will
be used to compute query id. Extend this to take into account number of
elements, and merge constants into groups based on it. Resulting groups
are powers of 10, i.e. 1 to 9, 10 to 99, etc.
---
 .../pg_stat_statements/expected/merging.out   | 84 +++++++++++++++----
 .../pg_stat_statements/pg_stat_statements.c   | 17 +++-
 contrib/pg_stat_statements/sql/merging.sql    | 13 +++
 doc/src/sgml/pgstatstatements.sgml            | 11 +--
 src/backend/nodes/queryjumblefuncs.c          | 52 ++++++++----
 src/include/nodes/queryjumble.h               |  7 +-
 6 files changed, 142 insertions(+), 42 deletions(-)

diff --git a/contrib/pg_stat_statements/expected/merging.out b/contrib/pg_stat_statements/expected/merging.out
index f286c735a36..7400870f3f6 100644
--- a/contrib/pg_stat_statements/expected/merging.out
+++ b/contrib/pg_stat_statements/expected/merging.out
@@ -54,11 +54,11 @@ SELECT * FROM test_merge WHERE id IN (1, 2, 3);
 (0 rows)
 
 SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
-                   query                    | calls 
---------------------------------------------+-------
- SELECT * FROM test_merge WHERE id IN ($1)  |     1
- SELECT * FROM test_merge WHERE id IN (...) |     1
- SELECT pg_stat_statements_reset()          |     1
+                          query                           | calls 
+----------------------------------------------------------+-------
+ SELECT * FROM test_merge WHERE id IN ($1)                |     1
+ SELECT * FROM test_merge WHERE id IN (... [1-9 entries]) |     1
+ SELECT pg_stat_statements_reset()                        |     1
 (3 rows)
 
 SELECT * FROM test_merge WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9);
@@ -80,7 +80,60 @@ SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
                                  query                                  | calls 
 ------------------------------------------------------------------------+-------
  SELECT * FROM test_merge WHERE id IN ($1)                              |     1
- SELECT * FROM test_merge WHERE id IN (...)                             |     4
+ SELECT * FROM test_merge WHERE id IN (... [1-9 entries])               |     2
+ SELECT * FROM test_merge WHERE id IN (... [10-99 entries])             |     2
+ SELECT pg_stat_statements_reset()                                      |     1
+ SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C" |     1
+(5 rows)
+
+-- Second order of magnitude, brace yourself
+SELECT pg_stat_statements_reset();
+ pg_stat_statements_reset 
+--------------------------
+ 
+(1 row)
+
+SELECT * FROM test_merge WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110);
+ id | data 
+----+------
+(0 rows)
+
+SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
+                            query                             | calls 
+--------------------------------------------------------------+-------
+ SELECT * FROM test_merge WHERE id IN (... [100-999 entries]) |     1
+ SELECT pg_stat_statements_reset()                            |     1
+(2 rows)
+
+-- With gaps on the threshold
+SELECT pg_stat_statements_reset();
+ pg_stat_statements_reset 
+--------------------------
+ 
+(1 row)
+
+SELECT * FROM test_merge WHERE id IN (1, 2, 3, 4);
+ id | data 
+----+------
+(0 rows)
+
+SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
+                          query                           | calls 
+----------------------------------------------------------+-------
+ SELECT * FROM test_merge WHERE id IN (... [1-9 entries]) |     1
+ SELECT pg_stat_statements_reset()                        |     1
+(2 rows)
+
+SELECT * FROM test_merge WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+ id | data 
+----+------
+(0 rows)
+
+SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
+                                 query                                  | calls 
+------------------------------------------------------------------------+-------
+ SELECT * FROM test_merge WHERE id IN (... [1-9 entries])               |     1
+ SELECT * FROM test_merge WHERE id IN (... [10-99 entries])             |     1
  SELECT pg_stat_statements_reset()                                      |     1
  SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C" |     1
 (4 rows)
@@ -108,11 +161,12 @@ SELECT * FROM test_merge WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11) and dat
 (0 rows)
 
 SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
-                          query                           | calls 
-----------------------------------------------------------+-------
- SELECT * FROM test_merge WHERE id IN (...) and data = $3 |     3
- SELECT pg_stat_statements_reset()                        |     1
-(2 rows)
+                                  query                                   | calls 
+--------------------------------------------------------------------------+-------
+ SELECT * FROM test_merge WHERE id IN (... [1-9 entries]) and data = $3   |     1
+ SELECT * FROM test_merge WHERE id IN (... [10-99 entries]) and data = $3 |     2
+ SELECT pg_stat_statements_reset()                                        |     1
+(3 rows)
 
 -- No constants simplification
 SELECT pg_stat_statements_reset();
@@ -147,10 +201,10 @@ SELECT * FROM test_merge_numeric WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)
 (0 rows)
 
 SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
-                       query                        | calls 
-----------------------------------------------------+-------
- SELECT * FROM test_merge_numeric WHERE id IN (...) |     1
- SELECT pg_stat_statements_reset()                  |     1
+                               query                                | calls 
+--------------------------------------------------------------------+-------
+ SELECT * FROM test_merge_numeric WHERE id IN (... [10-99 entries]) |     1
+ SELECT pg_stat_statements_reset()                                  |     1
 (2 rows)
 
 -- Test constants evaluation, verifies a tricky part to make sure there are no
diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c
index 4d47a746670..a5702c3d749 100644
--- a/contrib/pg_stat_statements/pg_stat_statements.c
+++ b/contrib/pg_stat_statements/pg_stat_statements.c
@@ -2714,6 +2714,8 @@ generate_normalized_query(JumbleState *jstate, const char *query,
 				last_tok_len = 0;	/* Length (in bytes) of that tok */
 	bool		skip = false; 	/* Signals that certain constants are
 								   merged together and have to be skipped */
+	int 		magnitude; 		/* Order of magnitute for number of merged
+								   constants */
 
 
 	/*
@@ -2754,7 +2756,8 @@ generate_normalized_query(JumbleState *jstate, const char *query,
 		Assert(len_to_wrt >= 0);
 
 		/* Normal path, non merged constant */
-		if (!jstate->clocations[i].merged)
+		magnitude = jstate->clocations[i].magnitude;
+		if (magnitude == 0)
 		{
 			memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt);
 			n_quer_loc += len_to_wrt;
@@ -2770,12 +2773,22 @@ generate_normalized_query(JumbleState *jstate, const char *query,
 		/* The firsts merged constant */
 		else if (!skip)
 		{
+			static const uint32 powers_of_ten[] = {
+				1, 10, 100,
+				1000, 10000, 100000,
+				1000000, 10000000, 100000000,
+				1000000000
+			};
+			int lower_merged = powers_of_ten[magnitude - 1];
+			int upper_merged = powers_of_ten[magnitude];
+
 			memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt);
 			n_quer_loc += len_to_wrt;
 
 			/* Skip the following until a non merged constant appear */
 			skip = true;
-			n_quer_loc += sprintf(norm_query + n_quer_loc, "...");
+			n_quer_loc += sprintf(norm_query + n_quer_loc, "... [%d-%d entries]",
+								  lower_merged, upper_merged - 1);
 		}
 		/* Otherwise the constant is merged away */
 
diff --git a/contrib/pg_stat_statements/sql/merging.sql b/contrib/pg_stat_statements/sql/merging.sql
index 8b589135daa..c515e48d50c 100644
--- a/contrib/pg_stat_statements/sql/merging.sql
+++ b/contrib/pg_stat_statements/sql/merging.sql
@@ -27,6 +27,19 @@ SELECT * FROM test_merge WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
 SELECT * FROM test_merge WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);
 SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
 
+-- Second order of magnitude, brace yourself
+SELECT pg_stat_statements_reset();
+SELECT * FROM test_merge WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110);
+SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
+
+-- With gaps on the threshold
+SELECT pg_stat_statements_reset();
+SELECT * FROM test_merge WHERE id IN (1, 2, 3, 4);
+SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
+
+SELECT * FROM test_merge WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
+
 -- More conditions in the query
 SELECT pg_stat_statements_reset();
 
diff --git a/doc/src/sgml/pgstatstatements.sgml b/doc/src/sgml/pgstatstatements.sgml
index bba8e5e11ed..a919696abc2 100644
--- a/doc/src/sgml/pgstatstatements.sgml
+++ b/doc/src/sgml/pgstatstatements.sgml
@@ -559,7 +559,7 @@
 =# SELECT * FROM test WHERE a IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12);
 =# SELECT query, calls FROM pg_stat_statements;
 -[ RECORD 1 ]------------------------------
-query | SELECT * FROM test WHERE a IN (...)
+query | SELECT * FROM test WHERE a IN (... [10-99 entries])
 calls | 2
 -[ RECORD 2 ]------------------------------
 query | SELECT pg_stat_statements_reset()
@@ -896,10 +896,11 @@ calls | 1
       with an array of different lenght.
 
       If this parameter is on, an array of constants will contribute only the
-      first and the last elements to the query identifier. It means two
-      occurences of the same query, where the only difference is number of
-      constants in the array, are going to get the same query identifier.
-      Such queries are represented in form <literal>'(...)'</literal>.
+      first element, the last element and the number of elements to the query
+      identifier. It means two occurences of the same query, where the only
+      difference is number of constants in the array, are going to get the
+      same query identifier if the arrays are of similar length.
+      Such queries are represented in form <literal>'(... [10-99 entries])'</literal>.
 
       The parameter could be used to reduce amount of repeating data stored
       via <structname>pg_stat_statements</structname>.  The default value is off.
diff --git a/src/backend/nodes/queryjumblefuncs.c b/src/backend/nodes/queryjumblefuncs.c
index 4bc16dde6a0..a1d4567ca66 100644
--- a/src/backend/nodes/queryjumblefuncs.c
+++ b/src/backend/nodes/queryjumblefuncs.c
@@ -37,6 +37,8 @@
 #include "nodes/queryjumble.h"
 #include "parser/scansup.h"
 
+#include "utils/numutils.h"
+
 #define JUMBLE_SIZE				1024	/* query serialization buffer size */
 
 /* GUC parameters */
@@ -51,7 +53,7 @@ bool		query_id_enabled = false;
 static void AppendJumble(JumbleState *jstate,
 						 const unsigned char *item, Size size);
 static void RecordConstLocation(JumbleState *jstate,
-								int location, bool merged);
+								int location, int magnitude);
 static void _jumbleNode(JumbleState *jstate, Node *node);
 static void _jumbleElements(JumbleState *jstate, List *elements);
 static void _jumbleA_Const(JumbleState *jstate, Node *node);
@@ -206,12 +208,15 @@ AppendJumble(JumbleState *jstate, const unsigned char *item, Size size)
  * Record location of constant within query string of query tree that is
  * currently being walked.
  *
- * Merged argument signals that the constant represents the first or the last
- * element in a series of merged constants, and everything but the first/last
- * element contributes nothing to the jumble hash.
+ * Magnitude argument larger than zero signals that the constant represents the
+ * first or the last element in a series of merged constants, and everything
+ * but such first/last element will contribute nothing to the jumble hash. The
+ * magnitute value specifies order of magnitute (i.e. how many digits it has)
+ * for the number of elements in the series, to represent the fact of merging
+ * later on.
  */
 static void
-RecordConstLocation(JumbleState *jstate, int location, bool merged)
+RecordConstLocation(JumbleState *jstate, int location, int magnitude)
 {
 	/* -1 indicates unknown or undefined location */
 	if (location >= 0)
@@ -227,7 +232,7 @@ RecordConstLocation(JumbleState *jstate, int location, bool merged)
 		}
 		jstate->clocations[jstate->clocations_count].location = location;
 		/* initialize lengths to -1 to simplify third-party module usage */
-		jstate->clocations[jstate->clocations_count].merged = merged;
+		jstate->clocations[jstate->clocations_count].magnitude = magnitude;
 		jstate->clocations[jstate->clocations_count].length = -1;
 		jstate->clocations_count++;
 	}
@@ -237,24 +242,26 @@ RecordConstLocation(JumbleState *jstate, int location, bool merged)
  * Verify if the provided list contains could be merged down, which means it
  * contains only constant expressions.
  *
- * Return value indicates if merging is possible.
+ * Return value is the order of magnitude (i.e. how many digits it has) for
+ * length of the list (to use for representation purposes later on) if merging
+ * is possible, otherwise zero.
  *
  * Note that this function searches only for explicit Const nodes and does not
  * try to simplify expressions.
  */
-static bool
+static int
 IsMergeableConstList(List *elements, Const **firstConst, Const **lastConst)
 {
 	ListCell   *temp;
 	Node	   *firstExpr = NULL;
 
 	if (elements == NULL)
-		return false;
+		return 0;
 
 	if (!query_id_const_merge)
 	{
 		/* Merging is disabled, process everything one by one */
-		return false;
+		return 0;
 	}
 
 	firstExpr = linitial(elements);
@@ -268,26 +275,26 @@ IsMergeableConstList(List *elements, Const **firstConst, Const **lastConst)
 	{
 		foreach(temp, elements)
 			if (!IsA(lfirst(temp), Const))
-				return false;
+				return 0;
 
 		*firstConst = (Const *) firstExpr;
 		*lastConst = llast_node(Const, elements);
-		return true;
+		return decimalLength32(elements->length);
 	}
 
 	/*
 	 * If we end up here, it means no constants merging is possible, process
 	 * the list as usual.
 	 */
-	return false;
+	return 0;
 }
 
 #define JUMBLE_NODE(item) \
 	_jumbleNode(jstate, (Node *) expr->item)
 #define JUMBLE_ELEMENTS(list) \
 	_jumbleElements(jstate, (List *) expr->list)
-#define JUMBLE_LOCATION(location, merged) \
-	RecordConstLocation(jstate, expr->location, merged)
+#define JUMBLE_LOCATION(location, magnitude) \
+	RecordConstLocation(jstate, expr->location, magnitude)
 #define JUMBLE_FIELD(item) \
 	AppendJumble(jstate, (const unsigned char *) &(expr->item), sizeof(expr->item))
 #define JUMBLE_FIELD_SINGLE(item) \
@@ -304,10 +311,19 @@ static void
 _jumbleElements(JumbleState *jstate, List *elements)
 {
 	Const *first, *last;
-	if(IsMergeableConstList(elements, &first, &last))
+	int magnitude = IsMergeableConstList(elements, &first, &last);
+
+	if (magnitude)
 	{
-		RecordConstLocation(jstate, first->location, true);
-		RecordConstLocation(jstate, last->location, true);
+		RecordConstLocation(jstate, first->location, magnitude);
+		RecordConstLocation(jstate, last->location, magnitude);
+
+		/*
+		 * After merging constants down we end up with only two constants, the
+		 * first and the last one. To distinguish the order of magnitute behind
+		 * merged constants, add its value into the jumble.
+		 */
+		JUMBLE_FIELD_SINGLE(magnitude);
 	}
 	else
 	{
diff --git a/src/include/nodes/queryjumble.h b/src/include/nodes/queryjumble.h
index c64a007ad3f..8ee2e9afbb6 100644
--- a/src/include/nodes/queryjumble.h
+++ b/src/include/nodes/queryjumble.h
@@ -26,9 +26,12 @@ typedef struct LocationLen
 
 	/*
 	 * Indicates the constant represents the beginning or the end of a merged
-	 * constants interval.
+	 * constants interval. The value shows how many constants were merged away
+	 * (up to a power of 10), or in other words the order of manitude for
+	 * number of merged constants (i.e. how many digits it has). Otherwise the
+	 * value is 0, indicating that no merging was performed.
 	 */
-	bool		merged;
+	int			magnitude;
 } LocationLen;
 
 /*
-- 
2.41.0

