From 4ad001ab9866467c90d5b1ddc664bfe1b6b87ee4 Mon Sep 17 00:00:00 2001
From: Dmitrii Dolgov <9erthalion6@gmail.com>
Date: Sun, 15 Oct 2023 10:06:09 +0200
Subject: [PATCH v15 3/4] Merge constants in ArrayExpr into groups

Using query_id_const_merge only first/last element in an ArrayExpr will
be used to compute query id. Extend this to take into account number of
elements, and merge constants into groups based on it. Resulting groups
are powers of 10, i.e. 1 to 9, 10 to 99, etc.
---
 .../pg_stat_statements/expected/merging.out   | 84 +++++++++++++++----
 .../pg_stat_statements/pg_stat_statements.c   | 17 +++-
 contrib/pg_stat_statements/sql/merging.sql    | 13 +++
 doc/src/sgml/config.sgml                      |  9 +-
 doc/src/sgml/pgstatstatements.sgml            |  2 +-
 src/backend/nodes/queryjumblefuncs.c          | 52 ++++++++----
 src/include/nodes/queryjumble.h               |  7 +-
 7 files changed, 142 insertions(+), 42 deletions(-)

diff --git a/contrib/pg_stat_statements/expected/merging.out b/contrib/pg_stat_statements/expected/merging.out
index 7711572e0b7..1bb75a93045 100644
--- a/contrib/pg_stat_statements/expected/merging.out
+++ b/contrib/pg_stat_statements/expected/merging.out
@@ -54,11 +54,11 @@ SELECT * FROM test_merge WHERE id IN (1, 2, 3);
 (0 rows)
 
 SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
-                   query                    | calls 
---------------------------------------------+-------
- SELECT * FROM test_merge WHERE id IN ($1)  |     1
- SELECT * FROM test_merge WHERE id IN (...) |     1
- SELECT pg_stat_statements_reset()          |     1
+                          query                           | calls 
+----------------------------------------------------------+-------
+ SELECT * FROM test_merge WHERE id IN ($1)                |     1
+ SELECT * FROM test_merge WHERE id IN (... [1-9 entries]) |     1
+ SELECT pg_stat_statements_reset()                        |     1
 (3 rows)
 
 SELECT * FROM test_merge WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9);
@@ -80,7 +80,60 @@ SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
                                  query                                  | calls 
 ------------------------------------------------------------------------+-------
  SELECT * FROM test_merge WHERE id IN ($1)                              |     1
- SELECT * FROM test_merge WHERE id IN (...)                             |     4
+ SELECT * FROM test_merge WHERE id IN (... [1-9 entries])               |     2
+ SELECT * FROM test_merge WHERE id IN (... [10-99 entries])             |     2
+ SELECT pg_stat_statements_reset()                                      |     1
+ SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C" |     1
+(5 rows)
+
+-- Second order of magnitude, brace yourself
+SELECT pg_stat_statements_reset();
+ pg_stat_statements_reset 
+--------------------------
+ 
+(1 row)
+
+SELECT * FROM test_merge WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110);
+ id | data 
+----+------
+(0 rows)
+
+SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
+                            query                             | calls 
+--------------------------------------------------------------+-------
+ SELECT * FROM test_merge WHERE id IN (... [100-999 entries]) |     1
+ SELECT pg_stat_statements_reset()                            |     1
+(2 rows)
+
+-- With gaps on the threshold
+SELECT pg_stat_statements_reset();
+ pg_stat_statements_reset 
+--------------------------
+ 
+(1 row)
+
+SELECT * FROM test_merge WHERE id IN (1, 2, 3, 4);
+ id | data 
+----+------
+(0 rows)
+
+SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
+                          query                           | calls 
+----------------------------------------------------------+-------
+ SELECT * FROM test_merge WHERE id IN (... [1-9 entries]) |     1
+ SELECT pg_stat_statements_reset()                        |     1
+(2 rows)
+
+SELECT * FROM test_merge WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+ id | data 
+----+------
+(0 rows)
+
+SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
+                                 query                                  | calls 
+------------------------------------------------------------------------+-------
+ SELECT * FROM test_merge WHERE id IN (... [1-9 entries])               |     1
+ SELECT * FROM test_merge WHERE id IN (... [10-99 entries])             |     1
  SELECT pg_stat_statements_reset()                                      |     1
  SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C" |     1
 (4 rows)
@@ -108,11 +161,12 @@ SELECT * FROM test_merge WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11) and dat
 (0 rows)
 
 SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
-                          query                           | calls 
-----------------------------------------------------------+-------
- SELECT * FROM test_merge WHERE id IN (...) and data = $3 |     3
- SELECT pg_stat_statements_reset()                        |     1
-(2 rows)
+                                  query                                   | calls 
+--------------------------------------------------------------------------+-------
+ SELECT * FROM test_merge WHERE id IN (... [1-9 entries]) and data = $3   |     1
+ SELECT * FROM test_merge WHERE id IN (... [10-99 entries]) and data = $3 |     2
+ SELECT pg_stat_statements_reset()                                        |     1
+(3 rows)
 
 -- No constants simplification
 SELECT pg_stat_statements_reset();
@@ -147,10 +201,10 @@ SELECT * FROM test_merge_numeric WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)
 (0 rows)
 
 SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
-                       query                        | calls 
-----------------------------------------------------+-------
- SELECT * FROM test_merge_numeric WHERE id IN (...) |     1
- SELECT pg_stat_statements_reset()                  |     1
+                               query                                | calls 
+--------------------------------------------------------------------+-------
+ SELECT * FROM test_merge_numeric WHERE id IN (... [10-99 entries]) |     1
+ SELECT pg_stat_statements_reset()                                  |     1
 (2 rows)
 
 -- Test constants evaluation, verifies a tricky part to make sure there are no
diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c
index 0b03009a053..24d7401257d 100644
--- a/contrib/pg_stat_statements/pg_stat_statements.c
+++ b/contrib/pg_stat_statements/pg_stat_statements.c
@@ -2697,6 +2697,8 @@ generate_normalized_query(JumbleState *jstate, const char *query,
 				last_tok_len = 0;	/* Length (in bytes) of that tok */
 	bool		skip = false; 	/* Signals that certain constants are
 								   merged together and have to be skipped */
+	int 		magnitude; 		/* Order of magnitute for number of merged
+								   constants */
 
 
 	/*
@@ -2737,7 +2739,8 @@ generate_normalized_query(JumbleState *jstate, const char *query,
 		Assert(len_to_wrt >= 0);
 
 		/* Normal path, non merged constant */
-		if (!jstate->clocations[i].merged)
+		magnitude = jstate->clocations[i].magnitude;
+		if (magnitude == 0)
 		{
 			memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt);
 			n_quer_loc += len_to_wrt;
@@ -2753,12 +2756,22 @@ generate_normalized_query(JumbleState *jstate, const char *query,
 		/* The firsts merged constant */
 		else if (!skip)
 		{
+			static const uint32 powers_of_ten[] = {
+				1, 10, 100,
+				1000, 10000, 100000,
+				1000000, 10000000, 100000000,
+				1000000000
+			};
+			int lower_merged = powers_of_ten[magnitude - 1];
+			int upper_merged = powers_of_ten[magnitude];
+
 			memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt);
 			n_quer_loc += len_to_wrt;
 
 			/* Skip the following until a non merged constant appear */
 			skip = true;
-			n_quer_loc += sprintf(norm_query + n_quer_loc, "...");
+			n_quer_loc += sprintf(norm_query + n_quer_loc, "... [%d-%d entries]",
+								  lower_merged, upper_merged - 1);
 		}
 		/* Otherwise the constant is merged away */
 
diff --git a/contrib/pg_stat_statements/sql/merging.sql b/contrib/pg_stat_statements/sql/merging.sql
index 02266a92ce8..75960159a1d 100644
--- a/contrib/pg_stat_statements/sql/merging.sql
+++ b/contrib/pg_stat_statements/sql/merging.sql
@@ -27,6 +27,19 @@ SELECT * FROM test_merge WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
 SELECT * FROM test_merge WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);
 SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
 
+-- Second order of magnitude, brace yourself
+SELECT pg_stat_statements_reset();
+SELECT * FROM test_merge WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110);
+SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
+
+-- With gaps on the threshold
+SELECT pg_stat_statements_reset();
+SELECT * FROM test_merge WHERE id IN (1, 2, 3, 4);
+SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
+
+SELECT * FROM test_merge WHERE id IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+SELECT query, calls FROM pg_stat_statements ORDER BY query COLLATE "C";
+
 -- More conditions in the query
 SELECT pg_stat_statements_reset();
 
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 55b0795bce1..4fd997284c9 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -8205,15 +8205,16 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv;
         with an array of different lenght.
 
         If this parameter is on, an array of constants will contribute only the
-        first and the last elements to the query identifier. It means two
-        occurences of the same query, where the only difference is number of
-        constants in the array, are going to get the same query identifier.
+        first element, the last element and the number of elements to the query
+        identifier. It means two occurences of the same query, where the only
+        difference is number of constants in the array, are going to get the
+        same query identifier if the arrays are of similar length.
 
         The parameter could be used to reduce amount of repeating data stored
         via <xref linkend="pgstatstatements"/> extension.  The default value is off.
 
         The <xref linkend="pgstatstatements"/> extension will represent such
-        queries in form <literal>'(...)'</literal>.
+        queries in form <literal>'(... [10-99 entries])'</literal>.
        </para>
       </listitem>
      </varlistentry>
diff --git a/doc/src/sgml/pgstatstatements.sgml b/doc/src/sgml/pgstatstatements.sgml
index 8df4064cbf4..1f6e6a0e76b 100644
--- a/doc/src/sgml/pgstatstatements.sgml
+++ b/doc/src/sgml/pgstatstatements.sgml
@@ -559,7 +559,7 @@
 =# SELECT * FROM test WHERE a IN (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12);
 =# SELECT query, calls FROM pg_stat_statements;
 -[ RECORD 1 ]------------------------------
-query | SELECT * FROM test WHERE a IN (...)
+query | SELECT * FROM test WHERE a IN (... [10-99 entries])
 calls | 2
 -[ RECORD 2 ]------------------------------
 query | SELECT pg_stat_statements_reset()
diff --git a/src/backend/nodes/queryjumblefuncs.c b/src/backend/nodes/queryjumblefuncs.c
index 52d45c2b093..b198fe2579e 100644
--- a/src/backend/nodes/queryjumblefuncs.c
+++ b/src/backend/nodes/queryjumblefuncs.c
@@ -37,6 +37,8 @@
 #include "nodes/queryjumble.h"
 #include "parser/scansup.h"
 
+#include "utils/numutils.h"
+
 #define JUMBLE_SIZE				1024	/* query serialization buffer size */
 
 /* GUC parameters */
@@ -51,7 +53,7 @@ bool		query_id_enabled = false;
 static void AppendJumble(JumbleState *jstate,
 						 const unsigned char *item, Size size);
 static void RecordConstLocation(JumbleState *jstate,
-								int location, bool merged);
+								int location, int magnitude);
 static void _jumbleNode(JumbleState *jstate, Node *node);
 static void _jumbleA_Const(JumbleState *jstate, Node *node);
 static void _jumbleList(JumbleState *jstate, Node *node);
@@ -193,12 +195,15 @@ AppendJumble(JumbleState *jstate, const unsigned char *item, Size size)
  * Record location of constant within query string of query tree that is
  * currently being walked.
  *
- * Merged argument signals that the constant represents the first or the last
- * element in a series of merged constants, and everything but the first/last
- * element contributes nothing to the jumble hash.
+ * Magnitude argument larger than zero signals that the constant represents the
+ * first or the last element in a series of merged constants, and everything
+ * but such first/last element will contribute nothing to the jumble hash. The
+ * magnitute value specifies order of magnitute (i.e. how many digits it has)
+ * for the number of elements in the series, to represent the fact of merging
+ * later on.
  */
 static void
-RecordConstLocation(JumbleState *jstate, int location, bool merged)
+RecordConstLocation(JumbleState *jstate, int location, int magnitude)
 {
 	/* -1 indicates unknown or undefined location */
 	if (location >= 0)
@@ -214,7 +219,7 @@ RecordConstLocation(JumbleState *jstate, int location, bool merged)
 		}
 		jstate->clocations[jstate->clocations_count].location = location;
 		/* initialize lengths to -1 to simplify third-party module usage */
-		jstate->clocations[jstate->clocations_count].merged = merged;
+		jstate->clocations[jstate->clocations_count].magnitude = magnitude;
 		jstate->clocations[jstate->clocations_count].length = -1;
 		jstate->clocations_count++;
 	}
@@ -224,24 +229,26 @@ RecordConstLocation(JumbleState *jstate, int location, bool merged)
  * Verify if the provided list contains could be merged down, which means it
  * contains only constant expressions.
  *
- * Return value indicates if merging is possible.
+ * Return value is the order of magnitude (i.e. how many digits it has) for
+ * length of the list (to use for representation purposes later on) if merging
+ * is possible, otherwise zero.
  *
  * Note that this function searches only for explicit Const nodes and does not
  * try to simplify expressions.
  */
-static bool
+static int
 IsMergeableConstList(List *elements, Const **firstConst, Const **lastConst)
 {
 	ListCell   *temp;
 	Node	   *firstExpr = NULL;
 
 	if (elements == NULL)
-		return false;
+		return 0;
 
 	if (!query_id_const_merge)
 	{
 		/* Merging is disabled, process everything one by one */
-		return false;
+		return 0;
 	}
 
 	firstExpr = linitial(elements);
@@ -255,24 +262,24 @@ IsMergeableConstList(List *elements, Const **firstConst, Const **lastConst)
 	{
 		foreach(temp, elements)
 			if (!IsA(lfirst(temp), Const))
-				return false;
+				return 0;
 
 		*firstConst = (Const *) firstExpr;
 		*lastConst = llast_node(Const, elements);
-		return true;
+		return decimalLength32(elements->length);
 	}
 
 	/*
 	 * If we end up here, it means no constants merging is possible, process
 	 * the list as usual.
 	 */
-	return false;
+	return 0;
 }
 
 #define JUMBLE_NODE(item) \
 	_jumbleNode(jstate, (Node *) expr->item)
-#define JUMBLE_LOCATION(location, merged) \
-	RecordConstLocation(jstate, expr->location, merged)
+#define JUMBLE_LOCATION(location, magnitude) \
+	RecordConstLocation(jstate, expr->location, magnitude)
 #define JUMBLE_FIELD(item) \
 	AppendJumble(jstate, (const unsigned char *) &(expr->item), sizeof(expr->item))
 #define JUMBLE_FIELD_SINGLE(item) \
@@ -290,10 +297,19 @@ _jumbleArrayExpr(JumbleState *jstate, Node *node)
 {
 	ArrayExpr *expr = (ArrayExpr *) node;
 	Const *first, *last;
-	if(IsMergeableConstList(expr->elements, &first, &last))
+	int magnitude = IsMergeableConstList(expr->elements, &first, &last);
+
+	if (magnitude)
 	{
-		RecordConstLocation(jstate, first->location, true);
-		RecordConstLocation(jstate, last->location, true);
+		RecordConstLocation(jstate, first->location, magnitude);
+		RecordConstLocation(jstate, last->location, magnitude);
+
+		/*
+		 * After merging constants down we end up with only two constants, the
+		 * first and the last one. To distinguish the order of magnitute behind
+		 * merged constants, add its value into the jumble.
+		 */
+		JUMBLE_FIELD_SINGLE(magnitude);
 	}
 	else
 	{
diff --git a/src/include/nodes/queryjumble.h b/src/include/nodes/queryjumble.h
index 225957d7b3c..cfa99efc14e 100644
--- a/src/include/nodes/queryjumble.h
+++ b/src/include/nodes/queryjumble.h
@@ -27,9 +27,12 @@ typedef struct LocationLen
 
 	/*
 	 * Indicates the constant represents the beginning or the end of a merged
-	 * constants interval.
+	 * constants interval. The value shows how many constants were merged away
+	 * (up to a power of 10), or in other words the order of manitude for
+	 * number of merged constants (i.e. how many digits it has). Otherwise the
+	 * value is 0, indicating that no merging was performed.
 	 */
-	bool		merged;
+	int			magnitude;
 } LocationLen;
 
 /*
-- 
2.41.0

