From 956c0a456a149f0e687d118b8c6ddad8e42df828 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 24 Oct 2025 20:42:30 +0000 Subject: [PATCH v4 2/2] pg_stat_statements: move out jumble-specific routines During discussion of commit 8eddb06, it was noted that the fill_in_constant_lengths and generate_normalized_query routines depend on intricate jumbling knowledge. Implementing this code within pg_stat_statements does not make sense; it belongs in the core jumbling module, queryjumblefuncs.c. pg_stat_statements can then consume these routines without needing to know the details. This separation became especially apparent with the squashing of constant lists, but it has been a general issue all along. In addition to reducing the pg_stat_statements codebase, this change makes these routines available to other extensions performing similar operations. Discussion: https://www.postgresql.org/message-id/CAA5RZ0t6rynOzvABTbHZPF9ydKoay5LN%2B_iM3hHk0ni_Qu_9tg%40mail.gmail.com --- .../pg_stat_statements/pg_stat_statements.c | 274 +----------------- src/backend/nodes/queryjumblefuncs.c | 258 +++++++++++++++++ src/include/nodes/queryjumble.h | 4 + 3 files changed, 269 insertions(+), 267 deletions(-) diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c index 1d22dc07da9..95293d3b594 100644 --- a/contrib/pg_stat_statements/pg_stat_statements.c +++ b/contrib/pg_stat_statements/pg_stat_statements.c @@ -50,7 +50,6 @@ #include "access/htup_details.h" #include "access/parallel.h" #include "catalog/pg_authid.h" -#include "common/int.h" #include "executor/instrument.h" #include "funcapi.h" #include "jit/jit.h" @@ -59,7 +58,6 @@ #include "nodes/queryjumble.h" #include "optimizer/planner.h" #include "parser/analyze.h" -#include "parser/scanner.h" #include "pgstat.h" #include "storage/fd.h" #include "storage/ipc.h" @@ -377,12 +375,6 @@ static char *qtext_fetch(Size query_offset, int query_len, static bool need_gc_qtexts(void); static void gc_qtexts(void); static TimestampTz entry_reset(Oid userid, Oid dbid, int64 queryid, bool minmax_only); -static char *generate_normalized_query(JumbleState *jstate, const char *query, - int query_loc, int *query_len_p); -static void fill_in_constant_lengths(JumbleState *jstate, const char *query, - int query_loc); -static int comp_location(const void *a, const void *b); - /* * Module load callback @@ -1359,6 +1351,13 @@ pgss_store(const char *query, int64 queryId, if (jstate) { LWLockRelease(pgss->lock); + + /* + * generate the normalized query. Note that the normalized + * representation may well vary depending on just which + * "equivalent" query is used to create the hashtable entry. We + * assume this is OK. + */ norm_query = generate_normalized_query(jstate, query, query_location, &query_len); @@ -2823,262 +2822,3 @@ release_lock: return stats_reset; } - -/* - * Generate a normalized version of the query string that will be used to - * represent all similar queries. - * - * Note that the normalized representation may well vary depending on - * just which "equivalent" query is used to create the hashtable entry. - * We assume this is OK. - * - * If query_loc > 0, then "query" has been advanced by that much compared to - * the original string start, so we need to translate the provided locations - * to compensate. (This lets us avoid re-scanning statements before the one - * of interest, so it's worth doing.) - * - * *query_len_p contains the input string length, and is updated with - * the result string length on exit. The resulting string might be longer - * or shorter depending on what happens with replacement of constants. - * - * Returns a palloc'd string. - */ -static char * -generate_normalized_query(JumbleState *jstate, const char *query, - int query_loc, int *query_len_p) -{ - char *norm_query; - int query_len = *query_len_p; - int norm_query_buflen, /* Space allowed for norm_query */ - len_to_wrt, /* Length (in bytes) to write */ - quer_loc = 0, /* Source query byte location */ - n_quer_loc = 0, /* Normalized query byte location */ - last_off = 0, /* Offset from start for previous tok */ - last_tok_len = 0; /* Length (in bytes) of that tok */ - int num_constants_replaced = 0; - - /* - * Get constants' lengths (core system only gives us locations). Note - * this also ensures the items are sorted by location. - */ - fill_in_constant_lengths(jstate, query, query_loc); - - /* - * Allow for $n symbols to be longer than the constants they replace. - * Constants must take at least one byte in text form, while a $n symbol - * certainly isn't more than 11 bytes, even if n reaches INT_MAX. We - * could refine that limit based on the max value of n for the current - * query, but it hardly seems worth any extra effort to do so. - */ - norm_query_buflen = query_len + jstate->clocations_count * 10; - - /* Allocate result buffer */ - norm_query = palloc(norm_query_buflen + 1); - - for (int i = 0; i < jstate->clocations_count; i++) - { - int off, /* Offset from start for cur tok */ - tok_len; /* Length (in bytes) of that tok */ - - /* - * If we have an external param at this location, but no lists are - * being squashed across the query, then we skip here; this will make - * us print the characters found in the original query that represent - * the parameter in the next iteration (or after the loop is done), - * which is a bit odd but seems to work okay in most cases. - */ - if (jstate->clocations[i].extern_param && !jstate->has_squashed_lists) - continue; - - off = jstate->clocations[i].location; - - /* Adjust recorded location if we're dealing with partial string */ - off -= query_loc; - - tok_len = jstate->clocations[i].length; - - if (tok_len < 0) - continue; /* ignore any duplicates */ - - /* Copy next chunk (what precedes the next constant) */ - len_to_wrt = off - last_off; - len_to_wrt -= last_tok_len; - Assert(len_to_wrt >= 0); - memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt); - n_quer_loc += len_to_wrt; - - /* - * And insert a param symbol in place of the constant token; and, if - * we have a squashable list, insert a placeholder comment starting - * from the list's second value. - */ - n_quer_loc += sprintf(norm_query + n_quer_loc, "$%d%s", - num_constants_replaced + 1 + jstate->highest_extern_param_id, - jstate->clocations[i].squashed ? " /*, ... */" : ""); - num_constants_replaced++; - - /* move forward */ - quer_loc = off + tok_len; - last_off = off; - last_tok_len = tok_len; - } - - /* - * We've copied up until the last ignorable constant. Copy over the - * remaining bytes of the original query string. - */ - len_to_wrt = query_len - quer_loc; - - Assert(len_to_wrt >= 0); - memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt); - n_quer_loc += len_to_wrt; - - Assert(n_quer_loc <= norm_query_buflen); - norm_query[n_quer_loc] = '\0'; - - *query_len_p = n_quer_loc; - return norm_query; -} - -/* - * Given a valid SQL string and an array of constant-location records, - * fill in the textual lengths of those constants. - * - * The constants may use any allowed constant syntax, such as float literals, - * bit-strings, single-quoted strings and dollar-quoted strings. This is - * accomplished by using the public API for the core scanner. - * - * It is the caller's job to ensure that the string is a valid SQL statement - * with constants at the indicated locations. Since in practice the string - * has already been parsed, and the locations that the caller provides will - * have originated from within the authoritative parser, this should not be - * a problem. - * - * Duplicate constant pointers are possible, and will have their lengths - * marked as '-1', so that they are later ignored. - * - * If query_loc > 0, then "query" has been advanced by that much compared to - * the original string start, so we need to translate the provided locations - * to compensate. (This lets us avoid re-scanning statements before the one - * of interest, so it's worth doing.) - * - * N.B. There is an assumption that a '-' character at a Const location begins - * a negative numeric constant. This precludes there ever being another - * reason for a constant to start with a '-'. - */ -static void -fill_in_constant_lengths(JumbleState *jstate, const char *query, - int query_loc) -{ - LocationLen *locs; - core_yyscan_t yyscanner; - core_yy_extra_type yyextra; - core_YYSTYPE yylval; - YYLTYPE yylloc; - int last_loc = -1; - int i; - - /* - * Sort the records by location so that we can process them in order while - * scanning the query text. - */ - if (jstate->clocations_count > 1) - qsort(jstate->clocations, jstate->clocations_count, - sizeof(LocationLen), comp_location); - locs = jstate->clocations; - - /* initialize the flex scanner --- should match raw_parser() */ - yyscanner = scanner_init(query, - &yyextra, - &ScanKeywords, - ScanKeywordTokens); - - /* we don't want to re-emit any escape string warnings */ - yyextra.escape_string_warning = false; - - /* Search for each constant, in sequence */ - for (i = 0; i < jstate->clocations_count; i++) - { - int loc = locs[i].location; - int tok; - bool squashed = locs[i].squashed; - - /* Adjust recorded location if we're dealing with partial string */ - loc -= query_loc; - - Assert(loc >= 0); - - if (loc <= last_loc) - { - locs[i].length = -1; - continue; /* Duplicate constant, ignore */ - } - - /* We have a valid location, so let's save it */ - last_loc = loc; - - if (squashed) - continue; /* squashable list, ignore */ - - /* Lex tokens until we find the desired constant */ - for (;;) - { - tok = core_yylex(&yylval, &yylloc, yyscanner); - - /* We should not hit end-of-string, but if we do, behave sanely */ - if (tok == 0) - break; /* out of inner for-loop */ - - /* - * We should find the token position exactly, but if we somehow - * run past it, work with that. - */ - if (yylloc >= loc) - { - if (query[loc] == '-') - { - /* - * It's a negative value - this is the one and only case - * where we replace more than a single token. - * - * Do not compensate for the core system's special-case - * adjustment of location to that of the leading '-' - * operator in the event of a negative constant. It is - * also useful for our purposes to start from the minus - * symbol. In this way, queries like "select * from foo - * where bar = 1" and "select * from foo where bar = -2" - * will have identical normalized query strings. - */ - tok = core_yylex(&yylval, &yylloc, yyscanner); - if (tok == 0) - break; /* out of inner for-loop */ - } - - /* - * We now rely on the assumption that flex has placed a zero - * byte after the text of the current token in scanbuf. - */ - locs[i].length = strlen(yyextra.scanbuf + loc); - break; /* out of inner for-loop */ - } - } - - /* If we hit end-of-string, give up, leaving remaining lengths -1 */ - if (tok == 0) - break; - } - - scanner_finish(yyscanner); -} - -/* - * comp_location: comparator for qsorting LocationLen structs by location - */ -static int -comp_location(const void *a, const void *b) -{ - int l = ((const LocationLen *) a)->location; - int r = ((const LocationLen *) b)->location; - - return pg_cmp_s32(l, r); -} diff --git a/src/backend/nodes/queryjumblefuncs.c b/src/backend/nodes/queryjumblefuncs.c index 31f97151977..c27f1c12ac7 100644 --- a/src/backend/nodes/queryjumblefuncs.c +++ b/src/backend/nodes/queryjumblefuncs.c @@ -40,10 +40,12 @@ #include "access/transam.h" #include "catalog/pg_proc.h" #include "common/hashfn.h" +#include "common/int.h" #include "miscadmin.h" #include "nodes/nodeFuncs.h" #include "nodes/queryjumble.h" #include "utils/lsyscache.h" +#include "parser/scanner.h" #include "parser/scansup.h" #define JUMBLE_SIZE 1024 /* query serialization buffer size */ @@ -77,6 +79,262 @@ static void _jumbleVariableSetStmt(JumbleState *jstate, Node *node); static void _jumbleRangeTblEntry_eref(JumbleState *jstate, RangeTblEntry *rte, Alias *expr); +static int comp_location(const void *a, const void *b); + +/* + * comp_location: comparator for qsorting LocationLen structs by location + */ +static int +comp_location(const void *a, const void *b) +{ + int l = ((const LocationLen *) a)->location; + int r = ((const LocationLen *) b)->location; + + return pg_cmp_s32(l, r); +} + + /* + * Generate a normalized version of the query string that will be used to + * represent all similar queries. + * + * If query_loc > 0, then "query" has been advanced by that much compared to + * the original string start, so we need to translate the provided locations + * to compensate. (This lets us avoid re-scanning statements before the one + * of interest, so it's worth doing.) + * + * *query_len_p contains the input string length, and is updated with the + * result string length on exit. The resulting string might be longer or + * shorter depending on what happens with replacement of constants. + * + * Returns a palloc'd string. + */ +char * +generate_normalized_query(JumbleState *jstate, const char *query, + int query_loc, int *query_len_p) +{ + char *norm_query; + int query_len = *query_len_p; + int norm_query_buflen, /* Space allowed for norm_query */ + len_to_wrt, /* Length (in bytes) to write */ + quer_loc = 0, /* Source query byte location */ + n_quer_loc = 0, /* Normalized query byte location */ + last_off = 0, /* Offset from start for previous tok */ + last_tok_len = 0; /* Length (in bytes) of that tok */ + int num_constants_replaced = 0; + + /* + * Get constants' lengths (core system only gives us locations). Note + * this also ensures the items are sorted by location. + */ + fill_in_constant_lengths(jstate, query, query_loc); + + /* + * Allow for $n symbols to be longer than the constants they replace. + * Constants must take at least one byte in text form, while a $n symbol + * certainly isn't more than 11 bytes, even if n reaches INT_MAX. We + * could refine that limit based on the max value of n for the current + * query, but it hardly seems worth any extra effort to do so. + */ + norm_query_buflen = query_len + jstate->clocations_count * 10; + + /* Allocate result buffer */ + norm_query = palloc(norm_query_buflen + 1); + + for (int i = 0; i < jstate->clocations_count; i++) + { + int off, /* Offset from start for cur tok */ + tok_len; /* Length (in bytes) of that tok */ + + /* + * If we have an external param at this location, but no lists are + * being squashed across the query, then we skip here; this will make + * us print the characters found in the original query that represent + * the parameter in the next iteration (or after the loop is done), + * which is a bit odd but seems to work okay in most cases. + */ + if (jstate->clocations[i].extern_param && !jstate->has_squashed_lists) + continue; + + off = jstate->clocations[i].location; + + /* Adjust recorded location if we're dealing with partial string */ + off -= query_loc; + + tok_len = jstate->clocations[i].length; + + if (tok_len < 0) + continue; /* ignore any duplicates */ + + /* Copy next chunk (what precedes the next constant) */ + len_to_wrt = off - last_off; + len_to_wrt -= last_tok_len; + Assert(len_to_wrt >= 0); + memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt); + n_quer_loc += len_to_wrt; + + /* + * And insert a param symbol in place of the constant token; and, if + * we have a squashable list, insert a placeholder comment starting + * from the list's second value. + */ + n_quer_loc += sprintf(norm_query + n_quer_loc, "$%d%s", + num_constants_replaced + 1 + jstate->highest_extern_param_id, + jstate->clocations[i].squashed ? " /*, ... */" : ""); + num_constants_replaced++; + + /* move forward */ + quer_loc = off + tok_len; + last_off = off; + last_tok_len = tok_len; + } + + /* + * We've copied up until the last ignorable constant. Copy over the + * remaining bytes of the original query string. + */ + len_to_wrt = query_len - quer_loc; + + Assert(len_to_wrt >= 0); + memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt); + n_quer_loc += len_to_wrt; + + Assert(n_quer_loc <= norm_query_buflen); + norm_query[n_quer_loc] = '\0'; + + *query_len_p = n_quer_loc; + return norm_query; +} + +/* + * Given a valid SQL string and an array of constant-location records, + * fill in the textual lengths of those constants. + * + * The constants may use any allowed constant syntax, such as float literals, + * bit-strings, single-quoted strings and dollar-quoted strings. This is + * accomplished by using the public API for the core scanner. + * + * It is the caller's job to ensure that the string is a valid SQL statement + * with constants at the indicated locations. Since in practice the string + * has already been parsed, and the locations that the caller provides will + * have originated from within the authoritative parser, this should not be + * a problem. + * + * Duplicate constant pointers are possible, and will have their lengths + * marked as '-1', so that they are later ignored. + * + * If query_loc > 0, then "query" has been advanced by that much compared to + * the original string start, so we need to translate the provided locations + * to compensate. (This lets us avoid re-scanning statements before the one + * of interest, so it's worth doing.) + * + * N.B. There is an assumption that a '-' character at a Const location begins + * a negative numeric constant. This precludes there ever being another + * reason for a constant to start with a '-'. + */ +void +fill_in_constant_lengths(JumbleState *jstate, const char *query, + int query_loc) +{ + LocationLen *locs; + core_yyscan_t yyscanner; + core_yy_extra_type yyextra; + core_YYSTYPE yylval; + YYLTYPE yylloc; + int last_loc = -1; + int i; + + /* + * Sort the records by location so that we can process them in order while + * scanning the query text. + */ + if (jstate->clocations_count > 1) + qsort(jstate->clocations, jstate->clocations_count, + sizeof(LocationLen), comp_location); + locs = jstate->clocations; + + /* initialize the flex scanner --- should match raw_parser() */ + yyscanner = scanner_init(query, + &yyextra, + &ScanKeywords, + ScanKeywordTokens); + + /* we don't want to re-emit any escape string warnings */ + yyextra.escape_string_warning = false; + + /* Search for each constant, in sequence */ + for (i = 0; i < jstate->clocations_count; i++) + { + int loc = locs[i].location; + int tok; + bool squashed = locs[i].squashed; + + /* Adjust recorded location if we're dealing with partial string */ + loc -= query_loc; + + Assert(loc >= 0); + + if (loc <= last_loc) + { + locs[i].length = -1; + continue; /* Duplicate constant, ignore */ + } + + /* We have a valid location, so let's save it */ + last_loc = loc; + + if (squashed) + continue; /* squashable list, ignore */ + + /* Lex tokens until we find the desired constant */ + for (;;) + { + tok = core_yylex(&yylval, &yylloc, yyscanner); + + /* We should not hit end-of-string, but if we do, behave sanely */ + if (tok == 0) + break; /* out of inner for-loop */ + + /* + * We should find the token position exactly, but if we somehow + * run past it, work with that. + */ + if (yylloc >= loc) + { + if (query[loc] == '-') + { + /* + * It's a negative value - this is the one and only case + * where we replace more than a single token. + * + * Do not compensate for the core system's special-case + * adjustment of location to that of the leading '-' + * operator in the event of a negative constant. It is + * also useful for our purposes to start from the minus + * symbol. In this way, queries like "select * from foo + * where bar = 1" and "select * from foo where bar = -2" + * will have identical normalized query strings. + */ + tok = core_yylex(&yylval, &yylloc, yyscanner); + if (tok == 0) + break; /* out of inner for-loop */ + } + + /* + * We now rely on the assumption that flex has placed a zero + * byte after the text of the current token in scanbuf. + */ + locs[i].length = strlen(yyextra.scanbuf + loc); + break; /* out of inner for-loop */ + } + } + + /* If we hit end-of-string, give up, leaving remaining lengths -1 */ + if (tok == 0) + break; + } + + scanner_finish(yyscanner); +} /* * Given a possibly multi-statement source string, confine our attention to the diff --git a/src/include/nodes/queryjumble.h b/src/include/nodes/queryjumble.h index dcb36dcb44f..8024a0883f7 100644 --- a/src/include/nodes/queryjumble.h +++ b/src/include/nodes/queryjumble.h @@ -91,6 +91,10 @@ extern PGDLLIMPORT int compute_query_id; extern const char *CleanQuerytext(const char *query, int *location, int *len); +extern char *generate_normalized_query(JumbleState *jstate, const char *query, + int query_loc, int *query_len_p); +extern void fill_in_constant_lengths(JumbleState *jstate, const char *query, + int query_loc); extern JumbleState *JumbleQuery(Query *query); extern void EnableQueryId(void); -- 2.43.0