From 36e226c368d2eb37c41124f52ef819bc626fd5a8 Mon Sep 17 00:00:00 2001 From: David Rowley Date: Thu, 23 May 2024 10:53:23 +1200 Subject: [PATCH v2 2/3] Use SIMD processing for escape_json() --- src/backend/utils/adt/json.c | 72 +++++++++++++++++++++++++++++- src/test/regress/expected/json.out | 44 ++++++++++++++++++ src/test/regress/sql/json.sql | 8 ++++ 3 files changed, 122 insertions(+), 2 deletions(-) diff --git a/src/backend/utils/adt/json.c b/src/backend/utils/adt/json.c index 7934cf62fb..a266f60ff3 100644 --- a/src/backend/utils/adt/json.c +++ b/src/backend/utils/adt/json.c @@ -19,6 +19,7 @@ #include "funcapi.h" #include "libpq/pqformat.h" #include "miscadmin.h" +#include "port/simd.h" #include "utils/array.h" #include "utils/builtins.h" #include "utils/date.h" @@ -1597,11 +1598,78 @@ escape_json_cstring(StringInfo buf, const char *str) void escape_json(StringInfo buf, const char *str, int len) { + int i = 0; + int copypos = 0; + + Assert(len >= 0); + appendStringInfoCharMacro(buf, '"'); - for (int i = 0; i < len; i++) - escape_json_char(buf, str[i]); + for (;;) + { + Vector8 chunk; + int vlen; + + /* + * Figure out how many bytes to process using SIMD. Round 'len' down + * to the previous multiple of sizeof(Vector8), assuming that's a + * power-of-2. + */ + vlen = len & (int) (~(sizeof(Vector8) - 1)); + + /* + * To speed this up try searching sizeof(Vector8) bytes at once for + * special characters that we need to escape. When we find one, we + * fall out of this first loop and copy the parts we've vector + * searched before processing the special-char vector byte-by-byte. + * Once we're done with that, come back and try doing vector searching + * again. We'll also process the tail end of the string byte-by-byte. + */ + for (; i < vlen; i += sizeof(Vector8)) + { + vector8_load(&chunk, (const uint8 *) &str[i]); + + /* + * Break on anything less than ' ' or if we find a '"' or '\\'. + * Those need special handling. That's done in the per-byte loop. + */ + if (vector8_has_le(chunk, (unsigned char) 0x1F) || + vector8_has(chunk, (unsigned char) '"') || + vector8_has(chunk, (unsigned char) '\\')) + break; + } + + /* + * Write to the destination up to the point of that we've vector + * searched so far. Do this only when switching into per-byte mode + * rather than once every sizeof(Vector8) bytes. + */ + if (copypos < i) + { + appendBinaryStringInfo(buf, &str[copypos], i - copypos); + copypos = i; + } + + /* + * Per-byte loop for Vector8s containing special chars and for + * processing the tail of the string. + */ + for (int b = 0; b < sizeof(Vector8); b++) + { + /* check if we've finished */ + if (i == len) + goto done; + + Assert(i < len); + + escape_json_char(buf, str[i++]); + } + + copypos = i; + /* We're not done yet. Try the SIMD search again */ + } +done: appendStringInfoCharMacro(buf, '"'); } diff --git a/src/test/regress/expected/json.out b/src/test/regress/expected/json.out index aa29bc597b..bfcc26c531 100644 --- a/src/test/regress/expected/json.out +++ b/src/test/regress/expected/json.out @@ -55,6 +55,50 @@ SELECT ('"'||repeat('.', 12)||'abc\n"')::json; -- OK, legal escapes "............abc\n" (1 row) +-- Stress testing of JSON escape code +CREATE TABLE json_escape (very_long_column_name_to_test_json_escape text); +INSERT INTO json_escape SELECT repeat('a', a) FROM generate_series(0,33) a; +-- Test various lengths of strings to validate SIMD processing to escape +-- special chars in the JSON. +SELECT row_to_json(j)::jsonb FROM json_escape j; + row_to_json +------------------------------------------------------------------------------------ + {"very_long_column_name_to_test_json_escape": ""} + {"very_long_column_name_to_test_json_escape": "a"} + {"very_long_column_name_to_test_json_escape": "aa"} + {"very_long_column_name_to_test_json_escape": "aaa"} + {"very_long_column_name_to_test_json_escape": "aaaa"} + {"very_long_column_name_to_test_json_escape": "aaaaa"} + {"very_long_column_name_to_test_json_escape": "aaaaaa"} + {"very_long_column_name_to_test_json_escape": "aaaaaaa"} + {"very_long_column_name_to_test_json_escape": "aaaaaaaa"} + {"very_long_column_name_to_test_json_escape": "aaaaaaaaa"} + {"very_long_column_name_to_test_json_escape": "aaaaaaaaaa"} + {"very_long_column_name_to_test_json_escape": "aaaaaaaaaaa"} + {"very_long_column_name_to_test_json_escape": "aaaaaaaaaaaa"} + {"very_long_column_name_to_test_json_escape": "aaaaaaaaaaaaa"} + {"very_long_column_name_to_test_json_escape": "aaaaaaaaaaaaaa"} + {"very_long_column_name_to_test_json_escape": "aaaaaaaaaaaaaaa"} + {"very_long_column_name_to_test_json_escape": "aaaaaaaaaaaaaaaa"} + {"very_long_column_name_to_test_json_escape": "aaaaaaaaaaaaaaaaa"} + {"very_long_column_name_to_test_json_escape": "aaaaaaaaaaaaaaaaaa"} + {"very_long_column_name_to_test_json_escape": "aaaaaaaaaaaaaaaaaaa"} + {"very_long_column_name_to_test_json_escape": "aaaaaaaaaaaaaaaaaaaa"} + {"very_long_column_name_to_test_json_escape": "aaaaaaaaaaaaaaaaaaaaa"} + {"very_long_column_name_to_test_json_escape": "aaaaaaaaaaaaaaaaaaaaaa"} + {"very_long_column_name_to_test_json_escape": "aaaaaaaaaaaaaaaaaaaaaaa"} + {"very_long_column_name_to_test_json_escape": "aaaaaaaaaaaaaaaaaaaaaaaa"} + {"very_long_column_name_to_test_json_escape": "aaaaaaaaaaaaaaaaaaaaaaaaa"} + {"very_long_column_name_to_test_json_escape": "aaaaaaaaaaaaaaaaaaaaaaaaaa"} + {"very_long_column_name_to_test_json_escape": "aaaaaaaaaaaaaaaaaaaaaaaaaaa"} + {"very_long_column_name_to_test_json_escape": "aaaaaaaaaaaaaaaaaaaaaaaaaaaa"} + {"very_long_column_name_to_test_json_escape": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaa"} + {"very_long_column_name_to_test_json_escape": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"} + {"very_long_column_name_to_test_json_escape": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"} + {"very_long_column_name_to_test_json_escape": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"} + {"very_long_column_name_to_test_json_escape": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"} +(34 rows) + -- see json_encoding test for input with unicode escapes -- Numbers. SELECT '1'::json; -- OK diff --git a/src/test/regress/sql/json.sql b/src/test/regress/sql/json.sql index ec57dfe707..0e7ca2f5af 100644 --- a/src/test/regress/sql/json.sql +++ b/src/test/regress/sql/json.sql @@ -12,6 +12,14 @@ SELECT '"\v"'::json; -- ERROR, not a valid JSON escape SELECT ('"'||repeat('.', 12)||'abc"')::json; -- OK SELECT ('"'||repeat('.', 12)||'abc\n"')::json; -- OK, legal escapes +-- Stress testing of JSON escape code +CREATE TABLE json_escape (very_long_column_name_to_test_json_escape text); +INSERT INTO json_escape SELECT repeat('a', a) FROM generate_series(0,33) a; + +-- Test various lengths of strings to validate SIMD processing to escape +-- special chars in the JSON. +SELECT row_to_json(j)::jsonb FROM json_escape j; + -- see json_encoding test for input with unicode escapes -- Numbers. -- 2.34.1