From 7573859b6f66b4ed370725f33077361c1cb81cb7 Mon Sep 17 00:00:00 2001 From: Jacob Champion Date: Mon, 8 Apr 2024 15:31:17 -0700 Subject: [PATCH v2] json_lex_string: don't overread on bad UTF8 Inputs to pg_parse_json[_incremental] are not guaranteed to be null-terminated, so pg_encoding_mblen_bounded (which uses strnlen) can walk off the end of the buffer. Check against the end pointer instead. pg_encoding_mblen_bounded() no longer has any callers and has been removed. TODO: - Do we really want to print incomplete UTF-8 sequences as-is once we know they're bad? --- src/common/jsonapi.c | 4 ++-- src/common/wchar.c | 13 +------------ src/include/mb/pg_wchar.h | 1 - src/test/modules/test_json_parser/t/002_inline.pl | 8 ++++++++ 4 files changed, 11 insertions(+), 15 deletions(-) diff --git a/src/common/jsonapi.c b/src/common/jsonapi.c index fc0cb36974..26e1f43ed3 100644 --- a/src/common/jsonapi.c +++ b/src/common/jsonapi.c @@ -1689,8 +1689,8 @@ json_lex_string(JsonLexContext *lex) } while (0) #define FAIL_AT_CHAR_END(code) \ do { \ - lex->token_terminator = \ - s + pg_encoding_mblen_bounded(lex->input_encoding, s); \ + char *term = s + pg_encoding_mblen(lex->input_encoding, s); \ + lex->token_terminator = (term <= end) ? term : end; \ return code; \ } while (0) diff --git a/src/common/wchar.c b/src/common/wchar.c index 76b7dfdfcb..97e9b61dba 100644 --- a/src/common/wchar.c +++ b/src/common/wchar.c @@ -2062,8 +2062,7 @@ const pg_wchar_tbl pg_wchar_table[] = { * * Caution: when dealing with text that is not certainly valid in the * specified encoding, the result may exceed the actual remaining - * string length. Callers that are not prepared to deal with that - * should use pg_encoding_mblen_bounded() instead. + * string length. */ int pg_encoding_mblen(int encoding, const char *mbstr) @@ -2073,16 +2072,6 @@ pg_encoding_mblen(int encoding, const char *mbstr) pg_wchar_table[PG_SQL_ASCII].mblen((const unsigned char *) mbstr)); } -/* - * Returns the byte length of a multibyte character; but not more than - * the distance to end of string. - */ -int -pg_encoding_mblen_bounded(int encoding, const char *mbstr) -{ - return strnlen(mbstr, pg_encoding_mblen(encoding, mbstr)); -} - /* * Returns the display length of a multibyte character. */ diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h index 249cd18a35..ac65bfcbef 100644 --- a/src/include/mb/pg_wchar.h +++ b/src/include/mb/pg_wchar.h @@ -663,7 +663,6 @@ extern int pg_valid_server_encoding_id(int encoding); * earlier in this file are also available from libpgcommon. */ extern int pg_encoding_mblen(int encoding, const char *mbstr); -extern int pg_encoding_mblen_bounded(int encoding, const char *mbstr); extern int pg_encoding_dsplen(int encoding, const char *mbstr); extern int pg_encoding_verifymbchar(int encoding, const char *mbstr, int len); extern int pg_encoding_verifymbstr(int encoding, const char *mbstr, int len); diff --git a/src/test/modules/test_json_parser/t/002_inline.pl b/src/test/modules/test_json_parser/t/002_inline.pl index f83cec03f8..60bb930e92 100644 --- a/src/test/modules/test_json_parser/t/002_inline.pl +++ b/src/test/modules/test_json_parser/t/002_inline.pl @@ -128,5 +128,13 @@ test( "incorrect escape count", '"\\\\\\\\\\\\\\"', error => qr/Token ""\\\\\\\\\\\\\\"" is invalid/); +test( + "incomplete UTF-8 sequence", + # Three bytes: double-quote, backslash, + "\"\\\x{F5}", + # Both invalid-token and invalid-escape are possible, because for smaller + # chunk sizes the incremental parser will skip the string parsing when it + # can't find an ending quote. + error => qr/(Token|Escape sequence) ""?\\\x{F5}" is invalid/); done_testing(); -- 2.34.1