Small patch to improve safety of utf8_to_unicode().

Started by Jeff Davisabout 1 month ago5 messages
#1Jeff Davis
pgsql@j-davis.com
1 attachment(s)

Attached.

Attachments:

v1-0001-Make-utf8_to_unicode-safer.patchtext/x-patch; charset=UTF-8; name=v1-0001-Make-utf8_to_unicode-safer.patchDownload
From 4777d1af75bc9e570e7db954b6642916fa1420bc Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Thu, 4 Dec 2025 19:37:32 -0800
Subject: [PATCH v1] Make utf8_to_unicode() safer.

---
 contrib/fuzzystrmatch/daitch_mokotoff.c   |  3 ++-
 src/backend/utils/adt/pg_locale_builtin.c |  5 +++--
 src/backend/utils/adt/varlena.c           | 26 +++++++++++------------
 src/common/saslprep.c                     |  4 +++-
 src/common/unicode/case_test.c            |  5 +++--
 src/common/unicode_case.c                 | 12 +++++++----
 src/common/wchar.c                        |  3 ++-
 src/include/mb/pg_wchar.h                 | 14 +++++-------
 8 files changed, 38 insertions(+), 34 deletions(-)

diff --git a/contrib/fuzzystrmatch/daitch_mokotoff.c b/contrib/fuzzystrmatch/daitch_mokotoff.c
index 07f895ae2bf..47bd2814460 100644
--- a/contrib/fuzzystrmatch/daitch_mokotoff.c
+++ b/contrib/fuzzystrmatch/daitch_mokotoff.c
@@ -401,7 +401,8 @@ read_char(const unsigned char *str, int *ix)
 
 	/* Decode UTF-8 character to ISO 10646 code point. */
 	str += *ix;
-	c = utf8_to_unicode(str);
+	/* Assume byte sequence has not been broken. */
+	c = utf8_to_unicode(str, MAX_MULTIBYTE_CHAR_LEN);
 
 	/* Advance *ix, but (for safety) not if we've reached end of string. */
 	if (c)
diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c
index 0d4c754a267..5c1358e5347 100644
--- a/src/backend/utils/adt/pg_locale_builtin.c
+++ b/src/backend/utils/adt/pg_locale_builtin.c
@@ -59,12 +59,13 @@ static size_t
 initcap_wbnext(void *state)
 {
 	struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;
+	unsigned char *str = (unsigned char *) wbstate->str;
 
 	while (wbstate->offset < wbstate->len &&
 		   wbstate->str[wbstate->offset] != '\0')
 	{
-		char32_t	u = utf8_to_unicode((unsigned char *) wbstate->str +
-										wbstate->offset);
+		char32_t	u = utf8_to_unicode(str + wbstate->offset,
+										wbstate->len - wbstate->offset);
 		bool		curr_alnum = pg_u_isalnum(u, wbstate->posix);
 
 		if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index baa5b44ea8d..d684370900d 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -5423,19 +5423,17 @@ Datum
 unicode_assigned(PG_FUNCTION_ARGS)
 {
 	text	   *input = PG_GETARG_TEXT_PP(0);
-	unsigned char *p;
-	int			size;
+	unsigned char *p = (unsigned char *) VARDATA_ANY(input);
+	unsigned char *p_end = p + VARSIZE_ANY_EXHDR(input);
 
 	if (GetDatabaseEncoding() != PG_UTF8)
 		ereport(ERROR,
 				(errmsg("Unicode categorization can only be performed if server encoding is UTF8")));
 
 	/* convert to char32_t */
-	size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
-	p = (unsigned char *) VARDATA_ANY(input);
-	for (int i = 0; i < size; i++)
+	while (p < p_end)
 	{
-		char32_t	uchar = utf8_to_unicode(p);
+		char32_t	uchar = utf8_to_unicode(p, p_end - p);
 		int			category = unicode_category(uchar);
 
 		if (category == PG_U_UNASSIGNED)
@@ -5456,7 +5454,8 @@ unicode_normalize_func(PG_FUNCTION_ARGS)
 	int			size;
 	char32_t   *input_chars;
 	char32_t   *output_chars;
-	unsigned char *p;
+	unsigned char *p = (unsigned char *) VARDATA_ANY(input);
+	unsigned char *p_end = p + VARSIZE_ANY_EXHDR(input);
 	text	   *result;
 	int			i;
 
@@ -5465,14 +5464,13 @@ unicode_normalize_func(PG_FUNCTION_ARGS)
 	/* convert to char32_t */
 	size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
 	input_chars = palloc((size + 1) * sizeof(char32_t));
-	p = (unsigned char *) VARDATA_ANY(input);
 	for (i = 0; i < size; i++)
 	{
-		input_chars[i] = utf8_to_unicode(p);
+		input_chars[i] = utf8_to_unicode(p, p_end - p);
 		p += pg_utf_mblen(p);
 	}
 	input_chars[i] = (char32_t) '\0';
-	Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
+	Assert(p == p_end);
 
 	/* action */
 	output_chars = unicode_normalize(form, input_chars);
@@ -5522,7 +5520,8 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
 	int			size;
 	char32_t   *input_chars;
 	char32_t   *output_chars;
-	unsigned char *p;
+	unsigned char *p = (unsigned char *) VARDATA_ANY(input);
+	unsigned char *p_end = p + VARSIZE_ANY_EXHDR(input);
 	int			i;
 	UnicodeNormalizationQC quickcheck;
 	int			output_size;
@@ -5533,14 +5532,13 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
 	/* convert to char32_t */
 	size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
 	input_chars = palloc((size + 1) * sizeof(char32_t));
-	p = (unsigned char *) VARDATA_ANY(input);
 	for (i = 0; i < size; i++)
 	{
-		input_chars[i] = utf8_to_unicode(p);
+		input_chars[i] = utf8_to_unicode(p, p_end - p);
 		p += pg_utf_mblen(p);
 	}
 	input_chars[i] = (char32_t) '\0';
-	Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
+	Assert(p == p_end);
 
 	/* quick check (see UAX #15) */
 	quickcheck = unicode_is_normalized_quickcheck(form, input_chars);
diff --git a/src/common/saslprep.c b/src/common/saslprep.c
index 101e8d65a4d..083702654b0 100644
--- a/src/common/saslprep.c
+++ b/src/common/saslprep.c
@@ -1055,6 +1055,7 @@ pg_saslprep(const char *input, char **output)
 	int			i;
 	bool		contains_RandALCat;
 	unsigned char *p;
+	unsigned char *p_end;
 	char32_t   *wp;
 
 	/* Ensure we return *output as NULL on failure */
@@ -1088,9 +1089,10 @@ pg_saslprep(const char *input, char **output)
 		goto oom;
 
 	p = (unsigned char *) input;
+	p_end = p + strlen(input);
 	for (i = 0; i < input_size; i++)
 	{
-		input_chars[i] = utf8_to_unicode(p);
+		input_chars[i] = utf8_to_unicode(p, p_end - p);
 		p += pg_utf_mblen(p);
 	}
 	input_chars[i] = (char32_t) '\0';
diff --git a/src/common/unicode/case_test.c b/src/common/unicode/case_test.c
index 00d4f85e5a5..e63ce2fbeb9 100644
--- a/src/common/unicode/case_test.c
+++ b/src/common/unicode/case_test.c
@@ -51,12 +51,13 @@ static size_t
 initcap_wbnext(void *state)
 {
 	struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;
+	unsigned char *str = (unsigned char *) wbstate->str;
 
 	while (wbstate->offset < wbstate->len &&
 		   wbstate->str[wbstate->offset] != '\0')
 	{
-		char32_t	u = utf8_to_unicode((unsigned char *) wbstate->str +
-										wbstate->offset);
+		char32_t	u = utf8_to_unicode(str + wbstate->offset,
+										wbstate->len - wbstate->offset);
 		bool		curr_alnum = pg_u_isalnum(u, wbstate->posix);
 
 		if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
diff --git a/src/common/unicode_case.c b/src/common/unicode_case.c
index e5e494db43c..a760094104c 100644
--- a/src/common/unicode_case.c
+++ b/src/common/unicode_case.c
@@ -223,15 +223,19 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 	Assert((str_casekind == CaseTitle && wbnext && wbstate) ||
 		   (str_casekind != CaseTitle && !wbnext && !wbstate));
 
+	if (srclen < 0)
+		srclen = strlen(src);
+
 	if (str_casekind == CaseTitle)
 	{
 		boundary = wbnext(wbstate);
 		Assert(boundary == 0);	/* start of text is always a boundary */
 	}
 
-	while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
+	while (srcoff < srclen)
 	{
-		char32_t	u1 = utf8_to_unicode((unsigned char *) src + srcoff);
+		char32_t	u1 = utf8_to_unicode((unsigned char *) src + srcoff,
+										 srclen - srcoff);
 		int			u1len = unicode_utf8len(u1);
 		char32_t	simple = 0;
 		const char32_t *special = NULL;
@@ -320,7 +324,7 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
 	{
 		if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
 		{
-			char32_t	curr = utf8_to_unicode(str + i);
+			char32_t	curr = utf8_to_unicode(str + i, len - i);
 
 			if (pg_u_prop_case_ignorable(curr))
 				continue;
@@ -344,7 +348,7 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
 	{
 		if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
 		{
-			char32_t	curr = utf8_to_unicode(str + i);
+			char32_t	curr = utf8_to_unicode(str + i, len - i);
 
 			if (pg_u_prop_case_ignorable(curr))
 				continue;
diff --git a/src/common/wchar.c b/src/common/wchar.c
index a4bc29921de..c113cadf815 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -661,7 +661,8 @@ ucs_wcwidth(pg_wchar ucs)
 static int
 pg_utf_dsplen(const unsigned char *s)
 {
-	return ucs_wcwidth(utf8_to_unicode(s));
+	/* trust that input is not a truncated byte sequence */
+	return ucs_wcwidth(utf8_to_unicode(s, MAX_MULTIBYTE_CHAR_LEN));
 }
 
 /*
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index 4d84bdc81e4..6dc0fff332f 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -558,22 +558,20 @@ surrogate_pair_to_codepoint(char16_t first, char16_t second)
 /*
  * Convert a UTF-8 character to a Unicode code point.
  * This is a one-character version of pg_utf2wchar_with_len.
- *
- * No error checks here, c must point to a long-enough string.
  */
 static inline char32_t
-utf8_to_unicode(const unsigned char *c)
+utf8_to_unicode(const unsigned char *c, size_t len)
 {
-	if ((*c & 0x80) == 0)
+	if ((*c & 0x80) == 0 && len >= 1)
 		return (char32_t) c[0];
-	else if ((*c & 0xe0) == 0xc0)
+	else if ((*c & 0xe0) == 0xc0 && len >= 2)
 		return (char32_t) (((c[0] & 0x1f) << 6) |
 						   (c[1] & 0x3f));
-	else if ((*c & 0xf0) == 0xe0)
+	else if ((*c & 0xf0) == 0xe0 && len >= 3)
 		return (char32_t) (((c[0] & 0x0f) << 12) |
 						   ((c[1] & 0x3f) << 6) |
 						   (c[2] & 0x3f));
-	else if ((*c & 0xf8) == 0xf0)
+	else if ((*c & 0xf8) == 0xf0 && len >= 4)
 		return (char32_t) (((c[0] & 0x07) << 18) |
 						   ((c[1] & 0x3f) << 12) |
 						   ((c[2] & 0x3f) << 6) |
@@ -676,8 +674,6 @@ extern int	pg_valid_server_encoding(const char *name);
 extern bool is_encoding_supported_by_icu(int encoding);
 extern const char *get_encoding_name_for_icu(int encoding);
 
-extern unsigned char *unicode_to_utf8(char32_t c, unsigned char *utf8string);
-extern char32_t utf8_to_unicode(const unsigned char *c);
 extern bool pg_utf8_islegal(const unsigned char *source, int length);
 extern int	pg_utf_mblen(const unsigned char *s);
 extern int	pg_mule_mblen(const unsigned char *s);
-- 
2.43.0

#2Chao Li
li.evan.chao@gmail.com
In reply to: Jeff Davis (#1)
Re: Small patch to improve safety of utf8_to_unicode().

On Dec 13, 2025, at 07:24, Jeff Davis <pgsql@j-davis.com> wrote:

Attached.

<v1-0001-Make-utf8_to_unicode-safer.patch>

This patch adds a length check to utf8_to_unicode(), I think which is where “safety” comes from. Can you please add an a little bit more to the commit message instead of only saying “improve safety”. It also deleted two redundant function declarations from pg_wchar.h, which may also worth a quick note in the commit message.

The code changes all look good to me. Only nitpicks are:

1
```
diff --git a/contrib/fuzzystrmatch/daitch_mokotoff.c b/contrib/fuzzystrmatch/daitch_mokotoff.c
index 07f895ae2bf..47bd2814460 100644
--- a/contrib/fuzzystrmatch/daitch_mokotoff.c
+++ b/contrib/fuzzystrmatch/daitch_mokotoff.c
@@ -401,7 +401,8 @@ read_char(const unsigned char *str, int *ix)
 	/* Decode UTF-8 character to ISO 10646 code point. */
 	str += *ix;
-	c = utf8_to_unicode(str);
+	/* Assume byte sequence has not been broken. */
+	c = utf8_to_unicode(str, MAX_MULTIBYTE_CHAR_LEN);
```

Here we need an empty line above the new comment.

2
```
diff --git a/src/common/wchar.c b/src/common/wchar.c
index a4bc29921de..c113cadf815 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -661,7 +661,8 @@ ucs_wcwidth(pg_wchar ucs)
 static int
 pg_utf_dsplen(const unsigned char *s)
 {
-	return ucs_wcwidth(utf8_to_unicode(s));
+	/* trust that input is not a truncated byte sequence */
+	return ucs_wcwidth(utf8_to_unicode(s, MAX_MULTIBYTE_CHAR_LEN));
 }
```

For the new comment, as a code reader, I wonder why we “trust” that? To me, it more feels like because of lacking length information, we have to trust. I would like this comment to be enhanced a little bit with more information.

Best regards,
--
Chao Li (Evan)
HighGo Software Co., Ltd.
https://www.highgo.com/

#3Jeff Davis
pgsql@j-davis.com
In reply to: Chao Li (#2)
1 attachment(s)
Re: Small patch to improve safety of utf8_to_unicode().

On Sun, 2025-12-14 at 07:22 +0800, Chao Li wrote:

This patch adds a length check to utf8_to_unicode(), I think which is
where “safety” comes from. Can you please add an a little bit more to
the commit message instead of only saying “improve safety”.

Right: it does not read past pg_mblen(), nor past the supplied length.

We could go further and check for valid continuation bytes, but the
other routines don't do that.

It also deleted two redundant function declarations from pg_wchar.h,
which may also worth a quick note in the commit message.

I committed that as an independent change and removed it from this
patch.

+	/* Assume byte sequence has not been broken. */
+	c = utf8_to_unicode(str, MAX_MULTIBYTE_CHAR_LEN);
```

Here we need an empty line above the new comment.

Done, and I expanded the comment to explain why it's safe.

 pg_utf_dsplen(const unsigned char *s)
 {
-	return ucs_wcwidth(utf8_to_unicode(s));
+	/* trust that input is not a truncated byte sequence */
+	return ucs_wcwidth(utf8_to_unicode(s,
MAX_MULTIBYTE_CHAR_LEN));
 }
```

For the new comment, as a code reader, I wonder why we “trust” that?

We could use strlen(), but I was concerned that it might be used for
string fragments that aren't NUL-terminated, because it's intended for
a single character. A caller might reasonably assume that it wouldn't
read past pg_mblen().

So I changed the comment slightly to just say that it requires the
input is a valid UTF-8 sequence. Let me know if you have another
suggestion.

Regards,
Jeff Davis

Attachments:

v2-0001-Make-utf8_to_unicode-safer.patchtext/x-patch; charset=UTF-8; name=v2-0001-Make-utf8_to_unicode-safer.patchDownload
From 94a80385eec23dbb85cc601063cac04e8d845a52 Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Thu, 4 Dec 2025 19:37:32 -0800
Subject: [PATCH v2] Make utf8_to_unicode() safer.

Require the input string length so that it doesn't read past the end
of a string if given an invalid (or truncated) byte sequence.

Discussion: https://postgr.es/m/c355354e6c3f4a7aafb047361b73db247260fca0.camel@j-davis.com
---
 contrib/fuzzystrmatch/daitch_mokotoff.c   |  8 ++++++-
 src/backend/utils/adt/pg_locale_builtin.c |  5 +++--
 src/backend/utils/adt/varlena.c           | 26 +++++++++++------------
 src/common/saslprep.c                     |  4 +++-
 src/common/unicode/case_test.c            |  5 +++--
 src/common/unicode_case.c                 | 12 +++++++----
 src/common/wchar.c                        |  3 ++-
 src/include/mb/pg_wchar.h                 | 12 +++++------
 8 files changed, 43 insertions(+), 32 deletions(-)

diff --git a/contrib/fuzzystrmatch/daitch_mokotoff.c b/contrib/fuzzystrmatch/daitch_mokotoff.c
index 07f895ae2bf..103a29d89d4 100644
--- a/contrib/fuzzystrmatch/daitch_mokotoff.c
+++ b/contrib/fuzzystrmatch/daitch_mokotoff.c
@@ -401,7 +401,13 @@ read_char(const unsigned char *str, int *ix)
 
 	/* Decode UTF-8 character to ISO 10646 code point. */
 	str += *ix;
-	c = utf8_to_unicode(str);
+
+	/*
+	 * We assume the string does not contain truncated byte sequences because
+	 * it came from pg_server_to_any(..., PG_UTF8), so we can safely use
+	 * MAX_MULTIBYTE_CHAR_LEN.
+	 */
+	c = utf8_to_unicode(str, MAX_MULTIBYTE_CHAR_LEN);
 
 	/* Advance *ix, but (for safety) not if we've reached end of string. */
 	if (c)
diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c
index 0c2920112bb..28fc55278b0 100644
--- a/src/backend/utils/adt/pg_locale_builtin.c
+++ b/src/backend/utils/adt/pg_locale_builtin.c
@@ -59,12 +59,13 @@ static size_t
 initcap_wbnext(void *state)
 {
 	struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;
+	unsigned char *str = (unsigned char *) wbstate->str;
 
 	while (wbstate->offset < wbstate->len &&
 		   wbstate->str[wbstate->offset] != '\0')
 	{
-		char32_t	u = utf8_to_unicode((unsigned char *) wbstate->str +
-										wbstate->offset);
+		char32_t	u = utf8_to_unicode(str + wbstate->offset,
+										wbstate->len - wbstate->offset);
 		bool		curr_alnum = pg_u_isalnum(u, wbstate->posix);
 
 		if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index baa5b44ea8d..d684370900d 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -5423,19 +5423,17 @@ Datum
 unicode_assigned(PG_FUNCTION_ARGS)
 {
 	text	   *input = PG_GETARG_TEXT_PP(0);
-	unsigned char *p;
-	int			size;
+	unsigned char *p = (unsigned char *) VARDATA_ANY(input);
+	unsigned char *p_end = p + VARSIZE_ANY_EXHDR(input);
 
 	if (GetDatabaseEncoding() != PG_UTF8)
 		ereport(ERROR,
 				(errmsg("Unicode categorization can only be performed if server encoding is UTF8")));
 
 	/* convert to char32_t */
-	size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
-	p = (unsigned char *) VARDATA_ANY(input);
-	for (int i = 0; i < size; i++)
+	while (p < p_end)
 	{
-		char32_t	uchar = utf8_to_unicode(p);
+		char32_t	uchar = utf8_to_unicode(p, p_end - p);
 		int			category = unicode_category(uchar);
 
 		if (category == PG_U_UNASSIGNED)
@@ -5456,7 +5454,8 @@ unicode_normalize_func(PG_FUNCTION_ARGS)
 	int			size;
 	char32_t   *input_chars;
 	char32_t   *output_chars;
-	unsigned char *p;
+	unsigned char *p = (unsigned char *) VARDATA_ANY(input);
+	unsigned char *p_end = p + VARSIZE_ANY_EXHDR(input);
 	text	   *result;
 	int			i;
 
@@ -5465,14 +5464,13 @@ unicode_normalize_func(PG_FUNCTION_ARGS)
 	/* convert to char32_t */
 	size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
 	input_chars = palloc((size + 1) * sizeof(char32_t));
-	p = (unsigned char *) VARDATA_ANY(input);
 	for (i = 0; i < size; i++)
 	{
-		input_chars[i] = utf8_to_unicode(p);
+		input_chars[i] = utf8_to_unicode(p, p_end - p);
 		p += pg_utf_mblen(p);
 	}
 	input_chars[i] = (char32_t) '\0';
-	Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
+	Assert(p == p_end);
 
 	/* action */
 	output_chars = unicode_normalize(form, input_chars);
@@ -5522,7 +5520,8 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
 	int			size;
 	char32_t   *input_chars;
 	char32_t   *output_chars;
-	unsigned char *p;
+	unsigned char *p = (unsigned char *) VARDATA_ANY(input);
+	unsigned char *p_end = p + VARSIZE_ANY_EXHDR(input);
 	int			i;
 	UnicodeNormalizationQC quickcheck;
 	int			output_size;
@@ -5533,14 +5532,13 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
 	/* convert to char32_t */
 	size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
 	input_chars = palloc((size + 1) * sizeof(char32_t));
-	p = (unsigned char *) VARDATA_ANY(input);
 	for (i = 0; i < size; i++)
 	{
-		input_chars[i] = utf8_to_unicode(p);
+		input_chars[i] = utf8_to_unicode(p, p_end - p);
 		p += pg_utf_mblen(p);
 	}
 	input_chars[i] = (char32_t) '\0';
-	Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
+	Assert(p == p_end);
 
 	/* quick check (see UAX #15) */
 	quickcheck = unicode_is_normalized_quickcheck(form, input_chars);
diff --git a/src/common/saslprep.c b/src/common/saslprep.c
index 101e8d65a4d..083702654b0 100644
--- a/src/common/saslprep.c
+++ b/src/common/saslprep.c
@@ -1055,6 +1055,7 @@ pg_saslprep(const char *input, char **output)
 	int			i;
 	bool		contains_RandALCat;
 	unsigned char *p;
+	unsigned char *p_end;
 	char32_t   *wp;
 
 	/* Ensure we return *output as NULL on failure */
@@ -1088,9 +1089,10 @@ pg_saslprep(const char *input, char **output)
 		goto oom;
 
 	p = (unsigned char *) input;
+	p_end = p + strlen(input);
 	for (i = 0; i < input_size; i++)
 	{
-		input_chars[i] = utf8_to_unicode(p);
+		input_chars[i] = utf8_to_unicode(p, p_end - p);
 		p += pg_utf_mblen(p);
 	}
 	input_chars[i] = (char32_t) '\0';
diff --git a/src/common/unicode/case_test.c b/src/common/unicode/case_test.c
index 00d4f85e5a5..e63ce2fbeb9 100644
--- a/src/common/unicode/case_test.c
+++ b/src/common/unicode/case_test.c
@@ -51,12 +51,13 @@ static size_t
 initcap_wbnext(void *state)
 {
 	struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state;
+	unsigned char *str = (unsigned char *) wbstate->str;
 
 	while (wbstate->offset < wbstate->len &&
 		   wbstate->str[wbstate->offset] != '\0')
 	{
-		char32_t	u = utf8_to_unicode((unsigned char *) wbstate->str +
-										wbstate->offset);
+		char32_t	u = utf8_to_unicode(str + wbstate->offset,
+										wbstate->len - wbstate->offset);
 		bool		curr_alnum = pg_u_isalnum(u, wbstate->posix);
 
 		if (!wbstate->init || curr_alnum != wbstate->prev_alnum)
diff --git a/src/common/unicode_case.c b/src/common/unicode_case.c
index e5e494db43c..a760094104c 100644
--- a/src/common/unicode_case.c
+++ b/src/common/unicode_case.c
@@ -223,15 +223,19 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 	Assert((str_casekind == CaseTitle && wbnext && wbstate) ||
 		   (str_casekind != CaseTitle && !wbnext && !wbstate));
 
+	if (srclen < 0)
+		srclen = strlen(src);
+
 	if (str_casekind == CaseTitle)
 	{
 		boundary = wbnext(wbstate);
 		Assert(boundary == 0);	/* start of text is always a boundary */
 	}
 
-	while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
+	while (srcoff < srclen)
 	{
-		char32_t	u1 = utf8_to_unicode((unsigned char *) src + srcoff);
+		char32_t	u1 = utf8_to_unicode((unsigned char *) src + srcoff,
+										 srclen - srcoff);
 		int			u1len = unicode_utf8len(u1);
 		char32_t	simple = 0;
 		const char32_t *special = NULL;
@@ -320,7 +324,7 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
 	{
 		if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
 		{
-			char32_t	curr = utf8_to_unicode(str + i);
+			char32_t	curr = utf8_to_unicode(str + i, len - i);
 
 			if (pg_u_prop_case_ignorable(curr))
 				continue;
@@ -344,7 +348,7 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
 	{
 		if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
 		{
-			char32_t	curr = utf8_to_unicode(str + i);
+			char32_t	curr = utf8_to_unicode(str + i, len - i);
 
 			if (pg_u_prop_case_ignorable(curr))
 				continue;
diff --git a/src/common/wchar.c b/src/common/wchar.c
index a4bc29921de..a104234ae92 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -661,7 +661,8 @@ ucs_wcwidth(pg_wchar ucs)
 static int
 pg_utf_dsplen(const unsigned char *s)
 {
-	return ucs_wcwidth(utf8_to_unicode(s));
+	/* input must be a valid UTF-8 sequence */
+	return ucs_wcwidth(utf8_to_unicode(s, MAX_MULTIBYTE_CHAR_LEN));
 }
 
 /*
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index 5df67ceea87..6dc0fff332f 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -558,22 +558,20 @@ surrogate_pair_to_codepoint(char16_t first, char16_t second)
 /*
  * Convert a UTF-8 character to a Unicode code point.
  * This is a one-character version of pg_utf2wchar_with_len.
- *
- * No error checks here, c must point to a long-enough string.
  */
 static inline char32_t
-utf8_to_unicode(const unsigned char *c)
+utf8_to_unicode(const unsigned char *c, size_t len)
 {
-	if ((*c & 0x80) == 0)
+	if ((*c & 0x80) == 0 && len >= 1)
 		return (char32_t) c[0];
-	else if ((*c & 0xe0) == 0xc0)
+	else if ((*c & 0xe0) == 0xc0 && len >= 2)
 		return (char32_t) (((c[0] & 0x1f) << 6) |
 						   (c[1] & 0x3f));
-	else if ((*c & 0xf0) == 0xe0)
+	else if ((*c & 0xf0) == 0xe0 && len >= 3)
 		return (char32_t) (((c[0] & 0x0f) << 12) |
 						   ((c[1] & 0x3f) << 6) |
 						   (c[2] & 0x3f));
-	else if ((*c & 0xf8) == 0xf0)
+	else if ((*c & 0xf8) == 0xf0 && len >= 4)
 		return (char32_t) (((c[0] & 0x07) << 18) |
 						   ((c[1] & 0x3f) << 12) |
 						   ((c[2] & 0x3f) << 6) |
-- 
2.43.0

#4Chao Li
li.evan.chao@gmail.com
In reply to: Jeff Davis (#3)
Re: Small patch to improve safety of utf8_to_unicode().

On Dec 16, 2025, at 04:23, Jeff Davis <pgsql@j-davis.com> wrote:

On Sun, 2025-12-14 at 07:22 +0800, Chao Li wrote:

This patch adds a length check to utf8_to_unicode(), I think which is
where “safety” comes from. Can you please add an a little bit more to
the commit message instead of only saying “improve safety”.

Right: it does not read past pg_mblen(), nor past the supplied length.

We could go further and check for valid continuation bytes, but the
other routines don't do that.

It also deleted two redundant function declarations from pg_wchar.h,
which may also worth a quick note in the commit message.

I committed that as an independent change and removed it from this
patch.

+ /* Assume byte sequence has not been broken. */
+ c = utf8_to_unicode(str, MAX_MULTIBYTE_CHAR_LEN);
```

Here we need an empty line above the new comment.

Done, and I expanded the comment to explain why it's safe.

pg_utf_dsplen(const unsigned char *s)
{
- return ucs_wcwidth(utf8_to_unicode(s));
+ /* trust that input is not a truncated byte sequence */
+ return ucs_wcwidth(utf8_to_unicode(s,
MAX_MULTIBYTE_CHAR_LEN));
}
```

For the new comment, as a code reader, I wonder why we “trust” that?

We could use strlen(), but I was concerned that it might be used for
string fragments that aren't NUL-terminated, because it's intended for
a single character. A caller might reasonably assume that it wouldn't
read past pg_mblen().

So I changed the comment slightly to just say that it requires the
input is a valid UTF-8 sequence. Let me know if you have another
suggestion.

Regards,
Jeff Davis

<v2-0001-Make-utf8_to_unicode-safer.patch>

V2 LGTM.

Best regards,
--
Chao Li (Evan)
HighGo Software Co., Ltd.
https://www.highgo.com/

#5Jeff Davis
pgsql@j-davis.com
In reply to: Chao Li (#4)
Re: Small patch to improve safety of utf8_to_unicode().

On Tue, 2025-12-16 at 07:34 +0800, Chao Li wrote:

<v2-0001-Make-utf8_to_unicode-safer.patch>

V2 LGTM.

On second thought, if we're going to change something here, we should
probably have a more flexible API for both utf8_to_unicode() and
unicode_to_utf8().

Looking at the callers, I think we want to have signatures something
like:

/* returns number of bytes consumed, or -1 */
static inline ssize_t
utf8_to_unicode(char32_t *cp, const unsigned char *src, size_t srclen)
{
...
}

/* returns number of bytes written, or -1 */
static inline ssize_t
unicode_to_utf8(unsigned char *dst, size_t dstsize, char32_t cp)
{
...
}

That would make both APIs safer, and the caller wouldn't need to call
unicode_utf8len() or pg_utf8_mblen() separately.

We could also do more validation, but of course then the callers would
need to do something if they encounter a failure. We could also try to
catch NUL terminators in the middle of a sequence, which might be
useful.

Regards,
Jeff Davis