From 935ccea6da82d0150f65672825c8f5e6f86bca89 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sat, 15 Feb 2025 12:58:22 -0500
Subject: [PATCH v4] Make escaping functions retain trailing bytes of an
 invalid character.

Instead of dropping the trailing byte(s) of an invalid or incomplete
multibyte character, replace only the first byte with a known-invalid
sequence, and process the rest normally.  This seems less likely to
confuse incautious callers than the behavior adopted in 5dc1e42b4.

Author: Andres Freund <andres@anarazel.de>
Reviewed-by: Jeff Davis <pgsql@j-davis.com>
Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us>
Discussion: https://postgr.es/m/20250215012712.45@rfd.leadboat.com
Backpatch-through: 13
---
 src/fe_utils/string_utils.c    | 91 +++++++++++++---------------------
 src/interfaces/libpq/fe-exec.c | 57 ++++++++-------------
 2 files changed, 55 insertions(+), 93 deletions(-)

diff --git a/src/fe_utils/string_utils.c b/src/fe_utils/string_utils.c
index b6a7b19708..bdf35db614 100644
--- a/src/fe_utils/string_utils.c
+++ b/src/fe_utils/string_utils.c
@@ -180,40 +180,25 @@ fmtIdEnc(const char *rawid, int encoding)
 			/* Slow path for possible multibyte characters */
 			charlen = pg_encoding_mblen(encoding, cp);
 
-			if (remaining < charlen)
-			{
-				/*
-				 * If the character is longer than the available input,
-				 * replace the string with an invalid sequence. The invalid
-				 * sequence ensures that the escaped string will trigger an
-				 * error on the server-side, even if we can't directly report
-				 * an error here.
-				 */
-				enlargePQExpBuffer(id_return, 2);
-				pg_encoding_set_invalid(encoding,
-										id_return->data + id_return->len);
-				id_return->len += 2;
-				id_return->data[id_return->len] = '\0';
-
-				/* there's no more input data, so we can stop */
-				break;
-			}
-			else if (pg_encoding_verifymbchar(encoding, cp, charlen) == -1)
+			if (remaining < charlen ||
+				pg_encoding_verifymbchar(encoding, cp, charlen) == -1)
 			{
 				/*
 				 * Multibyte character is invalid.  It's important to verify
-				 * that as invalid multi-byte characters could e.g. be used to
+				 * that as invalid multibyte characters could e.g. be used to
 				 * "skip" over quote characters, e.g. when parsing
 				 * character-by-character.
 				 *
-				 * Replace the bytes corresponding to the invalid character
-				 * with an invalid sequence, for the same reason as above.
+				 * Replace the character's first byte with an invalid
+				 * sequence. The invalid sequence ensures that the escaped
+				 * string will trigger an error on the server-side, even if we
+				 * can't directly report an error here.
 				 *
 				 * It would be a bit faster to verify the whole string the
 				 * first time we encounter a set highbit, but this way we can
-				 * replace just the invalid characters, which probably makes
-				 * it easier for users to find the invalidly encoded portion
-				 * of a larger string.
+				 * replace just the invalid data, which probably makes it
+				 * easier for users to find the invalidly encoded portion of a
+				 * larger string.
 				 */
 				enlargePQExpBuffer(id_return, 2);
 				pg_encoding_set_invalid(encoding,
@@ -222,11 +207,13 @@ fmtIdEnc(const char *rawid, int encoding)
 				id_return->data[id_return->len] = '\0';
 
 				/*
-				 * Copy the rest of the string after the invalid multi-byte
-				 * character.
+				 * Handle the following bytes as if this byte didn't exist.
+				 * That's safer in case the subsequent bytes contain
+				 * characters that are significant for the caller (e.g. '>' in
+				 * html).
 				 */
-				remaining -= charlen;
-				cp += charlen;
+				remaining--;
+				cp++;
 			}
 			else
 			{
@@ -395,49 +382,39 @@ appendStringLiteral(PQExpBuffer buf, const char *str,
 		/* Slow path for possible multibyte characters */
 		charlen = PQmblen(source, encoding);
 
-		if (remaining < charlen)
-		{
-			/*
-			 * If the character is longer than the available input, replace
-			 * the string with an invalid sequence. The invalid sequence
-			 * ensures that the escaped string will trigger an error on the
-			 * server-side, even if we can't directly report an error here.
-			 *
-			 * We know there's enough space for the invalid sequence because
-			 * the "target" buffer is 2 * length + 2 long, and at worst we're
-			 * replacing a single input byte with two invalid bytes.
-			 */
-			pg_encoding_set_invalid(encoding, target);
-			target += 2;
-
-			/* there's no more valid input data, so we can stop */
-			break;
-		}
-		else if (pg_encoding_verifymbchar(encoding, source, charlen) == -1)
+		if (remaining < charlen ||
+			pg_encoding_verifymbchar(encoding, source, charlen) == -1)
 		{
 			/*
 			 * Multibyte character is invalid.  It's important to verify that
-			 * as invalid multi-byte characters could e.g. be used to "skip"
+			 * as invalid multibyte characters could e.g. be used to "skip"
 			 * over quote characters, e.g. when parsing
 			 * character-by-character.
 			 *
-			 * Replace the bytes corresponding to the invalid character with
-			 * an invalid sequence, for the same reason as above.
+			 * Replace the character's first byte with an invalid sequence.
+			 * The invalid sequence ensures that the escaped string will
+			 * trigger an error on the server-side, even if we can't directly
+			 * report an error here.
+			 *
+			 * We know there's enough space for the invalid sequence because
+			 * the "target" buffer is 2 * length + 2 long, and at worst we're
+			 * replacing a single input byte with two invalid bytes.
 			 *
 			 * It would be a bit faster to verify the whole string the first
 			 * time we encounter a set highbit, but this way we can replace
-			 * just the invalid characters, which probably makes it easier for
-			 * users to find the invalidly encoded portion of a larger string.
+			 * just the invalid data, which probably makes it easier for users
+			 * to find the invalidly encoded portion of a larger string.
 			 */
 			pg_encoding_set_invalid(encoding, target);
 			target += 2;
-			remaining -= charlen;
 
 			/*
-			 * Copy the rest of the string after the invalid multi-byte
-			 * character.
+			 * Handle the following bytes as if this byte didn't exist. That's
+			 * safer in case the subsequent bytes contain important characters
+			 * for the caller (e.g. '>' in html).
 			 */
-			source += charlen;
+			source++;
+			remaining--;
 		}
 		else
 		{
diff --git a/src/interfaces/libpq/fe-exec.c b/src/interfaces/libpq/fe-exec.c
index 120d4d032e..69554cd603 100644
--- a/src/interfaces/libpq/fe-exec.c
+++ b/src/interfaces/libpq/fe-exec.c
@@ -4102,50 +4102,34 @@ PQescapeStringInternal(PGconn *conn,
 		/* Slow path for possible multibyte characters */
 		charlen = pg_encoding_mblen(encoding, source);
 
-		if (remaining < charlen)
+		if (remaining < charlen ||
+			pg_encoding_verifymbchar(encoding, source, charlen) == -1)
 		{
 			/*
-			 * If the character is longer than the available input, report an
-			 * error if possible, and replace the string with an invalid
-			 * sequence. The invalid sequence ensures that the escaped string
-			 * will trigger an error on the server-side, even if we can't
-			 * directly report an error here.
+			 * Multibyte character is invalid.  It's important to verify that
+			 * as invalid multibyte characters could e.g. be used to "skip"
+			 * over quote characters, e.g. when parsing
+			 * character-by-character.
+			 *
+			 * Report an error if possible, and replace the character's first
+			 * byte with an invalid sequence. The invalid sequence ensures
+			 * that the escaped string will trigger an error on the
+			 * server-side, even if we can't directly report an error here.
 			 *
 			 * This isn't *that* crucial when we can report an error to the
-			 * caller, but if we can't, the caller will use this string
-			 * unmodified and it needs to be safe for parsing.
+			 * caller; but if we can't or the caller ignores it, the caller
+			 * will use this string unmodified and it needs to be safe for
+			 * parsing.
 			 *
 			 * We know there's enough space for the invalid sequence because
 			 * the "to" buffer needs to be at least 2 * length + 1 long, and
 			 * at worst we're replacing a single input byte with two invalid
 			 * bytes.
-			 */
-			if (error)
-				*error = 1;
-			if (conn)
-				libpq_append_conn_error(conn, "incomplete multibyte character");
-
-			pg_encoding_set_invalid(encoding, target);
-			target += 2;
-
-			/* there's no more input data, so we can stop */
-			break;
-		}
-		else if (pg_encoding_verifymbchar(encoding, source, charlen) == -1)
-		{
-			/*
-			 * Multibyte character is invalid.  It's important to verify that
-			 * as invalid multi-byte characters could e.g. be used to "skip"
-			 * over quote characters, e.g. when parsing
-			 * character-by-character.
-			 *
-			 * Replace the bytes corresponding to the invalid character with
-			 * an invalid sequence, for the same reason as above.
 			 *
 			 * It would be a bit faster to verify the whole string the first
 			 * time we encounter a set highbit, but this way we can replace
-			 * just the invalid characters, which probably makes it easier for
-			 * users to find the invalidly encoded portion of a larger string.
+			 * just the invalid data, which probably makes it easier for users
+			 * to find the invalidly encoded portion of a larger string.
 			 */
 			if (error)
 				*error = 1;
@@ -4154,13 +4138,14 @@ PQescapeStringInternal(PGconn *conn,
 
 			pg_encoding_set_invalid(encoding, target);
 			target += 2;
-			remaining -= charlen;
 
 			/*
-			 * Copy the rest of the string after the invalid multi-byte
-			 * character.
+			 * Handle the following bytes as if this byte didn't exist. That's
+			 * safer in case the subsequent bytes contain important characters
+			 * for the caller (e.g. '>' in html).
 			 */
-			source += charlen;
+			source++;
+			remaining--;
 		}
 		else
 		{
-- 
2.43.5