From e4bcfb637018737ae3e0c7761b3b62a49b7ae407 Mon Sep 17 00:00:00 2001
From: Stepan Neretin <s.neretin@postgrespro.ru>
Date: Fri, 9 May 2025 17:19:42 +0700
Subject: [PATCH v2 2/2] Refactor byteain() to avoid double scanning and
 simplify logic

This patch reworks the escaped input handling in byteain() by replacing
manual buffer management with a StringInfo-based single-pass parse.
It combines ideas from a previous proposal by Steven Niu with additional
improvements to structure and readability.

Also adds regression tests covering edge cases for both hex and escaped
formats.

Includes input from discussion with Kirill Reshke on pgsql-hackers.
---
 contrib/btree_gin/expected/bytea.out | 92 ++++++++++++++++++++++++++++
 contrib/btree_gin/sql/bytea.sql      | 37 +++++++++++
 src/backend/utils/adt/varlena.c      | 63 ++++++++-----------
 3 files changed, 155 insertions(+), 37 deletions(-)

diff --git a/contrib/btree_gin/expected/bytea.out b/contrib/btree_gin/expected/bytea.out
index b0ed7a53450..d4ad2878775 100644
--- a/contrib/btree_gin/expected/bytea.out
+++ b/contrib/btree_gin/expected/bytea.out
@@ -44,3 +44,95 @@ SELECT * FROM test_bytea WHERE i>'abc'::bytea ORDER BY i;
  xyz
 (2 rows)
 
+-- Simple ASCII strings
+SELECT encode(bytea(E'a'), 'hex');            -- 61
+ encode 
+--------
+ 61
+(1 row)
+
+SELECT encode(bytea(E'ab'), 'hex');           -- 6162
+ encode 
+--------
+ 6162
+(1 row)
+
+-- Octal escapes
+SELECT encode(bytea(E'\\000'), 'hex');        -- 00
+ encode 
+--------
+ 00
+(1 row)
+
+SELECT encode(bytea(E'\\001'), 'hex');        -- 01
+ encode 
+--------
+ 01
+(1 row)
+
+SELECT encode(bytea(E'\\001\\002\\003'), 'hex');  -- 010203
+ encode 
+--------
+ 010203
+(1 row)
+
+-- Mixed literal and escapes
+SELECT encode(bytea(E'a\\000b\\134c'), 'hex'); -- 6100625c63
+   encode   
+------------
+ 6100625c63
+(1 row)
+
+-- Backslash literal
+SELECT encode(bytea(E'\\\\'), 'hex');         -- 5c
+ encode 
+--------
+ 5c
+(1 row)
+
+-- Empty input
+SELECT encode(bytea(E''), 'hex');             -- (empty string)
+ encode 
+--------
+ 
+(1 row)
+
+-- Hex format
+SELECT encode(bytea(E'\\x6869'), 'escape');   -- hi
+ encode 
+--------
+ hi
+(1 row)
+
+-- ===== Invalid bytea input tests =====
+-- Invalid octal escapes (less than 3 digits or out of range)
+SELECT bytea(E'\\77');     -- ERROR
+ERROR:  invalid input syntax for type bytea
+LINE 1: SELECT bytea(E'\\77');
+                     ^
+SELECT bytea(E'\\4');      -- ERROR
+ERROR:  invalid input syntax for type bytea
+LINE 1: SELECT bytea(E'\\4');
+                     ^
+SELECT bytea(E'\\08');     -- ERROR
+ERROR:  invalid input syntax for type bytea
+LINE 1: SELECT bytea(E'\\08');
+                     ^
+SELECT bytea(E'\\999');    -- ERROR
+ERROR:  invalid input syntax for type bytea
+LINE 1: SELECT bytea(E'\\999');
+                     ^
+-- Invalid hex format
+SELECT bytea(E'\\x1');     -- ERROR
+ERROR:  invalid hexadecimal data: odd number of digits
+LINE 1: SELECT bytea(E'\\x1');
+                     ^
+SELECT bytea(E'\\xZZ');    -- ERROR
+ERROR:  invalid hexadecimal digit: "Z"
+LINE 1: SELECT bytea(E'\\xZZ');
+                     ^
+-- Incomplete escape sequence
+SELECT bytea(E'abc\\');    -- ERROR
+ERROR:  invalid input syntax for type bytea
+LINE 1: SELECT bytea(E'abc\\');
+                     ^
diff --git a/contrib/btree_gin/sql/bytea.sql b/contrib/btree_gin/sql/bytea.sql
index 5f3eb11b169..cb8ee8eb2aa 100644
--- a/contrib/btree_gin/sql/bytea.sql
+++ b/contrib/btree_gin/sql/bytea.sql
@@ -15,3 +15,40 @@ SELECT * FROM test_bytea WHERE i<='abc'::bytea ORDER BY i;
 SELECT * FROM test_bytea WHERE i='abc'::bytea ORDER BY i;
 SELECT * FROM test_bytea WHERE i>='abc'::bytea ORDER BY i;
 SELECT * FROM test_bytea WHERE i>'abc'::bytea ORDER BY i;
+
+
+-- Simple ASCII strings
+SELECT encode(bytea(E'a'), 'hex');            -- 61
+SELECT encode(bytea(E'ab'), 'hex');           -- 6162
+
+-- Octal escapes
+SELECT encode(bytea(E'\\000'), 'hex');        -- 00
+SELECT encode(bytea(E'\\001'), 'hex');        -- 01
+SELECT encode(bytea(E'\\001\\002\\003'), 'hex');  -- 010203
+
+-- Mixed literal and escapes
+SELECT encode(bytea(E'a\\000b\\134c'), 'hex'); -- 6100625c63
+
+-- Backslash literal
+SELECT encode(bytea(E'\\\\'), 'hex');         -- 5c
+
+-- Empty input
+SELECT encode(bytea(E''), 'hex');             -- (empty string)
+
+-- Hex format
+SELECT encode(bytea(E'\\x6869'), 'escape');   -- hi
+
+-- ===== Invalid bytea input tests =====
+
+-- Invalid octal escapes (less than 3 digits or out of range)
+SELECT bytea(E'\\77');     -- ERROR
+SELECT bytea(E'\\4');      -- ERROR
+SELECT bytea(E'\\08');     -- ERROR
+SELECT bytea(E'\\999');    -- ERROR
+
+-- Invalid hex format
+SELECT bytea(E'\\x1');     -- ERROR
+SELECT bytea(E'\\xZZ');    -- ERROR
+
+-- Incomplete escape sequence
+SELECT bytea(E'abc\\');    -- ERROR
\ No newline at end of file
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index f1f1efba053..f84fd1dc644 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -296,70 +296,59 @@ text_to_cstring_buffer(const text *src, char *dst, size_t dst_len)
 Datum
 byteain(PG_FUNCTION_ARGS)
 {
-	char	   *inputText = PG_GETARG_CSTRING(0);
-	Node	   *escontext = fcinfo->context;
-	char	   *tp;
-	char	   *rp;
-	int			bc;
-	size_t	   input_len;
-	bytea	   *result;
+	char *inputText = PG_GETARG_CSTRING(0);
+	Node *escontext = fcinfo->context;
 
-	/* Recognize hex input */
+	/* Hex format */
 	if (inputText[0] == '\\' && inputText[1] == 'x')
 	{
-		size_t		len = strlen(inputText);
-
-		bc = (len - 2) / 2 + VARHDRSZ;	/* maximum possible length */
-		result = palloc(bc);
-		bc = hex_decode_safe(inputText + 2, len - 2, VARDATA(result),
-							 escontext);
-		SET_VARSIZE(result, bc + VARHDRSZ); /* actual length */
-
+		size_t len = strlen(inputText);
+		int bc = (len - 2) / 2 + VARHDRSZ;
+		bytea *result = palloc(bc);
+		bc = hex_decode_safe(inputText + 2, len - 2, VARDATA(result), escontext);
+		SET_VARSIZE(result, bc + VARHDRSZ);
 		PG_RETURN_BYTEA_P(result);
 	}
 
-	/* Handle traditional escaped style in a single pass */
-	input_len = strlen(inputText);
-	result = palloc(input_len + VARHDRSZ);  /* Allocate max possible size */
-	rp = VARDATA(result);
-	tp = inputText;
+	/* Escaped format */
+	StringInfoData buf;
+	initStringInfo(&buf);
+	char *tp = inputText;
 
-	while (*tp != '\0')
+	while (*tp)
 	{
-		if (tp[0] != '\\')
+		if (*tp != '\\')
 		{
-			*rp++ = *tp++;
+			appendStringInfoChar(&buf, *tp++);
 			continue;
 		}
 
 		if (tp[1] == '\\')
 		{
-			*rp++ = '\\';
+			appendStringInfoChar(&buf, '\\');
 			tp += 2;
 		}
-		else if ((tp[1] >= '0' && tp[1] <= '3') && 
-			 (tp[2] >= '0' && tp[2] <= '7') && 
-			 (tp[3] >= '0' && tp[3] <= '7'))
+		else if ((tp[1] >= '0' && tp[1] <= '3') &&
+				 (tp[2] >= '0' && tp[2] <= '7') &&
+				 (tp[3] >= '0' && tp[3] <= '7'))
 		{
-			bc = VAL(tp[1]);
-			bc <<= 3;
-			bc += VAL(tp[2]);
-			bc <<= 3;
-			*rp++ = bc + VAL(tp[3]);
-
+			int byte_val = VAL(tp[1]);
+			byte_val = (byte_val << 3) + VAL(tp[2]);
+			byte_val = (byte_val << 3) + VAL(tp[3]);
+			appendStringInfoChar(&buf, byte_val);
 			tp += 4;
 		}
 		else
 		{
-			/* Invalid escape sequence: report error */
 			ereturn(escontext, (Datum) 0,
 					(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
 					 errmsg("invalid input syntax for type %s", "bytea")));
 		}
 	}
 
-	/* Set the actual size of the bytea */
-	SET_VARSIZE(result, (rp - VARDATA(result)) + VARHDRSZ);
+	bytea *result = palloc(buf.len + VARHDRSZ);
+	SET_VARSIZE(result, buf.len + VARHDRSZ);
+	memcpy(VARDATA(result), buf.data, buf.len);
 
 	PG_RETURN_BYTEA_P(result);
 }
-- 
2.43.0

