From f0c56f1f95ab35dca3d7ff65f783289a6b225146 Mon Sep 17 00:00:00 2001
From: Laurenz Albe <laurenz.albe@cybertec.at>
Date: Fri, 23 May 2025 11:35:32 +0200
Subject: [PATCH v2] Fix SIMILAR TO regex translation for character classes

The code that translates SIMILAR TO pattern matching expressions
to regular expressions didn't consider that brackets can be nested,
as in [[:alpha:]%], and replaced placeholders like _ and % where
it shouldn't.

Fix by tracking the nesting level of brackets while considering that
in expressions like []] and [^]] the first closing bracket is a
regular character.

Author: Laurenz Albe <laurenz.albe@cybertec.at>
Reviewed-by: Michael Paquier <michael@paquier.xyz>
Discussion: https://postgr.es/m/16ab039d1af455652bdf4173402ddda145f2c73b.camel@cybertec.at
Backpatch-through: 13
---
 src/backend/utils/adt/regexp.c        | 36 ++++++++++++++++++++++-----
 src/test/regress/expected/strings.out | 12 +++++++++
 src/test/regress/sql/strings.sql      |  3 +++
 3 files changed, 45 insertions(+), 6 deletions(-)

diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c
index edee1f7880b..9d6f7675c64 100644
--- a/src/backend/utils/adt/regexp.c
+++ b/src/backend/utils/adt/regexp.c
@@ -773,7 +773,8 @@ similar_escape_internal(text *pat_text, text *esc_text)
 	int			plen,
 				elen;
 	bool		afterescape = false;
-	bool		incharclass = false;
+	int			charclass_depth = 0;	/* sqare bracket nesting level */
+	int			charclass_start = 0;
 	int			nquotes = 0;
 
 	p = VARDATA_ANY(pat_text);
@@ -904,7 +905,7 @@ similar_escape_internal(text *pat_text, text *esc_text)
 		/* fast path */
 		if (afterescape)
 		{
-			if (pchar == '"' && !incharclass)	/* escape-double-quote? */
+			if (pchar == '"' && charclass_depth < 1)	/* escape-double-quote? */
 			{
 				/* emit appropriate part separator, per notes above */
 				if (nquotes == 0)
@@ -953,18 +954,41 @@ similar_escape_internal(text *pat_text, text *esc_text)
 			/* SQL escape character; do not send to output */
 			afterescape = true;
 		}
-		else if (incharclass)
+		else if (charclass_depth > 0)
 		{
 			if (pchar == '\\')
 				*r++ = '\\';
 			*r++ = pchar;
-			if (pchar == ']')
-				incharclass = false;
+
+			/*
+			 * Ignore a closing bracket at the start of a character class.
+			 * Such a bracket is taken literally rather than closing the
+			 * class. "charclass_start" is 1 right at the beginning of a class
+			 * and 2 after an initial caret.
+			 */
+			if (pchar == '[')
+				charclass_depth++;
+			else if (pchar == ']' && charclass_start > 2)
+				charclass_depth--;
+
+			/*
+			 * If there is a caret right after the opening bracket, it negates
+			 * the character class, but a following closing bracket should
+			 * still be treated as a normal character.  However, that holds
+			 * only for the first caret, so only the values 1 and 2 mean that
+			 * closing brackets should be taken literally.
+			 */
+			if (pchar == '^')
+				charclass_start++;
+			else
+				charclass_start = 3;	/* definitely past the start */
 		}
 		else if (pchar == '[')
 		{
+			/* start of a character class */
 			*r++ = pchar;
-			incharclass = true;
+			charclass_depth++;
+			charclass_start = 1;
 		}
 		else if (pchar == '%')
 		{
diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out
index 174f0a68331..ce230961b5c 100644
--- a/src/test/regress/expected/strings.out
+++ b/src/test/regress/expected/strings.out
@@ -614,6 +614,18 @@ SELECT 'abcdefg' SIMILAR TO '_bcd%' ESCAPE NULL AS null;
 SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '##' AS error;
 ERROR:  invalid escape string
 HINT:  Escape string must be empty or one character.
+-- ".", "_", "%", "$", "(" and "^" should be left alone in character classes
+EXPLAIN (VERBOSE, COSTS OFF)
+SELECT (SELECT '') SIMILAR TO '.[.[:alnum:]_]_[]%]%[^]$]$[^^]^[(](p)';
+                                        QUERY PLAN                                         
+-------------------------------------------------------------------------------------------
+ Result
+   Output: ((InitPlan 1).col1 ~ '^(?:\.[.[:alnum:]_].[]%].*[^]$]\$[^^]\^[(](?:p))$'::text)
+   InitPlan 1
+     ->  Result
+           Output: ''::text
+(5 rows)
+
 -- Test backslash escapes in regexp_replace's replacement string
 SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3');
  regexp_replace 
diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql
index f7b325baadf..94b9aa48e90 100644
--- a/src/test/regress/sql/strings.sql
+++ b/src/test/regress/sql/strings.sql
@@ -196,6 +196,9 @@ SELECT 'abcd\efg' SIMILAR TO '_bcd\%' ESCAPE '' AS true;
 -- these behaviors are per spec, though:
 SELECT 'abcdefg' SIMILAR TO '_bcd%' ESCAPE NULL AS null;
 SELECT 'abcdefg' SIMILAR TO '_bcd#%' ESCAPE '##' AS error;
+-- ".", "_", "%", "$", "(" and "^" should be left alone in character classes
+EXPLAIN (VERBOSE, COSTS OFF)
+SELECT (SELECT '') SIMILAR TO '.[.[:alnum:]_]_[]%]%[^]$]$[^^]^[(](p)';
 
 -- Test backslash escapes in regexp_replace's replacement string
 SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3');
-- 
2.49.0

