From f4c4e2bb049f96e70e350bd0f0e9cecb755f8687 Mon Sep 17 00:00:00 2001
From: David Geier <geidav.pg@gmail.com>
Date: Fri, 23 Jan 2026 15:39:06 +0100
Subject: [PATCH v5 2/2] Use correct collation for finding word boundaries

pg_trgm finds all words in the input string and creates trigrams for
them. Word characters are alpha-numeric characters. What qualifies as
alpha-numeric character depends on the collation. Previously, pg_trgm
always used the default collation. Now the specified collation is used
instead.
---
 .../pg_trgm/expected/pg_trgm_collation.out    | 13 ++++++++++++
 contrib/pg_trgm/sql/pg_trgm_collation.sql     |  5 +++++
 contrib/pg_trgm/trgm.h                        |  6 +++---
 contrib/pg_trgm/trgm_op.c                     | 20 +++++++++----------
 contrib/pg_trgm/trgm_regexp.c                 |  2 +-
 src/backend/tsearch/ts_locale.c               |  8 ++++----
 src/include/tsearch/ts_locale.h               |  3 ++-
 7 files changed, 38 insertions(+), 19 deletions(-)

diff --git a/contrib/pg_trgm/expected/pg_trgm_collation.out b/contrib/pg_trgm/expected/pg_trgm_collation.out
index 472ce867665..0cc53edd821 100644
--- a/contrib/pg_trgm/expected/pg_trgm_collation.out
+++ b/contrib/pg_trgm/expected/pg_trgm_collation.out
@@ -41,3 +41,16 @@ SELECT similarity('ıstanbul' COLLATE "C", 'ISTANBUL' COLLATE "C");
  0.54545456
 (1 row)
 
+-- Test that word boundary identification uses specified collation
+SELECT show_trgm('helloıtestIworldİcode' COLLATE "tr-x-icu");
+                                                           show_trgm                                                           
+-------------------------------------------------------------------------------------------------------------------------------
+ {0x8fc0a2,0x93dfbf,0x1bf43c,"  h"," he",0x22d44f,0x4398ff,cod,"de ",dic,ell,est,hel,ico,ldi,llo,ode,orl,0x71b8f5,rld,tes,wor}
+(1 row)
+
+SELECT show_trgm('helloıtestIworldİcode' COLLATE "C");
+                                                  show_trgm                                                  
+-------------------------------------------------------------------------------------------------------------
+ {"  c","  h","  t"," co"," he"," te",cod,"de ",ell,est,hel,iwo,"ld ",llo,"lo ",ode,orl,rld,sti,tes,tiw,wor}
+(1 row)
+
diff --git a/contrib/pg_trgm/sql/pg_trgm_collation.sql b/contrib/pg_trgm/sql/pg_trgm_collation.sql
index afb3973a8b3..e1a5c7c5fa8 100644
--- a/contrib/pg_trgm/sql/pg_trgm_collation.sql
+++ b/contrib/pg_trgm/sql/pg_trgm_collation.sql
@@ -22,3 +22,8 @@ SELECT show_trgm('ISTANBUL' COLLATE "C");
 SELECT similarity('ıstanbul' COLLATE "tr-x-icu", 'ISTANBUL' COLLATE "tr-x-icu");
 SELECT similarity('ıstanbul' COLLATE "C", 'ISTANBUL' COLLATE "C");
 
+-- Test that word boundary identification uses specified collation
+
+SELECT show_trgm('helloıtestIworldİcode' COLLATE "tr-x-icu");
+SELECT show_trgm('helloıtestIworldİcode' COLLATE "C");
+
diff --git a/contrib/pg_trgm/trgm.h b/contrib/pg_trgm/trgm.h
index b6911e91458..3c4db129e20 100644
--- a/contrib/pg_trgm/trgm.h
+++ b/contrib/pg_trgm/trgm.h
@@ -47,9 +47,9 @@ typedef char trgm[3];
 } while(0)
 extern int	(*CMPTRGM) (const void *a, const void *b);
 
-#define ISWORDCHR(c, len)	(t_isalnum_with_len(c, len))
-#define ISPRINTABLECHAR(a)	( isascii( *(unsigned char*)(a) ) && (isalnum( *(unsigned char*)(a) ) || *(unsigned char*)(a)==' ') )
-#define ISPRINTABLETRGM(t)	( ISPRINTABLECHAR( ((char*)(t)) ) && ISPRINTABLECHAR( ((char*)(t))+1 ) && ISPRINTABLECHAR( ((char*)(t))+2 ) )
+#define ISWORDCHR(c, len, collation)	(t_isalnum_with_len_collation(c, len, collation))
+#define ISPRINTABLECHAR(a)				( isascii( *(unsigned char*)(a) ) && (isalnum( *(unsigned char*)(a) ) || *(unsigned char*)(a)==' ') )
+#define ISPRINTABLETRGM(t)				( ISPRINTABLECHAR( ((char*)(t)) ) && ISPRINTABLECHAR( ((char*)(t))+1 ) && ISPRINTABLECHAR( ((char*)(t))+2 ) )
 
 #define ISESCAPECHAR(x) (*(x) == '\\')	/* Wildcard escape character */
 #define ISWILDCARDCHAR(x) (*(x) == '_' || *(x) == '%')	/* Wildcard
diff --git a/contrib/pg_trgm/trgm_op.c b/contrib/pg_trgm/trgm_op.c
index 5f2defb57f2..ca30cb7c363 100644
--- a/contrib/pg_trgm/trgm_op.c
+++ b/contrib/pg_trgm/trgm_op.c
@@ -292,7 +292,7 @@ comp_trgm(const void *a, const void *b)
  * endword points to the character after word
  */
 static char *
-find_word(char *str, int lenstr, char **endword)
+find_word(char *str, int lenstr, char **endword, Oid collation)
 {
 	char	   *beginword = str;
 	const char *endstr = str + lenstr;
@@ -301,7 +301,7 @@ find_word(char *str, int lenstr, char **endword)
 	{
 		int			clen = pg_mblen_range(beginword, endstr);
 
-		if (ISWORDCHR(beginword, clen))
+		if (ISWORDCHR(beginword, clen, collation))
 			break;
 		beginword += clen;
 	}
@@ -314,7 +314,7 @@ find_word(char *str, int lenstr, char **endword)
 	{
 		int			clen = pg_mblen_range(*endword, endstr);
 
-		if (!ISWORDCHR(*endword, clen))
+		if (!ISWORDCHR(*endword, clen, collation))
 			break;
 		*endword += clen;
 	}
@@ -490,7 +490,7 @@ generate_trgm_only(growable_trgm_array *dst, char *str, int slen, Oid collation,
 	}
 
 	eword = str;
-	while ((bword = find_word(eword, slen - (eword - str), &eword)) != NULL)
+	while ((bword = find_word(eword, slen - (eword - str), &eword, collation)) != NULL)
 	{
 		int			oldlen;
 
@@ -907,7 +907,7 @@ calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
  */
 static const char *
 get_wildcard_part(const char *str, int lenstr,
-				  char *buf, int *bytelen)
+				  char *buf, int *bytelen, Oid collation)
 {
 	const char *beginword = str;
 	const char *endword;
@@ -930,7 +930,7 @@ get_wildcard_part(const char *str, int lenstr,
 
 		if (in_escape)
 		{
-			if (ISWORDCHR(beginword, clen))
+			if (ISWORDCHR(beginword, clen, collation))
 				break;
 			in_escape = false;
 			in_leading_wildcard_meta = false;
@@ -941,7 +941,7 @@ get_wildcard_part(const char *str, int lenstr,
 				in_escape = true;
 			else if (ISWILDCARDCHAR(beginword))
 				in_leading_wildcard_meta = true;
-			else if (ISWORDCHR(beginword, clen))
+			else if (ISWORDCHR(beginword, clen, collation))
 				break;
 			else
 				in_leading_wildcard_meta = false;
@@ -979,7 +979,7 @@ get_wildcard_part(const char *str, int lenstr,
 		clen = pg_mblen_range(endword, endstr);
 		if (in_escape)
 		{
-			if (ISWORDCHR(endword, clen))
+			if (ISWORDCHR(endword, clen, collation))
 			{
 				memcpy(s, endword, clen);
 				s += clen;
@@ -1006,7 +1006,7 @@ get_wildcard_part(const char *str, int lenstr,
 				in_trailing_wildcard_meta = true;
 				break;
 			}
-			else if (ISWORDCHR(endword, clen))
+			else if (ISWORDCHR(endword, clen, collation))
 			{
 				memcpy(s, endword, clen);
 				s += clen;
@@ -1070,7 +1070,7 @@ generate_wildcard_trgm(const char *str, int slen, Oid collation)
 	 */
 	eword = str;
 	while ((eword = get_wildcard_part(eword, slen - (eword - str),
-									  buf, &bytelen)) != NULL)
+									  buf, &bytelen, collation)) != NULL)
 	{
 		char	   *word;
 
diff --git a/contrib/pg_trgm/trgm_regexp.c b/contrib/pg_trgm/trgm_regexp.c
index 2f190df2f65..57f7b12c3d9 100644
--- a/contrib/pg_trgm/trgm_regexp.c
+++ b/contrib/pg_trgm/trgm_regexp.c
@@ -811,7 +811,7 @@ getColorInfo(regex_t *regex, TrgmNFA *trgmNFA, Oid collation)
 
 			if (!clen)
 				continue;		/* ok to ignore it altogether */
-			if (ISWORDCHR(c.bytes, clen))
+			if (ISWORDCHR(c.bytes, clen, collation))
 				colorInfo->wordChars[colorInfo->wordCharsCount++] = c;
 			else
 				colorInfo->containsNonWord = true;
diff --git a/src/backend/tsearch/ts_locale.c b/src/backend/tsearch/ts_locale.c
index df02ffb12fd..6f331e054a2 100644
--- a/src/backend/tsearch/ts_locale.c
+++ b/src/backend/tsearch/ts_locale.c
@@ -26,27 +26,27 @@ static void tsearch_readline_callback(void *arg);
 #define GENERATE_T_ISCLASS_DEF(character_class) \
 /* mblen shall be that of the first character */ \
 int \
-t_is##character_class##_with_len(const char *ptr, int mblen) \
+t_is##character_class##_with_len_collation(const char *ptr, int mblen, Oid collation) \
 { \
 	pg_wchar	wstr[WC_BUF_LEN]; \
 	int			wlen pg_attribute_unused(); \
 	wlen = pg_mb2wchar_with_len(ptr, wstr, mblen); \
 	Assert(wlen <= 1); \
 	/* pass single character, or NUL if empty */ \
-	return pg_isw##character_class(wstr[0], pg_database_locale()); \
+	return pg_isw##character_class(wstr[0], pg_newlocale_from_collation(collation)); \
 } \
 \
 /* ptr shall point to a NUL-terminated string */ \
 int \
 t_is##character_class##_cstr(const char *ptr) \
 { \
-	return t_is##character_class##_with_len(ptr, pg_mblen_cstr(ptr)); \
+	return t_is##character_class##_with_len_collation(ptr, pg_mblen_cstr(ptr), DEFAULT_COLLATION_OID); \
 } \
 /* ptr shall point to a string with pre-validated encoding */ \
 int \
 t_is##character_class##_unbounded(const char *ptr) \
 { \
-	return t_is##character_class##_with_len(ptr, pg_mblen_unbounded(ptr)); \
+	return t_is##character_class##_with_len_collation(ptr, pg_mblen_unbounded(ptr), DEFAULT_COLLATION_OID); \
 } \
 /* historical name for _unbounded */ \
 int \
diff --git a/src/include/tsearch/ts_locale.h b/src/include/tsearch/ts_locale.h
index 6e2d67ee4a5..7ad7042d523 100644
--- a/src/include/tsearch/ts_locale.h
+++ b/src/include/tsearch/ts_locale.h
@@ -18,6 +18,7 @@
 
 #include "lib/stringinfo.h"
 #include "mb/pg_wchar.h"
+#include "catalog/pg_collation.h"
 #include "utils/pg_locale.h"
 
 /* working state for tsearch_readline (should be a local var in caller) */
@@ -56,7 +57,7 @@ ts_copychar_cstr(void *dest, const void *src)
 #define COPYCHAR ts_copychar_cstr
 
 #define GENERATE_T_ISCLASS_DECL(character_class) \
-extern int	t_is##character_class##_with_len(const char *ptr, int len); \
+extern int	t_is##character_class##_with_len_collation(const char *ptr, int len, Oid collation); \
 extern int	t_is##character_class##_cstr(const char *ptr); \
 extern int	t_is##character_class##_unbounded(const char *ptr); \
 \
-- 
2.51.0