From 433105e8be0ce4834c35e79d68b926ff84dc1d54 Mon Sep 17 00:00:00 2001
From: David Geier <geidav.pg@gmail.com>
Date: Fri, 23 Jan 2026 15:39:06 +0100
Subject: [PATCH v2 2/2] Use correct collation for finding word boundaries

pg_trgm finds all words in the input string and creates trigrams for
them. Word characters are alpha-numeric characters. What qualifies as
alpha-numeric character depends on the collation. Previously, pg_trgm
always used the default collation. Now the specified collation is used
instead.
---
 .../pg_trgm/expected/pg_trgm_collation.out    | 13 ++++++++++++
 contrib/pg_trgm/sql/pg_trgm_collation.sql     |  5 +++++
 contrib/pg_trgm/trgm.h                        |  6 +++---
 contrib/pg_trgm/trgm_op.c                     | 21 ++++++++++---------
 contrib/pg_trgm/trgm_regexp.c                 |  2 +-
 src/backend/tsearch/ts_locale.c               |  4 ++--
 src/include/tsearch/ts_locale.h               |  9 +++++++-
 7 files changed, 43 insertions(+), 17 deletions(-)

diff --git a/contrib/pg_trgm/expected/pg_trgm_collation.out b/contrib/pg_trgm/expected/pg_trgm_collation.out
index ad7cb822e99..c3a380cb620 100644
--- a/contrib/pg_trgm/expected/pg_trgm_collation.out
+++ b/contrib/pg_trgm/expected/pg_trgm_collation.out
@@ -33,3 +33,16 @@ SELECT similarity('ıstanbul' COLLATE "C", 'ISTANBUL' COLLATE "C");
    0.545455
 (1 row)
 
+-- Test that word boundary identification uses specified collation
+SELECT show_trgm('helloıtestIworldİcode' COLLATE "turkish");
+                                                           show_trgm                                                           
+-------------------------------------------------------------------------------------------------------------------------------
+ {0x8fc0a2,0x93dfbf,0x1bf43c,"  h"," he",0x22d44f,0x4398ff,cod,"de ",dic,ell,est,hel,ico,ldi,llo,ode,orl,0x71b8f5,rld,tes,wor}
+(1 row)
+
+SELECT show_trgm('helloıtestIworldİcode' COLLATE "C");
+                                                  show_trgm                                                  
+-------------------------------------------------------------------------------------------------------------
+ {"  c","  h","  t"," co"," he"," te",cod,"de ",ell,est,hel,iwo,"ld ",llo,"lo ",ode,orl,rld,sti,tes,tiw,wor}
+(1 row)
+
diff --git a/contrib/pg_trgm/sql/pg_trgm_collation.sql b/contrib/pg_trgm/sql/pg_trgm_collation.sql
index b0630fdb46a..a1cf9b32720 100644
--- a/contrib/pg_trgm/sql/pg_trgm_collation.sql
+++ b/contrib/pg_trgm/sql/pg_trgm_collation.sql
@@ -12,3 +12,8 @@ SELECT show_trgm('ISTANBUL' COLLATE "C");
 
 SELECT similarity('ıstanbul' COLLATE "turkish", 'ISTANBUL' COLLATE "turkish");
 SELECT similarity('ıstanbul' COLLATE "C", 'ISTANBUL' COLLATE "C");
+
+-- Test that word boundary identification uses specified collation
+
+SELECT show_trgm('helloıtestIworldİcode' COLLATE "turkish");
+SELECT show_trgm('helloıtestIworldİcode' COLLATE "C");
\ No newline at end of file
diff --git a/contrib/pg_trgm/trgm.h b/contrib/pg_trgm/trgm.h
index 147eefaa3c6..6f0e0a20789 100644
--- a/contrib/pg_trgm/trgm.h
+++ b/contrib/pg_trgm/trgm.h
@@ -47,9 +47,9 @@ typedef char trgm[3];
 } while(0)
 extern int	(*CMPTRGM) (const void *a, const void *b);
 
-#define ISWORDCHR(c)	(t_isalnum(c))
-#define ISPRINTABLECHAR(a)	( isascii( *(unsigned char*)(a) ) && (isalnum( *(unsigned char*)(a) ) || *(unsigned char*)(a)==' ') )
-#define ISPRINTABLETRGM(t)	( ISPRINTABLECHAR( ((char*)(t)) ) && ISPRINTABLECHAR( ((char*)(t))+1 ) && ISPRINTABLECHAR( ((char*)(t))+2 ) )
+#define ISWORDCHR(c, collation)	(t_isalnum_collation(c, collation))
+#define ISPRINTABLECHAR(a)		( isascii( *(unsigned char*)(a) ) && (isalnum( *(unsigned char*)(a) ) || *(unsigned char*)(a)==' ') )
+#define ISPRINTABLETRGM(t)		( ISPRINTABLECHAR( ((char*)(t)) ) && ISPRINTABLECHAR( ((char*)(t))+1 ) && ISPRINTABLECHAR( ((char*)(t))+2 ) )
 
 #define ISESCAPECHAR(x) (*(x) == '\\')	/* Wildcard escape character */
 #define ISWILDCARDCHAR(x) (*(x) == '_' || *(x) == '%')	/* Wildcard
diff --git a/contrib/pg_trgm/trgm_op.c b/contrib/pg_trgm/trgm_op.c
index d4c75caeff9..399e6a26f34 100644
--- a/contrib/pg_trgm/trgm_op.c
+++ b/contrib/pg_trgm/trgm_op.c
@@ -220,11 +220,11 @@ comp_trgm(const void *a, const void *b)
  * endword points to the character after word
  */
 static char *
-find_word(char *str, int lenstr, char **endword, int *charlen)
+find_word(char *str, int lenstr, char **endword, int *charlen, Oid collation)
 {
 	char	   *beginword = str;
 
-	while (beginword - str < lenstr && !ISWORDCHR(beginword))
+	while (beginword - str < lenstr && !ISWORDCHR(beginword, collation))
 		beginword += pg_mblen(beginword);
 
 	if (beginword - str >= lenstr)
@@ -232,7 +232,7 @@ find_word(char *str, int lenstr, char **endword, int *charlen)
 
 	*endword = beginword;
 	*charlen = 0;
-	while (*endword - str < lenstr && ISWORDCHR(*endword))
+	while (*endword - str < lenstr && ISWORDCHR(*endword, collation))
 	{
 		*endword += pg_mblen(*endword);
 		(*charlen)++;
@@ -349,7 +349,7 @@ generate_trgm_only(trgm *trg, char *str, int slen, Oid collation, TrgmBound *bou
 	}
 
 	eword = str;
-	while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen)) != NULL)
+	while ((bword = find_word(eword, slen - (eword - str), &eword, &charlen, collation)) != NULL)
 	{
 #ifdef IGNORECASE
 		bword = str_tolower(bword, eword - bword, collation);
@@ -771,7 +771,8 @@ calc_word_similarity(char *str1, int slen1, char *str2, int slen2,
  */
 static const char *
 get_wildcard_part(const char *str, int lenstr,
-				  char *buf, int *bytelen, int *charlen)
+				  char *buf, int *bytelen, int *charlen,
+				  Oid collation)
 {
 	const char *beginword = str;
 	const char *endword;
@@ -791,7 +792,7 @@ get_wildcard_part(const char *str, int lenstr,
 	{
 		if (in_escape)
 		{
-			if (ISWORDCHR(beginword))
+			if (ISWORDCHR(beginword, collation))
 				break;
 			in_escape = false;
 			in_leading_wildcard_meta = false;
@@ -802,7 +803,7 @@ get_wildcard_part(const char *str, int lenstr,
 				in_escape = true;
 			else if (ISWILDCARDCHAR(beginword))
 				in_leading_wildcard_meta = true;
-			else if (ISWORDCHR(beginword))
+			else if (ISWORDCHR(beginword, collation))
 				break;
 			else
 				in_leading_wildcard_meta = false;
@@ -845,7 +846,7 @@ get_wildcard_part(const char *str, int lenstr,
 		clen = pg_mblen(endword);
 		if (in_escape)
 		{
-			if (ISWORDCHR(endword))
+			if (ISWORDCHR(endword, collation))
 			{
 				memcpy(s, endword, clen);
 				(*charlen)++;
@@ -873,7 +874,7 @@ get_wildcard_part(const char *str, int lenstr,
 				in_trailing_wildcard_meta = true;
 				break;
 			}
-			else if (ISWORDCHR(endword))
+			else if (ISWORDCHR(endword, collation))
 			{
 				memcpy(s, endword, clen);
 				(*charlen)++;
@@ -945,7 +946,7 @@ generate_wildcard_trgm(const char *str, int slen, Oid collation)
 	 */
 	eword = str;
 	while ((eword = get_wildcard_part(eword, slen - (eword - str),
-									  buf, &bytelen, &charlen)) != NULL)
+									  buf, &bytelen, &charlen, collation)) != NULL)
 	{
 #ifdef IGNORECASE
 		buf2 = str_tolower(buf, bytelen, collation);
diff --git a/contrib/pg_trgm/trgm_regexp.c b/contrib/pg_trgm/trgm_regexp.c
index cdd04fa01ad..a89dbe1880b 100644
--- a/contrib/pg_trgm/trgm_regexp.c
+++ b/contrib/pg_trgm/trgm_regexp.c
@@ -810,7 +810,7 @@ getColorInfo(regex_t *regex, TrgmNFA *trgmNFA, Oid collation)
 
 			if (!convertPgWchar(chars[j], &c, collation))
 				continue;		/* ok to ignore it altogether */
-			if (ISWORDCHR(c.bytes))
+			if (ISWORDCHR(c.bytes, collation))
 				colorInfo->wordChars[colorInfo->wordCharsCount++] = c;
 			else
 				colorInfo->containsNonWord = true;
diff --git a/src/backend/tsearch/ts_locale.c b/src/backend/tsearch/ts_locale.c
index 1e98f321957..3caad2e5c2a 100644
--- a/src/backend/tsearch/ts_locale.c
+++ b/src/backend/tsearch/ts_locale.c
@@ -37,7 +37,7 @@ t_isalpha(const char *ptr)
 }
 
 int
-t_isalnum(const char *ptr)
+t_isalnum_collation(const char *ptr, Oid collation)
 {
 	pg_wchar	wstr[WC_BUF_LEN];
 	int			wlen pg_attribute_unused();
@@ -46,7 +46,7 @@ t_isalnum(const char *ptr)
 	Assert(wlen <= 1);
 
 	/* pass single character, or NUL if empty */
-	return pg_iswalnum(wstr[0], pg_database_locale());
+	return pg_iswalnum(wstr[0], pg_newlocale_from_collation(collation));
 }
 
 
diff --git a/src/include/tsearch/ts_locale.h b/src/include/tsearch/ts_locale.h
index cea417a91b5..b1a2a45ed9d 100644
--- a/src/include/tsearch/ts_locale.h
+++ b/src/include/tsearch/ts_locale.h
@@ -18,6 +18,7 @@
 
 #include "lib/stringinfo.h"
 #include "mb/pg_wchar.h"
+#include "catalog/pg_collation.h"
 #include "utils/pg_locale.h"
 
 /* working state for tsearch_readline (should be a local var in caller) */
@@ -40,7 +41,13 @@ typedef struct
 #define COPYCHAR(d,s)	memcpy(d, s, pg_mblen(s))
 
 extern int	t_isalpha(const char *ptr);
-extern int	t_isalnum(const char *ptr);
+extern int	t_isalnum_collation(const char *ptr, Oid collation);
+
+static inline int
+t_isalnum(const char *ptr)
+{
+	return t_isalnum_collation(ptr, DEFAULT_COLLATION_OID);
+}
 
 extern bool tsearch_readline_begin(tsearch_readline_state *stp,
 								   const char *filename);
-- 
2.51.0

