From 4f8fa0fcc3efe4297aca58ee28f047c47a576d84 Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Mon, 6 Oct 2025 13:05:17 -0700
Subject: [PATCH v1 5/6] tsearch: use database default collation for parsing.

Previously, tsearch used the database's CTYPE setting, which only
matches the database default collation if the locale provider is libc.

Note that tsearch types (tsvector and tsquery) are not collatable
types. The locale affects parsing the original text, which is a lossy
process, so a COLLATE clause on the already-parsed value would not
make sense.
---
 src/backend/tsearch/ts_locale.c   | 40 ++++++-----------
 src/backend/tsearch/wparser_def.c | 71 ++++++-------------------------
 2 files changed, 27 insertions(+), 84 deletions(-)

diff --git a/src/backend/tsearch/ts_locale.c b/src/backend/tsearch/ts_locale.c
index 4801fe90089..9db13b72f99 100644
--- a/src/backend/tsearch/ts_locale.c
+++ b/src/backend/tsearch/ts_locale.c
@@ -20,45 +20,33 @@
 static void tsearch_readline_callback(void *arg);
 
 
-/*
- * The reason these functions use a 3-wchar_t output buffer, not 2 as you
- * might expect, is that on Windows "wchar_t" is 16 bits and what we'll be
- * getting from char2wchar() is UTF16 not UTF32.  A single input character
- * may therefore produce a surrogate pair rather than just one wchar_t;
- * we also need room for a trailing null.  When we do get a surrogate pair,
- * we pass just the first code to iswdigit() etc, so that these functions will
- * always return false for characters outside the Basic Multilingual Plane.
- */
-#define WC_BUF_LEN  3
+/* space for a single character plus a trailing NUL */
+#define WC_BUF_LEN  2
 
 int
 t_isalpha(const char *ptr)
 {
-	int			clen = pg_mblen(ptr);
-	wchar_t		character[WC_BUF_LEN];
-	locale_t	mylocale = 0;	/* TODO */
+	pg_wchar	wstr[WC_BUF_LEN];
+	int			wlen pg_attribute_unused();
 
-	if (clen == 1 || database_ctype_is_c)
-		return isalpha(TOUCHAR(ptr));
+	wlen = pg_mb2wchar_with_len(ptr, wstr, pg_mblen(ptr));
+	Assert(wlen <= 1);
 
-	char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
-
-	return iswalpha((wint_t) character[0]);
+	/* pass single character, or NUL if empty */
+	return pg_wc_isalpha(wstr[0], pg_database_locale());
 }
 
 int
 t_isalnum(const char *ptr)
 {
-	int			clen = pg_mblen(ptr);
-	wchar_t		character[WC_BUF_LEN];
-	locale_t	mylocale = 0;	/* TODO */
-
-	if (clen == 1 || database_ctype_is_c)
-		return isalnum(TOUCHAR(ptr));
+	pg_wchar	wstr[WC_BUF_LEN];
+	int			wlen pg_attribute_unused();
 
-	char2wchar(character, WC_BUF_LEN, ptr, clen, mylocale);
+	wlen = pg_mb2wchar_with_len(ptr, wstr, pg_mblen(ptr));
+	Assert(wlen <= 1);
 
-	return iswalnum((wint_t) character[0]);
+	/* pass single character, or NUL if empty */
+	return pg_wc_isalnum(wstr[0], pg_database_locale());
 }
 
 
diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c
index e2dd3da3aa3..e9129040422 100644
--- a/src/backend/tsearch/wparser_def.c
+++ b/src/backend/tsearch/wparser_def.c
@@ -243,9 +243,7 @@ typedef struct TParser
 	/* string and position information */
 	char	   *str;			/* multibyte string */
 	int			lenstr;			/* length of mbstring */
-	wchar_t    *wstr;			/* wide character string */
 	pg_wchar   *pgwstr;			/* wide character string for C-locale */
-	bool		usewide;
 
 	/* State of parse */
 	int			charmaxlen;
@@ -293,33 +291,8 @@ TParserInit(char *str, int len)
 	prs->charmaxlen = pg_database_encoding_max_length();
 	prs->str = str;
 	prs->lenstr = len;
-
-	/*
-	 * Use wide char code only when max encoding length > 1.
-	 */
-	if (prs->charmaxlen > 1)
-	{
-		locale_t	mylocale = 0;	/* TODO */
-
-		prs->usewide = true;
-		if (database_ctype_is_c)
-		{
-			/*
-			 * char2wchar doesn't work for C-locale and sizeof(pg_wchar) could
-			 * be different from sizeof(wchar_t)
-			 */
-			prs->pgwstr = (pg_wchar *) palloc(sizeof(pg_wchar) * (prs->lenstr + 1));
-			pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr);
-		}
-		else
-		{
-			prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1));
-			char2wchar(prs->wstr, prs->lenstr + 1, prs->str, prs->lenstr,
-					   mylocale);
-		}
-	}
-	else
-		prs->usewide = false;
+	prs->pgwstr = (pg_wchar *) palloc(sizeof(pg_wchar) * (prs->lenstr + 1));
+	pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr);
 
 	prs->state = newTParserPosition(NULL);
 	prs->state->state = TPS_Base;
@@ -350,12 +323,9 @@ TParserCopyInit(const TParser *orig)
 	prs->charmaxlen = orig->charmaxlen;
 	prs->str = orig->str + orig->state->posbyte;
 	prs->lenstr = orig->lenstr - orig->state->posbyte;
-	prs->usewide = orig->usewide;
 
 	if (orig->pgwstr)
 		prs->pgwstr = orig->pgwstr + orig->state->poschar;
-	if (orig->wstr)
-		prs->wstr = orig->wstr + orig->state->poschar;
 
 	prs->state = newTParserPosition(NULL);
 	prs->state->state = TPS_Base;
@@ -379,8 +349,6 @@ TParserClose(TParser *prs)
 		prs->state = ptr;
 	}
 
-	if (prs->wstr)
-		pfree(prs->wstr);
 	if (prs->pgwstr)
 		pfree(prs->pgwstr);
 
@@ -412,13 +380,9 @@ TParserCopyClose(TParser *prs)
 
 
 /*
- * Character-type support functions, equivalent to is* macros, but
- * working with any possible encodings and locales. Notes:
- *	- with multibyte encoding and C-locale isw* function may fail
- *	  or give wrong result.
- *	- multibyte encoding and C-locale often are used for
- *	  Asian languages.
- *	- if locale is C then we use pgwstr instead of wstr.
+ * Character-type support functions using the database default locale. If the
+ * locale is C, and the input character is non-ascii, the value to be returned
+ * is determined by the 'nonascii' macro argument.
  */
 
 #define p_iswhat(type, nonascii)											\
@@ -426,19 +390,13 @@ TParserCopyClose(TParser *prs)
 static int																	\
 p_is##type(TParser *prs)													\
 {																			\
+	pg_locale_t locale = pg_database_locale();								\
+	pg_wchar	wc;															\
 	Assert(prs->state);														\
-	if (prs->usewide)														\
-	{																		\
-		if (prs->pgwstr)													\
-		{																	\
-			unsigned int c = *(prs->pgwstr + prs->state->poschar);			\
-			if (c > 0x7f)													\
-				return nonascii;											\
-			return is##type(c);												\
-		}																	\
-		return isw##type(*(prs->wstr + prs->state->poschar));				\
-	}																		\
-	return is##type(*(unsigned char *) (prs->str + prs->state->posbyte));	\
+	wc = prs->pgwstr[prs->state->poschar];									\
+	if (prs->charmaxlen > 1 && locale->ctype_is_c && wc > 0x7f)				\
+		return nonascii;													\
+	return pg_wc_is##type(wc, pg_database_locale());						\
 }																			\
 																			\
 static int																	\
@@ -703,7 +661,7 @@ p_isspecial(TParser *prs)
 	 * Check that only in utf encoding, because other encodings aren't
 	 * supported by postgres or even exists.
 	 */
-	if (GetDatabaseEncoding() == PG_UTF8 && prs->usewide)
+	if (GetDatabaseEncoding() == PG_UTF8)
 	{
 		static const pg_wchar strange_letter[] = {
 			/*
@@ -944,10 +902,7 @@ p_isspecial(TParser *prs)
 				   *StopMiddle;
 		pg_wchar	c;
 
-		if (prs->pgwstr)
-			c = *(prs->pgwstr + prs->state->poschar);
-		else
-			c = (pg_wchar) *(prs->wstr + prs->state->poschar);
+		c = *(prs->pgwstr + prs->state->poschar);
 
 		while (StopLow < StopHigh)
 		{
-- 
2.43.0