Use CASEFOLD() internally rather than LOWER()

Started by Jeff Davisabout 6 hours ago1 messages

pgsql@j-davis.com

about 6 hours ago

4 attachment(s)

There are a number of internal callers of LOWER(), and conceptually
those should all be using CASEFOLD(). Patches attached.

I'm not sure if we want the citext patch -- it would require REINDEX of
all existing citext indexes after upgrade, and there's already a
documented tip ("Consider using nondeterministic collations...), so
perhaps it's a legacy extension anyway.

It would be nice to make the tsearch change this release, as there are
already changes that could require a reindex.

I didn't change pg_trgm yet, because I think that we have to change the
regex machinery to be aware of more than two case variants first (and
potentially increasing string lengths, too).

Regards,
Jeff Davis

Attachments:

v1-0002-citext-use-CASEFOLD-rather-than-LOWER.patchtext/x-patch; charset=UTF-8; name=v1-0002-citext-use-CASEFOLD-rather-than-LOWER.patchDownload

From 8526ae1884e7d36ab9d0da06a1fd16b9ea0f7206 Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Mon, 12 Jan 2026 09:04:03 -0800
Subject: [PATCH v1 2/4] citext: use CASEFOLD() rather than LOWER().

CASEFOLD() is better for case-insensitive matching in edge cases.

Existing citext indexes require REINDEX.
---
 contrib/citext/citext.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/contrib/citext/citext.c b/contrib/citext/citext.c
index a15ce5db829..02cad8c4ac3 100644
--- a/contrib/citext/citext.c
+++ b/contrib/citext/citext.c
@@ -43,15 +43,15 @@ citextcmp(text *left, text *right, Oid collid)
 	int32		result;
 
 	/*
-	 * We must do our str_tolower calls with DEFAULT_COLLATION_OID, not the
+	 * We must do our str_casefold calls with DEFAULT_COLLATION_OID, not the
 	 * input collation as you might expect.  This is so that the behavior of
 	 * citext's equality and hashing functions is not collation-dependent.  We
 	 * should change this once the core infrastructure is able to cope with
 	 * collation-dependent equality and hashing functions.
 	 */
 
-	lcstr = str_tolower(VARDATA_ANY(left), VARSIZE_ANY_EXHDR(left), DEFAULT_COLLATION_OID);
-	rcstr = str_tolower(VARDATA_ANY(right), VARSIZE_ANY_EXHDR(right), DEFAULT_COLLATION_OID);
+	lcstr = str_casefold(VARDATA_ANY(left), VARSIZE_ANY_EXHDR(left), DEFAULT_COLLATION_OID);
+	rcstr = str_casefold(VARDATA_ANY(right), VARSIZE_ANY_EXHDR(right), DEFAULT_COLLATION_OID);
 
 	result = varstr_cmp(lcstr, strlen(lcstr),
 						rcstr, strlen(rcstr),
@@ -77,8 +77,8 @@ internal_citext_pattern_cmp(text *left, text *right, Oid collid)
 				rlen;
 	int32		result;
 
-	lcstr = str_tolower(VARDATA_ANY(left), VARSIZE_ANY_EXHDR(left), DEFAULT_COLLATION_OID);
-	rcstr = str_tolower(VARDATA_ANY(right), VARSIZE_ANY_EXHDR(right), DEFAULT_COLLATION_OID);
+	lcstr = str_casefold(VARDATA_ANY(left), VARSIZE_ANY_EXHDR(left), DEFAULT_COLLATION_OID);
+	rcstr = str_casefold(VARDATA_ANY(right), VARSIZE_ANY_EXHDR(right), DEFAULT_COLLATION_OID);
 
 	llen = strlen(lcstr);
 	rlen = strlen(rcstr);
@@ -147,7 +147,7 @@ citext_hash(PG_FUNCTION_ARGS)
 	char	   *str;
 	Datum		result;
 
-	str = str_tolower(VARDATA_ANY(txt), VARSIZE_ANY_EXHDR(txt), DEFAULT_COLLATION_OID);
+	str = str_casefold(VARDATA_ANY(txt), VARSIZE_ANY_EXHDR(txt), DEFAULT_COLLATION_OID);
 	result = hash_any((unsigned char *) str, strlen(str));
 	pfree(str);
 
@@ -167,7 +167,7 @@ citext_hash_extended(PG_FUNCTION_ARGS)
 	char	   *str;
 	Datum		result;
 
-	str = str_tolower(VARDATA_ANY(txt), VARSIZE_ANY_EXHDR(txt), DEFAULT_COLLATION_OID);
+	str = str_casefold(VARDATA_ANY(txt), VARSIZE_ANY_EXHDR(txt), DEFAULT_COLLATION_OID);
 	result = hash_any_extended((unsigned char *) str, strlen(str), seed);
 	pfree(str);
 
@@ -196,8 +196,8 @@ citext_eq(PG_FUNCTION_ARGS)
 
 	/* We can't compare lengths in advance of downcasing ... */
 
-	lcstr = str_tolower(VARDATA_ANY(left), VARSIZE_ANY_EXHDR(left), DEFAULT_COLLATION_OID);
-	rcstr = str_tolower(VARDATA_ANY(right), VARSIZE_ANY_EXHDR(right), DEFAULT_COLLATION_OID);
+	lcstr = str_casefold(VARDATA_ANY(left), VARSIZE_ANY_EXHDR(left), DEFAULT_COLLATION_OID);
+	rcstr = str_casefold(VARDATA_ANY(right), VARSIZE_ANY_EXHDR(right), DEFAULT_COLLATION_OID);
 
 	/*
 	 * Since we only care about equality or not-equality, we can avoid all the
@@ -226,8 +226,8 @@ citext_ne(PG_FUNCTION_ARGS)
 
 	/* We can't compare lengths in advance of downcasing ... */
 
-	lcstr = str_tolower(VARDATA_ANY(left), VARSIZE_ANY_EXHDR(left), DEFAULT_COLLATION_OID);
-	rcstr = str_tolower(VARDATA_ANY(right), VARSIZE_ANY_EXHDR(right), DEFAULT_COLLATION_OID);
+	lcstr = str_casefold(VARDATA_ANY(left), VARSIZE_ANY_EXHDR(left), DEFAULT_COLLATION_OID);
+	rcstr = str_casefold(VARDATA_ANY(right), VARSIZE_ANY_EXHDR(right), DEFAULT_COLLATION_OID);
 
 	/*
 	 * Since we only care about equality or not-equality, we can avoid all the
-- 
2.43.0

v1-0001-ILIKE-use-CASEFOLD-rather-than-LOWER.patchtext/x-patch; charset=UTF-8; name=v1-0001-ILIKE-use-CASEFOLD-rather-than-LOWER.patchDownload

From 94b77294ac95901f07f1e2a571fad483a7409639 Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Mon, 12 Jan 2026 08:58:43 -0800
Subject: [PATCH v1 1/4] ILIKE: use CASEFOLD() rather than LOWER().

For non-C locales, we casefold the entire string before performing
pattern matching with ILIKE. Previously, casefolding was done with the
LOWER() function; now that a proper CASEFOLD() function exists, use
that instead.

CASEFOLD() is better than LOWER() for case-insensitive comparisons in
builtin and ICU locales. For instance, CASEFOLD() transforms a GREEK
SMALL LETTER FINAL SIGMA (U+03C2) into GREEK SMALL LETTER SIGMA
(U+03C3) so that the two characters match in a case-insensitive
comparison; whereas LOWER() does not transform it because it's already
lowercase, so they will not match.
---
 src/backend/utils/adt/like.c               |  8 +++-----
 src/test/regress/expected/collate.utf8.out | 24 ++++++++++++++++++++++
 src/test/regress/sql/collate.utf8.sql      |  6 ++++++
 3 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/src/backend/utils/adt/like.c b/src/backend/utils/adt/like.c
index 2143d8658e8..b04c6cc6661 100644
--- a/src/backend/utils/adt/like.c
+++ b/src/backend/utils/adt/like.c
@@ -190,10 +190,8 @@ Generic_Text_IC_like(text *str, text *pat, Oid collation)
 				 errmsg("nondeterministic collations are not supported for ILIKE")));
 
 	/*
-	 * For efficiency reasons, in the C locale we don't call lower() on the
+	 * For efficiency reasons, in the C locale we don't call casefold() on the
 	 * pattern and text, but instead lowercase each character lazily.
-	 *
-	 * XXX: use casefolding instead?
 	 */
 
 	if (locale->ctype_is_c)
@@ -206,11 +204,11 @@ Generic_Text_IC_like(text *str, text *pat, Oid collation)
 	}
 	else
 	{
-		pat = DatumGetTextPP(DirectFunctionCall1Coll(lower, collation,
+		pat = DatumGetTextPP(DirectFunctionCall1Coll(casefold, collation,
 													 PointerGetDatum(pat)));
 		p = VARDATA_ANY(pat);
 		plen = VARSIZE_ANY_EXHDR(pat);
-		str = DatumGetTextPP(DirectFunctionCall1Coll(lower, collation,
+		str = DatumGetTextPP(DirectFunctionCall1Coll(casefold, collation,
 													 PointerGetDatum(str)));
 		s = VARDATA_ANY(str);
 		slen = VARSIZE_ANY_EXHDR(str);
diff --git a/src/test/regress/expected/collate.utf8.out b/src/test/regress/expected/collate.utf8.out
index 0c3ab5c89b2..3d4292611e2 100644
--- a/src/test/regress/expected/collate.utf8.out
+++ b/src/test/regress/expected/collate.utf8.out
@@ -169,6 +169,18 @@ select casefold('AbCd 123 #$% ıiIİ ẞ ß Ǆǅǆ Σσς' collate PG_C_UTF8);
  abcd 123 #$% ıiiİ ß ß ǆǆǆ σσσ
 (1 row)
 
+SELECT U&'ς' ILIKE U&'σ' COLLATE PG_C_UTF8;
+ ?column? 
+----------
+ t
+(1 row)
+
+SELECT U&'straße' ILIKE U&'STRASSE' COLLATE PG_C_UTF8;
+ ?column? 
+----------
+ f
+(1 row)
+
 --
 -- Test PG_UNICODE_FAST
 --
@@ -338,3 +350,15 @@ select casefold('AbCd 123 #$% ıiIİ ẞ ß Ǆǅǆ Σσς' collate PG_UNICODE_FA
  abcd 123 #$% ıiii̇ ss ss ǆǆǆ σσσ
 (1 row)
 
+SELECT U&'ς' ILIKE U&'σ' COLLATE PG_UNICODE_FAST;
+ ?column? 
+----------
+ t
+(1 row)
+
+SELECT U&'straße' ILIKE U&'STRASSE' COLLATE PG_UNICODE_FAST;
+ ?column? 
+----------
+ t
+(1 row)
+
diff --git a/src/test/regress/sql/collate.utf8.sql b/src/test/regress/sql/collate.utf8.sql
index d6d14220ab3..4a5e519cf07 100644
--- a/src/test/regress/sql/collate.utf8.sql
+++ b/src/test/regress/sql/collate.utf8.sql
@@ -85,6 +85,9 @@ SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed
 -- case folding
 select casefold('AbCd 123 #$% ıiIİ ẞ ß Ǆǅǆ Σσς' collate PG_C_UTF8);
 
+SELECT U&'ς' ILIKE U&'σ' COLLATE PG_C_UTF8;
+SELECT U&'straße' ILIKE U&'STRASSE' COLLATE PG_C_UTF8;
+
 --
 -- Test PG_UNICODE_FAST
 --
@@ -148,3 +151,6 @@ SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_UNICODE_FAST; -- same as above with cases re
 
 -- case folding
 select casefold('AbCd 123 #$% ıiIİ ẞ ß Ǆǅǆ Σσς' collate PG_UNICODE_FAST);
+
+SELECT U&'ς' ILIKE U&'σ' COLLATE PG_UNICODE_FAST;
+SELECT U&'straße' ILIKE U&'STRASSE' COLLATE PG_UNICODE_FAST;
-- 
2.43.0

v1-0003-dict_xsyn-use-CASEFOLD-rather-than-LOWER.patchtext/x-patch; charset=UTF-8; name=v1-0003-dict_xsyn-use-CASEFOLD-rather-than-LOWER.patchDownload

From 19c2d1d413d7842e6c90c237fab33fe0f93caf82 Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Mon, 12 Jan 2026 09:11:56 -0800
Subject: [PATCH v1 3/4] dict_xsyn: use CASEFOLD() rather than LOWER().

CASEFOLD is better for case-insensitive matching in edge cases.
---
 contrib/dict_xsyn/dict_xsyn.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/contrib/dict_xsyn/dict_xsyn.c b/contrib/dict_xsyn/dict_xsyn.c
index 5c4917ce1fc..613527fd392 100644
--- a/contrib/dict_xsyn/dict_xsyn.c
+++ b/contrib/dict_xsyn/dict_xsyn.c
@@ -98,7 +98,7 @@ read_dictionary(DictSyn *d, const char *filename)
 		if (*line == '\0')
 			continue;
 
-		value = str_tolower(line, strlen(line), DEFAULT_COLLATION_OID);
+		value = str_casefold(line, strlen(line), DEFAULT_COLLATION_OID);
 		pfree(line);
 
 		pos = value;
@@ -215,7 +215,7 @@ dxsyn_lexize(PG_FUNCTION_ARGS)
 	{
 		char	   *temp = pnstrdup(in, length);
 
-		word.key = str_tolower(temp, length, DEFAULT_COLLATION_OID);
+		word.key = str_casefold(temp, length, DEFAULT_COLLATION_OID);
 		pfree(temp);
 		word.value = NULL;
 	}
-- 
2.43.0

v1-0004-tsearch-use-CASEFOLD-rather-than-LOWER.patchtext/x-patch; charset=UTF-8; name=v1-0004-tsearch-use-CASEFOLD-rather-than-LOWER.patchDownload

From 7dfcc58e48baf9aa9a8fa6f41c0c94b1d6d16bae Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Mon, 12 Jan 2026 09:24:16 -0800
Subject: [PATCH v1 4/4] tsearch: use CASEFOLD() rather than LOWER().

CASEFOLD() is better for case-insensitive matching in edge cases.
---
 src/backend/snowball/dict_snowball.c | 4 ++--
 src/backend/tsearch/dict_ispell.c    | 4 ++--
 src/backend/tsearch/dict_simple.c    | 4 ++--
 src/backend/tsearch/dict_synonym.c   | 6 +++---
 src/backend/tsearch/spell.c          | 6 +++---
 5 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/backend/snowball/dict_snowball.c b/src/backend/snowball/dict_snowball.c
index 182bd156995..cb2d3061953 100644
--- a/src/backend/snowball/dict_snowball.c
+++ b/src/backend/snowball/dict_snowball.c
@@ -251,7 +251,7 @@ dsnowball_init(PG_FUNCTION_ARGS)
 				ereport(ERROR,
 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 						 errmsg("multiple StopWords parameters")));
-			readstoplist(defGetString(defel), &d->stoplist, str_tolower);
+			readstoplist(defGetString(defel), &d->stoplist, str_casefold);
 			stoploaded = true;
 		}
 		else if (strcmp(defel->defname, "language") == 0)
@@ -287,7 +287,7 @@ dsnowball_lexize(PG_FUNCTION_ARGS)
 	DictSnowball *d = (DictSnowball *) PG_GETARG_POINTER(0);
 	char	   *in = (char *) PG_GETARG_POINTER(1);
 	int32		len = PG_GETARG_INT32(2);
-	char	   *txt = str_tolower(in, len, DEFAULT_COLLATION_OID);
+	char	   *txt = str_casefold(in, len, DEFAULT_COLLATION_OID);
 	TSLexeme   *res = palloc0_array(TSLexeme, 2);
 
 	/*
diff --git a/src/backend/tsearch/dict_ispell.c b/src/backend/tsearch/dict_ispell.c
index ad5c26ebccb..bdcfc836e80 100644
--- a/src/backend/tsearch/dict_ispell.c
+++ b/src/backend/tsearch/dict_ispell.c
@@ -79,7 +79,7 @@ dispell_init(PG_FUNCTION_ARGS)
 				ereport(ERROR,
 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 						 errmsg("multiple StopWords parameters")));
-			readstoplist(defGetString(defel), &(d->stoplist), str_tolower);
+			readstoplist(defGetString(defel), &(d->stoplist), str_casefold);
 			stoploaded = true;
 		}
 		else
@@ -128,7 +128,7 @@ dispell_lexize(PG_FUNCTION_ARGS)
 	if (len <= 0)
 		PG_RETURN_POINTER(NULL);
 
-	txt = str_tolower(in, len, DEFAULT_COLLATION_OID);
+	txt = str_casefold(in, len, DEFAULT_COLLATION_OID);
 	res = NINormalizeWord(&(d->obj), txt);
 
 	if (res == NULL)
diff --git a/src/backend/tsearch/dict_simple.c b/src/backend/tsearch/dict_simple.c
index 44d945b2be8..52df5251e20 100644
--- a/src/backend/tsearch/dict_simple.c
+++ b/src/backend/tsearch/dict_simple.c
@@ -48,7 +48,7 @@ dsimple_init(PG_FUNCTION_ARGS)
 				ereport(ERROR,
 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 						 errmsg("multiple StopWords parameters")));
-			readstoplist(defGetString(defel), &d->stoplist, str_tolower);
+			readstoplist(defGetString(defel), &d->stoplist, str_casefold);
 			stoploaded = true;
 		}
 		else if (strcmp(defel->defname, "accept") == 0)
@@ -81,7 +81,7 @@ dsimple_lexize(PG_FUNCTION_ARGS)
 	char	   *txt;
 	TSLexeme   *res;
 
-	txt = str_tolower(in, len, DEFAULT_COLLATION_OID);
+	txt = str_casefold(in, len, DEFAULT_COLLATION_OID);
 
 	if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
 	{
diff --git a/src/backend/tsearch/dict_synonym.c b/src/backend/tsearch/dict_synonym.c
index 6dee28ae525..b5ff8c23cab 100644
--- a/src/backend/tsearch/dict_synonym.c
+++ b/src/backend/tsearch/dict_synonym.c
@@ -185,8 +185,8 @@ dsynonym_init(PG_FUNCTION_ARGS)
 		}
 		else
 		{
-			d->syn[cur].in = str_tolower(starti, strlen(starti), DEFAULT_COLLATION_OID);
-			d->syn[cur].out = str_tolower(starto, strlen(starto), DEFAULT_COLLATION_OID);
+			d->syn[cur].in = str_casefold(starti, strlen(starti), DEFAULT_COLLATION_OID);
+			d->syn[cur].out = str_casefold(starto, strlen(starto), DEFAULT_COLLATION_OID);
 		}
 
 		d->syn[cur].outlen = strlen(starto);
@@ -226,7 +226,7 @@ dsynonym_lexize(PG_FUNCTION_ARGS)
 	if (d->case_sensitive)
 		key.in = pnstrdup(in, len);
 	else
-		key.in = str_tolower(in, len, DEFAULT_COLLATION_OID);
+		key.in = str_casefold(in, len, DEFAULT_COLLATION_OID);
 
 	key.out = NULL;
 
diff --git a/src/backend/tsearch/spell.c b/src/backend/tsearch/spell.c
index e3436dbddd2..e946b88f38b 100644
--- a/src/backend/tsearch/spell.c
+++ b/src/backend/tsearch/spell.c
@@ -170,7 +170,7 @@ cpstrdup(IspellDict *Conf, const char *str)
 
 
 /*
- * Apply str_tolower(), producing a temporary result (in the buildCxt).
+ * Apply str_casefold(), producing a temporary result (in the buildCxt).
  */
 static char *
 lowerstr_ctx(IspellDict *Conf, const char *src)
@@ -179,7 +179,7 @@ lowerstr_ctx(IspellDict *Conf, const char *src)
 	char	   *dst;
 
 	saveCtx = MemoryContextSwitchTo(Conf->buildCxt);
-	dst = str_tolower(src, strlen(src), DEFAULT_COLLATION_OID);
+	dst = str_casefold(src, strlen(src), DEFAULT_COLLATION_OID);
 	MemoryContextSwitchTo(saveCtx);
 
 	return dst;
@@ -1453,7 +1453,7 @@ NIImportAffixes(IspellDict *Conf, const char *filename)
 
 	while ((recoded = tsearch_readline(&trst)) != NULL)
 	{
-		pstr = str_tolower(recoded, strlen(recoded), DEFAULT_COLLATION_OID);
+		pstr = str_casefold(recoded, strlen(recoded), DEFAULT_COLLATION_OID);
 
 		/* Skip comments and empty lines */
 		if (*pstr == '#' || *pstr == '\n')
-- 
2.43.0