Use CASEFOLD() internally rather than LOWER()
Started by Jeff Davisabout 6 hours ago1 messages
There are a number of internal callers of LOWER(), and conceptually
those should all be using CASEFOLD(). Patches attached.
I'm not sure if we want the citext patch -- it would require REINDEX of
all existing citext indexes after upgrade, and there's already a
documented tip ("Consider using nondeterministic collations...), so
perhaps it's a legacy extension anyway.
It would be nice to make the tsearch change this release, as there are
already changes that could require a reindex.
I didn't change pg_trgm yet, because I think that we have to change the
regex machinery to be aware of more than two case variants first (and
potentially increasing string lengths, too).
Regards,
Jeff Davis
Attachments:
v1-0002-citext-use-CASEFOLD-rather-than-LOWER.patchtext/x-patch; charset=UTF-8; name=v1-0002-citext-use-CASEFOLD-rather-than-LOWER.patchDownload
From 8526ae1884e7d36ab9d0da06a1fd16b9ea0f7206 Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Mon, 12 Jan 2026 09:04:03 -0800
Subject: [PATCH v1 2/4] citext: use CASEFOLD() rather than LOWER().
CASEFOLD() is better for case-insensitive matching in edge cases.
Existing citext indexes require REINDEX.
---
contrib/citext/citext.c | 22 +++++++++++-----------
1 file changed, 11 insertions(+), 11 deletions(-)
diff --git a/contrib/citext/citext.c b/contrib/citext/citext.c
index a15ce5db829..02cad8c4ac3 100644
--- a/contrib/citext/citext.c
+++ b/contrib/citext/citext.c
@@ -43,15 +43,15 @@ citextcmp(text *left, text *right, Oid collid)
int32 result;
/*
- * We must do our str_tolower calls with DEFAULT_COLLATION_OID, not the
+ * We must do our str_casefold calls with DEFAULT_COLLATION_OID, not the
* input collation as you might expect. This is so that the behavior of
* citext's equality and hashing functions is not collation-dependent. We
* should change this once the core infrastructure is able to cope with
* collation-dependent equality and hashing functions.
*/
- lcstr = str_tolower(VARDATA_ANY(left), VARSIZE_ANY_EXHDR(left), DEFAULT_COLLATION_OID);
- rcstr = str_tolower(VARDATA_ANY(right), VARSIZE_ANY_EXHDR(right), DEFAULT_COLLATION_OID);
+ lcstr = str_casefold(VARDATA_ANY(left), VARSIZE_ANY_EXHDR(left), DEFAULT_COLLATION_OID);
+ rcstr = str_casefold(VARDATA_ANY(right), VARSIZE_ANY_EXHDR(right), DEFAULT_COLLATION_OID);
result = varstr_cmp(lcstr, strlen(lcstr),
rcstr, strlen(rcstr),
@@ -77,8 +77,8 @@ internal_citext_pattern_cmp(text *left, text *right, Oid collid)
rlen;
int32 result;
- lcstr = str_tolower(VARDATA_ANY(left), VARSIZE_ANY_EXHDR(left), DEFAULT_COLLATION_OID);
- rcstr = str_tolower(VARDATA_ANY(right), VARSIZE_ANY_EXHDR(right), DEFAULT_COLLATION_OID);
+ lcstr = str_casefold(VARDATA_ANY(left), VARSIZE_ANY_EXHDR(left), DEFAULT_COLLATION_OID);
+ rcstr = str_casefold(VARDATA_ANY(right), VARSIZE_ANY_EXHDR(right), DEFAULT_COLLATION_OID);
llen = strlen(lcstr);
rlen = strlen(rcstr);
@@ -147,7 +147,7 @@ citext_hash(PG_FUNCTION_ARGS)
char *str;
Datum result;
- str = str_tolower(VARDATA_ANY(txt), VARSIZE_ANY_EXHDR(txt), DEFAULT_COLLATION_OID);
+ str = str_casefold(VARDATA_ANY(txt), VARSIZE_ANY_EXHDR(txt), DEFAULT_COLLATION_OID);
result = hash_any((unsigned char *) str, strlen(str));
pfree(str);
@@ -167,7 +167,7 @@ citext_hash_extended(PG_FUNCTION_ARGS)
char *str;
Datum result;
- str = str_tolower(VARDATA_ANY(txt), VARSIZE_ANY_EXHDR(txt), DEFAULT_COLLATION_OID);
+ str = str_casefold(VARDATA_ANY(txt), VARSIZE_ANY_EXHDR(txt), DEFAULT_COLLATION_OID);
result = hash_any_extended((unsigned char *) str, strlen(str), seed);
pfree(str);
@@ -196,8 +196,8 @@ citext_eq(PG_FUNCTION_ARGS)
/* We can't compare lengths in advance of downcasing ... */
- lcstr = str_tolower(VARDATA_ANY(left), VARSIZE_ANY_EXHDR(left), DEFAULT_COLLATION_OID);
- rcstr = str_tolower(VARDATA_ANY(right), VARSIZE_ANY_EXHDR(right), DEFAULT_COLLATION_OID);
+ lcstr = str_casefold(VARDATA_ANY(left), VARSIZE_ANY_EXHDR(left), DEFAULT_COLLATION_OID);
+ rcstr = str_casefold(VARDATA_ANY(right), VARSIZE_ANY_EXHDR(right), DEFAULT_COLLATION_OID);
/*
* Since we only care about equality or not-equality, we can avoid all the
@@ -226,8 +226,8 @@ citext_ne(PG_FUNCTION_ARGS)
/* We can't compare lengths in advance of downcasing ... */
- lcstr = str_tolower(VARDATA_ANY(left), VARSIZE_ANY_EXHDR(left), DEFAULT_COLLATION_OID);
- rcstr = str_tolower(VARDATA_ANY(right), VARSIZE_ANY_EXHDR(right), DEFAULT_COLLATION_OID);
+ lcstr = str_casefold(VARDATA_ANY(left), VARSIZE_ANY_EXHDR(left), DEFAULT_COLLATION_OID);
+ rcstr = str_casefold(VARDATA_ANY(right), VARSIZE_ANY_EXHDR(right), DEFAULT_COLLATION_OID);
/*
* Since we only care about equality or not-equality, we can avoid all the
--
2.43.0
v1-0001-ILIKE-use-CASEFOLD-rather-than-LOWER.patchtext/x-patch; charset=UTF-8; name=v1-0001-ILIKE-use-CASEFOLD-rather-than-LOWER.patchDownload
From 94b77294ac95901f07f1e2a571fad483a7409639 Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Mon, 12 Jan 2026 08:58:43 -0800
Subject: [PATCH v1 1/4] ILIKE: use CASEFOLD() rather than LOWER().
For non-C locales, we casefold the entire string before performing
pattern matching with ILIKE. Previously, casefolding was done with the
LOWER() function; now that a proper CASEFOLD() function exists, use
that instead.
CASEFOLD() is better than LOWER() for case-insensitive comparisons in
builtin and ICU locales. For instance, CASEFOLD() transforms a GREEK
SMALL LETTER FINAL SIGMA (U+03C2) into GREEK SMALL LETTER SIGMA
(U+03C3) so that the two characters match in a case-insensitive
comparison; whereas LOWER() does not transform it because it's already
lowercase, so they will not match.
---
src/backend/utils/adt/like.c | 8 +++-----
src/test/regress/expected/collate.utf8.out | 24 ++++++++++++++++++++++
src/test/regress/sql/collate.utf8.sql | 6 ++++++
3 files changed, 33 insertions(+), 5 deletions(-)
diff --git a/src/backend/utils/adt/like.c b/src/backend/utils/adt/like.c
index 2143d8658e8..b04c6cc6661 100644
--- a/src/backend/utils/adt/like.c
+++ b/src/backend/utils/adt/like.c
@@ -190,10 +190,8 @@ Generic_Text_IC_like(text *str, text *pat, Oid collation)
errmsg("nondeterministic collations are not supported for ILIKE")));
/*
- * For efficiency reasons, in the C locale we don't call lower() on the
+ * For efficiency reasons, in the C locale we don't call casefold() on the
* pattern and text, but instead lowercase each character lazily.
- *
- * XXX: use casefolding instead?
*/
if (locale->ctype_is_c)
@@ -206,11 +204,11 @@ Generic_Text_IC_like(text *str, text *pat, Oid collation)
}
else
{
- pat = DatumGetTextPP(DirectFunctionCall1Coll(lower, collation,
+ pat = DatumGetTextPP(DirectFunctionCall1Coll(casefold, collation,
PointerGetDatum(pat)));
p = VARDATA_ANY(pat);
plen = VARSIZE_ANY_EXHDR(pat);
- str = DatumGetTextPP(DirectFunctionCall1Coll(lower, collation,
+ str = DatumGetTextPP(DirectFunctionCall1Coll(casefold, collation,
PointerGetDatum(str)));
s = VARDATA_ANY(str);
slen = VARSIZE_ANY_EXHDR(str);
diff --git a/src/test/regress/expected/collate.utf8.out b/src/test/regress/expected/collate.utf8.out
index 0c3ab5c89b2..3d4292611e2 100644
--- a/src/test/regress/expected/collate.utf8.out
+++ b/src/test/regress/expected/collate.utf8.out
@@ -169,6 +169,18 @@ select casefold('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' collate PG_C_UTF8);
abcd 123 #$% ıiiİ ß ß dždždž σσσ
(1 row)
+SELECT U&'ς' ILIKE U&'σ' COLLATE PG_C_UTF8;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT U&'straße' ILIKE U&'STRASSE' COLLATE PG_C_UTF8;
+ ?column?
+----------
+ f
+(1 row)
+
--
-- Test PG_UNICODE_FAST
--
@@ -338,3 +350,15 @@ select casefold('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' collate PG_UNICODE_FA
abcd 123 #$% ıiii̇ ss ss dždždž σσσ
(1 row)
+SELECT U&'ς' ILIKE U&'σ' COLLATE PG_UNICODE_FAST;
+ ?column?
+----------
+ t
+(1 row)
+
+SELECT U&'straße' ILIKE U&'STRASSE' COLLATE PG_UNICODE_FAST;
+ ?column?
+----------
+ t
+(1 row)
+
diff --git a/src/test/regress/sql/collate.utf8.sql b/src/test/regress/sql/collate.utf8.sql
index d6d14220ab3..4a5e519cf07 100644
--- a/src/test/regress/sql/collate.utf8.sql
+++ b/src/test/regress/sql/collate.utf8.sql
@@ -85,6 +85,9 @@ SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed
-- case folding
select casefold('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' collate PG_C_UTF8);
+SELECT U&'ς' ILIKE U&'σ' COLLATE PG_C_UTF8;
+SELECT U&'straße' ILIKE U&'STRASSE' COLLATE PG_C_UTF8;
+
--
-- Test PG_UNICODE_FAST
--
@@ -148,3 +151,6 @@ SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_UNICODE_FAST; -- same as above with cases re
-- case folding
select casefold('AbCd 123 #$% ıiIİ ẞ ß DŽDždž Σσς' collate PG_UNICODE_FAST);
+
+SELECT U&'ς' ILIKE U&'σ' COLLATE PG_UNICODE_FAST;
+SELECT U&'straße' ILIKE U&'STRASSE' COLLATE PG_UNICODE_FAST;
--
2.43.0
v1-0003-dict_xsyn-use-CASEFOLD-rather-than-LOWER.patchtext/x-patch; charset=UTF-8; name=v1-0003-dict_xsyn-use-CASEFOLD-rather-than-LOWER.patchDownload
From 19c2d1d413d7842e6c90c237fab33fe0f93caf82 Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Mon, 12 Jan 2026 09:11:56 -0800
Subject: [PATCH v1 3/4] dict_xsyn: use CASEFOLD() rather than LOWER().
CASEFOLD is better for case-insensitive matching in edge cases.
---
contrib/dict_xsyn/dict_xsyn.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/contrib/dict_xsyn/dict_xsyn.c b/contrib/dict_xsyn/dict_xsyn.c
index 5c4917ce1fc..613527fd392 100644
--- a/contrib/dict_xsyn/dict_xsyn.c
+++ b/contrib/dict_xsyn/dict_xsyn.c
@@ -98,7 +98,7 @@ read_dictionary(DictSyn *d, const char *filename)
if (*line == '\0')
continue;
- value = str_tolower(line, strlen(line), DEFAULT_COLLATION_OID);
+ value = str_casefold(line, strlen(line), DEFAULT_COLLATION_OID);
pfree(line);
pos = value;
@@ -215,7 +215,7 @@ dxsyn_lexize(PG_FUNCTION_ARGS)
{
char *temp = pnstrdup(in, length);
- word.key = str_tolower(temp, length, DEFAULT_COLLATION_OID);
+ word.key = str_casefold(temp, length, DEFAULT_COLLATION_OID);
pfree(temp);
word.value = NULL;
}
--
2.43.0
v1-0004-tsearch-use-CASEFOLD-rather-than-LOWER.patchtext/x-patch; charset=UTF-8; name=v1-0004-tsearch-use-CASEFOLD-rather-than-LOWER.patchDownload
From 7dfcc58e48baf9aa9a8fa6f41c0c94b1d6d16bae Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Mon, 12 Jan 2026 09:24:16 -0800
Subject: [PATCH v1 4/4] tsearch: use CASEFOLD() rather than LOWER().
CASEFOLD() is better for case-insensitive matching in edge cases.
---
src/backend/snowball/dict_snowball.c | 4 ++--
src/backend/tsearch/dict_ispell.c | 4 ++--
src/backend/tsearch/dict_simple.c | 4 ++--
src/backend/tsearch/dict_synonym.c | 6 +++---
src/backend/tsearch/spell.c | 6 +++---
5 files changed, 12 insertions(+), 12 deletions(-)
diff --git a/src/backend/snowball/dict_snowball.c b/src/backend/snowball/dict_snowball.c
index 182bd156995..cb2d3061953 100644
--- a/src/backend/snowball/dict_snowball.c
+++ b/src/backend/snowball/dict_snowball.c
@@ -251,7 +251,7 @@ dsnowball_init(PG_FUNCTION_ARGS)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("multiple StopWords parameters")));
- readstoplist(defGetString(defel), &d->stoplist, str_tolower);
+ readstoplist(defGetString(defel), &d->stoplist, str_casefold);
stoploaded = true;
}
else if (strcmp(defel->defname, "language") == 0)
@@ -287,7 +287,7 @@ dsnowball_lexize(PG_FUNCTION_ARGS)
DictSnowball *d = (DictSnowball *) PG_GETARG_POINTER(0);
char *in = (char *) PG_GETARG_POINTER(1);
int32 len = PG_GETARG_INT32(2);
- char *txt = str_tolower(in, len, DEFAULT_COLLATION_OID);
+ char *txt = str_casefold(in, len, DEFAULT_COLLATION_OID);
TSLexeme *res = palloc0_array(TSLexeme, 2);
/*
diff --git a/src/backend/tsearch/dict_ispell.c b/src/backend/tsearch/dict_ispell.c
index ad5c26ebccb..bdcfc836e80 100644
--- a/src/backend/tsearch/dict_ispell.c
+++ b/src/backend/tsearch/dict_ispell.c
@@ -79,7 +79,7 @@ dispell_init(PG_FUNCTION_ARGS)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("multiple StopWords parameters")));
- readstoplist(defGetString(defel), &(d->stoplist), str_tolower);
+ readstoplist(defGetString(defel), &(d->stoplist), str_casefold);
stoploaded = true;
}
else
@@ -128,7 +128,7 @@ dispell_lexize(PG_FUNCTION_ARGS)
if (len <= 0)
PG_RETURN_POINTER(NULL);
- txt = str_tolower(in, len, DEFAULT_COLLATION_OID);
+ txt = str_casefold(in, len, DEFAULT_COLLATION_OID);
res = NINormalizeWord(&(d->obj), txt);
if (res == NULL)
diff --git a/src/backend/tsearch/dict_simple.c b/src/backend/tsearch/dict_simple.c
index 44d945b2be8..52df5251e20 100644
--- a/src/backend/tsearch/dict_simple.c
+++ b/src/backend/tsearch/dict_simple.c
@@ -48,7 +48,7 @@ dsimple_init(PG_FUNCTION_ARGS)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("multiple StopWords parameters")));
- readstoplist(defGetString(defel), &d->stoplist, str_tolower);
+ readstoplist(defGetString(defel), &d->stoplist, str_casefold);
stoploaded = true;
}
else if (strcmp(defel->defname, "accept") == 0)
@@ -81,7 +81,7 @@ dsimple_lexize(PG_FUNCTION_ARGS)
char *txt;
TSLexeme *res;
- txt = str_tolower(in, len, DEFAULT_COLLATION_OID);
+ txt = str_casefold(in, len, DEFAULT_COLLATION_OID);
if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
{
diff --git a/src/backend/tsearch/dict_synonym.c b/src/backend/tsearch/dict_synonym.c
index 6dee28ae525..b5ff8c23cab 100644
--- a/src/backend/tsearch/dict_synonym.c
+++ b/src/backend/tsearch/dict_synonym.c
@@ -185,8 +185,8 @@ dsynonym_init(PG_FUNCTION_ARGS)
}
else
{
- d->syn[cur].in = str_tolower(starti, strlen(starti), DEFAULT_COLLATION_OID);
- d->syn[cur].out = str_tolower(starto, strlen(starto), DEFAULT_COLLATION_OID);
+ d->syn[cur].in = str_casefold(starti, strlen(starti), DEFAULT_COLLATION_OID);
+ d->syn[cur].out = str_casefold(starto, strlen(starto), DEFAULT_COLLATION_OID);
}
d->syn[cur].outlen = strlen(starto);
@@ -226,7 +226,7 @@ dsynonym_lexize(PG_FUNCTION_ARGS)
if (d->case_sensitive)
key.in = pnstrdup(in, len);
else
- key.in = str_tolower(in, len, DEFAULT_COLLATION_OID);
+ key.in = str_casefold(in, len, DEFAULT_COLLATION_OID);
key.out = NULL;
diff --git a/src/backend/tsearch/spell.c b/src/backend/tsearch/spell.c
index e3436dbddd2..e946b88f38b 100644
--- a/src/backend/tsearch/spell.c
+++ b/src/backend/tsearch/spell.c
@@ -170,7 +170,7 @@ cpstrdup(IspellDict *Conf, const char *str)
/*
- * Apply str_tolower(), producing a temporary result (in the buildCxt).
+ * Apply str_casefold(), producing a temporary result (in the buildCxt).
*/
static char *
lowerstr_ctx(IspellDict *Conf, const char *src)
@@ -179,7 +179,7 @@ lowerstr_ctx(IspellDict *Conf, const char *src)
char *dst;
saveCtx = MemoryContextSwitchTo(Conf->buildCxt);
- dst = str_tolower(src, strlen(src), DEFAULT_COLLATION_OID);
+ dst = str_casefold(src, strlen(src), DEFAULT_COLLATION_OID);
MemoryContextSwitchTo(saveCtx);
return dst;
@@ -1453,7 +1453,7 @@ NIImportAffixes(IspellDict *Conf, const char *filename)
while ((recoded = tsearch_readline(&trst)) != NULL)
{
- pstr = str_tolower(recoded, strlen(recoded), DEFAULT_COLLATION_OID);
+ pstr = str_casefold(recoded, strlen(recoded), DEFAULT_COLLATION_OID);
/* Skip comments and empty lines */
if (*pstr == '#' || *pstr == '\n')
--
2.43.0