From a9d6dddb3bfff78f191e29d8f2be69332f88cd20 Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Tue, 16 Sep 2025 12:02:03 -0700
Subject: [PATCH v1] Introduce case-insensitive PG_UNICODE_CI.

CATVERSION BUMP
---
 doc/src/sgml/charset.sgml                  |  31 ++++-
 doc/src/sgml/func/func-string.sgml         |  14 ++-
 doc/src/sgml/ref/create_collation.sgml     |   5 +-
 src/backend/commands/collationcmds.c       |   4 +-
 src/backend/commands/dbcommands.c          |   6 +
 src/backend/utils/adt/pg_locale.c          |   4 +
 src/backend/utils/adt/pg_locale_builtin.c  | 128 ++++++++++++++++++++-
 src/bin/initdb/initdb.c                    |   3 +
 src/common/unicode_case.c                  |   3 +
 src/include/catalog/pg_collation.dat       |   5 +
 src/include/common/unicode_case.h          |   3 +
 src/test/regress/expected/collate.utf8.out |  39 +++++++
 src/test/regress/sql/collate.utf8.sql      |  28 +++++
 13 files changed, 257 insertions(+), 16 deletions(-)

diff --git a/doc/src/sgml/charset.sgml b/doc/src/sgml/charset.sgml
index 59b27c3c370..4805a51dd37 100644
--- a/doc/src/sgml/charset.sgml
+++ b/doc/src/sgml/charset.sgml
@@ -377,8 +377,9 @@ initdb --locale-provider=icu --icu-locale=en
      <listitem>
       <para>
        The <literal>builtin</literal> provider uses built-in operations. Only
-       the <literal>C</literal>, <literal>C.UTF-8</literal>, and
-       <literal>PG_UNICODE_FAST</literal> locales are supported for this
+       the <literal>C</literal>, <literal>C.UTF-8</literal>,
+       <literal>PG_UNICODE_FAST</literal>, and
+       <literal>PG_UNICODE_CI</literal> locales are supported for this
        provider.
       </para>
       <para>
@@ -400,6 +401,14 @@ initdb --locale-provider=icu --icu-locale=en
        regular expression character classes are based on the "Standard"
        semantics, and the case mapping is the "full" variant.
       </para>
+      <para>
+       The <literal>PG_UNICODE_CI</literal> locale is available only when the
+       database encoding is <literal>UTF-8</literal>, and the behavior is
+       based on Unicode. The collation is case-insensitive, based on Unicode
+       Default Caseless Matching. The regular expression character classes are
+       based on the "Standard" semantics, and the case mapping is the "full"
+       variant.
+      </para>
      </listitem>
     </varlistentry>
 
@@ -911,6 +920,24 @@ SELECT * FROM test1 ORDER BY a || b COLLATE "fr_FR";
       </listitem>
      </varlistentry>
 
+     <varlistentry>
+      <term><literal>pg_unicode_ci</literal></term>
+      <listitem>
+       <para>
+        This collation is case-insensitive, based on the Unicode code point
+        values after case-folding the input.  This is not a natural language
+        order.  For the functions <function>lower</function>,
+        <function>initcap</function>, and <function>upper</function> it uses
+        Unicode full case mapping.  For pattern matching (including regular
+        expressions), it uses the Standard variant of Unicode <ulink
+        url="https://www.unicode.org/reports/tr18/#Compatibility_Properties">Compatibility
+        Properties</ulink>.  Behavior is efficient and stable within a
+        <productname>Postgres</productname> major version.  It is only
+        available for encoding <literal>UTF8</literal>.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry>
       <term><literal>pg_c_utf8</literal></term>
       <listitem>
diff --git a/doc/src/sgml/func/func-string.sgml b/doc/src/sgml/func/func-string.sgml
index 01cc94c234e..26cfddba199 100644
--- a/doc/src/sgml/func/func-string.sgml
+++ b/doc/src/sgml/func/func-string.sgml
@@ -695,12 +695,14 @@
        <para>
         Converts the first letter of each word to upper case (or title case
         if the letter is a digraph and locale is <literal>ICU</literal> or
-        <literal>builtin</literal> <literal>PG_UNICODE_FAST</literal>)
-        and the rest to lower case. When using the <literal>libc</literal> or
-        <literal>builtin</literal> locale provider, words are sequences of
-        alphanumeric characters separated by non-alphanumeric characters;
-        when using the ICU locale provider, words are separated according to
-        <ulink url="https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/ustring_8h.html#a47602e2c2012d77ee91908b9bbfdc063">u_strToTitle ICU function</ulink>.
+        <literal>builtin</literal> <literal>PG_UNICODE_FAST</literal> or
+        <literal>PG_UNICODE_CI</literal>) and the rest to lower case. When
+        using the <literal>libc</literal> or <literal>builtin</literal> locale
+        provider, words are sequences of alphanumeric characters separated by
+        non-alphanumeric characters; when using the ICU locale provider, words
+        are separated according to <ulink
+        url="https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/ustring_8h.html#a47602e2c2012d77ee91908b9bbfdc063">u_strToTitle
+        ICU function</ulink>.
        </para>
        <para>
         This function is primarily used for convenient
diff --git a/doc/src/sgml/ref/create_collation.sgml b/doc/src/sgml/ref/create_collation.sgml
index 4af1836ae30..f50a2aa3348 100644
--- a/doc/src/sgml/ref/create_collation.sgml
+++ b/doc/src/sgml/ref/create_collation.sgml
@@ -99,8 +99,9 @@ CREATE COLLATION [ IF NOT EXISTS ] <replaceable>name</replaceable> FROM <replace
       <para>
        If <replaceable>provider</replaceable> is <literal>builtin</literal>,
        then <replaceable>locale</replaceable> must be specified and set to
-       either <literal>C</literal>, <literal>C.UTF-8</literal> or
-       <literal>PG_UNICODE_FAST</literal>.
+       either <literal>C</literal>, <literal>C.UTF-8</literal>,
+       <literal>PG_UNICODE_FAST</literal>, or
+       <literal>PG_UNICODE_CI</literal>.
       </para>
      </listitem>
     </varlistentry>
diff --git a/src/backend/commands/collationcmds.c b/src/backend/commands/collationcmds.c
index 8acbfbbeda0..444a3fc8b37 100644
--- a/src/backend/commands/collationcmds.c
+++ b/src/backend/commands/collationcmds.c
@@ -306,7 +306,9 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e
 		 * difference. So we can save writing the code for the other
 		 * providers.
 		 */
-		if (!collisdeterministic && collprovider != COLLPROVIDER_ICU)
+		if (!collisdeterministic &&
+			(collprovider == COLLPROVIDER_LIBC ||
+			 (collprovider == COLLPROVIDER_BUILTIN && strcmp(colllocale, "PG_UNICODE_CI") != 0)))
 			ereport(ERROR,
 					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 					 errmsg("nondeterministic collations not supported with this provider")));
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 2793fd83771..fddc47b1973 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -1140,6 +1140,12 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
 					 errmsg("LOCALE or BUILTIN_LOCALE must be specified")));
 
 		dblocale = builtin_validate_locale(encoding, dblocale);
+
+		if (strcmp(dblocale, "PG_UNICODE_CI") == 0)
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("cannot use locale \"%s\" for the database collation",
+							dblocale)));
 	}
 	else if (dblocprovider == COLLPROVIDER_ICU)
 	{
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 97c2ac1faf9..ddee424757b 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -1443,6 +1443,8 @@ builtin_locale_encoding(const char *locale)
 		return PG_UTF8;
 	else if (strcmp(locale, "PG_UNICODE_FAST") == 0)
 		return PG_UTF8;
+	else if (strcmp(locale, "PG_UNICODE_CI") == 0)
+		return PG_UTF8;
 
 
 	ereport(ERROR,
@@ -1470,6 +1472,8 @@ builtin_validate_locale(int encoding, const char *locale)
 		canonical_name = "C.UTF-8";
 	else if (strcmp(locale, "PG_UNICODE_FAST") == 0)
 		canonical_name = "PG_UNICODE_FAST";
+	else if (strcmp(locale, "PG_UNICODE_CI") == 0)
+		canonical_name = "PG_UNICODE_CI";
 
 	if (!canonical_name)
 		ereport(ERROR,
diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c
index 0c9fbdb40f2..8018a3be437 100644
--- a/src/backend/utils/adt/pg_locale_builtin.c
+++ b/src/backend/utils/adt/pg_locale_builtin.c
@@ -15,6 +15,7 @@
 #include "catalog/pg_collation.h"
 #include "common/unicode_case.h"
 #include "common/unicode_category.h"
+#include "common/unicode_version.h"
 #include "mb/pg_wchar.h"
 #include "miscadmin.h"
 #include "utils/builtins.h"
@@ -25,6 +26,13 @@ extern pg_locale_t create_pg_locale_builtin(Oid collid,
 											MemoryContext context);
 extern char *get_collation_actual_version_builtin(const char *collcollate);
 
+static int strncoll_builtin_ci(const char *arg1, ssize_t len1,
+							   const char *arg2, ssize_t len2,
+							   pg_locale_t locale);
+static size_t strnxfrm_builtin_ci(char *dest, size_t destsize,
+								  const char *src, ssize_t srclen,
+								  pg_locale_t locale);
+
 struct WordBoundaryState
 {
 	const char *str;
@@ -182,6 +190,13 @@ wc_tolower_builtin(pg_wchar wc, pg_locale_t locale)
 	return unicode_lowercase_simple(wc);
 }
 
+static const struct collate_methods collate_methods_builtin_ci = {
+	.strncoll = strncoll_builtin_ci,
+	.strnxfrm = strnxfrm_builtin_ci,
+	.strnxfrm_prefix = strnxfrm_builtin_ci,
+	.strxfrm_is_safe = true,
+};
+
 static const struct ctype_methods ctype_methods_builtin = {
 	.strlower = strlower_builtin,
 	.strtitle = strtitle_builtin,
@@ -239,9 +254,12 @@ create_pg_locale_builtin(Oid collid, MemoryContext context)
 	result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct));
 
 	result->info.builtin.locale = MemoryContextStrdup(context, locstr);
-	result->info.builtin.casemap_full = (strcmp(locstr, "PG_UNICODE_FAST") == 0);
-	result->deterministic = true;
-	result->collate_is_c = true;
+	result->info.builtin.casemap_full = (strcmp(locstr, "PG_UNICODE_FAST") == 0) ||
+		(strcmp(locstr, "PG_UNICODE_CI") == 0);
+	result->deterministic = (strcmp(locstr, "PG_UNICODE_CI") != 0);
+	result->collate_is_c = (strcmp(locstr, "PG_UNICODE_CI") != 0);
+	if (!result->collate_is_c)
+		result->collate = &collate_methods_builtin_ci;
 	result->ctype_is_c = (strcmp(locstr, "C") == 0);
 	if (!result->ctype_is_c)
 		result->ctype = &ctype_methods_builtin;
@@ -253,8 +271,11 @@ char *
 get_collation_actual_version_builtin(const char *collcollate)
 {
 	/*
-	 * The only two supported locales (C and C.UTF-8) are both based on memcmp
-	 * and are not expected to change, but track the version anyway.
+	 * Locales C, C.UTF-8, and PG_UNICODE_FAST are based on memcmp and are not
+	 * expected to change, but track the version anyway.
+	 *
+	 * PG_UNICODE_CI has collation behavior dependent on the version of
+	 * Unicode, so use that for the collation version.
 	 *
 	 * Note that the character semantics may change for some locales, but the
 	 * collation version only tracks changes to sort order.
@@ -265,6 +286,8 @@ get_collation_actual_version_builtin(const char *collcollate)
 		return "1";
 	else if (strcmp(collcollate, "PG_UNICODE_FAST") == 0)
 		return "1";
+	else if (strcmp(collcollate, "PG_UNICODE_CI") == 0)
+		return PG_UNICODE_VERSION;
 	else
 		ereport(ERROR,
 				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
@@ -273,3 +296,98 @@ get_collation_actual_version_builtin(const char *collcollate)
 
 	return NULL;				/* keep compiler quiet */
 }
+
+/*
+ * strncoll_builtin_ci
+ *
+ * Compare two strings such that the result is equivalent to
+ * strcmp(CASEFOLD(arg1), CASEFOLD(arg2)).
+ */
+int
+strncoll_builtin_ci(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
+					pg_locale_t locale)
+{
+	char		 buf1[UNICODE_CASEMAP_BUFSZ];
+	char		 buf2[UNICODE_CASEMAP_BUFSZ];
+	size_t		 nbytes1 = 0; /* bytes stored in buf1 */
+	size_t		 nbytes2 = 0; /* bytes stored in buf2 */
+	const char	*p1	   = arg1;
+	const char	*p2	   = arg2;
+	ssize_t		 r1	   = (len1 >= 0) ? len1 : strlen(arg1);
+	ssize_t		 r2	   = (len2 >= 0) ? len2 : strlen(arg2);
+	const bool	 full  = locale->info.builtin.casemap_full;
+
+	/*
+	 * Folding the entire string at once could be wasteful. Fold the arguments
+	 * lazily into buf1 and buf2; comparing until we find a difference.
+	 *
+	 * We need to keep track of buffer contents across iterations, because
+	 * folding from the two inputs could result in different numbers of output
+	 * bytes (or even different numbers of codepoints). If the comparison is
+	 * still inconclusive, we need to keep the remaining bytes around for the
+	 * next iteration.
+	 */
+	while ((r1 > 0 || nbytes1 > 0) && (r2 > 0 || nbytes2 > 0))
+	{
+		int		ulen1 = 0;
+		int		ulen2 = 0;
+		size_t	nbytes_both;
+		int		result;
+
+		Assert(nbytes1 == 0 || nbytes2 == 0);
+
+		/* if a buffer is empty, fold one codepoint */
+		if (nbytes1 == 0)
+		{
+			ulen1 = pg_utf_mblen((unsigned char *)p1);
+			Assert(ulen1 <= r1);
+			nbytes1 = unicode_strfold(buf1, UNICODE_CASEMAP_BUFSZ, p1, ulen1,
+									  full);
+		}
+
+		if (nbytes2 == 0)
+		{
+			ulen2 = pg_utf_mblen((unsigned char *)p2);
+			Assert(ulen2 <= r2);
+			nbytes2 = unicode_strfold(buf2, UNICODE_CASEMAP_BUFSZ, p2, ulen2,
+									  full);
+		}
+
+		Assert(nbytes1 > 0 && nbytes1 <= UNICODE_CASEMAP_BUFSZ);
+		Assert(nbytes2 > 0 && nbytes2 <= UNICODE_CASEMAP_BUFSZ);
+
+		/* compare the corresponding bytes available in both buffers */
+		nbytes_both = Min(nbytes1, nbytes2);
+		result = memcmp(buf1, buf2, nbytes_both);
+
+		if (result != 0)
+			return result;
+
+		/* shift any remaining bytes in the buffers to the beginning */
+		nbytes1 -= nbytes_both;
+		nbytes2 -= nbytes_both;
+		memmove(buf1, buf1 + nbytes_both, nbytes1);
+		memmove(buf2, buf2 + nbytes_both, nbytes2);
+
+		p1 += ulen1;
+		r1 -= ulen1;
+		p2 += ulen2;
+		r2 -= ulen2;
+	}
+
+	if ((r1 == 0 && nbytes1 == 0) && !(r2 == 0 && nbytes2 == 0))
+		return -1; /* arg1 exhausted */
+	else if (!(r1 == 0 && nbytes1 == 0) && (r2 == 0 && nbytes2 == 0))
+		return 1; /* arg2 exhausted */
+	else
+		return 0; /* both inputs exhausted */
+}
+
+/* 'srclen' of -1 means the strings are NUL-terminated */
+size_t
+strnxfrm_builtin_ci(char *dest, size_t destsize, const char *src, ssize_t srclen,
+					pg_locale_t locale)
+{
+	return unicode_strfold(dest, destsize, src, srclen,
+						   locale->info.builtin.casemap_full);
+}
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index 92fe2f531f7..35f0adf1103 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -2478,6 +2478,9 @@ setlocales(void)
 			canonname = "C.UTF-8";
 		else if (strcmp(datlocale, "PG_UNICODE_FAST") == 0)
 			canonname = "PG_UNICODE_FAST";
+		else if (strcmp(datlocale, "PG_UNICODE_CI") == 0)
+			pg_fatal("cannot use locale \"%s\" for the database collation",
+					 datlocale);
 		else
 			pg_fatal("invalid locale name \"%s\" for builtin provider",
 					 datlocale);
diff --git a/src/common/unicode_case.c b/src/common/unicode_case.c
index 073faf6a0d5..562b5d3a054 100644
--- a/src/common/unicode_case.c
+++ b/src/common/unicode_case.c
@@ -20,6 +20,9 @@
 #include "common/unicode_category.h"
 #include "mb/pg_wchar.h"
 
+StaticAssertDecl(UNICODE_CASEMAP_BUFSZ == MAX_CASE_EXPANSION * sizeof(pg_wchar),
+				 "UNICODE_CASEMAP_BUFSZ miscalculated");
+
 enum CaseMapResult
 {
 	CASEMAP_SELF,
diff --git a/src/include/catalog/pg_collation.dat b/src/include/catalog/pg_collation.dat
index 8cfd09f0314..a8e03188739 100644
--- a/src/include/catalog/pg_collation.dat
+++ b/src/include/catalog/pg_collation.dat
@@ -37,5 +37,10 @@
   descr => 'sorts by Unicode code point; Unicode character semantics',
   collname => 'pg_unicode_fast', collprovider => 'b', collencoding => '6',
   colllocale => 'PG_UNICODE_FAST', collversion => '1' },
+{ oid => '6434',
+  descr => 'sorts by Unicode code point; Unicode character semantics',
+  collname => 'pg_unicode_ci', collprovider => 'b', collencoding => '6',
+  collisdeterministic => 'f',
+  colllocale => 'PG_UNICODE_CI', collversion => '16.0' },
 
 ]
diff --git a/src/include/common/unicode_case.h b/src/include/common/unicode_case.h
index 41e2c1f4b33..6e74edd66f2 100644
--- a/src/include/common/unicode_case.h
+++ b/src/include/common/unicode_case.h
@@ -16,6 +16,9 @@
 
 #include "mb/pg_wchar.h"
 
+/* buffer size needed to map a single codepoint */
+#define UNICODE_CASEMAP_BUFSZ	12
+
 typedef size_t (*WordBoundaryNext) (void *wbstate);
 
 pg_wchar	unicode_lowercase_simple(pg_wchar code);
diff --git a/src/test/regress/expected/collate.utf8.out b/src/test/regress/expected/collate.utf8.out
index 0c3ab5c89b2..b6a87e340fa 100644
--- a/src/test/regress/expected/collate.utf8.out
+++ b/src/test/regress/expected/collate.utf8.out
@@ -338,3 +338,42 @@ select casefold('AbCd 123 #$% ıiIİ ẞ ß Ǆǅǆ Σσς' collate PG_UNICODE_FA
  abcd 123 #$% ıiii̇ ss ss ǆǆǆ σσσ
 (1 row)
 
+--
+-- Test PG_UNICODE_CI
+--
+CREATE COLLATION regress_pg_unicode_ci (
+  provider = builtin, locale = 'PG_UNICODE_CI');
+DROP COLLATION regress_pg_unicode_ci;
+CREATE TABLE test_pg_unicode_ci (
+  t TEXT COLLATE PG_UNICODE_CI
+);
+INSERT INTO test_pg_unicode_ci VALUES
+       ('ABC'),
+       ('aBc'),
+       ('ABB'),
+       ('ẞß'),
+       ('sSSs'),
+       ('ςσΣ'),
+       ('σςΣ'),
+       ('ΣΣσ');
+SELECT DISTINCT t FROM test_pg_unicode_ci;
+  t  
+-----
+ ABB
+ ABC
+ ẞß
+ ςσΣ
+(4 rows)
+
+SELECT r1.t, r2.t FROM test_pg_unicode_ci r1, test_pg_unicode_ci r2
+ WHERE r1.t = r2.t COLLATE PG_UNICODE_CI AND
+       r1.t < r2.t COLLATE "C";
+  t   |  t  
+------+-----
+ ABC  | aBc
+ sSSs | ẞß
+ ςσΣ  | σςΣ
+ ΣΣσ  | ςσΣ
+ ΣΣσ  | σςΣ
+(5 rows)
+
diff --git a/src/test/regress/sql/collate.utf8.sql b/src/test/regress/sql/collate.utf8.sql
index d6d14220ab3..d38b58ba838 100644
--- a/src/test/regress/sql/collate.utf8.sql
+++ b/src/test/regress/sql/collate.utf8.sql
@@ -148,3 +148,31 @@ SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_UNICODE_FAST; -- same as above with cases re
 
 -- case folding
 select casefold('AbCd 123 #$% ıiIİ ẞ ß Ǆǅǆ Σσς' collate PG_UNICODE_FAST);
+
+--
+-- Test PG_UNICODE_CI
+--
+
+CREATE COLLATION regress_pg_unicode_ci (
+  provider = builtin, locale = 'PG_UNICODE_CI');
+DROP COLLATION regress_pg_unicode_ci;
+
+CREATE TABLE test_pg_unicode_ci (
+  t TEXT COLLATE PG_UNICODE_CI
+);
+
+INSERT INTO test_pg_unicode_ci VALUES
+       ('ABC'),
+       ('aBc'),
+       ('ABB'),
+       ('ẞß'),
+       ('sSSs'),
+       ('ςσΣ'),
+       ('σςΣ'),
+       ('ΣΣσ');
+
+SELECT DISTINCT t FROM test_pg_unicode_ci;
+
+SELECT r1.t, r2.t FROM test_pg_unicode_ci r1, test_pg_unicode_ci r2
+ WHERE r1.t = r2.t COLLATE PG_UNICODE_CI AND
+       r1.t < r2.t COLLATE "C";
-- 
2.43.0

