From c9ed14fbbafe80e9466be20b6098734a35c52564 Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Wed, 11 Dec 2024 23:46:43 -0800
Subject: [PATCH v1 3/3] Add SQL function FOLDCASE().

Useful for case-insensitive string comparison. Avoids some of the
edge-case problems with using LOWER() for that purpose.
---
 doc/src/sgml/func.sgml                     | 34 ++++++++++++
 src/backend/utils/adt/formatting.c         | 64 ++++++++++++++++++++++
 src/backend/utils/adt/oracle_compat.c      | 16 ++++++
 src/backend/utils/adt/pg_locale.c          | 24 ++++++++
 src/backend/utils/adt/pg_locale_builtin.c  |  9 +++
 src/backend/utils/adt/pg_locale_icu.c      | 52 ++++++++++++++++++
 src/include/catalog/pg_proc.dat            |  3 +
 src/include/utils/formatting.h             |  1 +
 src/include/utils/pg_locale.h              |  3 +
 src/test/regress/expected/collate.utf8.out |  7 +++
 src/test/regress/sql/collate.utf8.sql      |  3 +
 11 files changed, 216 insertions(+)

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 2c35252dc06..b958db5f96f 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -3109,6 +3109,40 @@ SELECT NOT(ROW(table.*) IS NOT NULL) FROM TABLE; -- detect at least one null in
        </para></entry>
       </row>
 
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>foldcase</primary>
+        </indexterm>
+        <function>foldcase</function> ( <type>text</type> )
+        <returnvalue>text</returnvalue>
+       </para>
+       <para>
+        Performs case folding of the input string according to the
+        collation. Case folding is similar to case conversion, but the purpose
+        of folding is to facilitate case-insensitive comparison of strings,
+        whereas the purpose of case conversion is to convert to a particular
+        cased form.
+       </para>
+       <para>
+        Ordinarily, case folding simply converts to lowercase, but there are a
+        few notable exceptions. For instance, the character
+        <literal>Σ</literal> (U+03A3) has two lowercase forms:
+        <literal>σ</literal> (U+03C3) and <literal>ς</literal> (U+03C2); case
+        folding in the <literal>PG_C_UTF8</literal> collation maps all forms
+        of the character to <literal>σ</literal>.
+       </para>
+       <para>
+        Another benefit of case folding is that the results don't change for
+        existing characters in new versions of Unicode.
+       </para>
+       <para>
+        The <literal>libc</literal> provider doesn't support case folding, so
+        the <function>foldcase</function> is identical to
+        <function>lower</function>.
+       </para></entry>
+      </row>
+
       <row>
        <entry role="func_table_entry"><para role="func_signature">
         <indexterm>
diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c
index 30c06c8d099..4757c80a96f 100644
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@@ -1773,6 +1773,70 @@ str_initcap(const char *buff, size_t nbytes, Oid collid)
 	return result;
 }
 
+/*
+ * collation-aware, wide-character-aware case folding
+ *
+ * We pass the number of bytes so we can pass varlena and char*
+ * to this function.  The result is a palloc'd, null-terminated string.
+ */
+char *
+str_foldcase(const char *buff, size_t nbytes, Oid collid)
+{
+	char	   *result;
+	pg_locale_t mylocale;
+
+	if (!buff)
+		return NULL;
+
+	if (!OidIsValid(collid))
+	{
+		/*
+		 * This typically means that the parser could not resolve a conflict
+		 * of implicit collations, so report it that way.
+		 */
+		ereport(ERROR,
+				(errcode(ERRCODE_INDETERMINATE_COLLATION),
+				 errmsg("could not determine which collation to use for %s function",
+						"lower()"),
+				 errhint("Use the COLLATE clause to set the collation explicitly.")));
+	}
+
+	mylocale = pg_newlocale_from_collation(collid);
+
+	/* C/POSIX collations use this path regardless of database encoding */
+	if (mylocale->ctype_is_c)
+	{
+		result = asc_tolower(buff, nbytes);
+	}
+	else
+	{
+		const char *src = buff;
+		size_t		srclen = nbytes;
+		size_t		dstsize;
+		char	   *dst;
+		size_t		needed;
+
+		/* first try buffer of equal size plus terminating NUL */
+		dstsize = srclen + 1;
+		dst = palloc(dstsize);
+
+		needed = pg_strfold(dst, dstsize, src, srclen, mylocale);
+		if (needed + 1 > dstsize)
+		{
+			/* grow buffer if needed and retry */
+			dstsize = needed + 1;
+			dst = repalloc(dst, dstsize);
+			needed = pg_strfold(dst, dstsize, src, srclen, mylocale);
+			Assert(needed + 1 <= dstsize);
+		}
+
+		Assert(dst[needed] == '\0');
+		result = dst;
+	}
+
+	return result;
+}
+
 /*
  * ASCII-only lower function
  *
diff --git a/src/backend/utils/adt/oracle_compat.c b/src/backend/utils/adt/oracle_compat.c
index b126a7d460d..3296881b7a7 100644
--- a/src/backend/utils/adt/oracle_compat.c
+++ b/src/backend/utils/adt/oracle_compat.c
@@ -126,6 +126,22 @@ initcap(PG_FUNCTION_ARGS)
 	PG_RETURN_TEXT_P(result);
 }
 
+Datum
+foldcase(PG_FUNCTION_ARGS)
+{
+	text	   *in_string = PG_GETARG_TEXT_PP(0);
+	char	   *out_string;
+	text	   *result;
+
+	out_string = str_foldcase(VARDATA_ANY(in_string),
+							  VARSIZE_ANY_EXHDR(in_string),
+							  PG_GET_COLLATION());
+	result = cstring_to_text(out_string);
+	pfree(out_string);
+
+	PG_RETURN_TEXT_P(result);
+}
+
 
 /********************************************************************
  *
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index d16f26f1705..d2775f8195f 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -122,6 +122,8 @@ extern size_t strtitle_builtin(char *dst, size_t dstsize, const char *src,
 							   ssize_t srclen, pg_locale_t locale);
 extern size_t strupper_builtin(char *dst, size_t dstsize, const char *src,
 							   ssize_t srclen, pg_locale_t locale);
+extern size_t strfold_builtin(char *dst, size_t dstsize, const char *src,
+							  ssize_t srclen, pg_locale_t locale);
 
 extern size_t strlower_icu(char *dst, size_t dstsize, const char *src,
 						   ssize_t srclen, pg_locale_t locale);
@@ -129,6 +131,8 @@ extern size_t strtitle_icu(char *dst, size_t dstsize, const char *src,
 						   ssize_t srclen, pg_locale_t locale);
 extern size_t strupper_icu(char *dst, size_t dstsize, const char *src,
 						   ssize_t srclen, pg_locale_t locale);
+extern size_t strfold_icu(char *dst, size_t dstsize, const char *src,
+						  ssize_t srclen, pg_locale_t locale);
 
 extern size_t strlower_libc(char *dst, size_t dstsize, const char *src,
 							ssize_t srclen, pg_locale_t locale);
@@ -1546,6 +1550,26 @@ pg_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen,
 	return 0;					/* keep compiler quiet */
 }
 
+size_t
+pg_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen,
+		   pg_locale_t locale)
+{
+	if (locale->provider == COLLPROVIDER_BUILTIN)
+		return strfold_builtin(dst, dstsize, src, srclen, locale);
+#ifdef USE_ICU
+	else if (locale->provider == COLLPROVIDER_ICU)
+		return strfold_icu(dst, dstsize, src, srclen, locale);
+#endif
+	/* for libc, just use strlower */
+	else if (locale->provider == COLLPROVIDER_LIBC)
+		return strlower_libc(dst, dstsize, src, srclen, locale);
+	else
+		/* shouldn't happen */
+		PGLOCALE_SUPPORT_ERROR(locale->provider);
+
+	return 0;					/* keep compiler quiet */
+}
+
 /*
  * pg_strcoll
  *
diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c
index d3aa7bceacd..e41b01e7529 100644
--- a/src/backend/utils/adt/pg_locale_builtin.c
+++ b/src/backend/utils/adt/pg_locale_builtin.c
@@ -30,6 +30,8 @@ extern size_t strtitle_builtin(char *dst, size_t dstsize, const char *src,
 							   ssize_t srclen, pg_locale_t locale);
 extern size_t strupper_builtin(char *dst, size_t dstsize, const char *src,
 							   ssize_t srclen, pg_locale_t locale);
+extern size_t strfold_builtin(char *dst, size_t dstsize, const char *src,
+							  ssize_t srclen, pg_locale_t locale);
 
 
 struct WordBoundaryState
@@ -103,6 +105,13 @@ strupper_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
 	return unicode_strupper(dest, destsize, src, srclen);
 }
 
+size_t
+strfold_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
+				pg_locale_t locale)
+{
+	return unicode_strfold(dest, destsize, src, srclen);
+}
+
 pg_locale_t
 create_pg_locale_builtin(Oid collid, MemoryContext context)
 {
diff --git a/src/backend/utils/adt/pg_locale_icu.c b/src/backend/utils/adt/pg_locale_icu.c
index f0a77a767e7..31050f370e7 100644
--- a/src/backend/utils/adt/pg_locale_icu.c
+++ b/src/backend/utils/adt/pg_locale_icu.c
@@ -54,6 +54,8 @@ extern size_t strtitle_icu(char *dst, size_t dstsize, const char *src,
 						   ssize_t srclen, pg_locale_t locale);
 extern size_t strupper_icu(char *dst, size_t dstsize, const char *src,
 						   ssize_t srclen, pg_locale_t locale);
+extern size_t strfold_icu(char *dst, size_t dstsize, const char *src,
+						  ssize_t srclen, pg_locale_t locale);
 
 #ifdef USE_ICU
 
@@ -107,6 +109,10 @@ static int32_t u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
 									   const UChar *src, int32_t srcLength,
 									   const char *locale,
 									   UErrorCode *pErrorCode);
+static int32_t u_strFoldCase_default(UChar *dest, int32_t destCapacity,
+									 const UChar *src, int32_t srcLength,
+									 const char *locale,
+									 UErrorCode *pErrorCode);
 #endif
 
 pg_locale_t
@@ -406,6 +412,26 @@ strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
 	return result_len;
 }
 
+size_t
+strfold_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
+			pg_locale_t locale)
+{
+	int32_t		len_uchar;
+	int32_t		len_conv;
+	UChar	   *buff_uchar;
+	UChar	   *buff_conv;
+	size_t		result_len;
+
+	len_uchar = icu_to_uchar(&buff_uchar, src, srclen);
+	len_conv = icu_convert_case(u_strFoldCase_default, locale,
+								&buff_conv, buff_uchar, len_uchar);
+	result_len = icu_from_uchar(dest, destsize, buff_conv, len_conv);
+	pfree(buff_uchar);
+	pfree(buff_conv);
+
+	return result_len;
+}
+
 /*
  * strncoll_icu
  *
@@ -635,6 +661,32 @@ u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
 						NULL, locale, pErrorCode);
 }
 
+static int32_t
+u_strFoldCase_default(UChar *dest, int32_t destCapacity,
+					  const UChar *src, int32_t srcLength,
+					  const char *locale,
+					  UErrorCode *pErrorCode)
+{
+	uint32 options = U_FOLD_CASE_DEFAULT;
+	char lang[3];
+	UErrorCode	status;
+
+	status = U_ZERO_ERROR;
+	uloc_getLanguage(locale, lang, 3, &status);
+	if (U_SUCCESS(status))
+	{
+		/*
+		 * The option name is confusing, but it causes u_strFoldCase to use
+		 * the 'T' mappings, which are ignored for U_FOLD_CASE_DEFAULT.
+		 */
+		if (strcmp(lang, "tr") == 0 || strcmp(lang, "az") == 0)
+			options = U_FOLD_CASE_EXCLUDE_SPECIAL_I;
+	}
+
+	return u_strFoldCase(dest, destCapacity, src, srcLength,
+						 options, pErrorCode);
+}
+
 /*
  * strncoll_icu_no_utf8
  *
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 9575524007f..9e5e01a9ad6 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -3623,6 +3623,9 @@
 { oid => '872', descr => 'capitalize each word',
   proname => 'initcap', prorettype => 'text', proargtypes => 'text',
   prosrc => 'initcap' },
+{ oid => '9569', descr => 'fold case',
+  proname => 'foldcase', prorettype => 'text', proargtypes => 'text',
+  prosrc => 'foldcase' },
 { oid => '873', descr => 'left-pad string to length',
   proname => 'lpad', prorettype => 'text', proargtypes => 'text int4 text',
   prosrc => 'lpad' },
diff --git a/src/include/utils/formatting.h b/src/include/utils/formatting.h
index cde030414ee..486bca61a56 100644
--- a/src/include/utils/formatting.h
+++ b/src/include/utils/formatting.h
@@ -21,6 +21,7 @@
 extern char *str_tolower(const char *buff, size_t nbytes, Oid collid);
 extern char *str_toupper(const char *buff, size_t nbytes, Oid collid);
 extern char *str_initcap(const char *buff, size_t nbytes, Oid collid);
+extern char *str_foldcase(const char *buff, size_t nbytes, Oid collid);
 
 extern char *asc_tolower(const char *buff, size_t nbytes);
 extern char *asc_toupper(const char *buff, size_t nbytes);
diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h
index 861df3ddd05..30a81fb4fb8 100644
--- a/src/include/utils/pg_locale.h
+++ b/src/include/utils/pg_locale.h
@@ -102,6 +102,9 @@ extern size_t pg_strtitle(char *dest, size_t destsize,
 extern size_t pg_strupper(char *dest, size_t destsize,
 						  const char *src, ssize_t srclen,
 						  pg_locale_t locale);
+extern size_t pg_strfold(char *dest, size_t destsize,
+						 const char *src, ssize_t srclen,
+						 pg_locale_t locale);
 extern int	pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale);
 extern int	pg_strncoll(const char *arg1, ssize_t len1,
 						const char *arg2, ssize_t len2, pg_locale_t locale);
diff --git a/src/test/regress/expected/collate.utf8.out b/src/test/regress/expected/collate.utf8.out
index 4558d2521a2..9d5d565eb39 100644
--- a/src/test/regress/expected/collate.utf8.out
+++ b/src/test/regress/expected/collate.utf8.out
@@ -160,3 +160,10 @@ SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed
  t
 (1 row)
 
+-- case folding
+select foldcase('AbCd 123 #$% ıiIİ ẞ ß Ǆǅǆ Σσς' collate pg_c_utf8);
+           foldcase            
+-------------------------------
+ abcd 123 #$% ıiiİ ß ß ǆǆǆ σσσ
+(1 row)
+
diff --git a/src/test/regress/sql/collate.utf8.sql b/src/test/regress/sql/collate.utf8.sql
index 87fe06ddf1b..9385e71e958 100644
--- a/src/test/regress/sql/collate.utf8.sql
+++ b/src/test/regress/sql/collate.utf8.sql
@@ -80,3 +80,6 @@ SELECT 'xAb' ~* '[W-Y]' COLLATE PG_C_UTF8;
 SELECT 'xAb' !~* '[c-d]' COLLATE PG_C_UTF8;
 SELECT 'Δ' ~* '[γ-λ]' COLLATE PG_C_UTF8;
 SELECT 'δ' ~* '[Γ-Λ]' COLLATE PG_C_UTF8; -- same as above with cases reversed
+
+-- case folding
+select foldcase('AbCd 123 #$% ıiIİ ẞ ß Ǆǅǆ Σσς' collate pg_c_utf8);
-- 
2.34.1

