Refactor: allow pg_strncoll(), etc., to accept -1 length for NUL-terminated cstrings.

Started by Jeff Davisover 1 year ago2 messages

pgsql@j-davis.com

over 1 year ago

1 attachment(s)

Like ICU, allow -1 length to mean that the input string is NUL-
terminated for pg_strncoll(), pg_strnxfrm(), and pg_strnxfrm_prefix().

This simplifies the API and code a bit.

Along with some other refactoring in this area, we are getting close to
the point where the collation provider can just be a table of methods,
which means we can add an extension hook to provide a different method
table. That still requires more work, I'm just mentioning it here for
context.

Regards,
Jeff Davis

Attachments:

v1-0001-Allow-length-1-for-NUL-terminated-input-to-pg_str.patchtext/x-patch; charset=UTF-8; name=v1-0001-Allow-length-1-for-NUL-terminated-input-to-pg_str.patchDownload

From 6f0c0a9e05039cd295c6c090b3d98d381244b35c Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Wed, 21 Aug 2024 10:59:28 -0700
Subject: [PATCH v1] Allow length=-1 for NUL-terminated input to pg_strncoll(),
 etc.

Like ICU, allow a length of -1 to be specified for NUL-terminated
arguments to pg_strncoll(), pg_strnxfrm(), and pg_strnxfrm_prefix().

Simplifies the code and comments.
---
 src/backend/utils/adt/pg_locale.c | 186 ++++++++++--------------------
 1 file changed, 64 insertions(+), 122 deletions(-)

diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 48b7e16d81b..26b0f4577f0 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -1809,6 +1809,8 @@ get_collation_actual_version(char collprovider, const char *collcollate)
  *
  * Win32 does not have UTF-8. Convert UTF8 arguments to wide characters and
  * invoke wcscoll_l().
+ *
+ * An input string length of -1 means that it's NUL-terminated.
  */
 #ifdef WIN32
 static int
@@ -1819,8 +1821,8 @@ pg_strncoll_libc_win32_utf8(const char *arg1, size_t len1, const char *arg2,
 	char	   *buf = sbuf;
 	char	   *a1p,
 			   *a2p;
-	int			a1len = len1 * 2 + 2;
-	int			a2len = len2 * 2 + 2;
+	int			a1len;
+	int			a2len;
 	int			r;
 	int			result;
 
@@ -1830,6 +1832,14 @@ pg_strncoll_libc_win32_utf8(const char *arg1, size_t len1, const char *arg2,
 	Assert(false);
 #endif
 
+	if (len1 == -1)
+		len1 = strlen(arg1);
+	if (len2 == -1)
+		len2 = strlen(arg2);
+
+	a1len = len1 * 2 + 2;
+	a2len = len2 * 2 + 2;
+
 	if (a1len + a2len > TEXTBUFLEN)
 		buf = palloc(a1len + a2len);
 
@@ -1876,40 +1886,10 @@ pg_strncoll_libc_win32_utf8(const char *arg1, size_t len1, const char *arg2,
 }
 #endif							/* WIN32 */
 
-/*
- * pg_strcoll_libc
- *
- * Call strcoll_l() or wcscoll_l() as appropriate for the given locale,
- * platform, and database encoding. If the locale is NULL, use the database
- * collation.
- *
- * Arguments must be encoded in the database encoding and nul-terminated.
- */
-static int
-pg_strcoll_libc(const char *arg1, const char *arg2, pg_locale_t locale)
-{
-	int			result;
-
-	Assert(locale->provider == COLLPROVIDER_LIBC);
-#ifdef WIN32
-	if (GetDatabaseEncoding() == PG_UTF8)
-	{
-		size_t		len1 = strlen(arg1);
-		size_t		len2 = strlen(arg2);
-
-		result = pg_strncoll_libc_win32_utf8(arg1, len1, arg2, len2, locale);
-	}
-	else
-#endif							/* WIN32 */
-		result = strcoll_l(arg1, arg2, locale->info.lt);
-
-	return result;
-}
-
 /*
  * pg_strncoll_libc
  *
- * Nul-terminate the arguments and call pg_strcoll_libc().
+ * An input string length of -1 means that it's NUL-terminated.
  */
 static int
 pg_strncoll_libc(const char *arg1, size_t len1, const char *arg2, size_t len2,
@@ -1917,10 +1897,10 @@ pg_strncoll_libc(const char *arg1, size_t len1, const char *arg2, size_t len2,
 {
 	char		sbuf[TEXTBUFLEN];
 	char	   *buf = sbuf;
-	size_t		bufsize1 = len1 + 1;
-	size_t		bufsize2 = len2 + 1;
-	char	   *arg1n;
-	char	   *arg2n;
+	size_t		bufsize1 = (len1 == -1) ? 0 : len1 + 1;
+	size_t		bufsize2 = (len2 == -1) ? 0 : len2 + 1;
+	const char *arg1n;
+	const char *arg2n;
 	int			result;
 
 	Assert(locale->provider == COLLPROVIDER_LIBC);
@@ -1934,16 +1914,32 @@ pg_strncoll_libc(const char *arg1, size_t len1, const char *arg2, size_t len2,
 	if (bufsize1 + bufsize2 > TEXTBUFLEN)
 		buf = palloc(bufsize1 + bufsize2);
 
-	arg1n = buf;
-	arg2n = buf + bufsize1;
+	/* nul-terminate arguments if necessary */
+	if (len1 == -1)
+	{
+		arg1n = arg1;
+	}
+	else
+	{
+		char *buf1 = buf;
+		memcpy(buf1, arg1, len1);
+		buf1[len1] = '\0';
+		arg1n = buf1;
+	}
 
-	/* nul-terminate arguments */
-	memcpy(arg1n, arg1, len1);
-	arg1n[len1] = '\0';
-	memcpy(arg2n, arg2, len2);
-	arg2n[len2] = '\0';
+	if (len2 == -1)
+	{
+		arg2n = arg2;
+	}
+	else
+	{
+		char *buf2 = buf + bufsize1;
+		memcpy(buf2, arg2, len2);
+		buf2[len2] = '\0';
+		arg2n = buf2;
+	}
 
-	result = pg_strcoll_libc(arg1n, arg2n, locale);
+	result = strcoll_l(arg1n, arg2n, locale->info.lt);
 
 	if (buf != sbuf)
 		pfree(buf);
@@ -2015,8 +2011,6 @@ pg_strncoll_icu_no_utf8(const char *arg1, int32_t len1,
  * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given
  * database encoding. An argument length of -1 means the string is
  * NUL-terminated.
- *
- * Arguments must be encoded in the database encoding.
  */
 static int
 pg_strncoll_icu(const char *arg1, int32_t len1, const char *arg2, int32_t len2,
@@ -2054,15 +2048,7 @@ pg_strncoll_icu(const char *arg1, int32_t len1, const char *arg2, int32_t len2,
 /*
  * pg_strcoll
  *
- * Call ucol_strcollUTF8(), ucol_strcoll(), strcoll_l() or wcscoll_l() as
- * appropriate for the given locale, platform, and database encoding. If the
- * locale is not specified, use the database collation.
- *
- * Arguments must be encoded in the database encoding and nul-terminated.
- *
- * The caller is responsible for breaking ties if the collation is
- * deterministic; this maintains consistency with pg_strxfrm(), which cannot
- * easily account for deterministic collations.
+ * Like pg_strncoll for NUL-terminated input strings.
  */
 int
 pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale)
@@ -2070,7 +2056,7 @@ pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale)
 	int			result;
 
 	if (locale->provider == COLLPROVIDER_LIBC)
-		result = pg_strcoll_libc(arg1, arg2, locale);
+		result = pg_strncoll_libc(arg1, -1, arg2, -1, locale);
 #ifdef USE_ICU
 	else if (locale->provider == COLLPROVIDER_ICU)
 		result = pg_strncoll_icu(arg1, -1, arg2, -1, locale);
@@ -2089,11 +2075,8 @@ pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale)
  * appropriate for the given locale, platform, and database encoding. If the
  * locale is not specified, use the database collation.
  *
- * Arguments must be encoded in the database encoding.
- *
- * This function may need to nul-terminate the arguments for libc functions;
- * so if the caller already has nul-terminated strings, it should call
- * pg_strcoll() instead.
+ * The input strings must be encoded in the database encoding. If an input
+ * string is NUL-terminated, its length may be specified as -1.
  *
  * The caller is responsible for breaking ties if the collation is
  * deterministic; this maintains consistency with pg_strnxfrm(), which cannot
@@ -2119,14 +2102,6 @@ pg_strncoll(const char *arg1, size_t len1, const char *arg2, size_t len2,
 }
 
 
-static size_t
-pg_strxfrm_libc(char *dest, const char *src, size_t destsize,
-				pg_locale_t locale)
-{
-	Assert(locale->provider == COLLPROVIDER_LIBC);
-	return strxfrm_l(dest, src, destsize, locale->info.lt);
-}
-
 static size_t
 pg_strnxfrm_libc(char *dest, const char *src, size_t srclen, size_t destsize,
 				 pg_locale_t locale)
@@ -2138,14 +2113,17 @@ pg_strnxfrm_libc(char *dest, const char *src, size_t srclen, size_t destsize,
 
 	Assert(locale->provider == COLLPROVIDER_LIBC);
 
+	if (srclen == -1)
+		return strxfrm_l(dest, src, destsize, locale->info.lt);
+
 	if (bufsize > TEXTBUFLEN)
 		buf = palloc(bufsize);
 
-	/* nul-terminate arguments */
+	/* nul-terminate argument */
 	memcpy(buf, src, srclen);
 	buf[srclen] = '\0';
 
-	result = pg_strxfrm_libc(dest, buf, destsize, locale);
+	result = strxfrm_l(dest, buf, destsize, locale->info.lt);
 
 	if (buf != sbuf)
 		pfree(buf);
@@ -2326,20 +2304,7 @@ pg_strxfrm_enabled(pg_locale_t locale)
 /*
  * pg_strxfrm
  *
- * Transforms 'src' to a nul-terminated string stored in 'dest' such that
- * ordinary strcmp() on transformed strings is equivalent to pg_strcoll() on
- * untransformed strings.
- *
- * The provided 'src' must be nul-terminated. If 'destsize' is zero, 'dest'
- * may be NULL.
- *
- * Not all providers support pg_strxfrm() safely. The caller should check
- * pg_strxfrm_enabled() first, otherwise this function may return wrong
- * results or an error.
- *
- * Returns the number of bytes needed (or more) to store the transformed
- * string, excluding the terminating nul byte. If the value returned is
- * 'destsize' or greater, the resulting contents of 'dest' are undefined.
+ * Like pg_strnxfrm for a NUL-terminated input string.
  */
 size_t
 pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale)
@@ -2347,7 +2312,7 @@ pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale)
 	size_t		result = 0;		/* keep compiler quiet */
 
 	if (locale->provider == COLLPROVIDER_LIBC)
-		result = pg_strxfrm_libc(dest, src, destsize, locale);
+		result = pg_strnxfrm_libc(dest, src, -1, destsize, locale);
 #ifdef USE_ICU
 	else if (locale->provider == COLLPROVIDER_ICU)
 		result = pg_strnxfrm_icu(dest, src, -1, destsize, locale);
@@ -2366,8 +2331,9 @@ pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale)
  * ordinary strcmp() on transformed strings is equivalent to pg_strcoll() on
  * untransformed strings.
  *
- * 'src' does not need to be nul-terminated. If 'destsize' is zero, 'dest' may
- * be NULL.
+ * The input string must be encoded in the database encoding. If the input
+ * string is NUL-terminated, its length may be specified as -1. If 'destsize'
+ * is zero, 'dest' may be NULL.
  *
  * Not all providers support pg_strnxfrm() safely. The caller should check
  * pg_strxfrm_enabled() first, otherwise this function may return wrong
@@ -2376,10 +2342,6 @@ pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale)
  * Returns the number of bytes needed (or more) to store the transformed
  * string, excluding the terminating nul byte. If the value returned is
  * 'destsize' or greater, the resulting contents of 'dest' are undefined.
- *
- * This function may need to nul-terminate the argument for libc functions;
- * so if the caller already has a nul-terminated string, it should call
- * pg_strxfrm() instead.
  */
 size_t
 pg_strnxfrm(char *dest, size_t destsize, const char *src, size_t srclen,
@@ -2421,44 +2383,24 @@ pg_strxfrm_prefix_enabled(pg_locale_t locale)
 /*
  * pg_strxfrm_prefix
  *
- * Transforms 'src' to a byte sequence stored in 'dest' such that ordinary
- * memcmp() on the byte sequence is equivalent to pg_strcoll() on
- * untransformed strings. The result is not nul-terminated.
- *
- * The provided 'src' must be nul-terminated.
- *
- * Not all providers support pg_strxfrm_prefix() safely. The caller should
- * check pg_strxfrm_prefix_enabled() first, otherwise this function may return
- * wrong results or an error.
- *
- * If destsize is not large enough to hold the resulting byte sequence, stores
- * only the first destsize bytes in 'dest'. Returns the number of bytes
- * actually copied to 'dest'.
+ * Like pg_strnxfrm_prefix for a NUL-terminated input string.
  */
 size_t
 pg_strxfrm_prefix(char *dest, const char *src, size_t destsize,
 				  pg_locale_t locale)
 {
-	size_t		result = 0;		/* keep compiler quiet */
-
-#ifdef USE_ICU
-	if (locale->provider == COLLPROVIDER_ICU)
-		result = pg_strnxfrm_prefix_icu(dest, src, -1, destsize, locale);
-	else
-#endif
-		PGLOCALE_SUPPORT_ERROR(locale->provider);
-
-	return result;
+	return pg_strnxfrm_prefix(dest, destsize, src, -1, locale);
 }
 
 /*
  * pg_strnxfrm_prefix
  *
  * Transforms 'src' to a byte sequence stored in 'dest' such that ordinary
- * memcmp() on the byte sequence is equivalent to pg_strcoll() on
+ * memcmp() on the byte sequence is equivalent to pg_strncoll() on
  * untransformed strings. The result is not nul-terminated.
  *
- * The provided 'src' must be nul-terminated.
+ * The input string must be encoded in the database encoding. If the input
+ * string is NUL-terminated, its length may be specified as -1.
  *
  * Not all providers support pg_strnxfrm_prefix() safely. The caller should
  * check pg_strxfrm_prefix_enabled() first, otherwise this function may return
@@ -2467,10 +2409,6 @@ pg_strxfrm_prefix(char *dest, const char *src, size_t destsize,
  * If destsize is not large enough to hold the resulting byte sequence, stores
  * only the first destsize bytes in 'dest'. Returns the number of bytes
  * actually copied to 'dest'.
- *
- * This function may need to nul-terminate the argument for libc functions;
- * so if the caller already has a nul-terminated string, it should call
- * pg_strxfrm_prefix() instead.
  */
 size_t
 pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src,
@@ -2661,6 +2599,8 @@ init_icu_converter(void)
 
 /*
  * Find length, in UChars, of given string if converted to UChar string.
+ *
+ * A length of -1 indicates that the input string is NUL-terminated.
  */
 static size_t
 uchar_length(UConverter *converter, const char *str, int32_t len)
@@ -2678,6 +2618,8 @@ uchar_length(UConverter *converter, const char *str, int32_t len)
 /*
  * Convert the given source string into a UChar string, stored in dest, and
  * return the length (in UChars).
+ *
+ * A srclen of -1 indicates that the input string is NUL-terminated.
  */
 static int32_t
 uchar_convert(UConverter *converter, UChar *dest, int32_t destlen,
-- 
2.34.1

Jeff Davis

pgsql@j-davis.com

over 1 year ago

In reply to: Jeff Davis (#1)

7 attachment(s)

Re: Refactor: allow pg_strncoll(), etc., to accept -1 length for NUL-terminated cstrings.

On Thu, 2024-08-22 at 11:00 -0700, Jeff Davis wrote:

Like ICU, allow -1 length to mean that the input string is NUL-
terminated for pg_strncoll(), pg_strnxfrm(), and
pg_strnxfrm_prefix().

To better illustrate the direction I'm going, I roughly implemented
some patches that implement collation using a table of methods rather
than lots branching based on the provider.

This more cleanly separates the API for a provider, which will enable
us to use a hook to create a custom provider with arbitrary methods,
that may have nothing to do with ICU or libc. Or, we could go so far as
to implement a "CREATE LOCALE PROVIDER" that would provide the methods
using a handler function, and "datlocprovider" would be an OID rather
than a char.

From a practical perspective, I expect that extensions would use this
to lock down the version of a particular provider rather than implement
a completely arbitrary one. But the API is good for either case, and
offers quite a bit of code cleanup.

There are quite a few loose ends, of course:

* There is still a lot of branching on the provider for DDL and
catalog access. I'm not sure if we will ever eliminate all of this, or
if we would even want to.

* I haven't done anything with get_collation_actual_version().
Perhaps that should be a method, too, but it requires some extra
thought if we want this to be useful for "multilib" (having multiple
versions of a provider library at once).

* I didn't add methods for formatting.c yet.

* initdb -- should it offer a way to preload a library and then use
that for the provider?

* I need to allow an arbitrary per-provider context, rather than the
current union designed for the existing providers.

Again, the patches are rough and there's a lot of code churn. I'd like
some feedback on whether people generally like the direction this is
going. If so I will clean up the patch series into smaller, more
reviewable chunks.

Regards,
Jeff Davis

Attachments:

v4-0001-Tighten-up-make_libc_collator-and-make_icu_collat.patchtext/x-patch; charset=UTF-8; name=v4-0001-Tighten-up-make_libc_collator-and-make_icu_collat.patchDownload

From 224470bc4d0660dc11940f5595031eecb0319d62 Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Wed, 7 Aug 2024 11:05:46 -0700
Subject: [PATCH v4 1/7] Tighten up make_libc_collator() and
 make_icu_collator().

Return the result rather than using an out parameter, and make it the
caller's responsibility to copy it into the right context. Ensure that
no paths leak a collator.

The function make_icu_collator() doesn't have any external callers, so
change it to be static. Also, when re-opening with rules, use a
try/finally block to avoid leaking the collator.

In make_libc_collator(), if the first newlocale() succeeds and the
second one fails, close the first locale_t object.

Discussion: https://postgr.es/m/54d20e812bd6c3e44c10eddcd757ec494ebf1803.camel@j-davis.com
---
 src/backend/utils/adt/pg_locale.c | 126 +++++++++++++++++++-----------
 src/include/utils/pg_locale.h     |   4 -
 2 files changed, 80 insertions(+), 50 deletions(-)

diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 5bef1b113a8..12ba5726f77 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -1297,14 +1297,15 @@ report_newlocale_failure(const char *localename)
 }
 
 /*
- * Initialize the locale_t field.
+ * Create a locale_t with the given collation and ctype.
  *
- * The "C" and "POSIX" locales are not actually handled by libc, so set the
- * locale_t to zero in that case.
+ * The "C" and "POSIX" locales are not actually handled by libc, so return
+ * NULL.
+ *
+ * Ensure that no path leaks a locale_t.
  */
-static void
-make_libc_collator(const char *collate, const char *ctype,
-				   pg_locale_t result)
+static locale_t
+make_libc_collator(const char *collate, const char *ctype)
 {
 	locale_t	loc = 0;
 
@@ -1343,7 +1344,11 @@ make_libc_collator(const char *collate, const char *ctype,
 			errno = 0;
 			loc = newlocale(LC_CTYPE_MASK, ctype, loc1);
 			if (!loc)
+			{
+				if (loc1)
+					freelocale(loc1);
 				report_newlocale_failure(ctype);
+			}
 		}
 		else
 			loc = loc1;
@@ -1360,60 +1365,78 @@ make_libc_collator(const char *collate, const char *ctype,
 #endif
 	}
 
-	result->info.lt = loc;
+	return loc;
 }
 
-void
-make_icu_collator(const char *iculocstr,
-				  const char *icurules,
-				  struct pg_locale_struct *resultp)
-{
+/*
+ * Create a UCollator with the given locale string and rules.
+ *
+ * Ensure that no path leaks a UCollator.
+ */
 #ifdef USE_ICU
-	UCollator  *collator;
-
-	collator = pg_ucol_open(iculocstr);
-
-	/*
-	 * If rules are specified, we extract the rules of the standard collation,
-	 * add our own rules, and make a new collator with the combined rules.
-	 */
-	if (icurules)
+static UCollator *
+make_icu_collator(const char *iculocstr, const char *icurules)
+{
+	if (!icurules)
 	{
-		const UChar *default_rules;
-		UChar	   *agg_rules;
+		/* simple case without rules */
+		return pg_ucol_open(iculocstr);
+	}
+	else
+	{
+		UCollator  *collator_std_rules;
+		UCollator  *collator_all_rules;
+		const UChar *std_rules;
 		UChar	   *my_rules;
-		UErrorCode	status;
+		UChar	   *all_rules;
 		int32_t		length;
+		int32_t		total;
+		UErrorCode	status;
 
-		default_rules = ucol_getRules(collator, &length);
+		/*
+		 * If rules are specified, we extract the rules of the standard
+		 * collation, add our own rules, and make a new collator with the
+		 * combined rules.
+		 */
 		icu_to_uchar(&my_rules, icurules, strlen(icurules));
 
-		agg_rules = palloc_array(UChar, u_strlen(default_rules) + u_strlen(my_rules) + 1);
-		u_strcpy(agg_rules, default_rules);
-		u_strcat(agg_rules, my_rules);
+		collator_std_rules = pg_ucol_open(iculocstr);
 
-		ucol_close(collator);
+		std_rules = ucol_getRules(collator_std_rules, &length);
+
+		total = u_strlen(std_rules) + u_strlen(my_rules) + 1;
+
+		/* avoid leaking collator on OOM */
+		all_rules = palloc_extended(sizeof(UChar) * total, MCXT_ALLOC_NO_OOM);
+		if (!all_rules)
+		{
+			ucol_close(collator_std_rules);
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of memory")));
+		}
+
+		u_strcpy(all_rules, std_rules);
+		u_strcat(all_rules, my_rules);
+
+		ucol_close(collator_std_rules);
 
 		status = U_ZERO_ERROR;
-		collator = ucol_openRules(agg_rules, u_strlen(agg_rules),
-								  UCOL_DEFAULT, UCOL_DEFAULT_STRENGTH, NULL, &status);
+		collator_all_rules = ucol_openRules(all_rules, u_strlen(all_rules),
+											UCOL_DEFAULT, UCOL_DEFAULT_STRENGTH,
+											NULL, &status);
 		if (U_FAILURE(status))
+		{
 			ereport(ERROR,
 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 					 errmsg("could not open collator for locale \"%s\" with rules \"%s\": %s",
 							iculocstr, icurules, u_errorName(status))));
-	}
+		}
 
-	/* We will leak this string if the caller errors later :-( */
-	resultp->info.icu.locale = MemoryContextStrdup(TopMemoryContext, iculocstr);
-	resultp->info.icu.ucol = collator;
-#else							/* not USE_ICU */
-	/* could get here if a collation was created by a build with ICU */
-	ereport(ERROR,
-			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-			 errmsg("ICU is not supported in this build")));
-#endif							/* not USE_ICU */
+		return collator_all_rules;
+	}
 }
+#endif							/* not USE_ICU */
 
 /*
  * Initialize default_locale with database locale settings.
@@ -1424,7 +1447,6 @@ init_database_collation(void)
 	HeapTuple	tup;
 	Form_pg_database dbform;
 	Datum		datum;
-	bool		isnull;
 
 	/* Fetch our pg_database row normally, via syscache */
 	tup = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId));
@@ -1449,8 +1471,10 @@ init_database_collation(void)
 	}
 	else if (dbform->datlocprovider == COLLPROVIDER_ICU)
 	{
+#ifdef USE_ICU
 		char	   *datlocale;
 		char	   *icurules;
+		bool		isnull;
 
 		datum = SysCacheGetAttrNotNull(DATABASEOID, tup, Anum_pg_database_datlocale);
 		datlocale = TextDatumGetCString(datum);
@@ -1464,7 +1488,14 @@ init_database_collation(void)
 		else
 			icurules = NULL;
 
-		make_icu_collator(datlocale, icurules, &default_locale);
+		default_locale.info.icu.locale = MemoryContextStrdup(TopMemoryContext, datlocale);
+		default_locale.info.icu.ucol = make_icu_collator(datlocale, icurules);
+#else							/* not USE_ICU */
+		/* could get here if a collation was created by a build with ICU */
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("ICU is not supported in this build")));
+#endif							/* not USE_ICU */
 	}
 	else
 	{
@@ -1483,7 +1514,7 @@ init_database_collation(void)
 		default_locale.ctype_is_c = (strcmp(datctype, "C") == 0) ||
 			(strcmp(datctype, "POSIX") == 0);
 
-		make_libc_collator(datcollate, datctype, &default_locale);
+		default_locale.info.lt = make_libc_collator(datcollate, datctype);
 	}
 
 	default_locale.provider = dbform->datlocprovider;
@@ -1572,7 +1603,7 @@ pg_newlocale_from_collation(Oid collid)
 			result.ctype_is_c = (strcmp(collctype, "C") == 0) ||
 				(strcmp(collctype, "POSIX") == 0);
 
-			make_libc_collator(collcollate, collctype, &result);
+			result.info.lt = make_libc_collator(collcollate, collctype);
 		}
 		else if (collform->collprovider == COLLPROVIDER_ICU)
 		{
@@ -1591,7 +1622,8 @@ pg_newlocale_from_collation(Oid collid)
 			else
 				icurules = NULL;
 
-			make_icu_collator(iculocstr, icurules, &result);
+			result.info.icu.locale = MemoryContextStrdup(TopMemoryContext, iculocstr);
+			result.info.icu.ucol = make_icu_collator(iculocstr, icurules);
 		}
 
 		datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collversion,
@@ -2500,6 +2532,8 @@ builtin_validate_locale(int encoding, const char *locale)
 /*
  * Wrapper around ucol_open() to handle API differences for older ICU
  * versions.
+ *
+ * Ensure that no path leaks a UCollator.
  */
 static UCollator *
 pg_ucol_open(const char *loc_str)
diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h
index faae868bfcc..c2d95411e0a 100644
--- a/src/include/utils/pg_locale.h
+++ b/src/include/utils/pg_locale.h
@@ -104,10 +104,6 @@ struct pg_locale_struct
 
 typedef struct pg_locale_struct *pg_locale_t;
 
-extern void make_icu_collator(const char *iculocstr,
-							  const char *icurules,
-							  struct pg_locale_struct *resultp);
-
 extern void init_database_collation(void);
 extern pg_locale_t pg_newlocale_from_collation(Oid collid);
 
-- 
2.34.1

v4-0007-Use-method-table-for-collation.patchtext/x-patch; charset=UTF-8; name=v4-0007-Use-method-table-for-collation.patchDownload

From c9ace91726c2889fe96dec28fd9f3c655e13afd7 Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Thu, 19 Sep 2024 11:12:41 -0700
Subject: [PATCH v4 7/7] Use method table for collation.

---
 src/backend/regex/regc_pg_locale.c     |  376 ++-----
 src/backend/utils/adt/Makefile         |    2 +
 src/backend/utils/adt/meson.build      |    2 +
 src/backend/utils/adt/pg_locale.c      | 1338 +++---------------------
 src/backend/utils/adt/pg_locale_icu.c  |  873 ++++++++++++++++
 src/backend/utils/adt/pg_locale_libc.c |  604 +++++++++++
 src/include/utils/pg_locale.h          |   44 +-
 7 files changed, 1727 insertions(+), 1512 deletions(-)
 create mode 100644 src/backend/utils/adt/pg_locale_icu.c
 create mode 100644 src/backend/utils/adt/pg_locale_libc.c

diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c
index b75784b6ce5..f7cd3f1787c 100644
--- a/src/backend/regex/regc_pg_locale.c
+++ b/src/backend/regex/regc_pg_locale.c
@@ -63,32 +63,17 @@
  * NB: the coding here assumes pg_wchar is an unsigned type.
  */
 
-typedef enum
-{
-	PG_REGEX_STRATEGY_C,		/* C locale (encoding independent) */
-	PG_REGEX_STRATEGY_BUILTIN,	/* built-in Unicode semantics */
-	PG_REGEX_STRATEGY_LIBC_WIDE,	/* Use locale_t <wctype.h> functions */
-	PG_REGEX_STRATEGY_LIBC_1BYTE,	/* Use locale_t <ctype.h> functions */
-	PG_REGEX_STRATEGY_ICU,		/* Use ICU uchar.h functions */
-} PG_Locale_Strategy;
-
-static PG_Locale_Strategy pg_regex_strategy;
 static pg_locale_t pg_regex_locale;
 static Oid	pg_regex_collation;
 
+static struct pg_locale_struct dummy_c_locale = {
+	.collate_is_c = true,
+	.ctype_is_c = true,
+};
+
 /*
  * Hard-wired character properties for C locale
  */
-#define PG_ISDIGIT	0x01
-#define PG_ISALPHA	0x02
-#define PG_ISALNUM	(PG_ISDIGIT | PG_ISALPHA)
-#define PG_ISUPPER	0x04
-#define PG_ISLOWER	0x08
-#define PG_ISGRAPH	0x10
-#define PG_ISPRINT	0x20
-#define PG_ISPUNCT	0x40
-#define PG_ISSPACE	0x80
-
 static const unsigned char pg_char_properties[128] = {
 	 /* NUL */ 0,
 	 /* ^A */ 0,
@@ -232,7 +217,6 @@ void
 pg_set_regex_collation(Oid collation)
 {
 	pg_locale_t locale = 0;
-	PG_Locale_Strategy strategy;
 
 	if (!OidIsValid(collation))
 	{
@@ -253,8 +237,8 @@ pg_set_regex_collation(Oid collation)
 		 * catalog access is available, so we can't call
 		 * pg_newlocale_from_collation().
 		 */
-		strategy = PG_REGEX_STRATEGY_C;
 		collation = C_COLLATION_OID;
+		locale = &dummy_c_locale;
 	}
 	else
 	{
@@ -271,32 +255,11 @@ pg_set_regex_collation(Oid collation)
 			 * C/POSIX collations use this path regardless of database
 			 * encoding
 			 */
-			strategy = PG_REGEX_STRATEGY_C;
-			locale = 0;
+			locale = &dummy_c_locale;
 			collation = C_COLLATION_OID;
 		}
-		else if (locale->provider == COLLPROVIDER_BUILTIN)
-		{
-			Assert(GetDatabaseEncoding() == PG_UTF8);
-			strategy = PG_REGEX_STRATEGY_BUILTIN;
-		}
-#ifdef USE_ICU
-		else if (locale->provider == COLLPROVIDER_ICU)
-		{
-			strategy = PG_REGEX_STRATEGY_ICU;
-		}
-#endif
-		else
-		{
-			Assert(locale->provider == COLLPROVIDER_LIBC);
-			if (GetDatabaseEncoding() == PG_UTF8)
-				strategy = PG_REGEX_STRATEGY_LIBC_WIDE;
-			else
-				strategy = PG_REGEX_STRATEGY_LIBC_1BYTE;
-		}
 	}
 
-	pg_regex_strategy = strategy;
 	pg_regex_locale = locale;
 	pg_regex_collation = collation;
 }
@@ -304,82 +267,31 @@ pg_set_regex_collation(Oid collation)
 static int
 pg_wc_isdigit(pg_wchar c)
 {
-	switch (pg_regex_strategy)
-	{
-		case PG_REGEX_STRATEGY_C:
-			return (c <= (pg_wchar) 127 &&
-					(pg_char_properties[c] & PG_ISDIGIT));
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_isdigit(c, true);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return iswdigit_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
-			return (c <= (pg_wchar) UCHAR_MAX &&
-					isdigit_l((unsigned char) c, pg_regex_locale->info.lt));
-			break;
-		case PG_REGEX_STRATEGY_ICU:
-#ifdef USE_ICU
-			return u_isdigit(c);
-#endif
-			break;
-	}
-	return 0;					/* can't get here, but keep compiler quiet */
+	if (pg_regex_locale->ctype_is_c)
+		return (c <= (pg_wchar) 127 &&
+				(pg_char_properties[c] & PG_ISDIGIT));
+	else
+		return char_props(c, PG_ISDIGIT, pg_regex_locale) != 0;
 }
 
 static int
 pg_wc_isalpha(pg_wchar c)
 {
-	switch (pg_regex_strategy)
-	{
-		case PG_REGEX_STRATEGY_C:
-			return (c <= (pg_wchar) 127 &&
-					(pg_char_properties[c] & PG_ISALPHA));
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_isalpha(c);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return iswalpha_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
-			return (c <= (pg_wchar) UCHAR_MAX &&
-					isalpha_l((unsigned char) c, pg_regex_locale->info.lt));
-			break;
-		case PG_REGEX_STRATEGY_ICU:
-#ifdef USE_ICU
-			return u_isalpha(c);
-#endif
-			break;
-	}
-	return 0;					/* can't get here, but keep compiler quiet */
+	if (pg_regex_locale->ctype_is_c)
+		return (c <= (pg_wchar) 127 &&
+				(pg_char_properties[c] & PG_ISALPHA));
+	else
+		return char_props(c, PG_ISALPHA, pg_regex_locale) != 0;
 }
 
 static int
 pg_wc_isalnum(pg_wchar c)
 {
-	switch (pg_regex_strategy)
-	{
-		case PG_REGEX_STRATEGY_C:
-			return (c <= (pg_wchar) 127 &&
-					(pg_char_properties[c] & PG_ISALNUM));
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_isalnum(c, true);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return iswalnum_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
-			return (c <= (pg_wchar) UCHAR_MAX &&
-					isalnum_l((unsigned char) c, pg_regex_locale->info.lt));
-			break;
-		case PG_REGEX_STRATEGY_ICU:
-#ifdef USE_ICU
-			return u_isalnum(c);
-#endif
-			break;
-	}
-	return 0;					/* can't get here, but keep compiler quiet */
+	if (pg_regex_locale->ctype_is_c)
+		return (c <= (pg_wchar) 127 &&
+				(pg_char_properties[c] & PG_ISALNUM));
+	else
+		return char_props(c, PG_ISDIGIT|PG_ISALPHA, pg_regex_locale) != 0;
 }
 
 static int
@@ -394,219 +306,87 @@ pg_wc_isword(pg_wchar c)
 static int
 pg_wc_isupper(pg_wchar c)
 {
-	switch (pg_regex_strategy)
-	{
-		case PG_REGEX_STRATEGY_C:
-			return (c <= (pg_wchar) 127 &&
-					(pg_char_properties[c] & PG_ISUPPER));
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_isupper(c);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return iswupper_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
-			return (c <= (pg_wchar) UCHAR_MAX &&
-					isupper_l((unsigned char) c, pg_regex_locale->info.lt));
-			break;
-		case PG_REGEX_STRATEGY_ICU:
-#ifdef USE_ICU
-			return u_isupper(c);
-#endif
-			break;
-	}
-	return 0;					/* can't get here, but keep compiler quiet */
+	if (pg_regex_locale->ctype_is_c)
+		return (c <= (pg_wchar) 127 &&
+				(pg_char_properties[c] & PG_ISUPPER));
+	else
+		return char_props(c, PG_ISUPPER, pg_regex_locale) != 0;
 }
 
 static int
 pg_wc_islower(pg_wchar c)
 {
-	switch (pg_regex_strategy)
-	{
-		case PG_REGEX_STRATEGY_C:
-			return (c <= (pg_wchar) 127 &&
-					(pg_char_properties[c] & PG_ISLOWER));
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_islower(c);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return iswlower_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
-			return (c <= (pg_wchar) UCHAR_MAX &&
-					islower_l((unsigned char) c, pg_regex_locale->info.lt));
-			break;
-		case PG_REGEX_STRATEGY_ICU:
-#ifdef USE_ICU
-			return u_islower(c);
-#endif
-			break;
-	}
-	return 0;					/* can't get here, but keep compiler quiet */
+	if (pg_regex_locale->ctype_is_c)
+		return (c <= (pg_wchar) 127 &&
+				(pg_char_properties[c] & PG_ISLOWER));
+	else
+		return char_props(c, PG_ISLOWER, pg_regex_locale) != 0;
 }
 
 static int
 pg_wc_isgraph(pg_wchar c)
 {
-	switch (pg_regex_strategy)
-	{
-		case PG_REGEX_STRATEGY_C:
-			return (c <= (pg_wchar) 127 &&
-					(pg_char_properties[c] & PG_ISGRAPH));
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_isgraph(c);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return iswgraph_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
-			return (c <= (pg_wchar) UCHAR_MAX &&
-					isgraph_l((unsigned char) c, pg_regex_locale->info.lt));
-			break;
-		case PG_REGEX_STRATEGY_ICU:
-#ifdef USE_ICU
-			return u_isgraph(c);
-#endif
-			break;
-	}
-	return 0;					/* can't get here, but keep compiler quiet */
+	if (pg_regex_locale->ctype_is_c)
+		return (c <= (pg_wchar) 127 &&
+				(pg_char_properties[c] & PG_ISGRAPH));
+	else
+		return char_props(c, PG_ISGRAPH, pg_regex_locale) != 0;
 }
 
 static int
 pg_wc_isprint(pg_wchar c)
 {
-	switch (pg_regex_strategy)
-	{
-		case PG_REGEX_STRATEGY_C:
-			return (c <= (pg_wchar) 127 &&
-					(pg_char_properties[c] & PG_ISPRINT));
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_isprint(c);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return iswprint_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
-			return (c <= (pg_wchar) UCHAR_MAX &&
-					isprint_l((unsigned char) c, pg_regex_locale->info.lt));
-			break;
-		case PG_REGEX_STRATEGY_ICU:
-#ifdef USE_ICU
-			return u_isprint(c);
-#endif
-			break;
-	}
-	return 0;					/* can't get here, but keep compiler quiet */
+	if (pg_regex_locale->ctype_is_c)
+		return (c <= (pg_wchar) 127 &&
+				(pg_char_properties[c] & PG_ISPRINT));
+	else
+		return char_props(c, PG_ISPRINT, pg_regex_locale) != 0;
 }
 
 static int
 pg_wc_ispunct(pg_wchar c)
 {
-	switch (pg_regex_strategy)
-	{
-		case PG_REGEX_STRATEGY_C:
-			return (c <= (pg_wchar) 127 &&
-					(pg_char_properties[c] & PG_ISPUNCT));
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_ispunct(c, true);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return iswpunct_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
-			return (c <= (pg_wchar) UCHAR_MAX &&
-					ispunct_l((unsigned char) c, pg_regex_locale->info.lt));
-			break;
-		case PG_REGEX_STRATEGY_ICU:
-#ifdef USE_ICU
-			return u_ispunct(c);
-#endif
-			break;
-	}
-	return 0;					/* can't get here, but keep compiler quiet */
+	if (pg_regex_locale->ctype_is_c)
+		return (c <= (pg_wchar) 127 &&
+				(pg_char_properties[c] & PG_ISPUNCT));
+	else
+		return char_props(c, PG_ISPUNCT, pg_regex_locale) != 0;
 }
 
 static int
 pg_wc_isspace(pg_wchar c)
 {
-	switch (pg_regex_strategy)
-	{
-		case PG_REGEX_STRATEGY_C:
-			return (c <= (pg_wchar) 127 &&
-					(pg_char_properties[c] & PG_ISSPACE));
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_isspace(c);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return iswspace_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
-			return (c <= (pg_wchar) UCHAR_MAX &&
-					isspace_l((unsigned char) c, pg_regex_locale->info.lt));
-			break;
-		case PG_REGEX_STRATEGY_ICU:
-#ifdef USE_ICU
-			return u_isspace(c);
-#endif
-			break;
-	}
-	return 0;					/* can't get here, but keep compiler quiet */
+	if (pg_regex_locale->ctype_is_c)
+		return (c <= (pg_wchar) 127 &&
+				(pg_char_properties[c] & PG_ISSPACE));
+	else
+		return char_props(c, PG_ISSPACE, pg_regex_locale) != 0;
 }
 
 static pg_wchar
 pg_wc_toupper(pg_wchar c)
 {
-	switch (pg_regex_strategy)
+	if (pg_regex_locale->ctype_is_c)
 	{
-		case PG_REGEX_STRATEGY_C:
-			if (c <= (pg_wchar) 127)
-				return pg_ascii_toupper((unsigned char) c);
-			return c;
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return unicode_uppercase_simple(c);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return towupper_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
-			if (c <= (pg_wchar) UCHAR_MAX)
-				return toupper_l((unsigned char) c, pg_regex_locale->info.lt);
-			return c;
-		case PG_REGEX_STRATEGY_ICU:
-#ifdef USE_ICU
-			return u_toupper(c);
-#endif
-			break;
+		if (c <= (pg_wchar) 127)
+			return pg_ascii_toupper((unsigned char) c);
+		return c;
 	}
-	return 0;					/* can't get here, but keep compiler quiet */
+	else
+		return pg_regex_locale->ctype->wc_toupper(c, pg_regex_locale);
 }
 
 static pg_wchar
 pg_wc_tolower(pg_wchar c)
 {
-	switch (pg_regex_strategy)
+	if (pg_regex_locale->ctype_is_c)
 	{
-		case PG_REGEX_STRATEGY_C:
-			if (c <= (pg_wchar) 127)
-				return pg_ascii_tolower((unsigned char) c);
-			return c;
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return unicode_lowercase_simple(c);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return towlower_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
-			if (c <= (pg_wchar) UCHAR_MAX)
-				return tolower_l((unsigned char) c, pg_regex_locale->info.lt);
-			return c;
-		case PG_REGEX_STRATEGY_ICU:
-#ifdef USE_ICU
-			return u_tolower(c);
-#endif
-			break;
+		if (c <= (pg_wchar) 127)
+			return pg_ascii_tolower((unsigned char) c);
+		return c;
 	}
-	return 0;					/* can't get here, but keep compiler quiet */
+	else
+		return pg_regex_locale->ctype->wc_tolower(c, pg_regex_locale);
 }
 
 
@@ -732,37 +512,27 @@ pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode)
 	 * would always be true for production values of MAX_SIMPLE_CHR, but it's
 	 * useful to allow it to be small for testing purposes.)
 	 */
-	switch (pg_regex_strategy)
+	if (pg_regex_locale->ctype_is_c)
 	{
-		case PG_REGEX_STRATEGY_C:
 #if MAX_SIMPLE_CHR >= 127
 			max_chr = (pg_wchar) 127;
 			pcc->cv.cclasscode = -1;
 #else
 			max_chr = (pg_wchar) MAX_SIMPLE_CHR;
 #endif
-			break;
-		case PG_REGEX_STRATEGY_BUILTIN:
-			max_chr = (pg_wchar) MAX_SIMPLE_CHR;
-			break;
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			max_chr = (pg_wchar) MAX_SIMPLE_CHR;
-			break;
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
+	}
+	else
+	{
 #if MAX_SIMPLE_CHR >= UCHAR_MAX
+		if (pg_regex_locale->provider == COLLPROVIDER_LIBC &&
+			GetDatabaseEncoding() != PG_UTF8)
+		{
 			max_chr = (pg_wchar) UCHAR_MAX;
 			pcc->cv.cclasscode = -1;
-#else
-			max_chr = (pg_wchar) MAX_SIMPLE_CHR;
+		}
+		else
 #endif
-			break;
-		case PG_REGEX_STRATEGY_ICU:
 			max_chr = (pg_wchar) MAX_SIMPLE_CHR;
-			break;
-		default:
-			Assert(false);
-			max_chr = 0;		/* can't get here, but keep compiler quiet */
-			break;
 	}
 
 	/*
diff --git a/src/backend/utils/adt/Makefile b/src/backend/utils/adt/Makefile
index edb09d4e356..85e5eaf32eb 100644
--- a/src/backend/utils/adt/Makefile
+++ b/src/backend/utils/adt/Makefile
@@ -79,6 +79,8 @@ OBJS = \
 	orderedsetaggs.o \
 	partitionfuncs.o \
 	pg_locale.o \
+	pg_locale_icu.o \
+	pg_locale_libc.o \
 	pg_lsn.o \
 	pg_upgrade_support.o \
 	pgstatfuncs.o \
diff --git a/src/backend/utils/adt/meson.build b/src/backend/utils/adt/meson.build
index 8c6fc80c373..f73f294b8f5 100644
--- a/src/backend/utils/adt/meson.build
+++ b/src/backend/utils/adt/meson.build
@@ -66,6 +66,8 @@ backend_sources += files(
   'orderedsetaggs.c',
   'partitionfuncs.c',
   'pg_locale.c',
+  'pg_locale_icu.c',
+  'pg_locale_libc.c',
   'pg_lsn.c',
   'pg_upgrade_support.c',
   'pgstatfuncs.c',
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index cfba55a6e31..1802b7a1589 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -58,6 +58,8 @@
 #include "catalog/pg_collation.h"
 #include "catalog/pg_database.h"
 #include "common/hashfn.h"
+#include "common/unicode_case.h"
+#include "common/unicode_category.h"
 #include "mb/pg_wchar.h"
 #include "miscadmin.h"
 #include "utils/builtins.h"
@@ -87,12 +89,6 @@
 #define		PGLOCALE_SUPPORT_ERROR(provider) \
 	elog(ERROR, "unsupported collprovider for %s: %c", __func__, provider)
 
-/*
- * This should be large enough that most strings will fit, but small enough
- * that we feel comfortable putting it on the stack
- */
-#define		TEXTBUFLEN			1024
-
 #define		MAX_L10N_DATA		80
 
 
@@ -119,7 +115,21 @@ char	   *localized_full_months[12 + 1];
 /* is the databases's LC_CTYPE the C locale? */
 bool		database_ctype_is_c = false;
 
-static struct pg_locale_struct default_locale;
+#ifdef USE_ICU
+extern pg_locale_t icu_dat_create_locale(HeapTuple dattuple);
+extern pg_locale_t icu_coll_create_locale(MemoryContext context,
+										  ResourceOwner resowner,
+										  HeapTuple colltuple);
+extern UCollator *pg_ucol_open(const char *loc_str);
+#endif
+
+
+extern pg_locale_t libc_dat_create_locale(HeapTuple dattuple);
+extern pg_locale_t libc_coll_create_locale(MemoryContext context,
+										   ResourceOwner resowner,
+										   HeapTuple colltuple);
+
+static pg_locale_t default_locale = NULL;
 
 /* indicates whether locale information cache is valid */
 static bool CurrentLocaleConvValid = false;
@@ -170,51 +180,48 @@ static pg_locale_t last_collation_cache_locale = NULL;
 static char *IsoLocaleName(const char *);
 #endif
 
-#ifdef USE_ICU
-/*
- * Converter object for converting between ICU's UChar strings and C strings
- * in database encoding.  Since the database encoding doesn't change, we only
- * need one of these per session.
- */
-static UConverter *icu_converter = NULL;
-
-static UCollator *pg_ucol_open(const char *loc_str);
-static void init_icu_converter(void);
-static size_t uchar_length(UConverter *converter,
-						   const char *str, int32_t len);
-static int32_t uchar_convert(UConverter *converter,
-							 UChar *dest, int32_t destlen,
-							 const char *src, int32_t srclen);
-static void icu_set_collation_attributes(UCollator *collator, const char *loc,
-										 UErrorCode *status);
-
-static void ResourceOwnerRememberUCollator(ResourceOwner owner,
-										   UCollator *collator);
-static void ResOwnerReleaseUCollator(Datum val);
-
-static const ResourceOwnerDesc UCollatorResourceKind =
-{
-	.name = "UCollator reference",
-	.release_phase = RESOURCE_RELEASE_AFTER_LOCKS,
-	.release_priority = RELEASE_PRIO_LAST,
-	.ReleaseResource = ResOwnerReleaseUCollator,
-	.DebugPrint = NULL                      /* the default message is fine */
-};
-#endif
+static int
+char_props_builtin(pg_wchar wc, int mask, pg_locale_t locale)
+{
+	int result = 0;
+
+	if ((mask & PG_ISDIGIT) && pg_u_isdigit(wc, true))
+		result |= PG_ISDIGIT;
+	if ((mask & PG_ISALPHA) && pg_u_isalpha(wc))
+		result |= PG_ISALPHA;
+	if ((mask & PG_ISUPPER) && pg_u_isupper(wc))
+		result |= PG_ISUPPER;
+	if ((mask & PG_ISLOWER) && pg_u_islower(wc))
+		result |= PG_ISLOWER;
+	if ((mask & PG_ISGRAPH) && pg_u_isgraph(wc))
+		result |= PG_ISGRAPH;
+	if ((mask & PG_ISPRINT) && pg_u_isprint(wc))
+		result |= PG_ISPRINT;
+	if ((mask & PG_ISPUNCT) && pg_u_ispunct(wc, true))
+		result |= PG_ISPUNCT;
+	if ((mask & PG_ISSPACE) && pg_u_isspace(wc))
+		result |= PG_ISSPACE;
+
+	return result;
+}
 
-static void ResourceOwnerRememberLocaleT(ResourceOwner owner,
-										 locale_t locale);
-static void ResOwnerReleaseLocaleT(Datum val);
+static pg_wchar
+toupper_builtin(pg_wchar wc, pg_locale_t locale)
+{
+	return unicode_uppercase_simple(wc);
+}
 
-static const ResourceOwnerDesc LocaleTResourceKind =
+static pg_wchar
+tolower_builtin(pg_wchar wc, pg_locale_t locale)
 {
-	.name = "locale_t reference",
-	.release_phase = RESOURCE_RELEASE_AFTER_LOCKS,
-	.release_priority = RELEASE_PRIO_LAST,
-	.ReleaseResource = ResOwnerReleaseLocaleT,
-	.DebugPrint = NULL                      /* the default message is fine */
-};
+	return unicode_lowercase_simple(wc);
+}
 
+struct ctype_methods builtin_ctype_methods = {
+	.char_props = char_props_builtin,
+	.wc_toupper = toupper_builtin,
+	.wc_tolower = tolower_builtin,
+};
 
 /*
  * POSIX doesn't define _l-variants of these functions, but several systems
@@ -1262,206 +1269,6 @@ IsoLocaleName(const char *winlocname)
 #endif							/* WIN32 && LC_MESSAGES */
 
 
-/* simple subroutine for reporting errors from newlocale() */
-static void
-report_newlocale_failure(const char *localename)
-{
-	int			save_errno;
-
-	/*
-	 * Windows doesn't provide any useful error indication from
-	 * _create_locale(), and BSD-derived platforms don't seem to feel they
-	 * need to set errno either (even though POSIX is pretty clear that
-	 * newlocale should do so).  So, if errno hasn't been set, assume ENOENT
-	 * is what to report.
-	 */
-	if (errno == 0)
-		errno = ENOENT;
-
-	/*
-	 * ENOENT means "no such locale", not "no such file", so clarify that
-	 * errno with an errdetail message.
-	 */
-	save_errno = errno;			/* auxiliary funcs might change errno */
-	ereport(ERROR,
-			(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-			 errmsg("could not create locale \"%s\": %m",
-					localename),
-			 (save_errno == ENOENT ?
-			  errdetail("The operating system could not find any locale data for the locale name \"%s\".",
-						localename) : 0)));
-}
-
-static void
-ResourceOwnerRememberLocaleT(ResourceOwner owner, locale_t locale)
-{
-	ResourceOwnerRemember(owner, PointerGetDatum(locale),
-						  &LocaleTResourceKind);
-}
-
-static void
-ResOwnerReleaseLocaleT(Datum val)
-{
-	locale_t locale = (locale_t) DatumGetPointer(val);
-	freelocale(locale);
-}
-
-/*
- * Create a locale_t with the given collation and ctype.
- *
- * The "C" and "POSIX" locales are not actually handled by libc, so return
- * NULL.
- *
- * Ensure that no path leaks a locale_t.
- */
-static locale_t
-make_libc_collator(const char *collate, const char *ctype)
-{
-	locale_t	loc = 0;
-
-	if (strcmp(collate, ctype) == 0)
-	{
-		if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0)
-		{
-			/* Normal case where they're the same */
-			errno = 0;
-#ifndef WIN32
-			loc = newlocale(LC_COLLATE_MASK | LC_CTYPE_MASK, collate,
-							NULL);
-#else
-			loc = _create_locale(LC_ALL, collate);
-#endif
-			if (!loc)
-				report_newlocale_failure(collate);
-		}
-	}
-	else
-	{
-#ifndef WIN32
-		/* We need two newlocale() steps */
-		locale_t	loc1 = 0;
-
-		if (strcmp(collate, "C") != 0 && strcmp(collate, "POSIX") != 0)
-		{
-			errno = 0;
-			loc1 = newlocale(LC_COLLATE_MASK, collate, NULL);
-			if (!loc1)
-				report_newlocale_failure(collate);
-		}
-
-		if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0)
-		{
-			errno = 0;
-			loc = newlocale(LC_CTYPE_MASK, ctype, loc1);
-			if (!loc)
-			{
-				if (loc1)
-					freelocale(loc1);
-				report_newlocale_failure(ctype);
-			}
-		}
-		else
-			loc = loc1;
-#else
-
-		/*
-		 * XXX The _create_locale() API doesn't appear to support this. Could
-		 * perhaps be worked around by changing pg_locale_t to contain two
-		 * separate fields.
-		 */
-		ereport(ERROR,
-				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-				 errmsg("collations with different collate and ctype values are not supported on this platform")));
-#endif
-	}
-
-	return loc;
-}
-
-/*
- * Create a UCollator with the given locale string and rules.
- *
- * Ensure that no path leaks a UCollator.
- */
-#ifdef USE_ICU
-static void
-ResourceOwnerRememberUCollator(ResourceOwner owner, UCollator *collator)
-{
-	ResourceOwnerRemember(owner, PointerGetDatum(collator),
-						  &UCollatorResourceKind);
-}
-
-static void
-ResOwnerReleaseUCollator(Datum val)
-{
-	UCollator *collator = (UCollator *) DatumGetPointer(val);
-	ucol_close(collator);
-}
-
-static UCollator *
-make_icu_collator(const char *iculocstr, const char *icurules)
-{
-	if (!icurules)
-	{
-		/* simple case without rules */
-		return pg_ucol_open(iculocstr);
-	}
-	else
-	{
-		UCollator  *collator_std_rules;
-		UCollator  *collator_all_rules;
-		const UChar *std_rules;
-		UChar	   *my_rules;
-		UChar	   *all_rules;
-		int32_t		length;
-		int32_t		total;
-		UErrorCode	status;
-
-		/*
-		 * If rules are specified, we extract the rules of the standard
-		 * collation, add our own rules, and make a new collator with the
-		 * combined rules.
-		 */
-		icu_to_uchar(&my_rules, icurules, strlen(icurules));
-
-		collator_std_rules = pg_ucol_open(iculocstr);
-
-		std_rules = ucol_getRules(collator_std_rules, &length);
-
-		total = u_strlen(std_rules) + u_strlen(my_rules) + 1;
-
-		/* avoid leaking collator on OOM */
-		all_rules = palloc_extended(sizeof(UChar) * total, MCXT_ALLOC_NO_OOM);
-		if (!all_rules)
-		{
-			ucol_close(collator_std_rules);
-			ereport(ERROR,
-					(errcode(ERRCODE_OUT_OF_MEMORY),
-					 errmsg("out of memory")));
-		}
-
-		u_strcpy(all_rules, std_rules);
-		u_strcat(all_rules, my_rules);
-
-		ucol_close(collator_std_rules);
-
-		status = U_ZERO_ERROR;
-		collator_all_rules = ucol_openRules(all_rules, u_strlen(all_rules),
-											UCOL_DEFAULT, UCOL_DEFAULT_STRENGTH,
-											NULL, &status);
-		if (U_FAILURE(status))
-		{
-			ereport(ERROR,
-					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-					 errmsg("could not open collator for locale \"%s\" with rules \"%s\": %s",
-							iculocstr, icurules, u_errorName(status))));
-		}
-
-		return collator_all_rules;
-	}
-}
-#endif							/* not USE_ICU */
-
 /*
  * Initialize default_locale with database locale settings.
  */
@@ -1471,6 +1278,7 @@ init_database_collation(void)
 	HeapTuple	tup;
 	Form_pg_database dbform;
 	Datum		datum;
+	pg_locale_t	result = NULL;
 
 	/* Fetch our pg_database row normally, via syscache */
 	tup = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId));
@@ -1487,70 +1295,38 @@ init_database_collation(void)
 
 		builtin_validate_locale(dbform->encoding, datlocale);
 
-		default_locale.collate_is_c = true;
-		default_locale.ctype_is_c = (strcmp(datlocale, "C") == 0);
-
-		default_locale.info.builtin.locale = MemoryContextStrdup(
+		result = MemoryContextAllocZero(TopMemoryContext,
+										sizeof(struct pg_locale_struct));
+		result->info.builtin.locale = MemoryContextStrdup(
 																 TopMemoryContext, datlocale);
-	}
-	else if (dbform->datlocprovider == COLLPROVIDER_ICU)
-	{
-#ifdef USE_ICU
-		char	   *datlocale;
-		char	   *icurules;
-		bool		isnull;
-
-		datum = SysCacheGetAttrNotNull(DATABASEOID, tup, Anum_pg_database_datlocale);
-		datlocale = TextDatumGetCString(datum);
-
-		default_locale.collate_is_c = false;
-		default_locale.ctype_is_c = false;
+		result->collate_is_c = true;
+		result->ctype_is_c = (strcmp(datlocale, "C") == 0);
 
-		datum = SysCacheGetAttr(DATABASEOID, tup, Anum_pg_database_daticurules, &isnull);
-		if (!isnull)
-			icurules = TextDatumGetCString(datum);
-		else
-			icurules = NULL;
+		if (!result->ctype_is_c)
+			result->ctype = &builtin_ctype_methods;
 
-		default_locale.info.icu.locale = MemoryContextStrdup(TopMemoryContext, datlocale);
-		default_locale.info.icu.ucol = make_icu_collator(datlocale, icurules);
-#else							/* not USE_ICU */
-		/* could get here if a collation was created by a build with ICU */
-		ereport(ERROR,
-				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-				 errmsg("ICU is not supported in this build")));
-#endif							/* not USE_ICU */
 	}
+#ifdef USE_ICU
+	else if (dbform->datlocprovider == COLLPROVIDER_ICU)
+		result = icu_dat_create_locale(tup);
+#endif							/* not USE_ICU */
+	else if (dbform->datlocprovider == COLLPROVIDER_LIBC)
+		result = libc_dat_create_locale(tup);
 	else
-	{
-		const char *datcollate;
-		const char *datctype;
-
-		Assert(dbform->datlocprovider == COLLPROVIDER_LIBC);
-
-		datum = SysCacheGetAttrNotNull(DATABASEOID, tup, Anum_pg_database_datcollate);
-		datcollate = TextDatumGetCString(datum);
-		datum = SysCacheGetAttrNotNull(DATABASEOID, tup, Anum_pg_database_datctype);
-		datctype = TextDatumGetCString(datum);
-
-		default_locale.collate_is_c = (strcmp(datcollate, "C") == 0) ||
-			(strcmp(datcollate, "POSIX") == 0);
-		default_locale.ctype_is_c = (strcmp(datctype, "C") == 0) ||
-			(strcmp(datctype, "POSIX") == 0);
+		PGLOCALE_SUPPORT_ERROR(dbform->datlocprovider);
 
-		default_locale.info.lt = make_libc_collator(datcollate, datctype);
-	}
-
-	default_locale.provider = dbform->datlocprovider;
+	result->provider = dbform->datlocprovider;
 
 	/*
 	 * Default locale is currently always deterministic.  Nondeterministic
 	 * locales currently don't support pattern matching, which would break a
 	 * lot of things if applied globally.
 	 */
-	default_locale.deterministic = true;
+	result->deterministic = true;
 
 	ReleaseSysCache(tup);
+
+	default_locale = result;
 }
 
 /*
@@ -1558,12 +1334,12 @@ init_database_collation(void)
  * allocating memory.
  */
 static pg_locale_t
-create_pg_locale(MemoryContext context, ResourceOwner owner, Oid collid)
+create_pg_locale(MemoryContext context, ResourceOwner resowner, Oid collid)
 {
 	/* We haven't computed this yet in this session, so do it */
 	HeapTuple	tp;
 	Form_pg_collation collform;
-	pg_locale_t result;
+	pg_locale_t result = NULL;
 	Datum		datum;
 	bool		isnull;
 
@@ -1631,65 +1407,19 @@ create_pg_locale(MemoryContext context, ResourceOwner owner, Oid collid)
 		result->deterministic = collform->collisdeterministic;
 		result->collate_is_c = true;
 		result->ctype_is_c = (strcmp(locstr, "C") == 0);
+		if (!result->ctype_is_c)
+			result->ctype = &builtin_ctype_methods;
 		result->info.builtin.locale = MemoryContextStrdup(context,
 														  locstr);
 	}
-	else if (collform->collprovider == COLLPROVIDER_LIBC)
-	{
-		const char *collcollate;
-		const char *collctype;
-		locale_t	locale;
-
-		datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collcollate);
-		collcollate = TextDatumGetCString(datum);
-		datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collctype);
-		collctype = TextDatumGetCString(datum);
-
-		ResourceOwnerEnlarge(owner);
-		locale = make_libc_collator(collcollate, collctype);
-		if (locale)
-			ResourceOwnerRememberLocaleT(owner, locale);
-
-		result = MemoryContextAllocZero(context,
-										sizeof(struct pg_locale_struct));
-
-		result->provider = collform->collprovider;
-		result->deterministic = collform->collisdeterministic;
-		result->collate_is_c = (strcmp(collcollate, "C") == 0) ||
-			(strcmp(collcollate, "POSIX") == 0);
-		result->ctype_is_c = (strcmp(collctype, "C") == 0) ||
-			(strcmp(collctype, "POSIX") == 0);
-		result->info.lt = locale;
-	}
+#ifdef USE_ICU
 	else if (collform->collprovider == COLLPROVIDER_ICU)
-	{
-		const char *iculocstr;
-		const char *icurules;
-		UCollator  *collator;
-
-		datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colllocale);
-		iculocstr = TextDatumGetCString(datum);
-
-		datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collicurules, &isnull);
-		if (!isnull)
-			icurules = TextDatumGetCString(datum);
-		else
-			icurules = NULL;
-
-		ResourceOwnerEnlarge(owner);
-		collator = make_icu_collator(iculocstr, icurules);
-		ResourceOwnerRememberUCollator(owner, collator);
-
-		result = MemoryContextAllocZero(context,
-										sizeof(struct pg_locale_struct));
-
-		result->provider = collform->collprovider;
-		result->deterministic = collform->collisdeterministic;
-		result->collate_is_c = false;
-		result->ctype_is_c = false;
-		result->info.icu.locale = MemoryContextStrdup(context, iculocstr);
-		result->info.icu.ucol = collator;
-	}
+		result = icu_coll_create_locale(context, resowner, tp);
+#endif
+	else if (collform->collprovider == COLLPROVIDER_LIBC)
+		result = libc_coll_create_locale(context, resowner, tp);
+	else
+		PGLOCALE_SUPPORT_ERROR(collform->collprovider);
 
 	ReleaseSysCache(tp);
 
@@ -1735,7 +1465,7 @@ pg_newlocale_from_collation(Oid collid)
 	bool					 found;
 
 	if (collid == DEFAULT_COLLATION_OID)
-		return &default_locale;
+		return default_locale;
 
 	if (!OidIsValid(collid))
 		elog(ERROR, "cache lookup failed for collation %u", collid);
@@ -1886,483 +1616,48 @@ get_collation_actual_version(char collprovider, const char *collcollate)
 }
 
 /*
- * strncoll_libc_win32_utf8
- *
- * Win32 does not have UTF-8. Convert UTF8 arguments to wide characters and
- * invoke wcscoll_l().
+ * pg_strcoll
  *
- * An input string length of -1 means that it's NUL-terminated.
+ * Like pg_strncoll for NUL-terminated input strings.
  */
-#ifdef WIN32
-static int
-strncoll_libc_win32_utf8(const char *arg1, ssize_t len1, const char *arg2,
-						 ssize_t len2, pg_locale_t locale)
+int
+pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale)
 {
-	char		sbuf[TEXTBUFLEN];
-	char	   *buf = sbuf;
-	char	   *a1p,
-			   *a2p;
-	int			a1len;
-	int			a2len;
-	int			r;
-	int			result;
-
-	Assert(locale->provider == COLLPROVIDER_LIBC);
-	Assert(GetDatabaseEncoding() == PG_UTF8);
-#ifndef WIN32
-	Assert(false);
-#endif
-
-	if (len1 == -1)
-		len1 = strlen(arg1);
-	if (len2 == -1)
-		len2 = strlen(arg2);
-
-	a1len = len1 * 2 + 2;
-	a2len = len2 * 2 + 2;
-
-	if (a1len + a2len > TEXTBUFLEN)
-		buf = palloc(a1len + a2len);
-
-	a1p = buf;
-	a2p = buf + a1len;
-
-	/* API does not work for zero-length input */
-	if (len1 == 0)
-		r = 0;
-	else
-	{
-		r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
-								(LPWSTR) a1p, a1len / 2);
-		if (!r)
-			ereport(ERROR,
-					(errmsg("could not convert string to UTF-16: error code %lu",
-							GetLastError())));
-	}
-	((LPWSTR) a1p)[r] = 0;
-
-	if (len2 == 0)
-		r = 0;
-	else
-	{
-		r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
-								(LPWSTR) a2p, a2len / 2);
-		if (!r)
-			ereport(ERROR,
-					(errmsg("could not convert string to UTF-16: error code %lu",
-							GetLastError())));
-	}
-	((LPWSTR) a2p)[r] = 0;
-
-	errno = 0;
-	result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, locale->info.lt);
-	if (result == 2147483647)	/* _NLSCMPERROR; missing from mingw headers */
-		ereport(ERROR,
-				(errmsg("could not compare Unicode strings: %m")));
-
-	if (buf != sbuf)
-		pfree(buf);
-
-	return result;
+	return locale->collate->strncoll(arg1, -1, arg2, -1, locale);
 }
-#endif							/* WIN32 */
 
 /*
- * strncoll_libc
+ * pg_strncoll
+ *
+ * Call ucol_strcollUTF8(), ucol_strcoll(), strcoll_l() or wcscoll_l() as
+ * appropriate for the given locale, platform, and database encoding. If the
+ * locale is not specified, use the database collation.
+ *
+ * The input strings must be encoded in the database encoding. If an input
+ * string is NUL-terminated, its length may be specified as -1.
  *
- * An input string length of -1 means that it's NUL-terminated.
+ * The caller is responsible for breaking ties if the collation is
+ * deterministic; this maintains consistency with pg_strnxfrm(), which cannot
+ * easily account for deterministic collations.
  */
-static int
-strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
-			  pg_locale_t locale)
+int
+pg_strncoll(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
+			pg_locale_t locale)
 {
-	char		sbuf[TEXTBUFLEN];
-	char	   *buf = sbuf;
-	size_t		bufsize1 = (len1 == -1) ? 0 : len1 + 1;
-	size_t		bufsize2 = (len2 == -1) ? 0 : len2 + 1;
-	const char *arg1n;
-	const char *arg2n;
-	int			result;
-
-	Assert(locale->provider == COLLPROVIDER_LIBC);
-
-#ifdef WIN32
-	/* check for this case before doing the work for nul-termination */
-	if (GetDatabaseEncoding() == PG_UTF8)
-		return strncoll_libc_win32_utf8(arg1, len1, arg2, len2, locale);
-#endif							/* WIN32 */
-
-	if (bufsize1 + bufsize2 > TEXTBUFLEN)
-		buf = palloc(bufsize1 + bufsize2);
-
-	/* nul-terminate arguments if necessary */
-	if (len1 == -1)
-	{
-		arg1n = arg1;
-	}
-	else
-	{
-		char *buf1 = buf;
-		memcpy(buf1, arg1, len1);
-		buf1[len1] = '\0';
-		arg1n = buf1;
-	}
-
-	if (len2 == -1)
-	{
-		arg2n = arg2;
-	}
-	else
-	{
-		char *buf2 = buf + bufsize1;
-		memcpy(buf2, arg2, len2);
-		buf2[len2] = '\0';
-		arg2n = buf2;
-	}
-
-	result = strcoll_l(arg1n, arg2n, locale->info.lt);
-
-	if (buf != sbuf)
-		pfree(buf);
-
-	return result;
+	return locale->collate->strncoll(arg1, len1, arg2, len2, locale);
 }
 
-#ifdef USE_ICU
-
 /*
- * strncoll_icu_no_utf8
+ * Return true if the collation provider supports pg_strxfrm() and
+ * pg_strnxfrm(); otherwise false.
  *
- * Convert the arguments from the database encoding to UChar strings, then
- * call ucol_strcoll(). An argument length of -1 means that the string is
- * NUL-terminated.
- *
- * When the database encoding is UTF-8, and ICU supports ucol_strcollUTF8(),
- * caller should call that instead.
- */
-static int
-strncoll_icu_no_utf8(const char *arg1, ssize_t len1,
-					 const char *arg2, ssize_t len2, pg_locale_t locale)
-{
-	char		sbuf[TEXTBUFLEN];
-	char	   *buf = sbuf;
-	int32_t		ulen1;
-	int32_t		ulen2;
-	size_t		bufsize1;
-	size_t		bufsize2;
-	UChar	   *uchar1,
-			   *uchar2;
-	int			result;
-
-	Assert(locale->provider == COLLPROVIDER_ICU);
-#ifdef HAVE_UCOL_STRCOLLUTF8
-	Assert(GetDatabaseEncoding() != PG_UTF8);
-#endif
-
-	init_icu_converter();
-
-	ulen1 = uchar_length(icu_converter, arg1, len1);
-	ulen2 = uchar_length(icu_converter, arg2, len2);
-
-	bufsize1 = (ulen1 + 1) * sizeof(UChar);
-	bufsize2 = (ulen2 + 1) * sizeof(UChar);
-
-	if (bufsize1 + bufsize2 > TEXTBUFLEN)
-		buf = palloc(bufsize1 + bufsize2);
-
-	uchar1 = (UChar *) buf;
-	uchar2 = (UChar *) (buf + bufsize1);
-
-	ulen1 = uchar_convert(icu_converter, uchar1, ulen1 + 1, arg1, len1);
-	ulen2 = uchar_convert(icu_converter, uchar2, ulen2 + 1, arg2, len2);
-
-	result = ucol_strcoll(locale->info.icu.ucol,
-						  uchar1, ulen1,
-						  uchar2, ulen2);
-
-	if (buf != sbuf)
-		pfree(buf);
-
-	return result;
-}
-
-/*
- * strncoll_icu
- *
- * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given
- * database encoding. An argument length of -1 means the string is
- * NUL-terminated.
- */
-static int
-strncoll_icu(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
-			 pg_locale_t locale)
-{
-	int			result;
-
-	Assert(locale->provider == COLLPROVIDER_ICU);
-
-#ifdef HAVE_UCOL_STRCOLLUTF8
-	if (GetDatabaseEncoding() == PG_UTF8)
-	{
-		UErrorCode	status;
-
-		status = U_ZERO_ERROR;
-		result = ucol_strcollUTF8(locale->info.icu.ucol,
-								  arg1, len1,
-								  arg2, len2,
-								  &status);
-		if (U_FAILURE(status))
-			ereport(ERROR,
-					(errmsg("collation failed: %s", u_errorName(status))));
-	}
-	else
-#endif
-	{
-		result = strncoll_icu_no_utf8(arg1, len1, arg2, len2, locale);
-	}
-
-	return result;
-}
-
-#endif							/* USE_ICU */
-
-/*
- * pg_strcoll
- *
- * Like pg_strncoll for NUL-terminated input strings.
- */
-int
-pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale)
-{
-	int			result;
-
-	if (locale->provider == COLLPROVIDER_LIBC)
-		result = strncoll_libc(arg1, -1, arg2, -1, locale);
-#ifdef USE_ICU
-	else if (locale->provider == COLLPROVIDER_ICU)
-		result = strncoll_icu(arg1, -1, arg2, -1, locale);
-#endif
-	else
-		/* shouldn't happen */
-		PGLOCALE_SUPPORT_ERROR(locale->provider);
-
-	return result;
-}
-
-/*
- * pg_strncoll
- *
- * Call ucol_strcollUTF8(), ucol_strcoll(), strcoll_l() or wcscoll_l() as
- * appropriate for the given locale, platform, and database encoding. If the
- * locale is not specified, use the database collation.
- *
- * The input strings must be encoded in the database encoding. If an input
- * string is NUL-terminated, its length may be specified as -1.
- *
- * The caller is responsible for breaking ties if the collation is
- * deterministic; this maintains consistency with pg_strnxfrm(), which cannot
- * easily account for deterministic collations.
- */
-int
-pg_strncoll(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
-			pg_locale_t locale)
-{
-	int			result;
-
-	if (locale->provider == COLLPROVIDER_LIBC)
-		result = strncoll_libc(arg1, len1, arg2, len2, locale);
-#ifdef USE_ICU
-	else if (locale->provider == COLLPROVIDER_ICU)
-		result = strncoll_icu(arg1, len1, arg2, len2, locale);
-#endif
-	else
-		/* shouldn't happen */
-		PGLOCALE_SUPPORT_ERROR(locale->provider);
-
-	return result;
-}
-
-
-static size_t
-strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen,
-			  pg_locale_t locale)
-{
-	char		sbuf[TEXTBUFLEN];
-	char	   *buf = sbuf;
-	size_t		bufsize = srclen + 1;
-	size_t		result;
-
-	Assert(locale->provider == COLLPROVIDER_LIBC);
-
-	if (srclen == -1)
-		return strxfrm_l(dest, src, destsize, locale->info.lt);
-
-	if (bufsize > TEXTBUFLEN)
-		buf = palloc(bufsize);
-
-	/* nul-terminate argument */
-	memcpy(buf, src, srclen);
-	buf[srclen] = '\0';
-
-	result = strxfrm_l(dest, buf, destsize, locale->info.lt);
-
-	if (buf != sbuf)
-		pfree(buf);
-
-	/* if dest is defined, it should be nul-terminated */
-	Assert(result >= destsize || dest[result] == '\0');
-
-	return result;
-}
-
-#ifdef USE_ICU
-
-/* 'srclen' of -1 means the strings are NUL-terminated */
-static size_t
-strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
-			 pg_locale_t locale)
-{
-	char		sbuf[TEXTBUFLEN];
-	char	   *buf = sbuf;
-	UChar	   *uchar;
-	int32_t		ulen;
-	size_t		uchar_bsize;
-	Size		result_bsize;
-
-	Assert(locale->provider == COLLPROVIDER_ICU);
-
-	init_icu_converter();
-
-	ulen = uchar_length(icu_converter, src, srclen);
-
-	uchar_bsize = (ulen + 1) * sizeof(UChar);
-
-	if (uchar_bsize > TEXTBUFLEN)
-		buf = palloc(uchar_bsize);
-
-	uchar = (UChar *) buf;
-
-	ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
-
-	result_bsize = ucol_getSortKey(locale->info.icu.ucol,
-								   uchar, ulen,
-								   (uint8_t *) dest, destsize);
-
-	/*
-	 * ucol_getSortKey() counts the nul-terminator in the result length, but
-	 * this function should not.
-	 */
-	Assert(result_bsize > 0);
-	result_bsize--;
-
-	if (buf != sbuf)
-		pfree(buf);
-
-	/* if dest is defined, it should be nul-terminated */
-	Assert(result_bsize >= destsize || dest[result_bsize] == '\0');
-
-	return result_bsize;
-}
-
-/* 'srclen' of -1 means the strings are NUL-terminated */
-static size_t
-strnxfrm_prefix_icu_no_utf8(char *dest,size_t destsize,
-							const char *src, ssize_t srclen,
-							pg_locale_t locale)
-{
-	char		sbuf[TEXTBUFLEN];
-	char	   *buf = sbuf;
-	UCharIterator iter;
-	uint32_t	state[2];
-	UErrorCode	status;
-	int32_t		ulen = -1;
-	UChar	   *uchar = NULL;
-	size_t		uchar_bsize;
-	Size		result_bsize;
-
-	Assert(locale->provider == COLLPROVIDER_ICU);
-	Assert(GetDatabaseEncoding() != PG_UTF8);
-
-	init_icu_converter();
-
-	ulen = uchar_length(icu_converter, src, srclen);
-
-	uchar_bsize = (ulen + 1) * sizeof(UChar);
-
-	if (uchar_bsize > TEXTBUFLEN)
-		buf = palloc(uchar_bsize);
-
-	uchar = (UChar *) buf;
-
-	ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
-
-	uiter_setString(&iter, uchar, ulen);
-	state[0] = state[1] = 0;	/* won't need that again */
-	status = U_ZERO_ERROR;
-	result_bsize = ucol_nextSortKeyPart(locale->info.icu.ucol,
-										&iter,
-										state,
-										(uint8_t *) dest,
-										destsize,
-										&status);
-	if (U_FAILURE(status))
-		ereport(ERROR,
-				(errmsg("sort key generation failed: %s",
-						u_errorName(status))));
-
-	return result_bsize;
-}
-
-/* 'srclen' of -1 means the strings are NUL-terminated */
-static size_t
-strnxfrm_prefix_icu(char *dest, size_t destsize,
-					const char *src, ssize_t srclen,
-					pg_locale_t locale)
-{
-	size_t		result;
-
-	Assert(locale->provider == COLLPROVIDER_ICU);
-
-	if (GetDatabaseEncoding() == PG_UTF8)
-	{
-		UCharIterator iter;
-		uint32_t	state[2];
-		UErrorCode	status;
-
-		uiter_setUTF8(&iter, src, srclen);
-		state[0] = state[1] = 0;	/* won't need that again */
-		status = U_ZERO_ERROR;
-		result = ucol_nextSortKeyPart(locale->info.icu.ucol,
-									  &iter,
-									  state,
-									  (uint8_t *) dest,
-									  destsize,
-									  &status);
-		if (U_FAILURE(status))
-			ereport(ERROR,
-					(errmsg("sort key generation failed: %s",
-							u_errorName(status))));
-	}
-	else
-		result = strnxfrm_prefix_icu_no_utf8(dest, destsize, src, srclen,
-											 locale);
-
-	return result;
-}
-
-#endif
-
-/*
- * Return true if the collation provider supports pg_strxfrm() and
- * pg_strnxfrm(); otherwise false.
- *
- * Unfortunately, it seems that strxfrm() for non-C collations is broken on
- * many common platforms; testing of multiple versions of glibc reveals that,
- * for many locales, strcoll() and strxfrm() do not return consistent
- * results. While no other libc other than Cygwin has so far been shown to
- * have a problem, we take the conservative course of action for right now and
- * disable this categorically.  (Users who are certain this isn't a problem on
- * their system can define TRUST_STRXFRM.)
+ * Unfortunately, it seems that strxfrm() for non-C collations is broken on
+ * many common platforms; testing of multiple versions of glibc reveals that,
+ * for many locales, strcoll() and strxfrm() do not return consistent
+ * results. While no other libc other than Cygwin has so far been shown to
+ * have a problem, we take the conservative course of action for right now and
+ * disable this categorically.  (Users who are certain this isn't a problem on
+ * their system can define TRUST_STRXFRM.)
  *
  * No similar problem is known for the ICU provider.
  */
@@ -2392,19 +1687,7 @@ pg_strxfrm_enabled(pg_locale_t locale)
 size_t
 pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale)
 {
-	size_t		result = 0;		/* keep compiler quiet */
-
-	if (locale->provider == COLLPROVIDER_LIBC)
-		result = strnxfrm_libc(dest, destsize, src, -1, locale);
-#ifdef USE_ICU
-	else if (locale->provider == COLLPROVIDER_ICU)
-		result = strnxfrm_icu(dest, destsize, src, -1, locale);
-#endif
-	else
-		/* shouldn't happen */
-		PGLOCALE_SUPPORT_ERROR(locale->provider);
-
-	return result;
+	return locale->collate->strnxfrm(dest, destsize, src, -1, locale);
 }
 
 /*
@@ -2430,19 +1713,7 @@ size_t
 pg_strnxfrm(char *dest, size_t destsize, const char *src, ssize_t srclen,
 			pg_locale_t locale)
 {
-	size_t		result = 0;		/* keep compiler quiet */
-
-	if (locale->provider == COLLPROVIDER_LIBC)
-		result = strnxfrm_libc(dest, src, srclen, destsize, locale);
-#ifdef USE_ICU
-	else if (locale->provider == COLLPROVIDER_ICU)
-		result = strnxfrm_icu(dest, src, srclen, destsize, locale);
-#endif
-	else
-		/* shouldn't happen */
-		PGLOCALE_SUPPORT_ERROR(locale->provider);
-
-	return result;
+	return locale->collate->strnxfrm(dest, destsize, src, srclen, locale);
 }
 
 /*
@@ -2472,7 +1743,7 @@ size_t
 pg_strxfrm_prefix(char *dest, const char *src, size_t destsize,
 				  pg_locale_t locale)
 {
-	return pg_strnxfrm_prefix(dest, destsize, src, -1, locale);
+	return locale->collate->strnxfrm_prefix(dest, destsize, src, -1, locale);
 }
 
 /*
@@ -2497,16 +1768,9 @@ size_t
 pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src,
 				   ssize_t srclen, pg_locale_t locale)
 {
-	size_t		result = 0;		/* keep compiler quiet */
-
-#ifdef USE_ICU
-	if (locale->provider == COLLPROVIDER_ICU)
-		result = strnxfrm_prefix_icu(dest, src, -1, destsize, locale);
-	else
-#endif
-		PGLOCALE_SUPPORT_ERROR(locale->provider);
-
-	return result;
+	return locale->collate->strnxfrm_prefix(dest, destsize,
+											src, srclen,
+											locale);
 }
 
 /*
@@ -2561,356 +1825,6 @@ builtin_validate_locale(int encoding, const char *locale)
 	return canonical_name;
 }
 
-
-#ifdef USE_ICU
-
-/*
- * Wrapper around ucol_open() to handle API differences for older ICU
- * versions.
- *
- * Ensure that no path leaks a UCollator.
- */
-static UCollator *
-pg_ucol_open(const char *loc_str)
-{
-	UCollator  *collator;
-	UErrorCode	status;
-	const char *orig_str = loc_str;
-	char	   *fixed_str = NULL;
-
-	/*
-	 * Must never open default collator, because it depends on the environment
-	 * and may change at any time. Should not happen, but check here to catch
-	 * bugs that might be hard to catch otherwise.
-	 *
-	 * NB: the default collator is not the same as the collator for the root
-	 * locale. The root locale may be specified as the empty string, "und", or
-	 * "root". The default collator is opened by passing NULL to ucol_open().
-	 */
-	if (loc_str == NULL)
-		elog(ERROR, "opening default collator is not supported");
-
-	/*
-	 * In ICU versions 54 and earlier, "und" is not a recognized spelling of
-	 * the root locale. If the first component of the locale is "und", replace
-	 * with "root" before opening.
-	 */
-	if (U_ICU_VERSION_MAJOR_NUM < 55)
-	{
-		char		lang[ULOC_LANG_CAPACITY];
-
-		status = U_ZERO_ERROR;
-		uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
-		if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
-		{
-			ereport(ERROR,
-					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-					 errmsg("could not get language from locale \"%s\": %s",
-							loc_str, u_errorName(status))));
-		}
-
-		if (strcmp(lang, "und") == 0)
-		{
-			const char *remainder = loc_str + strlen("und");
-
-			fixed_str = palloc(strlen("root") + strlen(remainder) + 1);
-			strcpy(fixed_str, "root");
-			strcat(fixed_str, remainder);
-
-			loc_str = fixed_str;
-		}
-	}
-
-	status = U_ZERO_ERROR;
-	collator = ucol_open(loc_str, &status);
-	if (U_FAILURE(status))
-		ereport(ERROR,
-		/* use original string for error report */
-				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-				 errmsg("could not open collator for locale \"%s\": %s",
-						orig_str, u_errorName(status))));
-
-	if (U_ICU_VERSION_MAJOR_NUM < 54)
-	{
-		status = U_ZERO_ERROR;
-		icu_set_collation_attributes(collator, loc_str, &status);
-
-		/*
-		 * Pretend the error came from ucol_open(), for consistent error
-		 * message across ICU versions.
-		 */
-		if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
-		{
-			ucol_close(collator);
-			ereport(ERROR,
-					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-					 errmsg("could not open collator for locale \"%s\": %s",
-							orig_str, u_errorName(status))));
-		}
-	}
-
-	if (fixed_str != NULL)
-		pfree(fixed_str);
-
-	return collator;
-}
-
-static void
-init_icu_converter(void)
-{
-	const char *icu_encoding_name;
-	UErrorCode	status;
-	UConverter *conv;
-
-	if (icu_converter)
-		return;					/* already done */
-
-	icu_encoding_name = get_encoding_name_for_icu(GetDatabaseEncoding());
-	if (!icu_encoding_name)
-		ereport(ERROR,
-				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-				 errmsg("encoding \"%s\" not supported by ICU",
-						pg_encoding_to_char(GetDatabaseEncoding()))));
-
-	status = U_ZERO_ERROR;
-	conv = ucnv_open(icu_encoding_name, &status);
-	if (U_FAILURE(status))
-		ereport(ERROR,
-				(errmsg("could not open ICU converter for encoding \"%s\": %s",
-						icu_encoding_name, u_errorName(status))));
-
-	icu_converter = conv;
-}
-
-/*
- * Find length, in UChars, of given string if converted to UChar string.
- *
- * A length of -1 indicates that the input string is NUL-terminated.
- */
-static size_t
-uchar_length(UConverter *converter, const char *str, int32_t len)
-{
-	UErrorCode	status = U_ZERO_ERROR;
-	int32_t		ulen;
-
-	ulen = ucnv_toUChars(converter, NULL, 0, str, len, &status);
-	if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
-		ereport(ERROR,
-				(errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
-	return ulen;
-}
-
-/*
- * Convert the given source string into a UChar string, stored in dest, and
- * return the length (in UChars).
- *
- * A srclen of -1 indicates that the input string is NUL-terminated.
- */
-static int32_t
-uchar_convert(UConverter *converter, UChar *dest, int32_t destlen,
-			  const char *src, int32_t srclen)
-{
-	UErrorCode	status = U_ZERO_ERROR;
-	int32_t		ulen;
-
-	status = U_ZERO_ERROR;
-	ulen = ucnv_toUChars(converter, dest, destlen, src, srclen, &status);
-	if (U_FAILURE(status))
-		ereport(ERROR,
-				(errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
-	return ulen;
-}
-
-/*
- * Convert a string in the database encoding into a string of UChars.
- *
- * The source string at buff is of length nbytes
- * (it needn't be nul-terminated)
- *
- * *buff_uchar receives a pointer to the palloc'd result string, and
- * the function's result is the number of UChars generated.
- *
- * The result string is nul-terminated, though most callers rely on the
- * result length instead.
- */
-int32_t
-icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes)
-{
-	int32_t		len_uchar;
-
-	init_icu_converter();
-
-	len_uchar = uchar_length(icu_converter, buff, nbytes);
-
-	*buff_uchar = palloc((len_uchar + 1) * sizeof(**buff_uchar));
-	len_uchar = uchar_convert(icu_converter,
-							  *buff_uchar, len_uchar + 1, buff, nbytes);
-
-	return len_uchar;
-}
-
-/*
- * Convert a string of UChars into the database encoding.
- *
- * The source string at buff_uchar is of length len_uchar
- * (it needn't be nul-terminated)
- *
- * *result receives a pointer to the palloc'd result string, and the
- * function's result is the number of bytes generated (not counting nul).
- *
- * The result string is nul-terminated.
- */
-int32_t
-icu_from_uchar(char **result, const UChar *buff_uchar, int32_t len_uchar)
-{
-	UErrorCode	status;
-	int32_t		len_result;
-
-	init_icu_converter();
-
-	status = U_ZERO_ERROR;
-	len_result = ucnv_fromUChars(icu_converter, NULL, 0,
-								 buff_uchar, len_uchar, &status);
-	if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
-		ereport(ERROR,
-				(errmsg("%s failed: %s", "ucnv_fromUChars",
-						u_errorName(status))));
-
-	*result = palloc(len_result + 1);
-
-	status = U_ZERO_ERROR;
-	len_result = ucnv_fromUChars(icu_converter, *result, len_result + 1,
-								 buff_uchar, len_uchar, &status);
-	if (U_FAILURE(status) ||
-		status == U_STRING_NOT_TERMINATED_WARNING)
-		ereport(ERROR,
-				(errmsg("%s failed: %s", "ucnv_fromUChars",
-						u_errorName(status))));
-
-	return len_result;
-}
-
-/*
- * Parse collation attributes from the given locale string and apply them to
- * the open collator.
- *
- * First, the locale string is canonicalized to an ICU format locale ID such
- * as "und@colStrength=primary;colCaseLevel=yes". Then, it parses and applies
- * the key-value arguments.
- *
- * Starting with ICU version 54, the attributes are processed automatically by
- * ucol_open(), so this is only necessary for emulating this behavior on older
- * versions.
- */
-pg_attribute_unused()
-static void
-icu_set_collation_attributes(UCollator *collator, const char *loc,
-							 UErrorCode *status)
-{
-	int32_t		len;
-	char	   *icu_locale_id;
-	char	   *lower_str;
-	char	   *str;
-	char	   *token;
-
-	/*
-	 * The input locale may be a BCP 47 language tag, e.g.
-	 * "und-u-kc-ks-level1", which expresses the same attributes in a
-	 * different form. It will be converted to the equivalent ICU format
-	 * locale ID, e.g. "und@colcaselevel=yes;colstrength=primary", by
-	 * uloc_canonicalize().
-	 */
-	*status = U_ZERO_ERROR;
-	len = uloc_canonicalize(loc, NULL, 0, status);
-	icu_locale_id = palloc(len + 1);
-	*status = U_ZERO_ERROR;
-	len = uloc_canonicalize(loc, icu_locale_id, len + 1, status);
-	if (U_FAILURE(*status) || *status == U_STRING_NOT_TERMINATED_WARNING)
-		return;
-
-	lower_str = asc_tolower(icu_locale_id, strlen(icu_locale_id));
-
-	pfree(icu_locale_id);
-
-	str = strchr(lower_str, '@');
-	if (!str)
-		return;
-	str++;
-
-	while ((token = strsep(&str, ";")))
-	{
-		char	   *e = strchr(token, '=');
-
-		if (e)
-		{
-			char	   *name;
-			char	   *value;
-			UColAttribute uattr;
-			UColAttributeValue uvalue;
-
-			*status = U_ZERO_ERROR;
-
-			*e = '\0';
-			name = token;
-			value = e + 1;
-
-			/*
-			 * See attribute name and value lists in ICU i18n/coll.cpp
-			 */
-			if (strcmp(name, "colstrength") == 0)
-				uattr = UCOL_STRENGTH;
-			else if (strcmp(name, "colbackwards") == 0)
-				uattr = UCOL_FRENCH_COLLATION;
-			else if (strcmp(name, "colcaselevel") == 0)
-				uattr = UCOL_CASE_LEVEL;
-			else if (strcmp(name, "colcasefirst") == 0)
-				uattr = UCOL_CASE_FIRST;
-			else if (strcmp(name, "colalternate") == 0)
-				uattr = UCOL_ALTERNATE_HANDLING;
-			else if (strcmp(name, "colnormalization") == 0)
-				uattr = UCOL_NORMALIZATION_MODE;
-			else if (strcmp(name, "colnumeric") == 0)
-				uattr = UCOL_NUMERIC_COLLATION;
-			else
-				/* ignore if unknown */
-				continue;
-
-			if (strcmp(value, "primary") == 0)
-				uvalue = UCOL_PRIMARY;
-			else if (strcmp(value, "secondary") == 0)
-				uvalue = UCOL_SECONDARY;
-			else if (strcmp(value, "tertiary") == 0)
-				uvalue = UCOL_TERTIARY;
-			else if (strcmp(value, "quaternary") == 0)
-				uvalue = UCOL_QUATERNARY;
-			else if (strcmp(value, "identical") == 0)
-				uvalue = UCOL_IDENTICAL;
-			else if (strcmp(value, "no") == 0)
-				uvalue = UCOL_OFF;
-			else if (strcmp(value, "yes") == 0)
-				uvalue = UCOL_ON;
-			else if (strcmp(value, "shifted") == 0)
-				uvalue = UCOL_SHIFTED;
-			else if (strcmp(value, "non-ignorable") == 0)
-				uvalue = UCOL_NON_IGNORABLE;
-			else if (strcmp(value, "lower") == 0)
-				uvalue = UCOL_LOWER_FIRST;
-			else if (strcmp(value, "upper") == 0)
-				uvalue = UCOL_UPPER_FIRST;
-			else
-			{
-				*status = U_ILLEGAL_ARGUMENT_ERROR;
-				break;
-			}
-
-			ucol_setAttribute(collator, uattr, uvalue, status);
-		}
-	}
-
-	pfree(lower_str);
-}
-#endif
-
 /*
  * Return the BCP47 language tag representation of the requested locale.
  *
@@ -3049,6 +1963,16 @@ icu_validate_locale(const char *loc_str)
 #endif							/* not USE_ICU */
 }
 
+/*
+ *
+ *TODO: add caching?
+ */
+int
+char_props(pg_wchar wc, int mask, pg_locale_t locale)
+{
+	return locale->ctype->char_props(wc, mask, locale);
+}
+
 /*
  * These functions convert from/to libc's wchar_t, *not* pg_wchar_t.
  * Therefore we keep them here rather than with the mbutils code.
diff --git a/src/backend/utils/adt/pg_locale_icu.c b/src/backend/utils/adt/pg_locale_icu.c
new file mode 100644
index 00000000000..a9e8b4b642b
--- /dev/null
+++ b/src/backend/utils/adt/pg_locale_icu.c
@@ -0,0 +1,873 @@
+/*-----------------------------------------------------------------------
+ *
+ * PostgreSQL locale utilities for ICU
+ *
+ * Portions Copyright (c) 2002-2024, PostgreSQL Global Development Group
+ *
+ * src/backend/utils/adt/pg_locale_libc.c
+ *
+ *-----------------------------------------------------------------------
+ */
+
+
+#include "postgres.h"
+
+#ifdef USE_ICU
+
+#include <unicode/ucnv.h>
+#include <unicode/ustring.h>
+
+#include "access/htup_details.h"
+#include "catalog/pg_collation.h"
+#include "catalog/pg_database.h"
+#include "utils/builtins.h"
+#include "utils/formatting.h"
+#include "utils/memutils.h"
+#include "utils/pg_locale.h"
+#include "utils/resowner.h"
+#include "utils/syscache.h"
+
+/*
+ * This should be large enough that most strings will fit, but small enough
+ * that we feel comfortable putting it on the stack
+ */
+#define		TEXTBUFLEN			1024
+
+extern pg_locale_t icu_dat_create_locale(HeapTuple dattuple);
+extern pg_locale_t icu_coll_create_locale(MemoryContext context,
+										  ResourceOwner resowner,
+										  HeapTuple colltuple);
+extern UCollator *pg_ucol_open(const char *loc_str);
+
+
+static UCollator * make_icu_collator(const char *iculocstr,
+									 const char *icurules);
+
+static int strncoll_icu(const char *arg1, ssize_t len1,
+						const char *arg2, ssize_t len2,
+						pg_locale_t locale);
+static size_t strnxfrm_icu(char *dest, size_t destsize,
+						   const char *src, ssize_t srclen,
+						   pg_locale_t locale);
+static size_t strnxfrm_prefix_icu(char *dest, size_t destsize,
+								  const char *src, ssize_t srclen,
+								  pg_locale_t locale);
+
+static void ResourceOwnerRememberUCollator(ResourceOwner owner,
+										   UCollator *collator);
+static void ResOwnerReleaseUCollator(Datum val);
+
+static void init_icu_converter(void);
+static size_t uchar_length(UConverter *converter,
+						   const char *str, int32_t len);
+static int32_t uchar_convert(UConverter *converter,
+							 UChar *dest, int32_t destlen,
+							 const char *src, int32_t srclen);
+static void icu_set_collation_attributes(UCollator *collator, const char *loc,
+										 UErrorCode *status);
+
+/*
+ * Converter object for converting between ICU's UChar strings and C strings
+ * in database encoding.  Since the database encoding doesn't change, we only
+ * need one of these per session.
+ */
+static UConverter *icu_converter = NULL;
+
+static const ResourceOwnerDesc UCollatorResourceKind =
+{
+	.name = "UCollator reference",
+	.release_phase = RESOURCE_RELEASE_AFTER_LOCKS,
+	.release_priority = RELEASE_PRIO_LAST,
+	.ReleaseResource = ResOwnerReleaseUCollator,
+	.DebugPrint = NULL                      /* the default message is fine */
+};
+
+
+static int
+char_props_icu(pg_wchar wc, int mask, pg_locale_t locale)
+{
+	int result = 0;
+
+	if ((mask & PG_ISDIGIT) && u_isdigit(wc))
+		result |= PG_ISDIGIT;
+	if ((mask & PG_ISALPHA) && u_isalpha(wc))
+		result |= PG_ISALPHA;
+	if ((mask & PG_ISUPPER) && u_isupper(wc))
+		result |= PG_ISUPPER;
+	if ((mask & PG_ISLOWER) && u_islower(wc))
+		result |= PG_ISLOWER;
+	if ((mask & PG_ISGRAPH) && u_isgraph(wc))
+		result |= PG_ISGRAPH;
+	if ((mask & PG_ISPRINT) && u_isprint(wc))
+		result |= PG_ISPRINT;
+	if ((mask & PG_ISPUNCT) && u_ispunct(wc))
+		result |= PG_ISPUNCT;
+	if ((mask & PG_ISSPACE) && u_isspace(wc))
+		result |= PG_ISSPACE;
+
+	return result;
+}
+
+static pg_wchar
+toupper_icu(pg_wchar wc, pg_locale_t locale)
+{
+	return u_toupper(wc);
+}
+
+static pg_wchar
+tolower_icu(pg_wchar wc, pg_locale_t locale)
+{
+	return u_tolower(wc);
+}
+
+struct collate_methods icu_collate_methods = {
+	.strncoll = strncoll_icu,
+	.strnxfrm = strnxfrm_icu,
+	.strnxfrm_prefix = strnxfrm_prefix_icu,
+};
+
+struct ctype_methods icu_ctype_methods = {
+	.char_props = char_props_icu,
+	.wc_toupper = toupper_icu,
+	.wc_tolower = tolower_icu,
+};
+
+static void
+ResourceOwnerRememberUCollator(ResourceOwner owner, UCollator *collator)
+{
+	ResourceOwnerRemember(owner, PointerGetDatum(collator),
+						  &UCollatorResourceKind);
+}
+
+static void
+ResOwnerReleaseUCollator(Datum val)
+{
+	UCollator *collator = (UCollator *) DatumGetPointer(val);
+	ucol_close(collator);
+}
+
+pg_locale_t
+icu_dat_create_locale(HeapTuple dattuple)
+{
+	Form_pg_database dbform;
+	Datum		datum;
+	char	   *datlocale;
+	char	   *icurules;
+	bool		isnull;
+	pg_locale_t result;
+
+	dbform = (Form_pg_database) GETSTRUCT(dattuple);
+
+	datum = SysCacheGetAttrNotNull(DATABASEOID, dattuple, Anum_pg_database_datlocale);
+	datlocale = TextDatumGetCString(datum);
+
+	datum = SysCacheGetAttr(DATABASEOID, dattuple, Anum_pg_database_daticurules, &isnull);
+	if (!isnull)
+		icurules = TextDatumGetCString(datum);
+	else
+		icurules = NULL;
+
+	result = MemoryContextAllocZero(TopMemoryContext,
+									sizeof(struct pg_locale_struct));
+
+	result->info.icu.locale = MemoryContextStrdup(TopMemoryContext,
+												  datlocale);
+	result->provider = dbform->datlocprovider;
+	result->deterministic = true;
+	result->collate_is_c = false;
+	result->ctype_is_c = false;
+	result->collate = &icu_collate_methods;
+	result->ctype = &icu_ctype_methods;
+	result->info.icu.ucol = make_icu_collator(datlocale, icurules);
+
+	return result;
+}
+
+pg_locale_t
+icu_coll_create_locale(MemoryContext context, ResourceOwner resowner,
+					   HeapTuple colltuple)
+{
+	Form_pg_collation collform;
+	Datum		datum;
+	bool		isnull;
+	const char *iculocstr;
+	const char *icurules;
+	UCollator  *collator;
+	pg_locale_t result;
+
+	collform = (Form_pg_collation) GETSTRUCT(colltuple);
+
+	Assert(collform->collprovider == COLLPROVIDER_ICU);
+
+	datum = SysCacheGetAttrNotNull(COLLOID, colltuple, Anum_pg_collation_colllocale);
+	iculocstr = TextDatumGetCString(datum);
+
+	datum = SysCacheGetAttr(COLLOID, colltuple, Anum_pg_collation_collicurules, &isnull);
+	if (!isnull)
+		icurules = TextDatumGetCString(datum);
+	else
+		icurules = NULL;
+
+	ResourceOwnerEnlarge(resowner);
+	collator = make_icu_collator(iculocstr, icurules);
+	ResourceOwnerRememberUCollator(resowner, collator);
+
+	result = MemoryContextAllocZero(context,
+									sizeof(struct pg_locale_struct));
+
+	result->info.icu.locale = MemoryContextStrdup(context, iculocstr);
+	result->provider = collform->collprovider;
+	result->deterministic = collform->collisdeterministic;
+	result->collate_is_c = false;
+	result->ctype_is_c = false;
+	result->collate = &icu_collate_methods;
+	result->ctype = &icu_ctype_methods;
+	result->info.icu.ucol = collator;
+
+	return result;
+}
+
+/*
+ * Create a UCollator with the given locale string and rules.
+ *
+ * Ensure that no path leaks a UCollator.
+ */
+static UCollator *
+make_icu_collator(const char *iculocstr, const char *icurules)
+{
+	if (!icurules)
+	{
+		/* simple case without rules */
+		return pg_ucol_open(iculocstr);
+	}
+	else
+	{
+		UCollator  *collator_std_rules;
+		UCollator  *collator_all_rules;
+		const UChar *std_rules;
+		UChar	   *my_rules;
+		UChar	   *all_rules;
+		int32_t		length;
+		int32_t		total;
+		UErrorCode	status;
+
+		/*
+		 * If rules are specified, we extract the rules of the standard
+		 * collation, add our own rules, and make a new collator with the
+		 * combined rules.
+		 */
+		icu_to_uchar(&my_rules, icurules, strlen(icurules));
+
+		collator_std_rules = pg_ucol_open(iculocstr);
+
+		std_rules = ucol_getRules(collator_std_rules, &length);
+
+		total = u_strlen(std_rules) + u_strlen(my_rules) + 1;
+
+		/* avoid leaking collator on OOM */
+		all_rules = palloc_extended(sizeof(UChar) * total, MCXT_ALLOC_NO_OOM);
+		if (!all_rules)
+		{
+			ucol_close(collator_std_rules);
+			ereport(ERROR,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of memory")));
+		}
+
+		u_strcpy(all_rules, std_rules);
+		u_strcat(all_rules, my_rules);
+
+		ucol_close(collator_std_rules);
+
+		status = U_ZERO_ERROR;
+		collator_all_rules = ucol_openRules(all_rules, u_strlen(all_rules),
+											UCOL_DEFAULT, UCOL_DEFAULT_STRENGTH,
+											NULL, &status);
+		if (U_FAILURE(status))
+		{
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("could not open collator for locale \"%s\" with rules \"%s\": %s",
+							iculocstr, icurules, u_errorName(status))));
+		}
+
+		return collator_all_rules;
+	}
+}
+
+
+/*
+ * strncoll_icu_no_utf8
+ *
+ * Convert the arguments from the database encoding to UChar strings, then
+ * call ucol_strcoll(). An argument length of -1 means that the string is
+ * NUL-terminated.
+ *
+ * When the database encoding is UTF-8, and ICU supports ucol_strcollUTF8(),
+ * caller should call that instead.
+ */
+static int
+strncoll_icu_no_utf8(const char *arg1, ssize_t len1,
+					 const char *arg2, ssize_t len2, pg_locale_t locale)
+{
+	char		sbuf[TEXTBUFLEN];
+	char	   *buf = sbuf;
+	int32_t		ulen1;
+	int32_t		ulen2;
+	size_t		bufsize1;
+	size_t		bufsize2;
+	UChar	   *uchar1,
+			   *uchar2;
+	int			result;
+
+	Assert(locale->provider == COLLPROVIDER_ICU);
+#ifdef HAVE_UCOL_STRCOLLUTF8
+	Assert(GetDatabaseEncoding() != PG_UTF8);
+#endif
+
+	init_icu_converter();
+
+	ulen1 = uchar_length(icu_converter, arg1, len1);
+	ulen2 = uchar_length(icu_converter, arg2, len2);
+
+	bufsize1 = (ulen1 + 1) * sizeof(UChar);
+	bufsize2 = (ulen2 + 1) * sizeof(UChar);
+
+	if (bufsize1 + bufsize2 > TEXTBUFLEN)
+		buf = palloc(bufsize1 + bufsize2);
+
+	uchar1 = (UChar *) buf;
+	uchar2 = (UChar *) (buf + bufsize1);
+
+	ulen1 = uchar_convert(icu_converter, uchar1, ulen1 + 1, arg1, len1);
+	ulen2 = uchar_convert(icu_converter, uchar2, ulen2 + 1, arg2, len2);
+
+	result = ucol_strcoll(locale->info.icu.ucol,
+						  uchar1, ulen1,
+						  uchar2, ulen2);
+
+	if (buf != sbuf)
+		pfree(buf);
+
+	return result;
+}
+
+/*
+ * strncoll_icu
+ *
+ * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given
+ * database encoding. An argument length of -1 means the string is
+ * NUL-terminated.
+ */
+static int
+strncoll_icu(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
+			 pg_locale_t locale)
+{
+	int			result;
+
+	Assert(locale->provider == COLLPROVIDER_ICU);
+
+#ifdef HAVE_UCOL_STRCOLLUTF8
+	if (GetDatabaseEncoding() == PG_UTF8)
+	{
+		UErrorCode	status;
+
+		status = U_ZERO_ERROR;
+		result = ucol_strcollUTF8(locale->info.icu.ucol,
+								  arg1, len1,
+								  arg2, len2,
+								  &status);
+		if (U_FAILURE(status))
+			ereport(ERROR,
+					(errmsg("collation failed: %s", u_errorName(status))));
+	}
+	else
+#endif
+	{
+		result = strncoll_icu_no_utf8(arg1, len1, arg2, len2, locale);
+	}
+
+	return result;
+}
+
+
+/* 'srclen' of -1 means the strings are NUL-terminated */
+static size_t
+strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
+			 pg_locale_t locale)
+{
+	char		sbuf[TEXTBUFLEN];
+	char	   *buf = sbuf;
+	UChar	   *uchar;
+	int32_t		ulen;
+	size_t		uchar_bsize;
+	Size		result_bsize;
+
+	Assert(locale->provider == COLLPROVIDER_ICU);
+
+	init_icu_converter();
+
+	ulen = uchar_length(icu_converter, src, srclen);
+
+	uchar_bsize = (ulen + 1) * sizeof(UChar);
+
+	if (uchar_bsize > TEXTBUFLEN)
+		buf = palloc(uchar_bsize);
+
+	uchar = (UChar *) buf;
+
+	ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
+
+	result_bsize = ucol_getSortKey(locale->info.icu.ucol,
+								   uchar, ulen,
+								   (uint8_t *) dest, destsize);
+
+	/*
+	 * ucol_getSortKey() counts the nul-terminator in the result length, but
+	 * this function should not.
+	 */
+	Assert(result_bsize > 0);
+	result_bsize--;
+
+	if (buf != sbuf)
+		pfree(buf);
+
+	/* if dest is defined, it should be nul-terminated */
+	Assert(result_bsize >= destsize || dest[result_bsize] == '\0');
+
+	return result_bsize;
+}
+
+/* 'srclen' of -1 means the strings are NUL-terminated */
+static size_t
+strnxfrm_prefix_icu_no_utf8(char *dest,size_t destsize,
+							const char *src, ssize_t srclen,
+							pg_locale_t locale)
+{
+	char		sbuf[TEXTBUFLEN];
+	char	   *buf = sbuf;
+	UCharIterator iter;
+	uint32_t	state[2];
+	UErrorCode	status;
+	int32_t		ulen = -1;
+	UChar	   *uchar = NULL;
+	size_t		uchar_bsize;
+	Size		result_bsize;
+
+	Assert(locale->provider == COLLPROVIDER_ICU);
+	Assert(GetDatabaseEncoding() != PG_UTF8);
+
+	init_icu_converter();
+
+	ulen = uchar_length(icu_converter, src, srclen);
+
+	uchar_bsize = (ulen + 1) * sizeof(UChar);
+
+	if (uchar_bsize > TEXTBUFLEN)
+		buf = palloc(uchar_bsize);
+
+	uchar = (UChar *) buf;
+
+	ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
+
+	uiter_setString(&iter, uchar, ulen);
+	state[0] = state[1] = 0;	/* won't need that again */
+	status = U_ZERO_ERROR;
+	result_bsize = ucol_nextSortKeyPart(locale->info.icu.ucol,
+										&iter,
+										state,
+										(uint8_t *) dest,
+										destsize,
+										&status);
+	if (U_FAILURE(status))
+		ereport(ERROR,
+				(errmsg("sort key generation failed: %s",
+						u_errorName(status))));
+
+	return result_bsize;
+}
+
+/* 'srclen' of -1 means the strings are NUL-terminated */
+static size_t
+strnxfrm_prefix_icu(char *dest, size_t destsize,
+					const char *src, ssize_t srclen,
+					pg_locale_t locale)
+{
+	size_t		result;
+
+	Assert(locale->provider == COLLPROVIDER_ICU);
+
+	if (GetDatabaseEncoding() == PG_UTF8)
+	{
+		UCharIterator iter;
+		uint32_t	state[2];
+		UErrorCode	status;
+
+		uiter_setUTF8(&iter, src, srclen);
+		state[0] = state[1] = 0;	/* won't need that again */
+		status = U_ZERO_ERROR;
+		result = ucol_nextSortKeyPart(locale->info.icu.ucol,
+									  &iter,
+									  state,
+									  (uint8_t *) dest,
+									  destsize,
+									  &status);
+		if (U_FAILURE(status))
+			ereport(ERROR,
+					(errmsg("sort key generation failed: %s",
+							u_errorName(status))));
+	}
+	else
+		result = strnxfrm_prefix_icu_no_utf8(dest, destsize, src, srclen,
+											 locale);
+
+	return result;
+}
+
+/*
+ * Wrapper around ucol_open() to handle API differences for older ICU
+ * versions.
+ *
+ * Ensure that no path leaks a UCollator.
+ */
+UCollator *
+pg_ucol_open(const char *loc_str)
+{
+	UCollator  *collator;
+	UErrorCode	status;
+	const char *orig_str = loc_str;
+	char	   *fixed_str = NULL;
+
+	/*
+	 * Must never open default collator, because it depends on the environment
+	 * and may change at any time. Should not happen, but check here to catch
+	 * bugs that might be hard to catch otherwise.
+	 *
+	 * NB: the default collator is not the same as the collator for the root
+	 * locale. The root locale may be specified as the empty string, "und", or
+	 * "root". The default collator is opened by passing NULL to ucol_open().
+	 */
+	if (loc_str == NULL)
+		elog(ERROR, "opening default collator is not supported");
+
+	/*
+	 * In ICU versions 54 and earlier, "und" is not a recognized spelling of
+	 * the root locale. If the first component of the locale is "und", replace
+	 * with "root" before opening.
+	 */
+	if (U_ICU_VERSION_MAJOR_NUM < 55)
+	{
+		char		lang[ULOC_LANG_CAPACITY];
+
+		status = U_ZERO_ERROR;
+		uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
+		if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
+		{
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("could not get language from locale \"%s\": %s",
+							loc_str, u_errorName(status))));
+		}
+
+		if (strcmp(lang, "und") == 0)
+		{
+			const char *remainder = loc_str + strlen("und");
+
+			fixed_str = palloc(strlen("root") + strlen(remainder) + 1);
+			strcpy(fixed_str, "root");
+			strcat(fixed_str, remainder);
+
+			loc_str = fixed_str;
+		}
+	}
+
+	status = U_ZERO_ERROR;
+	collator = ucol_open(loc_str, &status);
+	if (U_FAILURE(status))
+		ereport(ERROR,
+		/* use original string for error report */
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("could not open collator for locale \"%s\": %s",
+						orig_str, u_errorName(status))));
+
+	if (U_ICU_VERSION_MAJOR_NUM < 54)
+	{
+		status = U_ZERO_ERROR;
+		icu_set_collation_attributes(collator, loc_str, &status);
+
+		/*
+		 * Pretend the error came from ucol_open(), for consistent error
+		 * message across ICU versions.
+		 */
+		if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
+		{
+			ucol_close(collator);
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("could not open collator for locale \"%s\": %s",
+							orig_str, u_errorName(status))));
+		}
+	}
+
+	if (fixed_str != NULL)
+		pfree(fixed_str);
+
+	return collator;
+}
+
+static void
+init_icu_converter(void)
+{
+	const char *icu_encoding_name;
+	UErrorCode	status;
+	UConverter *conv;
+
+	if (icu_converter)
+		return;					/* already done */
+
+	icu_encoding_name = get_encoding_name_for_icu(GetDatabaseEncoding());
+	if (!icu_encoding_name)
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("encoding \"%s\" not supported by ICU",
+						pg_encoding_to_char(GetDatabaseEncoding()))));
+
+	status = U_ZERO_ERROR;
+	conv = ucnv_open(icu_encoding_name, &status);
+	if (U_FAILURE(status))
+		ereport(ERROR,
+				(errmsg("could not open ICU converter for encoding \"%s\": %s",
+						icu_encoding_name, u_errorName(status))));
+
+	icu_converter = conv;
+}
+
+/*
+ * Find length, in UChars, of given string if converted to UChar string.
+ *
+ * A length of -1 indicates that the input string is NUL-terminated.
+ */
+static size_t
+uchar_length(UConverter *converter, const char *str, int32_t len)
+{
+	UErrorCode	status = U_ZERO_ERROR;
+	int32_t		ulen;
+
+	ulen = ucnv_toUChars(converter, NULL, 0, str, len, &status);
+	if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
+		ereport(ERROR,
+				(errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
+	return ulen;
+}
+
+/*
+ * Convert the given source string into a UChar string, stored in dest, and
+ * return the length (in UChars).
+ *
+ * A srclen of -1 indicates that the input string is NUL-terminated.
+ */
+static int32_t
+uchar_convert(UConverter *converter, UChar *dest, int32_t destlen,
+			  const char *src, int32_t srclen)
+{
+	UErrorCode	status = U_ZERO_ERROR;
+	int32_t		ulen;
+
+	status = U_ZERO_ERROR;
+	ulen = ucnv_toUChars(converter, dest, destlen, src, srclen, &status);
+	if (U_FAILURE(status))
+		ereport(ERROR,
+				(errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
+	return ulen;
+}
+
+/*
+ * Convert a string in the database encoding into a string of UChars.
+ *
+ * The source string at buff is of length nbytes
+ * (it needn't be nul-terminated)
+ *
+ * *buff_uchar receives a pointer to the palloc'd result string, and
+ * the function's result is the number of UChars generated.
+ *
+ * The result string is nul-terminated, though most callers rely on the
+ * result length instead.
+ */
+int32_t
+icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes)
+{
+	int32_t		len_uchar;
+
+	init_icu_converter();
+
+	len_uchar = uchar_length(icu_converter, buff, nbytes);
+
+	*buff_uchar = palloc((len_uchar + 1) * sizeof(**buff_uchar));
+	len_uchar = uchar_convert(icu_converter,
+							  *buff_uchar, len_uchar + 1, buff, nbytes);
+
+	return len_uchar;
+}
+
+/*
+ * Convert a string of UChars into the database encoding.
+ *
+ * The source string at buff_uchar is of length len_uchar
+ * (it needn't be nul-terminated)
+ *
+ * *result receives a pointer to the palloc'd result string, and the
+ * function's result is the number of bytes generated (not counting nul).
+ *
+ * The result string is nul-terminated.
+ */
+int32_t
+icu_from_uchar(char **result, const UChar *buff_uchar, int32_t len_uchar)
+{
+	UErrorCode	status;
+	int32_t		len_result;
+
+	init_icu_converter();
+
+	status = U_ZERO_ERROR;
+	len_result = ucnv_fromUChars(icu_converter, NULL, 0,
+								 buff_uchar, len_uchar, &status);
+	if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
+		ereport(ERROR,
+				(errmsg("%s failed: %s", "ucnv_fromUChars",
+						u_errorName(status))));
+
+	*result = palloc(len_result + 1);
+
+	status = U_ZERO_ERROR;
+	len_result = ucnv_fromUChars(icu_converter, *result, len_result + 1,
+								 buff_uchar, len_uchar, &status);
+	if (U_FAILURE(status) ||
+		status == U_STRING_NOT_TERMINATED_WARNING)
+		ereport(ERROR,
+				(errmsg("%s failed: %s", "ucnv_fromUChars",
+						u_errorName(status))));
+
+	return len_result;
+}
+
+/*
+ * Parse collation attributes from the given locale string and apply them to
+ * the open collator.
+ *
+ * First, the locale string is canonicalized to an ICU format locale ID such
+ * as "und@colStrength=primary;colCaseLevel=yes". Then, it parses and applies
+ * the key-value arguments.
+ *
+ * Starting with ICU version 54, the attributes are processed automatically by
+ * ucol_open(), so this is only necessary for emulating this behavior on older
+ * versions.
+ */
+pg_attribute_unused()
+static void
+icu_set_collation_attributes(UCollator *collator, const char *loc,
+							 UErrorCode *status)
+{
+	int32_t		len;
+	char	   *icu_locale_id;
+	char	   *lower_str;
+	char	   *str;
+	char	   *token;
+
+	/*
+	 * The input locale may be a BCP 47 language tag, e.g.
+	 * "und-u-kc-ks-level1", which expresses the same attributes in a
+	 * different form. It will be converted to the equivalent ICU format
+	 * locale ID, e.g. "und@colcaselevel=yes;colstrength=primary", by
+	 * uloc_canonicalize().
+	 */
+	*status = U_ZERO_ERROR;
+	len = uloc_canonicalize(loc, NULL, 0, status);
+	icu_locale_id = palloc(len + 1);
+	*status = U_ZERO_ERROR;
+	len = uloc_canonicalize(loc, icu_locale_id, len + 1, status);
+	if (U_FAILURE(*status) || *status == U_STRING_NOT_TERMINATED_WARNING)
+		return;
+
+	lower_str = asc_tolower(icu_locale_id, strlen(icu_locale_id));
+
+	pfree(icu_locale_id);
+
+	str = strchr(lower_str, '@');
+	if (!str)
+		return;
+	str++;
+
+	while ((token = strsep(&str, ";")))
+	{
+		char	   *e = strchr(token, '=');
+
+		if (e)
+		{
+			char	   *name;
+			char	   *value;
+			UColAttribute uattr;
+			UColAttributeValue uvalue;
+
+			*status = U_ZERO_ERROR;
+
+			*e = '\0';
+			name = token;
+			value = e + 1;
+
+			/*
+			 * See attribute name and value lists in ICU i18n/coll.cpp
+			 */
+			if (strcmp(name, "colstrength") == 0)
+				uattr = UCOL_STRENGTH;
+			else if (strcmp(name, "colbackwards") == 0)
+				uattr = UCOL_FRENCH_COLLATION;
+			else if (strcmp(name, "colcaselevel") == 0)
+				uattr = UCOL_CASE_LEVEL;
+			else if (strcmp(name, "colcasefirst") == 0)
+				uattr = UCOL_CASE_FIRST;
+			else if (strcmp(name, "colalternate") == 0)
+				uattr = UCOL_ALTERNATE_HANDLING;
+			else if (strcmp(name, "colnormalization") == 0)
+				uattr = UCOL_NORMALIZATION_MODE;
+			else if (strcmp(name, "colnumeric") == 0)
+				uattr = UCOL_NUMERIC_COLLATION;
+			else
+				/* ignore if unknown */
+				continue;
+
+			if (strcmp(value, "primary") == 0)
+				uvalue = UCOL_PRIMARY;
+			else if (strcmp(value, "secondary") == 0)
+				uvalue = UCOL_SECONDARY;
+			else if (strcmp(value, "tertiary") == 0)
+				uvalue = UCOL_TERTIARY;
+			else if (strcmp(value, "quaternary") == 0)
+				uvalue = UCOL_QUATERNARY;
+			else if (strcmp(value, "identical") == 0)
+				uvalue = UCOL_IDENTICAL;
+			else if (strcmp(value, "no") == 0)
+				uvalue = UCOL_OFF;
+			else if (strcmp(value, "yes") == 0)
+				uvalue = UCOL_ON;
+			else if (strcmp(value, "shifted") == 0)
+				uvalue = UCOL_SHIFTED;
+			else if (strcmp(value, "non-ignorable") == 0)
+				uvalue = UCOL_NON_IGNORABLE;
+			else if (strcmp(value, "lower") == 0)
+				uvalue = UCOL_LOWER_FIRST;
+			else if (strcmp(value, "upper") == 0)
+				uvalue = UCOL_UPPER_FIRST;
+			else
+			{
+				*status = U_ILLEGAL_ARGUMENT_ERROR;
+				break;
+			}
+
+			ucol_setAttribute(collator, uattr, uvalue, status);
+		}
+	}
+
+	pfree(lower_str);
+}
+
+#endif					/* USE_ICU */
diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c
new file mode 100644
index 00000000000..6eb8b80fdf9
--- /dev/null
+++ b/src/backend/utils/adt/pg_locale_libc.c
@@ -0,0 +1,604 @@
+/*-----------------------------------------------------------------------
+ *
+ * PostgreSQL locale utilities for libc
+ *
+ * Portions Copyright (c) 2002-2024, PostgreSQL Global Development Group
+ *
+ * src/backend/utils/adt/pg_locale_libc.c
+ *
+ *-----------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <limits.h>
+#include <wctype.h>
+
+#include "access/htup_details.h"
+#include "catalog/pg_collation.h"
+#include "catalog/pg_database.h"
+#include "utils/builtins.h"
+#include "utils/memutils.h"
+#include "utils/pg_locale.h"
+#include "utils/resowner.h"
+#include "utils/syscache.h"
+
+/*
+ * This should be large enough that most strings will fit, but small enough
+ * that we feel comfortable putting it on the stack
+ */
+#define		TEXTBUFLEN			1024
+
+extern pg_locale_t libc_dat_create_locale(HeapTuple dattuple);
+extern pg_locale_t libc_coll_create_locale(MemoryContext context,
+										   ResourceOwner resowner,
+										   HeapTuple colltuple);
+
+static int strncoll_libc(const char *arg1, ssize_t len1,
+						 const char *arg2, ssize_t len2,
+						 pg_locale_t locale);
+static size_t strnxfrm_libc(char *dest, size_t destsize,
+							const char *src, ssize_t srclen,
+							pg_locale_t locale);
+
+static int char_props_libc(pg_wchar wc, int mask, pg_locale_t locale);
+static pg_wchar toupper_libc(pg_wchar wc, pg_locale_t locale);
+static pg_wchar tolower_libc(pg_wchar wc, pg_locale_t locale);
+
+static void ResourceOwnerRememberLocaleT(ResourceOwner resowner,
+										 locale_t locale);
+static void ResOwnerReleaseLocaleT(Datum val);
+
+static locale_t make_libc_collator(const char *collate, const char *ctype);
+
+static void report_newlocale_failure(const char *localename);
+
+static const ResourceOwnerDesc LocaleTResourceKind =
+{
+	.name = "locale_t reference",
+	.release_phase = RESOURCE_RELEASE_AFTER_LOCKS,
+	.release_priority = RELEASE_PRIO_LAST,
+	.ReleaseResource = ResOwnerReleaseLocaleT,
+	.DebugPrint = NULL                      /* the default message is fine */
+};
+
+struct collate_methods libc_collate_methods = {
+	.strncoll = strncoll_libc,
+	.strnxfrm = strnxfrm_libc,
+	.strnxfrm_prefix = NULL,
+};
+
+struct ctype_methods libc_ctype_methods = {
+	.char_props = char_props_libc,
+	.wc_toupper = toupper_libc,
+	.wc_tolower = tolower_libc,
+};
+
+pg_locale_t
+libc_dat_create_locale(HeapTuple dattuple)
+{
+	Form_pg_database dbform;
+	Datum		datum;
+	const char *datcollate;
+	const char *datctype;
+	pg_locale_t	result;
+
+	dbform = (Form_pg_database) GETSTRUCT(dattuple);
+	
+	Assert(dbform->datlocprovider == COLLPROVIDER_LIBC);
+
+	datum = SysCacheGetAttrNotNull(DATABASEOID, dattuple, Anum_pg_database_datcollate);
+	datcollate = TextDatumGetCString(datum);
+	datum = SysCacheGetAttrNotNull(DATABASEOID, dattuple, Anum_pg_database_datctype);
+	datctype = TextDatumGetCString(datum);
+
+	result = MemoryContextAllocZero(TopMemoryContext,
+									sizeof(struct pg_locale_struct));
+
+	result->provider = dbform->datlocprovider;
+	result->deterministic = true;
+	result->collate_is_c = (strcmp(datcollate, "C") == 0) ||
+		(strcmp(datcollate, "POSIX") == 0);
+	result->ctype_is_c = (strcmp(datctype, "C") == 0) ||
+		(strcmp(datctype, "POSIX") == 0);
+
+	if (!result->collate_is_c)
+		result->collate = &libc_collate_methods;
+	if (!result->ctype_is_c)
+		result->ctype = &libc_ctype_methods;
+	result->info.lt = make_libc_collator(datcollate, datctype);
+
+	return result;
+}
+
+pg_locale_t
+libc_coll_create_locale(MemoryContext context, ResourceOwner resowner,
+						HeapTuple colltuple)
+{
+	Form_pg_collation collform;
+	Datum		datum;
+	const char *collcollate;
+	const char *collctype;
+	locale_t	locale;
+	pg_locale_t result;
+
+	collform = (Form_pg_collation) GETSTRUCT(colltuple);
+
+	datum = SysCacheGetAttrNotNull(COLLOID, colltuple, Anum_pg_collation_collcollate);
+	collcollate = TextDatumGetCString(datum);
+	datum = SysCacheGetAttrNotNull(COLLOID, colltuple, Anum_pg_collation_collctype);
+	collctype = TextDatumGetCString(datum);
+
+	ResourceOwnerEnlarge(resowner);
+	locale = make_libc_collator(collcollate, collctype);
+	if (locale)
+		ResourceOwnerRememberLocaleT(resowner, locale);
+
+	result = MemoryContextAllocZero(context,
+									sizeof(struct pg_locale_struct));
+
+	result->provider = collform->collprovider;
+	result->deterministic = collform->collisdeterministic;
+	result->collate_is_c = (strcmp(collcollate, "C") == 0) ||
+		(strcmp(collcollate, "POSIX") == 0);
+	result->ctype_is_c = (strcmp(collctype, "C") == 0) ||
+		(strcmp(collctype, "POSIX") == 0);
+	if (!result->collate_is_c)
+		result->collate = &libc_collate_methods;
+	if (!result->ctype_is_c)
+		result->ctype = &libc_ctype_methods;
+	result->info.lt = locale;
+
+	return result;
+}
+
+static void
+ResourceOwnerRememberLocaleT(ResourceOwner resowner, locale_t locale)
+{
+	ResourceOwnerRemember(resowner, PointerGetDatum(locale),
+						  &LocaleTResourceKind);
+}
+
+static void
+ResOwnerReleaseLocaleT(Datum val)
+{
+	locale_t locale = (locale_t) DatumGetPointer(val);
+#ifndef WIN32
+	freelocale(locale);
+#else
+	_free_locale(locale);
+#endif
+}
+
+static int
+char_props_libc(pg_wchar wc, int mask, pg_locale_t locale)
+{
+	int result = 0;
+
+	Assert(!locale->ctype_is_c);
+
+	if (mask & PG_ISDIGIT)
+	{
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF))
+		{
+			if (iswdigit_l((wint_t) wc, locale->info.lt))
+				result |= PG_ISDIGIT;
+		}
+		else
+		{
+			if (wc <= (pg_wchar) UCHAR_MAX &&
+				isdigit_l((unsigned char) wc, locale->info.lt))
+				result |= PG_ISDIGIT;
+		}
+	}
+	if (mask & PG_ISALPHA)
+	{
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF))
+		{
+			if (iswalpha_l((wint_t) wc, locale->info.lt))
+				result |= PG_ISALPHA;
+		}
+		else
+		{
+			if (wc <= (pg_wchar) UCHAR_MAX &&
+				isalpha_l((unsigned char) wc, locale->info.lt))
+				result |= PG_ISALPHA;
+		}
+	}
+	if (mask & PG_ISUPPER)
+	{
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF))
+		{
+			if (iswupper_l((wint_t) wc, locale->info.lt))
+				result |= PG_ISUPPER;
+		}
+		else
+		{
+			if (wc <= (pg_wchar) UCHAR_MAX &&
+				isupper_l((unsigned char) wc, locale->info.lt))
+				result |= PG_ISUPPER;
+		}
+	}
+	if (mask & PG_ISLOWER)
+	{
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF))
+		{
+			if (iswlower_l((wint_t) wc, locale->info.lt))
+				result |= PG_ISLOWER;
+		}
+		else
+		{
+			if (wc <= (pg_wchar) UCHAR_MAX &&
+				islower_l((unsigned char) wc, locale->info.lt))
+				result |= PG_ISLOWER;
+		}
+	}
+	if (mask & PG_ISGRAPH)
+	{
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF))
+		{
+			if (iswgraph_l((wint_t) wc, locale->info.lt))
+				result |= PG_ISGRAPH;
+		}
+		else
+		{
+			if (wc <= (pg_wchar) UCHAR_MAX &&
+				isgraph_l((unsigned char) wc, locale->info.lt))
+				result |= PG_ISGRAPH;
+		}
+	}
+	if (mask & PG_ISPRINT)
+	{
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF))
+		{
+			if (iswprint_l((wint_t) wc, locale->info.lt))
+				result |= PG_ISPRINT;
+		}
+		else
+		{
+			if (wc <= (pg_wchar) UCHAR_MAX &&
+				isprint_l((unsigned char) wc, locale->info.lt))
+				result |= PG_ISPRINT;
+		}
+	}
+	if (mask & PG_ISPUNCT)
+	{
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF))
+		{
+			if (iswpunct_l((wint_t) wc, locale->info.lt))
+				result |= PG_ISPUNCT;
+		}
+		else
+		{
+			if (wc <= (pg_wchar) UCHAR_MAX &&
+				ispunct_l((unsigned char) wc, locale->info.lt))
+				result |= PG_ISPUNCT;
+		}
+	}
+	if (mask & PG_ISSPACE)
+	{
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF))
+		{
+			if (iswspace_l((wint_t) wc, locale->info.lt))
+				result |= PG_ISSPACE;
+		}
+		else
+		{
+			if (wc <= (pg_wchar) UCHAR_MAX &&
+				isspace_l((unsigned char) wc, locale->info.lt))
+				result |= PG_ISSPACE;
+		}
+	}
+
+	return result;
+}
+
+static pg_wchar
+toupper_libc(pg_wchar wc, pg_locale_t locale)
+{
+	if (GetDatabaseEncoding() == PG_UTF8 &&
+		(sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF))
+		return towupper_l((wint_t) wc, locale->info.lt);
+	else if (wc <= (pg_wchar) UCHAR_MAX)
+		return toupper_l((unsigned char) wc, locale->info.lt);
+	else
+		return wc;
+}
+
+static pg_wchar
+tolower_libc(pg_wchar wc, pg_locale_t locale)
+{
+	if (GetDatabaseEncoding() == PG_UTF8 &&
+		(sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF))
+		return towlower_l((wint_t) wc, locale->info.lt);
+	else if (wc <= (pg_wchar) UCHAR_MAX)
+		return tolower_l((unsigned char) wc, locale->info.lt);
+	else
+		return wc;
+}
+
+/*
+ * Create a locale_t with the given collation and ctype.
+ *
+ * The "C" and "POSIX" locales are not actually handled by libc, so return
+ * NULL.
+ *
+ * Ensure that no path leaks a locale_t.
+ */
+static locale_t
+make_libc_collator(const char *collate, const char *ctype)
+{
+	locale_t	loc = 0;
+
+	if (strcmp(collate, ctype) == 0)
+	{
+		if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0)
+		{
+			/* Normal case where they're the same */
+			errno = 0;
+#ifndef WIN32
+			loc = newlocale(LC_COLLATE_MASK | LC_CTYPE_MASK, collate,
+							NULL);
+#else
+			loc = _create_locale(LC_ALL, collate);
+#endif
+			if (!loc)
+				report_newlocale_failure(collate);
+		}
+	}
+	else
+	{
+#ifndef WIN32
+		/* We need two newlocale() steps */
+		locale_t	loc1 = 0;
+
+		if (strcmp(collate, "C") != 0 && strcmp(collate, "POSIX") != 0)
+		{
+			errno = 0;
+			loc1 = newlocale(LC_COLLATE_MASK, collate, NULL);
+			if (!loc1)
+				report_newlocale_failure(collate);
+		}
+
+		if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0)
+		{
+			errno = 0;
+			loc = newlocale(LC_CTYPE_MASK, ctype, loc1);
+			if (!loc)
+			{
+				if (loc1)
+					freelocale(loc1);
+				report_newlocale_failure(ctype);
+			}
+		}
+		else
+			loc = loc1;
+#else
+
+		/*
+		 * XXX The _create_locale() API doesn't appear to support this. Could
+		 * perhaps be worked around by changing pg_locale_t to contain two
+		 * separate fields.
+		 */
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("collations with different collate and ctype values are not supported on this platform")));
+#endif
+	}
+
+	return loc;
+}
+
+/*
+ * strncoll_libc_win32_utf8
+ *
+ * Win32 does not have UTF-8. Convert UTF8 arguments to wide characters and
+ * invoke wcscoll_l().
+ *
+ * An input string length of -1 means that it's NUL-terminated.
+ */
+#ifdef WIN32
+static int
+strncoll_libc_win32_utf8(const char *arg1, ssize_t len1, const char *arg2,
+						 ssize_t len2, pg_locale_t locale)
+{
+	char		sbuf[TEXTBUFLEN];
+	char	   *buf = sbuf;
+	char	   *a1p,
+			   *a2p;
+	int			a1len;
+	int			a2len;
+	int			r;
+	int			result;
+
+	Assert(locale->provider == COLLPROVIDER_LIBC);
+	Assert(GetDatabaseEncoding() == PG_UTF8);
+#ifndef WIN32
+	Assert(false);
+#endif
+
+	if (len1 == -1)
+		len1 = strlen(arg1);
+	if (len2 == -1)
+		len2 = strlen(arg2);
+
+	a1len = len1 * 2 + 2;
+	a2len = len2 * 2 + 2;
+
+	if (a1len + a2len > TEXTBUFLEN)
+		buf = palloc(a1len + a2len);
+
+	a1p = buf;
+	a2p = buf + a1len;
+
+	/* API does not work for zero-length input */
+	if (len1 == 0)
+		r = 0;
+	else
+	{
+		r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
+								(LPWSTR) a1p, a1len / 2);
+		if (!r)
+			ereport(ERROR,
+					(errmsg("could not convert string to UTF-16: error code %lu",
+							GetLastError())));
+	}
+	((LPWSTR) a1p)[r] = 0;
+
+	if (len2 == 0)
+		r = 0;
+	else
+	{
+		r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
+								(LPWSTR) a2p, a2len / 2);
+		if (!r)
+			ereport(ERROR,
+					(errmsg("could not convert string to UTF-16: error code %lu",
+							GetLastError())));
+	}
+	((LPWSTR) a2p)[r] = 0;
+
+	errno = 0;
+	result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, locale->info.lt);
+	if (result == 2147483647)	/* _NLSCMPERROR; missing from mingw headers */
+		ereport(ERROR,
+				(errmsg("could not compare Unicode strings: %m")));
+
+	if (buf != sbuf)
+		pfree(buf);
+
+	return result;
+}
+#endif							/* WIN32 */
+
+/*
+ * strncoll_libc
+ *
+ * An input string length of -1 means that it's NUL-terminated.
+ */
+static int
+strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
+			  pg_locale_t locale)
+{
+	char		sbuf[TEXTBUFLEN];
+	char	   *buf = sbuf;
+	size_t		bufsize1 = (len1 == -1) ? 0 : len1 + 1;
+	size_t		bufsize2 = (len2 == -1) ? 0 : len2 + 1;
+	const char *arg1n;
+	const char *arg2n;
+	int			result;
+
+	Assert(locale->provider == COLLPROVIDER_LIBC);
+
+#ifdef WIN32
+	/* check for this case before doing the work for nul-termination */
+	if (GetDatabaseEncoding() == PG_UTF8)
+		return strncoll_libc_win32_utf8(arg1, len1, arg2, len2, locale);
+#endif							/* WIN32 */
+
+	if (bufsize1 + bufsize2 > TEXTBUFLEN)
+		buf = palloc(bufsize1 + bufsize2);
+
+	/* nul-terminate arguments if necessary */
+	if (len1 == -1)
+	{
+		arg1n = arg1;
+	}
+	else
+	{
+		char *buf1 = buf;
+		memcpy(buf1, arg1, len1);
+		buf1[len1] = '\0';
+		arg1n = buf1;
+	}
+
+	if (len2 == -1)
+	{
+		arg2n = arg2;
+	}
+	else
+	{
+		char *buf2 = buf + bufsize1;
+		memcpy(buf2, arg2, len2);
+		buf2[len2] = '\0';
+		arg2n = buf2;
+	}
+
+	result = strcoll_l(arg1n, arg2n, locale->info.lt);
+
+	if (buf != sbuf)
+		pfree(buf);
+
+	return result;
+}
+
+static size_t
+strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen,
+			  pg_locale_t locale)
+{
+	char		sbuf[TEXTBUFLEN];
+	char	   *buf = sbuf;
+	size_t		bufsize = srclen + 1;
+	size_t		result;
+
+	Assert(locale->provider == COLLPROVIDER_LIBC);
+
+	if (srclen == -1)
+		return strxfrm_l(dest, src, destsize, locale->info.lt);
+
+	if (bufsize > TEXTBUFLEN)
+		buf = palloc(bufsize);
+
+	/* nul-terminate argument */
+	memcpy(buf, src, srclen);
+	buf[srclen] = '\0';
+
+	result = strxfrm_l(dest, buf, destsize, locale->info.lt);
+
+	if (buf != sbuf)
+		pfree(buf);
+
+	/* if dest is defined, it should be nul-terminated */
+	Assert(result >= destsize || dest[result] == '\0');
+
+	return result;
+}
+
+/* simple subroutine for reporting errors from newlocale() */
+static void
+report_newlocale_failure(const char *localename)
+{
+	int			save_errno;
+
+	/*
+	 * Windows doesn't provide any useful error indication from
+	 * _create_locale(), and BSD-derived platforms don't seem to feel they
+	 * need to set errno either (even though POSIX is pretty clear that
+	 * newlocale should do so).  So, if errno hasn't been set, assume ENOENT
+	 * is what to report.
+	 */
+	if (errno == 0)
+		errno = ENOENT;
+
+	/*
+	 * ENOENT means "no such locale", not "no such file", so clarify that
+	 * errno with an errdetail message.
+	 */
+	save_errno = errno;			/* auxiliary funcs might change errno */
+	ereport(ERROR,
+			(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+			 errmsg("could not create locale \"%s\": %m",
+					localename),
+			 (save_errno == ENOENT ?
+			  errdetail("The operating system could not find any locale data for the locale name \"%s\".",
+						localename) : 0)));
+}
+
diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h
index 3b443df8014..95ba7940b95 100644
--- a/src/include/utils/pg_locale.h
+++ b/src/include/utils/pg_locale.h
@@ -12,6 +12,7 @@
 #ifndef _PG_LOCALE_
 #define _PG_LOCALE_
 
+#include "mb/pg_wchar.h"
 #if defined(LOCALE_T_IN_XLOCALE) || defined(WCSTOMBS_L_IN_XLOCALE)
 #include <xlocale.h>
 #endif
@@ -19,6 +20,19 @@
 #include <unicode/ucol.h>
 #endif
 
+/*
+ * Character properties for regular expressions.
+ */
+#define PG_ISDIGIT	0x01
+#define PG_ISALPHA	0x02
+#define PG_ISALNUM	(PG_ISDIGIT | PG_ISALPHA)
+#define PG_ISUPPER	0x04
+#define PG_ISLOWER	0x08
+#define PG_ISGRAPH	0x10
+#define PG_ISPRINT	0x20
+#define PG_ISPUNCT	0x40
+#define PG_ISSPACE	0x80
+
 #ifdef USE_ICU
 /*
  * ucol_strcollUTF8() was introduced in ICU 50, but it is buggy before ICU 53.
@@ -62,6 +76,28 @@ extern struct lconv *PGLC_localeconv(void);
 
 extern void cache_locale_time(void);
 
+struct pg_locale_struct;
+typedef struct pg_locale_struct *pg_locale_t;
+
+struct collate_methods
+{
+	int	(*strncoll)(const char *arg1, ssize_t len1,
+					const char *arg2, ssize_t len2,
+					pg_locale_t locale);
+	size_t (*strnxfrm)(char *dest, size_t destsize,
+					   const char *src, ssize_t srclen,
+					   pg_locale_t locale);
+	size_t (*strnxfrm_prefix)(char *dest, size_t destsize,
+							  const char *src, ssize_t srclen,
+							  pg_locale_t locale);
+};
+
+struct ctype_methods
+{
+	int (*char_props)(pg_wchar wc, int mask, pg_locale_t locale);
+	pg_wchar (*wc_toupper)(pg_wchar wc, pg_locale_t locale);
+	pg_wchar (*wc_tolower)(pg_wchar wc, pg_locale_t locale);
+};
 
 /*
  * We use a discriminated union to hold either a locale_t or an ICU collator.
@@ -85,6 +121,10 @@ struct pg_locale_struct
 	bool		deterministic;
 	bool		collate_is_c;
 	bool		ctype_is_c;
+
+	struct collate_methods *collate;
+	struct ctype_methods *ctype;
+
 	union
 	{
 		struct
@@ -102,8 +142,6 @@ struct pg_locale_struct
 	}			info;
 };
 
-typedef struct pg_locale_struct *pg_locale_t;
-
 extern void init_database_collation(void);
 extern pg_locale_t pg_newlocale_from_collation(Oid collid);
 
@@ -132,6 +170,8 @@ extern int32_t icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes)
 extern int32_t icu_from_uchar(char **result, const UChar *buff_uchar, int32_t len_uchar);
 #endif
 
+extern int char_props(pg_wchar wc, int mask, pg_locale_t locale);
+
 /* These functions convert from/to libc's wchar_t, *not* pg_wchar_t */
 extern size_t wchar2char(char *to, const wchar_t *from, size_t tolen,
 						 pg_locale_t locale);
-- 
2.34.1

v4-0006-Allow-length-1-for-NUL-terminated-input-to-pg_str.patchtext/x-patch; charset=UTF-8; name=v4-0006-Allow-length-1-for-NUL-terminated-input-to-pg_str.patchDownload

From 56e4fbb3ccd2927e4cc92b4201632361e2b16abb Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Wed, 21 Aug 2024 10:59:28 -0700
Subject: [PATCH v4 6/7] Allow length=-1 for NUL-terminated input to
 pg_strncoll(), etc.

Like ICU, allow a length of -1 to be specified for NUL-terminated
arguments to pg_strncoll(), pg_strnxfrm(), and pg_strnxfrm_prefix().

Simplifies the code and comments.
---
 src/backend/utils/adt/pg_locale.c | 256 ++++++++++++------------------
 src/include/utils/pg_locale.h     |   8 +-
 2 files changed, 104 insertions(+), 160 deletions(-)

diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index c89ac3b9e01..cfba55a6e31 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -1886,22 +1886,24 @@ get_collation_actual_version(char collprovider, const char *collcollate)
 }
 
 /*
- * pg_strncoll_libc_win32_utf8
+ * strncoll_libc_win32_utf8
  *
  * Win32 does not have UTF-8. Convert UTF8 arguments to wide characters and
  * invoke wcscoll_l().
+ *
+ * An input string length of -1 means that it's NUL-terminated.
  */
 #ifdef WIN32
 static int
-pg_strncoll_libc_win32_utf8(const char *arg1, size_t len1, const char *arg2,
-							size_t len2, pg_locale_t locale)
+strncoll_libc_win32_utf8(const char *arg1, ssize_t len1, const char *arg2,
+						 ssize_t len2, pg_locale_t locale)
 {
 	char		sbuf[TEXTBUFLEN];
 	char	   *buf = sbuf;
 	char	   *a1p,
 			   *a2p;
-	int			a1len = len1 * 2 + 2;
-	int			a2len = len2 * 2 + 2;
+	int			a1len;
+	int			a2len;
 	int			r;
 	int			result;
 
@@ -1911,6 +1913,14 @@ pg_strncoll_libc_win32_utf8(const char *arg1, size_t len1, const char *arg2,
 	Assert(false);
 #endif
 
+	if (len1 == -1)
+		len1 = strlen(arg1);
+	if (len2 == -1)
+		len2 = strlen(arg2);
+
+	a1len = len1 * 2 + 2;
+	a2len = len2 * 2 + 2;
+
 	if (a1len + a2len > TEXTBUFLEN)
 		buf = palloc(a1len + a2len);
 
@@ -1958,50 +1968,20 @@ pg_strncoll_libc_win32_utf8(const char *arg1, size_t len1, const char *arg2,
 #endif							/* WIN32 */
 
 /*
- * pg_strcoll_libc
+ * strncoll_libc
  *
- * Call strcoll_l() or wcscoll_l() as appropriate for the given locale,
- * platform, and database encoding. If the locale is NULL, use the database
- * collation.
- *
- * Arguments must be encoded in the database encoding and nul-terminated.
+ * An input string length of -1 means that it's NUL-terminated.
  */
 static int
-pg_strcoll_libc(const char *arg1, const char *arg2, pg_locale_t locale)
-{
-	int			result;
-
-	Assert(locale->provider == COLLPROVIDER_LIBC);
-#ifdef WIN32
-	if (GetDatabaseEncoding() == PG_UTF8)
-	{
-		size_t		len1 = strlen(arg1);
-		size_t		len2 = strlen(arg2);
-
-		result = pg_strncoll_libc_win32_utf8(arg1, len1, arg2, len2, locale);
-	}
-	else
-#endif							/* WIN32 */
-		result = strcoll_l(arg1, arg2, locale->info.lt);
-
-	return result;
-}
-
-/*
- * pg_strncoll_libc
- *
- * Nul-terminate the arguments and call pg_strcoll_libc().
- */
-static int
-pg_strncoll_libc(const char *arg1, size_t len1, const char *arg2, size_t len2,
-				 pg_locale_t locale)
+strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
+			  pg_locale_t locale)
 {
 	char		sbuf[TEXTBUFLEN];
 	char	   *buf = sbuf;
-	size_t		bufsize1 = len1 + 1;
-	size_t		bufsize2 = len2 + 1;
-	char	   *arg1n;
-	char	   *arg2n;
+	size_t		bufsize1 = (len1 == -1) ? 0 : len1 + 1;
+	size_t		bufsize2 = (len2 == -1) ? 0 : len2 + 1;
+	const char *arg1n;
+	const char *arg2n;
 	int			result;
 
 	Assert(locale->provider == COLLPROVIDER_LIBC);
@@ -2009,22 +1989,38 @@ pg_strncoll_libc(const char *arg1, size_t len1, const char *arg2, size_t len2,
 #ifdef WIN32
 	/* check for this case before doing the work for nul-termination */
 	if (GetDatabaseEncoding() == PG_UTF8)
-		return pg_strncoll_libc_win32_utf8(arg1, len1, arg2, len2, locale);
+		return strncoll_libc_win32_utf8(arg1, len1, arg2, len2, locale);
 #endif							/* WIN32 */
 
 	if (bufsize1 + bufsize2 > TEXTBUFLEN)
 		buf = palloc(bufsize1 + bufsize2);
 
-	arg1n = buf;
-	arg2n = buf + bufsize1;
+	/* nul-terminate arguments if necessary */
+	if (len1 == -1)
+	{
+		arg1n = arg1;
+	}
+	else
+	{
+		char *buf1 = buf;
+		memcpy(buf1, arg1, len1);
+		buf1[len1] = '\0';
+		arg1n = buf1;
+	}
 
-	/* nul-terminate arguments */
-	memcpy(arg1n, arg1, len1);
-	arg1n[len1] = '\0';
-	memcpy(arg2n, arg2, len2);
-	arg2n[len2] = '\0';
+	if (len2 == -1)
+	{
+		arg2n = arg2;
+	}
+	else
+	{
+		char *buf2 = buf + bufsize1;
+		memcpy(buf2, arg2, len2);
+		buf2[len2] = '\0';
+		arg2n = buf2;
+	}
 
-	result = pg_strcoll_libc(arg1n, arg2n, locale);
+	result = strcoll_l(arg1n, arg2n, locale->info.lt);
 
 	if (buf != sbuf)
 		pfree(buf);
@@ -2035,7 +2031,7 @@ pg_strncoll_libc(const char *arg1, size_t len1, const char *arg2, size_t len2,
 #ifdef USE_ICU
 
 /*
- * pg_strncoll_icu_no_utf8
+ * strncoll_icu_no_utf8
  *
  * Convert the arguments from the database encoding to UChar strings, then
  * call ucol_strcoll(). An argument length of -1 means that the string is
@@ -2045,8 +2041,8 @@ pg_strncoll_libc(const char *arg1, size_t len1, const char *arg2, size_t len2,
  * caller should call that instead.
  */
 static int
-pg_strncoll_icu_no_utf8(const char *arg1, int32_t len1,
-						const char *arg2, int32_t len2, pg_locale_t locale)
+strncoll_icu_no_utf8(const char *arg1, ssize_t len1,
+					 const char *arg2, ssize_t len2, pg_locale_t locale)
 {
 	char		sbuf[TEXTBUFLEN];
 	char	   *buf = sbuf;
@@ -2091,17 +2087,15 @@ pg_strncoll_icu_no_utf8(const char *arg1, int32_t len1,
 }
 
 /*
- * pg_strncoll_icu
+ * strncoll_icu
  *
  * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given
  * database encoding. An argument length of -1 means the string is
  * NUL-terminated.
- *
- * Arguments must be encoded in the database encoding.
  */
 static int
-pg_strncoll_icu(const char *arg1, int32_t len1, const char *arg2, int32_t len2,
-				pg_locale_t locale)
+strncoll_icu(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
+			 pg_locale_t locale)
 {
 	int			result;
 
@@ -2124,7 +2118,7 @@ pg_strncoll_icu(const char *arg1, int32_t len1, const char *arg2, int32_t len2,
 	else
 #endif
 	{
-		result = pg_strncoll_icu_no_utf8(arg1, len1, arg2, len2, locale);
+		result = strncoll_icu_no_utf8(arg1, len1, arg2, len2, locale);
 	}
 
 	return result;
@@ -2135,15 +2129,7 @@ pg_strncoll_icu(const char *arg1, int32_t len1, const char *arg2, int32_t len2,
 /*
  * pg_strcoll
  *
- * Call ucol_strcollUTF8(), ucol_strcoll(), strcoll_l() or wcscoll_l() as
- * appropriate for the given locale, platform, and database encoding. If the
- * locale is not specified, use the database collation.
- *
- * Arguments must be encoded in the database encoding and nul-terminated.
- *
- * The caller is responsible for breaking ties if the collation is
- * deterministic; this maintains consistency with pg_strxfrm(), which cannot
- * easily account for deterministic collations.
+ * Like pg_strncoll for NUL-terminated input strings.
  */
 int
 pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale)
@@ -2151,10 +2137,10 @@ pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale)
 	int			result;
 
 	if (locale->provider == COLLPROVIDER_LIBC)
-		result = pg_strcoll_libc(arg1, arg2, locale);
+		result = strncoll_libc(arg1, -1, arg2, -1, locale);
 #ifdef USE_ICU
 	else if (locale->provider == COLLPROVIDER_ICU)
-		result = pg_strncoll_icu(arg1, -1, arg2, -1, locale);
+		result = strncoll_icu(arg1, -1, arg2, -1, locale);
 #endif
 	else
 		/* shouldn't happen */
@@ -2170,27 +2156,24 @@ pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale)
  * appropriate for the given locale, platform, and database encoding. If the
  * locale is not specified, use the database collation.
  *
- * Arguments must be encoded in the database encoding.
- *
- * This function may need to nul-terminate the arguments for libc functions;
- * so if the caller already has nul-terminated strings, it should call
- * pg_strcoll() instead.
+ * The input strings must be encoded in the database encoding. If an input
+ * string is NUL-terminated, its length may be specified as -1.
  *
  * The caller is responsible for breaking ties if the collation is
  * deterministic; this maintains consistency with pg_strnxfrm(), which cannot
  * easily account for deterministic collations.
  */
 int
-pg_strncoll(const char *arg1, size_t len1, const char *arg2, size_t len2,
+pg_strncoll(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
 			pg_locale_t locale)
 {
 	int			result;
 
 	if (locale->provider == COLLPROVIDER_LIBC)
-		result = pg_strncoll_libc(arg1, len1, arg2, len2, locale);
+		result = strncoll_libc(arg1, len1, arg2, len2, locale);
 #ifdef USE_ICU
 	else if (locale->provider == COLLPROVIDER_ICU)
-		result = pg_strncoll_icu(arg1, len1, arg2, len2, locale);
+		result = strncoll_icu(arg1, len1, arg2, len2, locale);
 #endif
 	else
 		/* shouldn't happen */
@@ -2201,16 +2184,8 @@ pg_strncoll(const char *arg1, size_t len1, const char *arg2, size_t len2,
 
 
 static size_t
-pg_strxfrm_libc(char *dest, const char *src, size_t destsize,
-				pg_locale_t locale)
-{
-	Assert(locale->provider == COLLPROVIDER_LIBC);
-	return strxfrm_l(dest, src, destsize, locale->info.lt);
-}
-
-static size_t
-pg_strnxfrm_libc(char *dest, const char *src, size_t srclen, size_t destsize,
-				 pg_locale_t locale)
+strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen,
+			  pg_locale_t locale)
 {
 	char		sbuf[TEXTBUFLEN];
 	char	   *buf = sbuf;
@@ -2219,14 +2194,17 @@ pg_strnxfrm_libc(char *dest, const char *src, size_t srclen, size_t destsize,
 
 	Assert(locale->provider == COLLPROVIDER_LIBC);
 
+	if (srclen == -1)
+		return strxfrm_l(dest, src, destsize, locale->info.lt);
+
 	if (bufsize > TEXTBUFLEN)
 		buf = palloc(bufsize);
 
-	/* nul-terminate arguments */
+	/* nul-terminate argument */
 	memcpy(buf, src, srclen);
 	buf[srclen] = '\0';
 
-	result = pg_strxfrm_libc(dest, buf, destsize, locale);
+	result = strxfrm_l(dest, buf, destsize, locale->info.lt);
 
 	if (buf != sbuf)
 		pfree(buf);
@@ -2241,8 +2219,8 @@ pg_strnxfrm_libc(char *dest, const char *src, size_t srclen, size_t destsize,
 
 /* 'srclen' of -1 means the strings are NUL-terminated */
 static size_t
-pg_strnxfrm_icu(char *dest, const char *src, int32_t srclen, int32_t destsize,
-				pg_locale_t locale)
+strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
+			 pg_locale_t locale)
 {
 	char		sbuf[TEXTBUFLEN];
 	char	   *buf = sbuf;
@@ -2288,8 +2266,9 @@ pg_strnxfrm_icu(char *dest, const char *src, int32_t srclen, int32_t destsize,
 
 /* 'srclen' of -1 means the strings are NUL-terminated */
 static size_t
-pg_strnxfrm_prefix_icu_no_utf8(char *dest, const char *src, int32_t srclen,
-							   int32_t destsize, pg_locale_t locale)
+strnxfrm_prefix_icu_no_utf8(char *dest,size_t destsize,
+							const char *src, ssize_t srclen,
+							pg_locale_t locale)
 {
 	char		sbuf[TEXTBUFLEN];
 	char	   *buf = sbuf;
@@ -2336,8 +2315,9 @@ pg_strnxfrm_prefix_icu_no_utf8(char *dest, const char *src, int32_t srclen,
 
 /* 'srclen' of -1 means the strings are NUL-terminated */
 static size_t
-pg_strnxfrm_prefix_icu(char *dest, const char *src, int32_t srclen,
-					   int32_t destsize, pg_locale_t locale)
+strnxfrm_prefix_icu(char *dest, size_t destsize,
+					const char *src, ssize_t srclen,
+					pg_locale_t locale)
 {
 	size_t		result;
 
@@ -2364,8 +2344,8 @@ pg_strnxfrm_prefix_icu(char *dest, const char *src, int32_t srclen,
 							u_errorName(status))));
 	}
 	else
-		result = pg_strnxfrm_prefix_icu_no_utf8(dest, src, srclen, destsize,
-												locale);
+		result = strnxfrm_prefix_icu_no_utf8(dest, destsize, src, srclen,
+											 locale);
 
 	return result;
 }
@@ -2407,20 +2387,7 @@ pg_strxfrm_enabled(pg_locale_t locale)
 /*
  * pg_strxfrm
  *
- * Transforms 'src' to a nul-terminated string stored in 'dest' such that
- * ordinary strcmp() on transformed strings is equivalent to pg_strcoll() on
- * untransformed strings.
- *
- * The provided 'src' must be nul-terminated. If 'destsize' is zero, 'dest'
- * may be NULL.
- *
- * Not all providers support pg_strxfrm() safely. The caller should check
- * pg_strxfrm_enabled() first, otherwise this function may return wrong
- * results or an error.
- *
- * Returns the number of bytes needed (or more) to store the transformed
- * string, excluding the terminating nul byte. If the value returned is
- * 'destsize' or greater, the resulting contents of 'dest' are undefined.
+ * Like pg_strnxfrm for a NUL-terminated input string.
  */
 size_t
 pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale)
@@ -2428,10 +2395,10 @@ pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale)
 	size_t		result = 0;		/* keep compiler quiet */
 
 	if (locale->provider == COLLPROVIDER_LIBC)
-		result = pg_strxfrm_libc(dest, src, destsize, locale);
+		result = strnxfrm_libc(dest, destsize, src, -1, locale);
 #ifdef USE_ICU
 	else if (locale->provider == COLLPROVIDER_ICU)
-		result = pg_strnxfrm_icu(dest, src, -1, destsize, locale);
+		result = strnxfrm_icu(dest, destsize, src, -1, locale);
 #endif
 	else
 		/* shouldn't happen */
@@ -2447,8 +2414,9 @@ pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale)
  * ordinary strcmp() on transformed strings is equivalent to pg_strcoll() on
  * untransformed strings.
  *
- * 'src' does not need to be nul-terminated. If 'destsize' is zero, 'dest' may
- * be NULL.
+ * The input string must be encoded in the database encoding. If the input
+ * string is NUL-terminated, its length may be specified as -1. If 'destsize'
+ * is zero, 'dest' may be NULL.
  *
  * Not all providers support pg_strnxfrm() safely. The caller should check
  * pg_strxfrm_enabled() first, otherwise this function may return wrong
@@ -2457,22 +2425,18 @@ pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale)
  * Returns the number of bytes needed (or more) to store the transformed
  * string, excluding the terminating nul byte. If the value returned is
  * 'destsize' or greater, the resulting contents of 'dest' are undefined.
- *
- * This function may need to nul-terminate the argument for libc functions;
- * so if the caller already has a nul-terminated string, it should call
- * pg_strxfrm() instead.
  */
 size_t
-pg_strnxfrm(char *dest, size_t destsize, const char *src, size_t srclen,
+pg_strnxfrm(char *dest, size_t destsize, const char *src, ssize_t srclen,
 			pg_locale_t locale)
 {
 	size_t		result = 0;		/* keep compiler quiet */
 
 	if (locale->provider == COLLPROVIDER_LIBC)
-		result = pg_strnxfrm_libc(dest, src, srclen, destsize, locale);
+		result = strnxfrm_libc(dest, src, srclen, destsize, locale);
 #ifdef USE_ICU
 	else if (locale->provider == COLLPROVIDER_ICU)
-		result = pg_strnxfrm_icu(dest, src, srclen, destsize, locale);
+		result = strnxfrm_icu(dest, src, srclen, destsize, locale);
 #endif
 	else
 		/* shouldn't happen */
@@ -2502,44 +2466,24 @@ pg_strxfrm_prefix_enabled(pg_locale_t locale)
 /*
  * pg_strxfrm_prefix
  *
- * Transforms 'src' to a byte sequence stored in 'dest' such that ordinary
- * memcmp() on the byte sequence is equivalent to pg_strcoll() on
- * untransformed strings. The result is not nul-terminated.
- *
- * The provided 'src' must be nul-terminated.
- *
- * Not all providers support pg_strxfrm_prefix() safely. The caller should
- * check pg_strxfrm_prefix_enabled() first, otherwise this function may return
- * wrong results or an error.
- *
- * If destsize is not large enough to hold the resulting byte sequence, stores
- * only the first destsize bytes in 'dest'. Returns the number of bytes
- * actually copied to 'dest'.
+ * Like pg_strnxfrm_prefix for a NUL-terminated input string.
  */
 size_t
 pg_strxfrm_prefix(char *dest, const char *src, size_t destsize,
 				  pg_locale_t locale)
 {
-	size_t		result = 0;		/* keep compiler quiet */
-
-#ifdef USE_ICU
-	if (locale->provider == COLLPROVIDER_ICU)
-		result = pg_strnxfrm_prefix_icu(dest, src, -1, destsize, locale);
-	else
-#endif
-		PGLOCALE_SUPPORT_ERROR(locale->provider);
-
-	return result;
+	return pg_strnxfrm_prefix(dest, destsize, src, -1, locale);
 }
 
 /*
  * pg_strnxfrm_prefix
  *
  * Transforms 'src' to a byte sequence stored in 'dest' such that ordinary
- * memcmp() on the byte sequence is equivalent to pg_strcoll() on
+ * memcmp() on the byte sequence is equivalent to pg_strncoll() on
  * untransformed strings. The result is not nul-terminated.
  *
- * The provided 'src' must be nul-terminated.
+ * The input string must be encoded in the database encoding. If the input
+ * string is NUL-terminated, its length may be specified as -1.
  *
  * Not all providers support pg_strnxfrm_prefix() safely. The caller should
  * check pg_strxfrm_prefix_enabled() first, otherwise this function may return
@@ -2548,20 +2492,16 @@ pg_strxfrm_prefix(char *dest, const char *src, size_t destsize,
  * If destsize is not large enough to hold the resulting byte sequence, stores
  * only the first destsize bytes in 'dest'. Returns the number of bytes
  * actually copied to 'dest'.
- *
- * This function may need to nul-terminate the argument for libc functions;
- * so if the caller already has a nul-terminated string, it should call
- * pg_strxfrm_prefix() instead.
  */
 size_t
 pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src,
-				   size_t srclen, pg_locale_t locale)
+				   ssize_t srclen, pg_locale_t locale)
 {
 	size_t		result = 0;		/* keep compiler quiet */
 
 #ifdef USE_ICU
 	if (locale->provider == COLLPROVIDER_ICU)
-		result = pg_strnxfrm_prefix_icu(dest, src, -1, destsize, locale);
+		result = strnxfrm_prefix_icu(dest, src, -1, destsize, locale);
 	else
 #endif
 		PGLOCALE_SUPPORT_ERROR(locale->provider);
@@ -2744,6 +2684,8 @@ init_icu_converter(void)
 
 /*
  * Find length, in UChars, of given string if converted to UChar string.
+ *
+ * A length of -1 indicates that the input string is NUL-terminated.
  */
 static size_t
 uchar_length(UConverter *converter, const char *str, int32_t len)
@@ -2761,6 +2703,8 @@ uchar_length(UConverter *converter, const char *str, int32_t len)
 /*
  * Convert the given source string into a UChar string, stored in dest, and
  * return the length (in UChars).
+ *
+ * A srclen of -1 indicates that the input string is NUL-terminated.
  */
 static int32_t
 uchar_convert(UConverter *converter, UChar *dest, int32_t destlen,
diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h
index c2d95411e0a..3b443df8014 100644
--- a/src/include/utils/pg_locale.h
+++ b/src/include/utils/pg_locale.h
@@ -109,18 +109,18 @@ extern pg_locale_t pg_newlocale_from_collation(Oid collid);
 
 extern char *get_collation_actual_version(char collprovider, const char *collcollate);
 extern int	pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale);
-extern int	pg_strncoll(const char *arg1, size_t len1,
-						const char *arg2, size_t len2, pg_locale_t locale);
+extern int	pg_strncoll(const char *arg1, ssize_t len1,
+						const char *arg2, ssize_t len2, pg_locale_t locale);
 extern bool pg_strxfrm_enabled(pg_locale_t locale);
 extern size_t pg_strxfrm(char *dest, const char *src, size_t destsize,
 						 pg_locale_t locale);
 extern size_t pg_strnxfrm(char *dest, size_t destsize, const char *src,
-						  size_t srclen, pg_locale_t locale);
+						  ssize_t srclen, pg_locale_t locale);
 extern bool pg_strxfrm_prefix_enabled(pg_locale_t locale);
 extern size_t pg_strxfrm_prefix(char *dest, const char *src, size_t destsize,
 								pg_locale_t locale);
 extern size_t pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src,
-								 size_t srclen, pg_locale_t locale);
+								 ssize_t srclen, pg_locale_t locale);
 
 extern int	builtin_locale_encoding(const char *locale);
 extern const char *builtin_validate_locale(int encoding, const char *locale);
-- 
2.34.1

v4-0005-invalidation.patchtext/x-patch; charset=UTF-8; name=v4-0005-invalidation.patchDownload

From 2f51247615a36dc257b700c2832f3d4aa32fce64 Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Wed, 18 Sep 2024 17:49:57 -0700
Subject: [PATCH v4 5/7] invalidation

---
 src/backend/utils/adt/pg_locale.c | 41 +++++++++++++++++++++++++------
 1 file changed, 33 insertions(+), 8 deletions(-)

diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 9d1d71f1561..c89ac3b9e01 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -63,6 +63,7 @@
 #include "utils/builtins.h"
 #include "utils/formatting.h"
 #include "utils/guc_hooks.h"
+#include "utils/inval.h"
 #include "utils/lsyscache.h"
 #include "utils/memutils.h"
 #include "utils/pg_locale.h"
@@ -1695,6 +1696,34 @@ create_pg_locale(MemoryContext context, ResourceOwner owner, Oid collid)
 	return result;
 }
 
+static void
+CollationCacheInvalidate(Datum arg, int cacheid, uint32 hashvalue)
+{
+	last_collation_cache_oid = InvalidOid;
+
+	if (CollationCache == NULL)
+		return;
+
+	ResourceOwnerRelease(CollationCacheOwner,
+						 RESOURCE_RELEASE_BEFORE_LOCKS,
+						 false, true);
+	ResourceOwnerRelease(CollationCacheOwner,
+						 RESOURCE_RELEASE_LOCKS,
+						 false, true);
+	ResourceOwnerRelease(CollationCacheOwner,
+						 RESOURCE_RELEASE_AFTER_LOCKS,
+						 false, true);
+	ResourceOwnerDelete(CollationCacheOwner);
+	CollationCacheOwner = ResourceOwnerCreate(NULL, "collation cache");
+
+	MemoryContextReset(CollationCacheContext);
+
+	/* free all memory and reset hash table */
+	CollationCache = collation_cache_create(CollationCacheContext,
+											16, NULL);
+}
+
+
 /*
  * Create or retrieve a pg_locale_t for the given collation OID.  Results are
  * cached for the lifetime of the backend.
@@ -1714,14 +1743,7 @@ pg_newlocale_from_collation(Oid collid)
 	if (last_collation_cache_oid == collid)
 		return last_collation_cache_locale;
 
-	/*
-	 * Cache mechanism for collation information.
-	 *
-	 * Note that we currently lack any way to flush the cache.  Since we don't
-	 * support ALTER COLLATION, this is OK.  The worst case is that someone
-	 * drops a collation, and a useless cache entry hangs around in existing
-	 * backends.
-	 */
+	/* cache mechanism for collation information */
 	if (CollationCache == NULL)
 	{
 		CollationCacheOwner = ResourceOwnerCreate(NULL, "collation cache");
@@ -1730,6 +1752,9 @@ pg_newlocale_from_collation(Oid collid)
 													  ALLOCSET_DEFAULT_SIZES);
 		CollationCache = collation_cache_create(CollationCacheContext,
 												16, NULL);
+		CacheRegisterSyscacheCallback(COLLOID,
+									  CollationCacheInvalidate,
+									  (Datum) 0);
 	}
 
 	cache_entry = collation_cache_insert(CollationCache, collid, &found);
-- 
2.34.1

v4-0004-resource-owners.patchtext/x-patch; charset=UTF-8; name=v4-0004-resource-owners.patchDownload

From 5ae3b1be6489617a1639141749c31d2f4419a676 Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Wed, 18 Sep 2024 16:55:42 -0700
Subject: [PATCH v4 4/7] resource owners

---
 src/backend/utils/adt/pg_locale.c | 74 ++++++++++++++++++++++++++++++-
 1 file changed, 72 insertions(+), 2 deletions(-)

diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index d3d9c3920e6..9d1d71f1561 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -66,6 +66,7 @@
 #include "utils/lsyscache.h"
 #include "utils/memutils.h"
 #include "utils/pg_locale.h"
+#include "utils/resowner.h"
 #include "utils/syscache.h"
 
 #ifdef USE_ICU
@@ -148,6 +149,12 @@ typedef struct
 #define SH_DEFINE
 #include "lib/simplehash.h"
 
+/*
+ * Collator objects (UCollator for ICU or locale_t for libc) are allocated in
+ * an external library, so track them using a resource owner.
+ */
+static ResourceOwner CollationCacheOwner = NULL;
+
 static MemoryContext CollationCacheContext = NULL;
 static collation_cache_hash *CollationCache = NULL;
 
@@ -179,8 +186,35 @@ static int32_t uchar_convert(UConverter *converter,
 							 const char *src, int32_t srclen);
 static void icu_set_collation_attributes(UCollator *collator, const char *loc,
 										 UErrorCode *status);
+
+static void ResourceOwnerRememberUCollator(ResourceOwner owner,
+										   UCollator *collator);
+static void ResOwnerReleaseUCollator(Datum val);
+
+static const ResourceOwnerDesc UCollatorResourceKind =
+{
+	.name = "UCollator reference",
+	.release_phase = RESOURCE_RELEASE_AFTER_LOCKS,
+	.release_priority = RELEASE_PRIO_LAST,
+	.ReleaseResource = ResOwnerReleaseUCollator,
+	.DebugPrint = NULL                      /* the default message is fine */
+};
 #endif
 
+static void ResourceOwnerRememberLocaleT(ResourceOwner owner,
+										 locale_t locale);
+static void ResOwnerReleaseLocaleT(Datum val);
+
+static const ResourceOwnerDesc LocaleTResourceKind =
+{
+	.name = "locale_t reference",
+	.release_phase = RESOURCE_RELEASE_AFTER_LOCKS,
+	.release_priority = RELEASE_PRIO_LAST,
+	.ReleaseResource = ResOwnerReleaseLocaleT,
+	.DebugPrint = NULL                      /* the default message is fine */
+};
+
+
 /*
  * POSIX doesn't define _l-variants of these functions, but several systems
  * have them.  We provide our own replacements here.
@@ -1257,6 +1291,20 @@ report_newlocale_failure(const char *localename)
 						localename) : 0)));
 }
 
+static void
+ResourceOwnerRememberLocaleT(ResourceOwner owner, locale_t locale)
+{
+	ResourceOwnerRemember(owner, PointerGetDatum(locale),
+						  &LocaleTResourceKind);
+}
+
+static void
+ResOwnerReleaseLocaleT(Datum val)
+{
+	locale_t locale = (locale_t) DatumGetPointer(val);
+	freelocale(locale);
+}
+
 /*
  * Create a locale_t with the given collation and ctype.
  *
@@ -1335,6 +1383,20 @@ make_libc_collator(const char *collate, const char *ctype)
  * Ensure that no path leaks a UCollator.
  */
 #ifdef USE_ICU
+static void
+ResourceOwnerRememberUCollator(ResourceOwner owner, UCollator *collator)
+{
+	ResourceOwnerRemember(owner, PointerGetDatum(collator),
+						  &UCollatorResourceKind);
+}
+
+static void
+ResOwnerReleaseUCollator(Datum val)
+{
+	UCollator *collator = (UCollator *) DatumGetPointer(val);
+	ucol_close(collator);
+}
+
 static UCollator *
 make_icu_collator(const char *iculocstr, const char *icurules)
 {
@@ -1495,7 +1557,7 @@ init_database_collation(void)
  * allocating memory.
  */
 static pg_locale_t
-create_pg_locale(MemoryContext context, Oid collid)
+create_pg_locale(MemoryContext context, ResourceOwner owner, Oid collid)
 {
 	/* We haven't computed this yet in this session, so do it */
 	HeapTuple	tp;
@@ -1582,7 +1644,10 @@ create_pg_locale(MemoryContext context, Oid collid)
 		datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collctype);
 		collctype = TextDatumGetCString(datum);
 
+		ResourceOwnerEnlarge(owner);
 		locale = make_libc_collator(collcollate, collctype);
+		if (locale)
+			ResourceOwnerRememberLocaleT(owner, locale);
 
 		result = MemoryContextAllocZero(context,
 										sizeof(struct pg_locale_struct));
@@ -1610,7 +1675,9 @@ create_pg_locale(MemoryContext context, Oid collid)
 		else
 			icurules = NULL;
 
+		ResourceOwnerEnlarge(owner);
 		collator = make_icu_collator(iculocstr, icurules);
+		ResourceOwnerRememberUCollator(owner, collator);
 
 		result = MemoryContextAllocZero(context,
 										sizeof(struct pg_locale_struct));
@@ -1657,6 +1724,7 @@ pg_newlocale_from_collation(Oid collid)
 	 */
 	if (CollationCache == NULL)
 	{
+		CollationCacheOwner = ResourceOwnerCreate(NULL, "collation cache");
 		CollationCacheContext = AllocSetContextCreate(TopMemoryContext,
 													  "collation cache",
 													  ALLOCSET_DEFAULT_SIZES);
@@ -1675,7 +1743,9 @@ pg_newlocale_from_collation(Oid collid)
 	}
 
 	if (cache_entry->locale == 0)
-		cache_entry->locale = create_pg_locale(CollationCacheContext, collid);
+		cache_entry->locale = create_pg_locale(CollationCacheContext,
+											   CollationCacheOwner,
+											   collid);
 
 	last_collation_cache_oid = collid;
 	last_collation_cache_locale = cache_entry->locale;
-- 
2.34.1

v4-0003-CollationCacheContext.patchtext/x-patch; charset=UTF-8; name=v4-0003-CollationCacheContext.patchDownload

From fca0efa184971f9780b356039aa3ed08a7445524 Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Wed, 18 Sep 2024 15:55:37 -0700
Subject: [PATCH v4 3/7] CollationCacheContext

---
 src/backend/utils/adt/pg_locale.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 1dec00b55ed..d3d9c3920e6 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -1495,7 +1495,7 @@ init_database_collation(void)
  * allocating memory.
  */
 static pg_locale_t
-create_pg_locale(Oid collid)
+create_pg_locale(MemoryContext context, Oid collid)
 {
 	/* We haven't computed this yet in this session, so do it */
 	HeapTuple	tp;
@@ -1561,15 +1561,15 @@ create_pg_locale(Oid collid)
 
 		builtin_validate_locale(GetDatabaseEncoding(), locstr);
 
-		result = MemoryContextAllocZero(TopMemoryContext,
+		result = MemoryContextAllocZero(context,
 										sizeof(struct pg_locale_struct));
 
 		result->provider = collform->collprovider;
 		result->deterministic = collform->collisdeterministic;
 		result->collate_is_c = true;
 		result->ctype_is_c = (strcmp(locstr, "C") == 0);
-		result->info.builtin.locale = MemoryContextStrdup(TopMemoryContext,
-														 locstr);
+		result->info.builtin.locale = MemoryContextStrdup(context,
+														  locstr);
 	}
 	else if (collform->collprovider == COLLPROVIDER_LIBC)
 	{
@@ -1584,7 +1584,7 @@ create_pg_locale(Oid collid)
 
 		locale = make_libc_collator(collcollate, collctype);
 
-		result = MemoryContextAllocZero(TopMemoryContext,
+		result = MemoryContextAllocZero(context,
 										sizeof(struct pg_locale_struct));
 
 		result->provider = collform->collprovider;
@@ -1612,14 +1612,14 @@ create_pg_locale(Oid collid)
 
 		collator = make_icu_collator(iculocstr, icurules);
 
-		result = MemoryContextAllocZero(TopMemoryContext,
+		result = MemoryContextAllocZero(context,
 										sizeof(struct pg_locale_struct));
 
 		result->provider = collform->collprovider;
 		result->deterministic = collform->collisdeterministic;
 		result->collate_is_c = false;
 		result->ctype_is_c = false;
-		result->info.icu.locale = MemoryContextStrdup(TopMemoryContext, iculocstr);
+		result->info.icu.locale = MemoryContextStrdup(context, iculocstr);
 		result->info.icu.ucol = collator;
 	}
 
@@ -1675,7 +1675,7 @@ pg_newlocale_from_collation(Oid collid)
 	}
 
 	if (cache_entry->locale == 0)
-		cache_entry->locale = create_pg_locale(collid);
+		cache_entry->locale = create_pg_locale(CollationCacheContext, collid);
 
 	last_collation_cache_oid = collid;
 	last_collation_cache_locale = cache_entry->locale;
-- 
2.34.1

v4-0002-create_pg_locale.patchtext/x-patch; charset=UTF-8; name=v4-0002-create_pg_locale.patchDownload

From eccc4a4a83069c6a14465b4a9239a4d759aaa2a8 Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Wed, 18 Sep 2024 15:53:56 -0700
Subject: [PATCH v4 2/7] create_pg_locale

---
 src/backend/utils/adt/pg_locale.c | 310 +++++++++++++++---------------
 1 file changed, 155 insertions(+), 155 deletions(-)

diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 12ba5726f77..1dec00b55ed 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -1227,45 +1227,6 @@ IsoLocaleName(const char *winlocname)
 #endif							/* WIN32 && LC_MESSAGES */
 
 
-/*
- * Cache mechanism for collation information.
- *
- * Note that we currently lack any way to flush the cache.  Since we don't
- * support ALTER COLLATION, this is OK.  The worst case is that someone
- * drops a collation, and a useless cache entry hangs around in existing
- * backends.
- */
-static collation_cache_entry *
-lookup_collation_cache(Oid collation)
-{
-	collation_cache_entry *cache_entry;
-	bool		found;
-
-	Assert(OidIsValid(collation));
-	Assert(collation != DEFAULT_COLLATION_OID);
-
-	if (CollationCache == NULL)
-	{
-		CollationCacheContext = AllocSetContextCreate(TopMemoryContext,
-													  "collation cache",
-													  ALLOCSET_DEFAULT_SIZES);
-		CollationCache = collation_cache_create(CollationCacheContext,
-												16, NULL);
-	}
-
-	cache_entry = collation_cache_insert(CollationCache, collation, &found);
-	if (!found)
-	{
-		/*
-		 * Make sure cache entry is marked invalid, in case we fail before
-		 * setting things.
-		 */
-		cache_entry->locale = 0;
-	}
-
-	return cache_entry;
-}
-
 /* simple subroutine for reporting errors from newlocale() */
 static void
 report_newlocale_failure(const char *localename)
@@ -1530,153 +1491,192 @@ init_database_collation(void)
 }
 
 /*
- * Create a pg_locale_t from a collation OID.  Results are cached for the
- * lifetime of the backend.  Thus, do not free the result with freelocale().
- *
- * For simplicity, we always generate COLLATE + CTYPE even though we
- * might only need one of them.  Since this is called only once per session,
- * it shouldn't cost much.
+ * Create and initialize a pg_locale_t.  Be careful to check for errors before
+ * allocating memory.
  */
-pg_locale_t
-pg_newlocale_from_collation(Oid collid)
+static pg_locale_t
+create_pg_locale(Oid collid)
 {
-	collation_cache_entry *cache_entry;
-
-	if (collid == DEFAULT_COLLATION_OID)
-		return &default_locale;
+	/* We haven't computed this yet in this session, so do it */
+	HeapTuple	tp;
+	Form_pg_collation collform;
+	pg_locale_t result;
+	Datum		datum;
+	bool		isnull;
 
-	if (!OidIsValid(collid))
+	tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collid));
+	if (!HeapTupleIsValid(tp))
 		elog(ERROR, "cache lookup failed for collation %u", collid);
+	collform = (Form_pg_collation) GETSTRUCT(tp);
 
-	if (last_collation_cache_oid == collid)
-		return last_collation_cache_locale;
-
-	cache_entry = lookup_collation_cache(collid);
-
-	if (cache_entry->locale == 0)
+	/* compare version in catalog to version from provider */
+	datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collversion,
+							&isnull);
+	if (!isnull)
 	{
-		/* We haven't computed this yet in this session, so do it */
-		HeapTuple	tp;
-		Form_pg_collation collform;
-		struct pg_locale_struct result;
-		pg_locale_t resultp;
-		Datum		datum;
-		bool		isnull;
+		char	   *actual_versionstr;
+		char	   *collversionstr;
 
-		tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collid));
-		if (!HeapTupleIsValid(tp))
-			elog(ERROR, "cache lookup failed for collation %u", collid);
-		collform = (Form_pg_collation) GETSTRUCT(tp);
+		collversionstr = TextDatumGetCString(datum);
 
-		/* We'll fill in the result struct locally before allocating memory */
-		memset(&result, 0, sizeof(result));
-		result.provider = collform->collprovider;
-		result.deterministic = collform->collisdeterministic;
+		if (collform->collprovider == COLLPROVIDER_LIBC)
+			datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collcollate);
+		else
+			datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colllocale);
 
-		if (collform->collprovider == COLLPROVIDER_BUILTIN)
+		actual_versionstr = get_collation_actual_version(collform->collprovider,
+														 TextDatumGetCString(datum));
+		if (!actual_versionstr)
 		{
-			const char *locstr;
+			/*
+			 * This could happen when specifying a version in CREATE
+			 * COLLATION but the provider does not support versioning, or
+			 * manually creating a mess in the catalogs.
+			 */
+			ereport(ERROR,
+					(errmsg("collation \"%s\" has no actual version, but a version was recorded",
+							NameStr(collform->collname))));
+		}
 
-			datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colllocale);
-			locstr = TextDatumGetCString(datum);
+		if (strcmp(actual_versionstr, collversionstr) != 0)
+			ereport(WARNING,
+					(errmsg("collation \"%s\" has version mismatch",
+							NameStr(collform->collname)),
+					 errdetail("The collation in the database was created using version %s, "
+							   "but the operating system provides version %s.",
+							   collversionstr, actual_versionstr),
+					 errhint("Rebuild all objects affected by this collation and run "
+							 "ALTER COLLATION %s REFRESH VERSION, "
+							 "or build PostgreSQL with the right library version.",
+							 quote_qualified_identifier(get_namespace_name(collform->collnamespace),
+														NameStr(collform->collname)))));
+	}
 
-			result.collate_is_c = true;
-			result.ctype_is_c = (strcmp(locstr, "C") == 0);
+	if (collform->collprovider == COLLPROVIDER_BUILTIN)
+	{
+		const char *locstr;
 
-			builtin_validate_locale(GetDatabaseEncoding(), locstr);
+		datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colllocale);
+		locstr = TextDatumGetCString(datum);
 
-			result.info.builtin.locale = MemoryContextStrdup(TopMemoryContext,
-															 locstr);
-		}
-		else if (collform->collprovider == COLLPROVIDER_LIBC)
-		{
-			const char *collcollate;
-			const char *collctype;
+		builtin_validate_locale(GetDatabaseEncoding(), locstr);
 
-			datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collcollate);
-			collcollate = TextDatumGetCString(datum);
-			datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collctype);
-			collctype = TextDatumGetCString(datum);
+		result = MemoryContextAllocZero(TopMemoryContext,
+										sizeof(struct pg_locale_struct));
 
-			result.collate_is_c = (strcmp(collcollate, "C") == 0) ||
-				(strcmp(collcollate, "POSIX") == 0);
-			result.ctype_is_c = (strcmp(collctype, "C") == 0) ||
-				(strcmp(collctype, "POSIX") == 0);
+		result->provider = collform->collprovider;
+		result->deterministic = collform->collisdeterministic;
+		result->collate_is_c = true;
+		result->ctype_is_c = (strcmp(locstr, "C") == 0);
+		result->info.builtin.locale = MemoryContextStrdup(TopMemoryContext,
+														 locstr);
+	}
+	else if (collform->collprovider == COLLPROVIDER_LIBC)
+	{
+		const char *collcollate;
+		const char *collctype;
+		locale_t	locale;
+
+		datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collcollate);
+		collcollate = TextDatumGetCString(datum);
+		datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collctype);
+		collctype = TextDatumGetCString(datum);
+
+		locale = make_libc_collator(collcollate, collctype);
+
+		result = MemoryContextAllocZero(TopMemoryContext,
+										sizeof(struct pg_locale_struct));
+
+		result->provider = collform->collprovider;
+		result->deterministic = collform->collisdeterministic;
+		result->collate_is_c = (strcmp(collcollate, "C") == 0) ||
+			(strcmp(collcollate, "POSIX") == 0);
+		result->ctype_is_c = (strcmp(collctype, "C") == 0) ||
+			(strcmp(collctype, "POSIX") == 0);
+		result->info.lt = locale;
+	}
+	else if (collform->collprovider == COLLPROVIDER_ICU)
+	{
+		const char *iculocstr;
+		const char *icurules;
+		UCollator  *collator;
 
-			result.info.lt = make_libc_collator(collcollate, collctype);
-		}
-		else if (collform->collprovider == COLLPROVIDER_ICU)
-		{
-			const char *iculocstr;
-			const char *icurules;
+		datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colllocale);
+		iculocstr = TextDatumGetCString(datum);
 
-			datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colllocale);
-			iculocstr = TextDatumGetCString(datum);
+		datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collicurules, &isnull);
+		if (!isnull)
+			icurules = TextDatumGetCString(datum);
+		else
+			icurules = NULL;
 
-			result.collate_is_c = false;
-			result.ctype_is_c = false;
+		collator = make_icu_collator(iculocstr, icurules);
 
-			datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collicurules, &isnull);
-			if (!isnull)
-				icurules = TextDatumGetCString(datum);
-			else
-				icurules = NULL;
+		result = MemoryContextAllocZero(TopMemoryContext,
+										sizeof(struct pg_locale_struct));
 
-			result.info.icu.locale = MemoryContextStrdup(TopMemoryContext, iculocstr);
-			result.info.icu.ucol = make_icu_collator(iculocstr, icurules);
-		}
+		result->provider = collform->collprovider;
+		result->deterministic = collform->collisdeterministic;
+		result->collate_is_c = false;
+		result->ctype_is_c = false;
+		result->info.icu.locale = MemoryContextStrdup(TopMemoryContext, iculocstr);
+		result->info.icu.ucol = collator;
+	}
 
-		datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collversion,
-								&isnull);
-		if (!isnull)
-		{
-			char	   *actual_versionstr;
-			char	   *collversionstr;
+	ReleaseSysCache(tp);
 
-			collversionstr = TextDatumGetCString(datum);
+	return result;
+}
 
-			if (collform->collprovider == COLLPROVIDER_LIBC)
-				datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collcollate);
-			else
-				datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colllocale);
+/*
+ * Create or retrieve a pg_locale_t for the given collation OID.  Results are
+ * cached for the lifetime of the backend.
+ */
+pg_locale_t
+pg_newlocale_from_collation(Oid collid)
+{
+	collation_cache_entry	*cache_entry;
+	bool					 found;
 
-			actual_versionstr = get_collation_actual_version(collform->collprovider,
-															 TextDatumGetCString(datum));
-			if (!actual_versionstr)
-			{
-				/*
-				 * This could happen when specifying a version in CREATE
-				 * COLLATION but the provider does not support versioning, or
-				 * manually creating a mess in the catalogs.
-				 */
-				ereport(ERROR,
-						(errmsg("collation \"%s\" has no actual version, but a version was recorded",
-								NameStr(collform->collname))));
-			}
+	if (collid == DEFAULT_COLLATION_OID)
+		return &default_locale;
 
-			if (strcmp(actual_versionstr, collversionstr) != 0)
-				ereport(WARNING,
-						(errmsg("collation \"%s\" has version mismatch",
-								NameStr(collform->collname)),
-						 errdetail("The collation in the database was created using version %s, "
-								   "but the operating system provides version %s.",
-								   collversionstr, actual_versionstr),
-						 errhint("Rebuild all objects affected by this collation and run "
-								 "ALTER COLLATION %s REFRESH VERSION, "
-								 "or build PostgreSQL with the right library version.",
-								 quote_qualified_identifier(get_namespace_name(collform->collnamespace),
-															NameStr(collform->collname)))));
-		}
+	if (!OidIsValid(collid))
+		elog(ERROR, "cache lookup failed for collation %u", collid);
 
-		ReleaseSysCache(tp);
+	if (last_collation_cache_oid == collid)
+		return last_collation_cache_locale;
 
-		/* We'll keep the pg_locale_t structures in TopMemoryContext */
-		resultp = MemoryContextAlloc(TopMemoryContext, sizeof(*resultp));
-		*resultp = result;
+	/*
+	 * Cache mechanism for collation information.
+	 *
+	 * Note that we currently lack any way to flush the cache.  Since we don't
+	 * support ALTER COLLATION, this is OK.  The worst case is that someone
+	 * drops a collation, and a useless cache entry hangs around in existing
+	 * backends.
+	 */
+	if (CollationCache == NULL)
+	{
+		CollationCacheContext = AllocSetContextCreate(TopMemoryContext,
+													  "collation cache",
+													  ALLOCSET_DEFAULT_SIZES);
+		CollationCache = collation_cache_create(CollationCacheContext,
+												16, NULL);
+	}
 
-		cache_entry->locale = resultp;
+	cache_entry = collation_cache_insert(CollationCache, collid, &found);
+	if (!found)
+	{
+		/*
+		 * Make sure cache entry is marked invalid, in case we fail before
+		 * setting things.
+		 */
+		cache_entry->locale = 0;
 	}
 
+	if (cache_entry->locale == 0)
+		cache_entry->locale = create_pg_locale(collid);
+
 	last_collation_cache_oid = collid;
 	last_collation_cache_locale = cache_entry->locale;
 
-- 
2.34.1