Refactor: allow pg_strncoll(), etc., to accept -1 length for NUL-terminated cstrings.
Like ICU, allow -1 length to mean that the input string is NUL-
terminated for pg_strncoll(), pg_strnxfrm(), and pg_strnxfrm_prefix().
This simplifies the API and code a bit.
Along with some other refactoring in this area, we are getting close to
the point where the collation provider can just be a table of methods,
which means we can add an extension hook to provide a different method
table. That still requires more work, I'm just mentioning it here for
context.
Regards,
Jeff Davis
Attachments:
v1-0001-Allow-length-1-for-NUL-terminated-input-to-pg_str.patchtext/x-patch; charset=UTF-8; name=v1-0001-Allow-length-1-for-NUL-terminated-input-to-pg_str.patchDownload
From 6f0c0a9e05039cd295c6c090b3d98d381244b35c Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Wed, 21 Aug 2024 10:59:28 -0700
Subject: [PATCH v1] Allow length=-1 for NUL-terminated input to pg_strncoll(),
etc.
Like ICU, allow a length of -1 to be specified for NUL-terminated
arguments to pg_strncoll(), pg_strnxfrm(), and pg_strnxfrm_prefix().
Simplifies the code and comments.
---
src/backend/utils/adt/pg_locale.c | 186 ++++++++++--------------------
1 file changed, 64 insertions(+), 122 deletions(-)
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 48b7e16d81b..26b0f4577f0 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -1809,6 +1809,8 @@ get_collation_actual_version(char collprovider, const char *collcollate)
*
* Win32 does not have UTF-8. Convert UTF8 arguments to wide characters and
* invoke wcscoll_l().
+ *
+ * An input string length of -1 means that it's NUL-terminated.
*/
#ifdef WIN32
static int
@@ -1819,8 +1821,8 @@ pg_strncoll_libc_win32_utf8(const char *arg1, size_t len1, const char *arg2,
char *buf = sbuf;
char *a1p,
*a2p;
- int a1len = len1 * 2 + 2;
- int a2len = len2 * 2 + 2;
+ int a1len;
+ int a2len;
int r;
int result;
@@ -1830,6 +1832,14 @@ pg_strncoll_libc_win32_utf8(const char *arg1, size_t len1, const char *arg2,
Assert(false);
#endif
+ if (len1 == -1)
+ len1 = strlen(arg1);
+ if (len2 == -1)
+ len2 = strlen(arg2);
+
+ a1len = len1 * 2 + 2;
+ a2len = len2 * 2 + 2;
+
if (a1len + a2len > TEXTBUFLEN)
buf = palloc(a1len + a2len);
@@ -1876,40 +1886,10 @@ pg_strncoll_libc_win32_utf8(const char *arg1, size_t len1, const char *arg2,
}
#endif /* WIN32 */
-/*
- * pg_strcoll_libc
- *
- * Call strcoll_l() or wcscoll_l() as appropriate for the given locale,
- * platform, and database encoding. If the locale is NULL, use the database
- * collation.
- *
- * Arguments must be encoded in the database encoding and nul-terminated.
- */
-static int
-pg_strcoll_libc(const char *arg1, const char *arg2, pg_locale_t locale)
-{
- int result;
-
- Assert(locale->provider == COLLPROVIDER_LIBC);
-#ifdef WIN32
- if (GetDatabaseEncoding() == PG_UTF8)
- {
- size_t len1 = strlen(arg1);
- size_t len2 = strlen(arg2);
-
- result = pg_strncoll_libc_win32_utf8(arg1, len1, arg2, len2, locale);
- }
- else
-#endif /* WIN32 */
- result = strcoll_l(arg1, arg2, locale->info.lt);
-
- return result;
-}
-
/*
* pg_strncoll_libc
*
- * Nul-terminate the arguments and call pg_strcoll_libc().
+ * An input string length of -1 means that it's NUL-terminated.
*/
static int
pg_strncoll_libc(const char *arg1, size_t len1, const char *arg2, size_t len2,
@@ -1917,10 +1897,10 @@ pg_strncoll_libc(const char *arg1, size_t len1, const char *arg2, size_t len2,
{
char sbuf[TEXTBUFLEN];
char *buf = sbuf;
- size_t bufsize1 = len1 + 1;
- size_t bufsize2 = len2 + 1;
- char *arg1n;
- char *arg2n;
+ size_t bufsize1 = (len1 == -1) ? 0 : len1 + 1;
+ size_t bufsize2 = (len2 == -1) ? 0 : len2 + 1;
+ const char *arg1n;
+ const char *arg2n;
int result;
Assert(locale->provider == COLLPROVIDER_LIBC);
@@ -1934,16 +1914,32 @@ pg_strncoll_libc(const char *arg1, size_t len1, const char *arg2, size_t len2,
if (bufsize1 + bufsize2 > TEXTBUFLEN)
buf = palloc(bufsize1 + bufsize2);
- arg1n = buf;
- arg2n = buf + bufsize1;
+ /* nul-terminate arguments if necessary */
+ if (len1 == -1)
+ {
+ arg1n = arg1;
+ }
+ else
+ {
+ char *buf1 = buf;
+ memcpy(buf1, arg1, len1);
+ buf1[len1] = '\0';
+ arg1n = buf1;
+ }
- /* nul-terminate arguments */
- memcpy(arg1n, arg1, len1);
- arg1n[len1] = '\0';
- memcpy(arg2n, arg2, len2);
- arg2n[len2] = '\0';
+ if (len2 == -1)
+ {
+ arg2n = arg2;
+ }
+ else
+ {
+ char *buf2 = buf + bufsize1;
+ memcpy(buf2, arg2, len2);
+ buf2[len2] = '\0';
+ arg2n = buf2;
+ }
- result = pg_strcoll_libc(arg1n, arg2n, locale);
+ result = strcoll_l(arg1n, arg2n, locale->info.lt);
if (buf != sbuf)
pfree(buf);
@@ -2015,8 +2011,6 @@ pg_strncoll_icu_no_utf8(const char *arg1, int32_t len1,
* Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given
* database encoding. An argument length of -1 means the string is
* NUL-terminated.
- *
- * Arguments must be encoded in the database encoding.
*/
static int
pg_strncoll_icu(const char *arg1, int32_t len1, const char *arg2, int32_t len2,
@@ -2054,15 +2048,7 @@ pg_strncoll_icu(const char *arg1, int32_t len1, const char *arg2, int32_t len2,
/*
* pg_strcoll
*
- * Call ucol_strcollUTF8(), ucol_strcoll(), strcoll_l() or wcscoll_l() as
- * appropriate for the given locale, platform, and database encoding. If the
- * locale is not specified, use the database collation.
- *
- * Arguments must be encoded in the database encoding and nul-terminated.
- *
- * The caller is responsible for breaking ties if the collation is
- * deterministic; this maintains consistency with pg_strxfrm(), which cannot
- * easily account for deterministic collations.
+ * Like pg_strncoll for NUL-terminated input strings.
*/
int
pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale)
@@ -2070,7 +2056,7 @@ pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale)
int result;
if (locale->provider == COLLPROVIDER_LIBC)
- result = pg_strcoll_libc(arg1, arg2, locale);
+ result = pg_strncoll_libc(arg1, -1, arg2, -1, locale);
#ifdef USE_ICU
else if (locale->provider == COLLPROVIDER_ICU)
result = pg_strncoll_icu(arg1, -1, arg2, -1, locale);
@@ -2089,11 +2075,8 @@ pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale)
* appropriate for the given locale, platform, and database encoding. If the
* locale is not specified, use the database collation.
*
- * Arguments must be encoded in the database encoding.
- *
- * This function may need to nul-terminate the arguments for libc functions;
- * so if the caller already has nul-terminated strings, it should call
- * pg_strcoll() instead.
+ * The input strings must be encoded in the database encoding. If an input
+ * string is NUL-terminated, its length may be specified as -1.
*
* The caller is responsible for breaking ties if the collation is
* deterministic; this maintains consistency with pg_strnxfrm(), which cannot
@@ -2119,14 +2102,6 @@ pg_strncoll(const char *arg1, size_t len1, const char *arg2, size_t len2,
}
-static size_t
-pg_strxfrm_libc(char *dest, const char *src, size_t destsize,
- pg_locale_t locale)
-{
- Assert(locale->provider == COLLPROVIDER_LIBC);
- return strxfrm_l(dest, src, destsize, locale->info.lt);
-}
-
static size_t
pg_strnxfrm_libc(char *dest, const char *src, size_t srclen, size_t destsize,
pg_locale_t locale)
@@ -2138,14 +2113,17 @@ pg_strnxfrm_libc(char *dest, const char *src, size_t srclen, size_t destsize,
Assert(locale->provider == COLLPROVIDER_LIBC);
+ if (srclen == -1)
+ return strxfrm_l(dest, src, destsize, locale->info.lt);
+
if (bufsize > TEXTBUFLEN)
buf = palloc(bufsize);
- /* nul-terminate arguments */
+ /* nul-terminate argument */
memcpy(buf, src, srclen);
buf[srclen] = '\0';
- result = pg_strxfrm_libc(dest, buf, destsize, locale);
+ result = strxfrm_l(dest, buf, destsize, locale->info.lt);
if (buf != sbuf)
pfree(buf);
@@ -2326,20 +2304,7 @@ pg_strxfrm_enabled(pg_locale_t locale)
/*
* pg_strxfrm
*
- * Transforms 'src' to a nul-terminated string stored in 'dest' such that
- * ordinary strcmp() on transformed strings is equivalent to pg_strcoll() on
- * untransformed strings.
- *
- * The provided 'src' must be nul-terminated. If 'destsize' is zero, 'dest'
- * may be NULL.
- *
- * Not all providers support pg_strxfrm() safely. The caller should check
- * pg_strxfrm_enabled() first, otherwise this function may return wrong
- * results or an error.
- *
- * Returns the number of bytes needed (or more) to store the transformed
- * string, excluding the terminating nul byte. If the value returned is
- * 'destsize' or greater, the resulting contents of 'dest' are undefined.
+ * Like pg_strnxfrm for a NUL-terminated input string.
*/
size_t
pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale)
@@ -2347,7 +2312,7 @@ pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale)
size_t result = 0; /* keep compiler quiet */
if (locale->provider == COLLPROVIDER_LIBC)
- result = pg_strxfrm_libc(dest, src, destsize, locale);
+ result = pg_strnxfrm_libc(dest, src, -1, destsize, locale);
#ifdef USE_ICU
else if (locale->provider == COLLPROVIDER_ICU)
result = pg_strnxfrm_icu(dest, src, -1, destsize, locale);
@@ -2366,8 +2331,9 @@ pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale)
* ordinary strcmp() on transformed strings is equivalent to pg_strcoll() on
* untransformed strings.
*
- * 'src' does not need to be nul-terminated. If 'destsize' is zero, 'dest' may
- * be NULL.
+ * The input string must be encoded in the database encoding. If the input
+ * string is NUL-terminated, its length may be specified as -1. If 'destsize'
+ * is zero, 'dest' may be NULL.
*
* Not all providers support pg_strnxfrm() safely. The caller should check
* pg_strxfrm_enabled() first, otherwise this function may return wrong
@@ -2376,10 +2342,6 @@ pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale)
* Returns the number of bytes needed (or more) to store the transformed
* string, excluding the terminating nul byte. If the value returned is
* 'destsize' or greater, the resulting contents of 'dest' are undefined.
- *
- * This function may need to nul-terminate the argument for libc functions;
- * so if the caller already has a nul-terminated string, it should call
- * pg_strxfrm() instead.
*/
size_t
pg_strnxfrm(char *dest, size_t destsize, const char *src, size_t srclen,
@@ -2421,44 +2383,24 @@ pg_strxfrm_prefix_enabled(pg_locale_t locale)
/*
* pg_strxfrm_prefix
*
- * Transforms 'src' to a byte sequence stored in 'dest' such that ordinary
- * memcmp() on the byte sequence is equivalent to pg_strcoll() on
- * untransformed strings. The result is not nul-terminated.
- *
- * The provided 'src' must be nul-terminated.
- *
- * Not all providers support pg_strxfrm_prefix() safely. The caller should
- * check pg_strxfrm_prefix_enabled() first, otherwise this function may return
- * wrong results or an error.
- *
- * If destsize is not large enough to hold the resulting byte sequence, stores
- * only the first destsize bytes in 'dest'. Returns the number of bytes
- * actually copied to 'dest'.
+ * Like pg_strnxfrm_prefix for a NUL-terminated input string.
*/
size_t
pg_strxfrm_prefix(char *dest, const char *src, size_t destsize,
pg_locale_t locale)
{
- size_t result = 0; /* keep compiler quiet */
-
-#ifdef USE_ICU
- if (locale->provider == COLLPROVIDER_ICU)
- result = pg_strnxfrm_prefix_icu(dest, src, -1, destsize, locale);
- else
-#endif
- PGLOCALE_SUPPORT_ERROR(locale->provider);
-
- return result;
+ return pg_strnxfrm_prefix(dest, destsize, src, -1, locale);
}
/*
* pg_strnxfrm_prefix
*
* Transforms 'src' to a byte sequence stored in 'dest' such that ordinary
- * memcmp() on the byte sequence is equivalent to pg_strcoll() on
+ * memcmp() on the byte sequence is equivalent to pg_strncoll() on
* untransformed strings. The result is not nul-terminated.
*
- * The provided 'src' must be nul-terminated.
+ * The input string must be encoded in the database encoding. If the input
+ * string is NUL-terminated, its length may be specified as -1.
*
* Not all providers support pg_strnxfrm_prefix() safely. The caller should
* check pg_strxfrm_prefix_enabled() first, otherwise this function may return
@@ -2467,10 +2409,6 @@ pg_strxfrm_prefix(char *dest, const char *src, size_t destsize,
* If destsize is not large enough to hold the resulting byte sequence, stores
* only the first destsize bytes in 'dest'. Returns the number of bytes
* actually copied to 'dest'.
- *
- * This function may need to nul-terminate the argument for libc functions;
- * so if the caller already has a nul-terminated string, it should call
- * pg_strxfrm_prefix() instead.
*/
size_t
pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src,
@@ -2661,6 +2599,8 @@ init_icu_converter(void)
/*
* Find length, in UChars, of given string if converted to UChar string.
+ *
+ * A length of -1 indicates that the input string is NUL-terminated.
*/
static size_t
uchar_length(UConverter *converter, const char *str, int32_t len)
@@ -2678,6 +2618,8 @@ uchar_length(UConverter *converter, const char *str, int32_t len)
/*
* Convert the given source string into a UChar string, stored in dest, and
* return the length (in UChars).
+ *
+ * A srclen of -1 indicates that the input string is NUL-terminated.
*/
static int32_t
uchar_convert(UConverter *converter, UChar *dest, int32_t destlen,
--
2.34.1
On Thu, 2024-08-22 at 11:00 -0700, Jeff Davis wrote:
Like ICU, allow -1 length to mean that the input string is NUL-
terminated for pg_strncoll(), pg_strnxfrm(), and
pg_strnxfrm_prefix().
To better illustrate the direction I'm going, I roughly implemented
some patches that implement collation using a table of methods rather
than lots branching based on the provider.
This more cleanly separates the API for a provider, which will enable
us to use a hook to create a custom provider with arbitrary methods,
that may have nothing to do with ICU or libc. Or, we could go so far as
to implement a "CREATE LOCALE PROVIDER" that would provide the methods
using a handler function, and "datlocprovider" would be an OID rather
than a char.
From a practical perspective, I expect that extensions would use this
to lock down the version of a particular provider rather than implement
a completely arbitrary one. But the API is good for either case, and
offers quite a bit of code cleanup.
There are quite a few loose ends, of course:
* There is still a lot of branching on the provider for DDL and
catalog access. I'm not sure if we will ever eliminate all of this, or
if we would even want to.
* I haven't done anything with get_collation_actual_version().
Perhaps that should be a method, too, but it requires some extra
thought if we want this to be useful for "multilib" (having multiple
versions of a provider library at once).
* I didn't add methods for formatting.c yet.
* initdb -- should it offer a way to preload a library and then use
that for the provider?
* I need to allow an arbitrary per-provider context, rather than the
current union designed for the existing providers.
Again, the patches are rough and there's a lot of code churn. I'd like
some feedback on whether people generally like the direction this is
going. If so I will clean up the patch series into smaller, more
reviewable chunks.
Regards,
Jeff Davis
Attachments:
v4-0001-Tighten-up-make_libc_collator-and-make_icu_collat.patchtext/x-patch; charset=UTF-8; name=v4-0001-Tighten-up-make_libc_collator-and-make_icu_collat.patchDownload
From 224470bc4d0660dc11940f5595031eecb0319d62 Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Wed, 7 Aug 2024 11:05:46 -0700
Subject: [PATCH v4 1/7] Tighten up make_libc_collator() and
make_icu_collator().
Return the result rather than using an out parameter, and make it the
caller's responsibility to copy it into the right context. Ensure that
no paths leak a collator.
The function make_icu_collator() doesn't have any external callers, so
change it to be static. Also, when re-opening with rules, use a
try/finally block to avoid leaking the collator.
In make_libc_collator(), if the first newlocale() succeeds and the
second one fails, close the first locale_t object.
Discussion: https://postgr.es/m/54d20e812bd6c3e44c10eddcd757ec494ebf1803.camel@j-davis.com
---
src/backend/utils/adt/pg_locale.c | 126 +++++++++++++++++++-----------
src/include/utils/pg_locale.h | 4 -
2 files changed, 80 insertions(+), 50 deletions(-)
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 5bef1b113a8..12ba5726f77 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -1297,14 +1297,15 @@ report_newlocale_failure(const char *localename)
}
/*
- * Initialize the locale_t field.
+ * Create a locale_t with the given collation and ctype.
*
- * The "C" and "POSIX" locales are not actually handled by libc, so set the
- * locale_t to zero in that case.
+ * The "C" and "POSIX" locales are not actually handled by libc, so return
+ * NULL.
+ *
+ * Ensure that no path leaks a locale_t.
*/
-static void
-make_libc_collator(const char *collate, const char *ctype,
- pg_locale_t result)
+static locale_t
+make_libc_collator(const char *collate, const char *ctype)
{
locale_t loc = 0;
@@ -1343,7 +1344,11 @@ make_libc_collator(const char *collate, const char *ctype,
errno = 0;
loc = newlocale(LC_CTYPE_MASK, ctype, loc1);
if (!loc)
+ {
+ if (loc1)
+ freelocale(loc1);
report_newlocale_failure(ctype);
+ }
}
else
loc = loc1;
@@ -1360,60 +1365,78 @@ make_libc_collator(const char *collate, const char *ctype,
#endif
}
- result->info.lt = loc;
+ return loc;
}
-void
-make_icu_collator(const char *iculocstr,
- const char *icurules,
- struct pg_locale_struct *resultp)
-{
+/*
+ * Create a UCollator with the given locale string and rules.
+ *
+ * Ensure that no path leaks a UCollator.
+ */
#ifdef USE_ICU
- UCollator *collator;
-
- collator = pg_ucol_open(iculocstr);
-
- /*
- * If rules are specified, we extract the rules of the standard collation,
- * add our own rules, and make a new collator with the combined rules.
- */
- if (icurules)
+static UCollator *
+make_icu_collator(const char *iculocstr, const char *icurules)
+{
+ if (!icurules)
{
- const UChar *default_rules;
- UChar *agg_rules;
+ /* simple case without rules */
+ return pg_ucol_open(iculocstr);
+ }
+ else
+ {
+ UCollator *collator_std_rules;
+ UCollator *collator_all_rules;
+ const UChar *std_rules;
UChar *my_rules;
- UErrorCode status;
+ UChar *all_rules;
int32_t length;
+ int32_t total;
+ UErrorCode status;
- default_rules = ucol_getRules(collator, &length);
+ /*
+ * If rules are specified, we extract the rules of the standard
+ * collation, add our own rules, and make a new collator with the
+ * combined rules.
+ */
icu_to_uchar(&my_rules, icurules, strlen(icurules));
- agg_rules = palloc_array(UChar, u_strlen(default_rules) + u_strlen(my_rules) + 1);
- u_strcpy(agg_rules, default_rules);
- u_strcat(agg_rules, my_rules);
+ collator_std_rules = pg_ucol_open(iculocstr);
- ucol_close(collator);
+ std_rules = ucol_getRules(collator_std_rules, &length);
+
+ total = u_strlen(std_rules) + u_strlen(my_rules) + 1;
+
+ /* avoid leaking collator on OOM */
+ all_rules = palloc_extended(sizeof(UChar) * total, MCXT_ALLOC_NO_OOM);
+ if (!all_rules)
+ {
+ ucol_close(collator_std_rules);
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+
+ u_strcpy(all_rules, std_rules);
+ u_strcat(all_rules, my_rules);
+
+ ucol_close(collator_std_rules);
status = U_ZERO_ERROR;
- collator = ucol_openRules(agg_rules, u_strlen(agg_rules),
- UCOL_DEFAULT, UCOL_DEFAULT_STRENGTH, NULL, &status);
+ collator_all_rules = ucol_openRules(all_rules, u_strlen(all_rules),
+ UCOL_DEFAULT, UCOL_DEFAULT_STRENGTH,
+ NULL, &status);
if (U_FAILURE(status))
+ {
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("could not open collator for locale \"%s\" with rules \"%s\": %s",
iculocstr, icurules, u_errorName(status))));
- }
+ }
- /* We will leak this string if the caller errors later :-( */
- resultp->info.icu.locale = MemoryContextStrdup(TopMemoryContext, iculocstr);
- resultp->info.icu.ucol = collator;
-#else /* not USE_ICU */
- /* could get here if a collation was created by a build with ICU */
- ereport(ERROR,
- (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
- errmsg("ICU is not supported in this build")));
-#endif /* not USE_ICU */
+ return collator_all_rules;
+ }
}
+#endif /* not USE_ICU */
/*
* Initialize default_locale with database locale settings.
@@ -1424,7 +1447,6 @@ init_database_collation(void)
HeapTuple tup;
Form_pg_database dbform;
Datum datum;
- bool isnull;
/* Fetch our pg_database row normally, via syscache */
tup = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId));
@@ -1449,8 +1471,10 @@ init_database_collation(void)
}
else if (dbform->datlocprovider == COLLPROVIDER_ICU)
{
+#ifdef USE_ICU
char *datlocale;
char *icurules;
+ bool isnull;
datum = SysCacheGetAttrNotNull(DATABASEOID, tup, Anum_pg_database_datlocale);
datlocale = TextDatumGetCString(datum);
@@ -1464,7 +1488,14 @@ init_database_collation(void)
else
icurules = NULL;
- make_icu_collator(datlocale, icurules, &default_locale);
+ default_locale.info.icu.locale = MemoryContextStrdup(TopMemoryContext, datlocale);
+ default_locale.info.icu.ucol = make_icu_collator(datlocale, icurules);
+#else /* not USE_ICU */
+ /* could get here if a collation was created by a build with ICU */
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("ICU is not supported in this build")));
+#endif /* not USE_ICU */
}
else
{
@@ -1483,7 +1514,7 @@ init_database_collation(void)
default_locale.ctype_is_c = (strcmp(datctype, "C") == 0) ||
(strcmp(datctype, "POSIX") == 0);
- make_libc_collator(datcollate, datctype, &default_locale);
+ default_locale.info.lt = make_libc_collator(datcollate, datctype);
}
default_locale.provider = dbform->datlocprovider;
@@ -1572,7 +1603,7 @@ pg_newlocale_from_collation(Oid collid)
result.ctype_is_c = (strcmp(collctype, "C") == 0) ||
(strcmp(collctype, "POSIX") == 0);
- make_libc_collator(collcollate, collctype, &result);
+ result.info.lt = make_libc_collator(collcollate, collctype);
}
else if (collform->collprovider == COLLPROVIDER_ICU)
{
@@ -1591,7 +1622,8 @@ pg_newlocale_from_collation(Oid collid)
else
icurules = NULL;
- make_icu_collator(iculocstr, icurules, &result);
+ result.info.icu.locale = MemoryContextStrdup(TopMemoryContext, iculocstr);
+ result.info.icu.ucol = make_icu_collator(iculocstr, icurules);
}
datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collversion,
@@ -2500,6 +2532,8 @@ builtin_validate_locale(int encoding, const char *locale)
/*
* Wrapper around ucol_open() to handle API differences for older ICU
* versions.
+ *
+ * Ensure that no path leaks a UCollator.
*/
static UCollator *
pg_ucol_open(const char *loc_str)
diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h
index faae868bfcc..c2d95411e0a 100644
--- a/src/include/utils/pg_locale.h
+++ b/src/include/utils/pg_locale.h
@@ -104,10 +104,6 @@ struct pg_locale_struct
typedef struct pg_locale_struct *pg_locale_t;
-extern void make_icu_collator(const char *iculocstr,
- const char *icurules,
- struct pg_locale_struct *resultp);
-
extern void init_database_collation(void);
extern pg_locale_t pg_newlocale_from_collation(Oid collid);
--
2.34.1
v4-0007-Use-method-table-for-collation.patchtext/x-patch; charset=UTF-8; name=v4-0007-Use-method-table-for-collation.patchDownload
From c9ace91726c2889fe96dec28fd9f3c655e13afd7 Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Thu, 19 Sep 2024 11:12:41 -0700
Subject: [PATCH v4 7/7] Use method table for collation.
---
src/backend/regex/regc_pg_locale.c | 376 ++-----
src/backend/utils/adt/Makefile | 2 +
src/backend/utils/adt/meson.build | 2 +
src/backend/utils/adt/pg_locale.c | 1338 +++---------------------
src/backend/utils/adt/pg_locale_icu.c | 873 ++++++++++++++++
src/backend/utils/adt/pg_locale_libc.c | 604 +++++++++++
src/include/utils/pg_locale.h | 44 +-
7 files changed, 1727 insertions(+), 1512 deletions(-)
create mode 100644 src/backend/utils/adt/pg_locale_icu.c
create mode 100644 src/backend/utils/adt/pg_locale_libc.c
diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c
index b75784b6ce5..f7cd3f1787c 100644
--- a/src/backend/regex/regc_pg_locale.c
+++ b/src/backend/regex/regc_pg_locale.c
@@ -63,32 +63,17 @@
* NB: the coding here assumes pg_wchar is an unsigned type.
*/
-typedef enum
-{
- PG_REGEX_STRATEGY_C, /* C locale (encoding independent) */
- PG_REGEX_STRATEGY_BUILTIN, /* built-in Unicode semantics */
- PG_REGEX_STRATEGY_LIBC_WIDE, /* Use locale_t <wctype.h> functions */
- PG_REGEX_STRATEGY_LIBC_1BYTE, /* Use locale_t <ctype.h> functions */
- PG_REGEX_STRATEGY_ICU, /* Use ICU uchar.h functions */
-} PG_Locale_Strategy;
-
-static PG_Locale_Strategy pg_regex_strategy;
static pg_locale_t pg_regex_locale;
static Oid pg_regex_collation;
+static struct pg_locale_struct dummy_c_locale = {
+ .collate_is_c = true,
+ .ctype_is_c = true,
+};
+
/*
* Hard-wired character properties for C locale
*/
-#define PG_ISDIGIT 0x01
-#define PG_ISALPHA 0x02
-#define PG_ISALNUM (PG_ISDIGIT | PG_ISALPHA)
-#define PG_ISUPPER 0x04
-#define PG_ISLOWER 0x08
-#define PG_ISGRAPH 0x10
-#define PG_ISPRINT 0x20
-#define PG_ISPUNCT 0x40
-#define PG_ISSPACE 0x80
-
static const unsigned char pg_char_properties[128] = {
/* NUL */ 0,
/* ^A */ 0,
@@ -232,7 +217,6 @@ void
pg_set_regex_collation(Oid collation)
{
pg_locale_t locale = 0;
- PG_Locale_Strategy strategy;
if (!OidIsValid(collation))
{
@@ -253,8 +237,8 @@ pg_set_regex_collation(Oid collation)
* catalog access is available, so we can't call
* pg_newlocale_from_collation().
*/
- strategy = PG_REGEX_STRATEGY_C;
collation = C_COLLATION_OID;
+ locale = &dummy_c_locale;
}
else
{
@@ -271,32 +255,11 @@ pg_set_regex_collation(Oid collation)
* C/POSIX collations use this path regardless of database
* encoding
*/
- strategy = PG_REGEX_STRATEGY_C;
- locale = 0;
+ locale = &dummy_c_locale;
collation = C_COLLATION_OID;
}
- else if (locale->provider == COLLPROVIDER_BUILTIN)
- {
- Assert(GetDatabaseEncoding() == PG_UTF8);
- strategy = PG_REGEX_STRATEGY_BUILTIN;
- }
-#ifdef USE_ICU
- else if (locale->provider == COLLPROVIDER_ICU)
- {
- strategy = PG_REGEX_STRATEGY_ICU;
- }
-#endif
- else
- {
- Assert(locale->provider == COLLPROVIDER_LIBC);
- if (GetDatabaseEncoding() == PG_UTF8)
- strategy = PG_REGEX_STRATEGY_LIBC_WIDE;
- else
- strategy = PG_REGEX_STRATEGY_LIBC_1BYTE;
- }
}
- pg_regex_strategy = strategy;
pg_regex_locale = locale;
pg_regex_collation = collation;
}
@@ -304,82 +267,31 @@ pg_set_regex_collation(Oid collation)
static int
pg_wc_isdigit(pg_wchar c)
{
- switch (pg_regex_strategy)
- {
- case PG_REGEX_STRATEGY_C:
- return (c <= (pg_wchar) 127 &&
- (pg_char_properties[c] & PG_ISDIGIT));
- case PG_REGEX_STRATEGY_BUILTIN:
- return pg_u_isdigit(c, true);
- case PG_REGEX_STRATEGY_LIBC_WIDE:
- if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
- return iswdigit_l((wint_t) c, pg_regex_locale->info.lt);
- /* FALL THRU */
- case PG_REGEX_STRATEGY_LIBC_1BYTE:
- return (c <= (pg_wchar) UCHAR_MAX &&
- isdigit_l((unsigned char) c, pg_regex_locale->info.lt));
- break;
- case PG_REGEX_STRATEGY_ICU:
-#ifdef USE_ICU
- return u_isdigit(c);
-#endif
- break;
- }
- return 0; /* can't get here, but keep compiler quiet */
+ if (pg_regex_locale->ctype_is_c)
+ return (c <= (pg_wchar) 127 &&
+ (pg_char_properties[c] & PG_ISDIGIT));
+ else
+ return char_props(c, PG_ISDIGIT, pg_regex_locale) != 0;
}
static int
pg_wc_isalpha(pg_wchar c)
{
- switch (pg_regex_strategy)
- {
- case PG_REGEX_STRATEGY_C:
- return (c <= (pg_wchar) 127 &&
- (pg_char_properties[c] & PG_ISALPHA));
- case PG_REGEX_STRATEGY_BUILTIN:
- return pg_u_isalpha(c);
- case PG_REGEX_STRATEGY_LIBC_WIDE:
- if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
- return iswalpha_l((wint_t) c, pg_regex_locale->info.lt);
- /* FALL THRU */
- case PG_REGEX_STRATEGY_LIBC_1BYTE:
- return (c <= (pg_wchar) UCHAR_MAX &&
- isalpha_l((unsigned char) c, pg_regex_locale->info.lt));
- break;
- case PG_REGEX_STRATEGY_ICU:
-#ifdef USE_ICU
- return u_isalpha(c);
-#endif
- break;
- }
- return 0; /* can't get here, but keep compiler quiet */
+ if (pg_regex_locale->ctype_is_c)
+ return (c <= (pg_wchar) 127 &&
+ (pg_char_properties[c] & PG_ISALPHA));
+ else
+ return char_props(c, PG_ISALPHA, pg_regex_locale) != 0;
}
static int
pg_wc_isalnum(pg_wchar c)
{
- switch (pg_regex_strategy)
- {
- case PG_REGEX_STRATEGY_C:
- return (c <= (pg_wchar) 127 &&
- (pg_char_properties[c] & PG_ISALNUM));
- case PG_REGEX_STRATEGY_BUILTIN:
- return pg_u_isalnum(c, true);
- case PG_REGEX_STRATEGY_LIBC_WIDE:
- if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
- return iswalnum_l((wint_t) c, pg_regex_locale->info.lt);
- /* FALL THRU */
- case PG_REGEX_STRATEGY_LIBC_1BYTE:
- return (c <= (pg_wchar) UCHAR_MAX &&
- isalnum_l((unsigned char) c, pg_regex_locale->info.lt));
- break;
- case PG_REGEX_STRATEGY_ICU:
-#ifdef USE_ICU
- return u_isalnum(c);
-#endif
- break;
- }
- return 0; /* can't get here, but keep compiler quiet */
+ if (pg_regex_locale->ctype_is_c)
+ return (c <= (pg_wchar) 127 &&
+ (pg_char_properties[c] & PG_ISALNUM));
+ else
+ return char_props(c, PG_ISDIGIT|PG_ISALPHA, pg_regex_locale) != 0;
}
static int
@@ -394,219 +306,87 @@ pg_wc_isword(pg_wchar c)
static int
pg_wc_isupper(pg_wchar c)
{
- switch (pg_regex_strategy)
- {
- case PG_REGEX_STRATEGY_C:
- return (c <= (pg_wchar) 127 &&
- (pg_char_properties[c] & PG_ISUPPER));
- case PG_REGEX_STRATEGY_BUILTIN:
- return pg_u_isupper(c);
- case PG_REGEX_STRATEGY_LIBC_WIDE:
- if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
- return iswupper_l((wint_t) c, pg_regex_locale->info.lt);
- /* FALL THRU */
- case PG_REGEX_STRATEGY_LIBC_1BYTE:
- return (c <= (pg_wchar) UCHAR_MAX &&
- isupper_l((unsigned char) c, pg_regex_locale->info.lt));
- break;
- case PG_REGEX_STRATEGY_ICU:
-#ifdef USE_ICU
- return u_isupper(c);
-#endif
- break;
- }
- return 0; /* can't get here, but keep compiler quiet */
+ if (pg_regex_locale->ctype_is_c)
+ return (c <= (pg_wchar) 127 &&
+ (pg_char_properties[c] & PG_ISUPPER));
+ else
+ return char_props(c, PG_ISUPPER, pg_regex_locale) != 0;
}
static int
pg_wc_islower(pg_wchar c)
{
- switch (pg_regex_strategy)
- {
- case PG_REGEX_STRATEGY_C:
- return (c <= (pg_wchar) 127 &&
- (pg_char_properties[c] & PG_ISLOWER));
- case PG_REGEX_STRATEGY_BUILTIN:
- return pg_u_islower(c);
- case PG_REGEX_STRATEGY_LIBC_WIDE:
- if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
- return iswlower_l((wint_t) c, pg_regex_locale->info.lt);
- /* FALL THRU */
- case PG_REGEX_STRATEGY_LIBC_1BYTE:
- return (c <= (pg_wchar) UCHAR_MAX &&
- islower_l((unsigned char) c, pg_regex_locale->info.lt));
- break;
- case PG_REGEX_STRATEGY_ICU:
-#ifdef USE_ICU
- return u_islower(c);
-#endif
- break;
- }
- return 0; /* can't get here, but keep compiler quiet */
+ if (pg_regex_locale->ctype_is_c)
+ return (c <= (pg_wchar) 127 &&
+ (pg_char_properties[c] & PG_ISLOWER));
+ else
+ return char_props(c, PG_ISLOWER, pg_regex_locale) != 0;
}
static int
pg_wc_isgraph(pg_wchar c)
{
- switch (pg_regex_strategy)
- {
- case PG_REGEX_STRATEGY_C:
- return (c <= (pg_wchar) 127 &&
- (pg_char_properties[c] & PG_ISGRAPH));
- case PG_REGEX_STRATEGY_BUILTIN:
- return pg_u_isgraph(c);
- case PG_REGEX_STRATEGY_LIBC_WIDE:
- if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
- return iswgraph_l((wint_t) c, pg_regex_locale->info.lt);
- /* FALL THRU */
- case PG_REGEX_STRATEGY_LIBC_1BYTE:
- return (c <= (pg_wchar) UCHAR_MAX &&
- isgraph_l((unsigned char) c, pg_regex_locale->info.lt));
- break;
- case PG_REGEX_STRATEGY_ICU:
-#ifdef USE_ICU
- return u_isgraph(c);
-#endif
- break;
- }
- return 0; /* can't get here, but keep compiler quiet */
+ if (pg_regex_locale->ctype_is_c)
+ return (c <= (pg_wchar) 127 &&
+ (pg_char_properties[c] & PG_ISGRAPH));
+ else
+ return char_props(c, PG_ISGRAPH, pg_regex_locale) != 0;
}
static int
pg_wc_isprint(pg_wchar c)
{
- switch (pg_regex_strategy)
- {
- case PG_REGEX_STRATEGY_C:
- return (c <= (pg_wchar) 127 &&
- (pg_char_properties[c] & PG_ISPRINT));
- case PG_REGEX_STRATEGY_BUILTIN:
- return pg_u_isprint(c);
- case PG_REGEX_STRATEGY_LIBC_WIDE:
- if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
- return iswprint_l((wint_t) c, pg_regex_locale->info.lt);
- /* FALL THRU */
- case PG_REGEX_STRATEGY_LIBC_1BYTE:
- return (c <= (pg_wchar) UCHAR_MAX &&
- isprint_l((unsigned char) c, pg_regex_locale->info.lt));
- break;
- case PG_REGEX_STRATEGY_ICU:
-#ifdef USE_ICU
- return u_isprint(c);
-#endif
- break;
- }
- return 0; /* can't get here, but keep compiler quiet */
+ if (pg_regex_locale->ctype_is_c)
+ return (c <= (pg_wchar) 127 &&
+ (pg_char_properties[c] & PG_ISPRINT));
+ else
+ return char_props(c, PG_ISPRINT, pg_regex_locale) != 0;
}
static int
pg_wc_ispunct(pg_wchar c)
{
- switch (pg_regex_strategy)
- {
- case PG_REGEX_STRATEGY_C:
- return (c <= (pg_wchar) 127 &&
- (pg_char_properties[c] & PG_ISPUNCT));
- case PG_REGEX_STRATEGY_BUILTIN:
- return pg_u_ispunct(c, true);
- case PG_REGEX_STRATEGY_LIBC_WIDE:
- if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
- return iswpunct_l((wint_t) c, pg_regex_locale->info.lt);
- /* FALL THRU */
- case PG_REGEX_STRATEGY_LIBC_1BYTE:
- return (c <= (pg_wchar) UCHAR_MAX &&
- ispunct_l((unsigned char) c, pg_regex_locale->info.lt));
- break;
- case PG_REGEX_STRATEGY_ICU:
-#ifdef USE_ICU
- return u_ispunct(c);
-#endif
- break;
- }
- return 0; /* can't get here, but keep compiler quiet */
+ if (pg_regex_locale->ctype_is_c)
+ return (c <= (pg_wchar) 127 &&
+ (pg_char_properties[c] & PG_ISPUNCT));
+ else
+ return char_props(c, PG_ISPUNCT, pg_regex_locale) != 0;
}
static int
pg_wc_isspace(pg_wchar c)
{
- switch (pg_regex_strategy)
- {
- case PG_REGEX_STRATEGY_C:
- return (c <= (pg_wchar) 127 &&
- (pg_char_properties[c] & PG_ISSPACE));
- case PG_REGEX_STRATEGY_BUILTIN:
- return pg_u_isspace(c);
- case PG_REGEX_STRATEGY_LIBC_WIDE:
- if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
- return iswspace_l((wint_t) c, pg_regex_locale->info.lt);
- /* FALL THRU */
- case PG_REGEX_STRATEGY_LIBC_1BYTE:
- return (c <= (pg_wchar) UCHAR_MAX &&
- isspace_l((unsigned char) c, pg_regex_locale->info.lt));
- break;
- case PG_REGEX_STRATEGY_ICU:
-#ifdef USE_ICU
- return u_isspace(c);
-#endif
- break;
- }
- return 0; /* can't get here, but keep compiler quiet */
+ if (pg_regex_locale->ctype_is_c)
+ return (c <= (pg_wchar) 127 &&
+ (pg_char_properties[c] & PG_ISSPACE));
+ else
+ return char_props(c, PG_ISSPACE, pg_regex_locale) != 0;
}
static pg_wchar
pg_wc_toupper(pg_wchar c)
{
- switch (pg_regex_strategy)
+ if (pg_regex_locale->ctype_is_c)
{
- case PG_REGEX_STRATEGY_C:
- if (c <= (pg_wchar) 127)
- return pg_ascii_toupper((unsigned char) c);
- return c;
- case PG_REGEX_STRATEGY_BUILTIN:
- return unicode_uppercase_simple(c);
- case PG_REGEX_STRATEGY_LIBC_WIDE:
- if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
- return towupper_l((wint_t) c, pg_regex_locale->info.lt);
- /* FALL THRU */
- case PG_REGEX_STRATEGY_LIBC_1BYTE:
- if (c <= (pg_wchar) UCHAR_MAX)
- return toupper_l((unsigned char) c, pg_regex_locale->info.lt);
- return c;
- case PG_REGEX_STRATEGY_ICU:
-#ifdef USE_ICU
- return u_toupper(c);
-#endif
- break;
+ if (c <= (pg_wchar) 127)
+ return pg_ascii_toupper((unsigned char) c);
+ return c;
}
- return 0; /* can't get here, but keep compiler quiet */
+ else
+ return pg_regex_locale->ctype->wc_toupper(c, pg_regex_locale);
}
static pg_wchar
pg_wc_tolower(pg_wchar c)
{
- switch (pg_regex_strategy)
+ if (pg_regex_locale->ctype_is_c)
{
- case PG_REGEX_STRATEGY_C:
- if (c <= (pg_wchar) 127)
- return pg_ascii_tolower((unsigned char) c);
- return c;
- case PG_REGEX_STRATEGY_BUILTIN:
- return unicode_lowercase_simple(c);
- case PG_REGEX_STRATEGY_LIBC_WIDE:
- if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
- return towlower_l((wint_t) c, pg_regex_locale->info.lt);
- /* FALL THRU */
- case PG_REGEX_STRATEGY_LIBC_1BYTE:
- if (c <= (pg_wchar) UCHAR_MAX)
- return tolower_l((unsigned char) c, pg_regex_locale->info.lt);
- return c;
- case PG_REGEX_STRATEGY_ICU:
-#ifdef USE_ICU
- return u_tolower(c);
-#endif
- break;
+ if (c <= (pg_wchar) 127)
+ return pg_ascii_tolower((unsigned char) c);
+ return c;
}
- return 0; /* can't get here, but keep compiler quiet */
+ else
+ return pg_regex_locale->ctype->wc_tolower(c, pg_regex_locale);
}
@@ -732,37 +512,27 @@ pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode)
* would always be true for production values of MAX_SIMPLE_CHR, but it's
* useful to allow it to be small for testing purposes.)
*/
- switch (pg_regex_strategy)
+ if (pg_regex_locale->ctype_is_c)
{
- case PG_REGEX_STRATEGY_C:
#if MAX_SIMPLE_CHR >= 127
max_chr = (pg_wchar) 127;
pcc->cv.cclasscode = -1;
#else
max_chr = (pg_wchar) MAX_SIMPLE_CHR;
#endif
- break;
- case PG_REGEX_STRATEGY_BUILTIN:
- max_chr = (pg_wchar) MAX_SIMPLE_CHR;
- break;
- case PG_REGEX_STRATEGY_LIBC_WIDE:
- max_chr = (pg_wchar) MAX_SIMPLE_CHR;
- break;
- case PG_REGEX_STRATEGY_LIBC_1BYTE:
+ }
+ else
+ {
#if MAX_SIMPLE_CHR >= UCHAR_MAX
+ if (pg_regex_locale->provider == COLLPROVIDER_LIBC &&
+ GetDatabaseEncoding() != PG_UTF8)
+ {
max_chr = (pg_wchar) UCHAR_MAX;
pcc->cv.cclasscode = -1;
-#else
- max_chr = (pg_wchar) MAX_SIMPLE_CHR;
+ }
+ else
#endif
- break;
- case PG_REGEX_STRATEGY_ICU:
max_chr = (pg_wchar) MAX_SIMPLE_CHR;
- break;
- default:
- Assert(false);
- max_chr = 0; /* can't get here, but keep compiler quiet */
- break;
}
/*
diff --git a/src/backend/utils/adt/Makefile b/src/backend/utils/adt/Makefile
index edb09d4e356..85e5eaf32eb 100644
--- a/src/backend/utils/adt/Makefile
+++ b/src/backend/utils/adt/Makefile
@@ -79,6 +79,8 @@ OBJS = \
orderedsetaggs.o \
partitionfuncs.o \
pg_locale.o \
+ pg_locale_icu.o \
+ pg_locale_libc.o \
pg_lsn.o \
pg_upgrade_support.o \
pgstatfuncs.o \
diff --git a/src/backend/utils/adt/meson.build b/src/backend/utils/adt/meson.build
index 8c6fc80c373..f73f294b8f5 100644
--- a/src/backend/utils/adt/meson.build
+++ b/src/backend/utils/adt/meson.build
@@ -66,6 +66,8 @@ backend_sources += files(
'orderedsetaggs.c',
'partitionfuncs.c',
'pg_locale.c',
+ 'pg_locale_icu.c',
+ 'pg_locale_libc.c',
'pg_lsn.c',
'pg_upgrade_support.c',
'pgstatfuncs.c',
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index cfba55a6e31..1802b7a1589 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -58,6 +58,8 @@
#include "catalog/pg_collation.h"
#include "catalog/pg_database.h"
#include "common/hashfn.h"
+#include "common/unicode_case.h"
+#include "common/unicode_category.h"
#include "mb/pg_wchar.h"
#include "miscadmin.h"
#include "utils/builtins.h"
@@ -87,12 +89,6 @@
#define PGLOCALE_SUPPORT_ERROR(provider) \
elog(ERROR, "unsupported collprovider for %s: %c", __func__, provider)
-/*
- * This should be large enough that most strings will fit, but small enough
- * that we feel comfortable putting it on the stack
- */
-#define TEXTBUFLEN 1024
-
#define MAX_L10N_DATA 80
@@ -119,7 +115,21 @@ char *localized_full_months[12 + 1];
/* is the databases's LC_CTYPE the C locale? */
bool database_ctype_is_c = false;
-static struct pg_locale_struct default_locale;
+#ifdef USE_ICU
+extern pg_locale_t icu_dat_create_locale(HeapTuple dattuple);
+extern pg_locale_t icu_coll_create_locale(MemoryContext context,
+ ResourceOwner resowner,
+ HeapTuple colltuple);
+extern UCollator *pg_ucol_open(const char *loc_str);
+#endif
+
+
+extern pg_locale_t libc_dat_create_locale(HeapTuple dattuple);
+extern pg_locale_t libc_coll_create_locale(MemoryContext context,
+ ResourceOwner resowner,
+ HeapTuple colltuple);
+
+static pg_locale_t default_locale = NULL;
/* indicates whether locale information cache is valid */
static bool CurrentLocaleConvValid = false;
@@ -170,51 +180,48 @@ static pg_locale_t last_collation_cache_locale = NULL;
static char *IsoLocaleName(const char *);
#endif
-#ifdef USE_ICU
-/*
- * Converter object for converting between ICU's UChar strings and C strings
- * in database encoding. Since the database encoding doesn't change, we only
- * need one of these per session.
- */
-static UConverter *icu_converter = NULL;
-
-static UCollator *pg_ucol_open(const char *loc_str);
-static void init_icu_converter(void);
-static size_t uchar_length(UConverter *converter,
- const char *str, int32_t len);
-static int32_t uchar_convert(UConverter *converter,
- UChar *dest, int32_t destlen,
- const char *src, int32_t srclen);
-static void icu_set_collation_attributes(UCollator *collator, const char *loc,
- UErrorCode *status);
-
-static void ResourceOwnerRememberUCollator(ResourceOwner owner,
- UCollator *collator);
-static void ResOwnerReleaseUCollator(Datum val);
-
-static const ResourceOwnerDesc UCollatorResourceKind =
-{
- .name = "UCollator reference",
- .release_phase = RESOURCE_RELEASE_AFTER_LOCKS,
- .release_priority = RELEASE_PRIO_LAST,
- .ReleaseResource = ResOwnerReleaseUCollator,
- .DebugPrint = NULL /* the default message is fine */
-};
-#endif
+static int
+char_props_builtin(pg_wchar wc, int mask, pg_locale_t locale)
+{
+ int result = 0;
+
+ if ((mask & PG_ISDIGIT) && pg_u_isdigit(wc, true))
+ result |= PG_ISDIGIT;
+ if ((mask & PG_ISALPHA) && pg_u_isalpha(wc))
+ result |= PG_ISALPHA;
+ if ((mask & PG_ISUPPER) && pg_u_isupper(wc))
+ result |= PG_ISUPPER;
+ if ((mask & PG_ISLOWER) && pg_u_islower(wc))
+ result |= PG_ISLOWER;
+ if ((mask & PG_ISGRAPH) && pg_u_isgraph(wc))
+ result |= PG_ISGRAPH;
+ if ((mask & PG_ISPRINT) && pg_u_isprint(wc))
+ result |= PG_ISPRINT;
+ if ((mask & PG_ISPUNCT) && pg_u_ispunct(wc, true))
+ result |= PG_ISPUNCT;
+ if ((mask & PG_ISSPACE) && pg_u_isspace(wc))
+ result |= PG_ISSPACE;
+
+ return result;
+}
-static void ResourceOwnerRememberLocaleT(ResourceOwner owner,
- locale_t locale);
-static void ResOwnerReleaseLocaleT(Datum val);
+static pg_wchar
+toupper_builtin(pg_wchar wc, pg_locale_t locale)
+{
+ return unicode_uppercase_simple(wc);
+}
-static const ResourceOwnerDesc LocaleTResourceKind =
+static pg_wchar
+tolower_builtin(pg_wchar wc, pg_locale_t locale)
{
- .name = "locale_t reference",
- .release_phase = RESOURCE_RELEASE_AFTER_LOCKS,
- .release_priority = RELEASE_PRIO_LAST,
- .ReleaseResource = ResOwnerReleaseLocaleT,
- .DebugPrint = NULL /* the default message is fine */
-};
+ return unicode_lowercase_simple(wc);
+}
+struct ctype_methods builtin_ctype_methods = {
+ .char_props = char_props_builtin,
+ .wc_toupper = toupper_builtin,
+ .wc_tolower = tolower_builtin,
+};
/*
* POSIX doesn't define _l-variants of these functions, but several systems
@@ -1262,206 +1269,6 @@ IsoLocaleName(const char *winlocname)
#endif /* WIN32 && LC_MESSAGES */
-/* simple subroutine for reporting errors from newlocale() */
-static void
-report_newlocale_failure(const char *localename)
-{
- int save_errno;
-
- /*
- * Windows doesn't provide any useful error indication from
- * _create_locale(), and BSD-derived platforms don't seem to feel they
- * need to set errno either (even though POSIX is pretty clear that
- * newlocale should do so). So, if errno hasn't been set, assume ENOENT
- * is what to report.
- */
- if (errno == 0)
- errno = ENOENT;
-
- /*
- * ENOENT means "no such locale", not "no such file", so clarify that
- * errno with an errdetail message.
- */
- save_errno = errno; /* auxiliary funcs might change errno */
- ereport(ERROR,
- (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("could not create locale \"%s\": %m",
- localename),
- (save_errno == ENOENT ?
- errdetail("The operating system could not find any locale data for the locale name \"%s\".",
- localename) : 0)));
-}
-
-static void
-ResourceOwnerRememberLocaleT(ResourceOwner owner, locale_t locale)
-{
- ResourceOwnerRemember(owner, PointerGetDatum(locale),
- &LocaleTResourceKind);
-}
-
-static void
-ResOwnerReleaseLocaleT(Datum val)
-{
- locale_t locale = (locale_t) DatumGetPointer(val);
- freelocale(locale);
-}
-
-/*
- * Create a locale_t with the given collation and ctype.
- *
- * The "C" and "POSIX" locales are not actually handled by libc, so return
- * NULL.
- *
- * Ensure that no path leaks a locale_t.
- */
-static locale_t
-make_libc_collator(const char *collate, const char *ctype)
-{
- locale_t loc = 0;
-
- if (strcmp(collate, ctype) == 0)
- {
- if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0)
- {
- /* Normal case where they're the same */
- errno = 0;
-#ifndef WIN32
- loc = newlocale(LC_COLLATE_MASK | LC_CTYPE_MASK, collate,
- NULL);
-#else
- loc = _create_locale(LC_ALL, collate);
-#endif
- if (!loc)
- report_newlocale_failure(collate);
- }
- }
- else
- {
-#ifndef WIN32
- /* We need two newlocale() steps */
- locale_t loc1 = 0;
-
- if (strcmp(collate, "C") != 0 && strcmp(collate, "POSIX") != 0)
- {
- errno = 0;
- loc1 = newlocale(LC_COLLATE_MASK, collate, NULL);
- if (!loc1)
- report_newlocale_failure(collate);
- }
-
- if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0)
- {
- errno = 0;
- loc = newlocale(LC_CTYPE_MASK, ctype, loc1);
- if (!loc)
- {
- if (loc1)
- freelocale(loc1);
- report_newlocale_failure(ctype);
- }
- }
- else
- loc = loc1;
-#else
-
- /*
- * XXX The _create_locale() API doesn't appear to support this. Could
- * perhaps be worked around by changing pg_locale_t to contain two
- * separate fields.
- */
- ereport(ERROR,
- (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
- errmsg("collations with different collate and ctype values are not supported on this platform")));
-#endif
- }
-
- return loc;
-}
-
-/*
- * Create a UCollator with the given locale string and rules.
- *
- * Ensure that no path leaks a UCollator.
- */
-#ifdef USE_ICU
-static void
-ResourceOwnerRememberUCollator(ResourceOwner owner, UCollator *collator)
-{
- ResourceOwnerRemember(owner, PointerGetDatum(collator),
- &UCollatorResourceKind);
-}
-
-static void
-ResOwnerReleaseUCollator(Datum val)
-{
- UCollator *collator = (UCollator *) DatumGetPointer(val);
- ucol_close(collator);
-}
-
-static UCollator *
-make_icu_collator(const char *iculocstr, const char *icurules)
-{
- if (!icurules)
- {
- /* simple case without rules */
- return pg_ucol_open(iculocstr);
- }
- else
- {
- UCollator *collator_std_rules;
- UCollator *collator_all_rules;
- const UChar *std_rules;
- UChar *my_rules;
- UChar *all_rules;
- int32_t length;
- int32_t total;
- UErrorCode status;
-
- /*
- * If rules are specified, we extract the rules of the standard
- * collation, add our own rules, and make a new collator with the
- * combined rules.
- */
- icu_to_uchar(&my_rules, icurules, strlen(icurules));
-
- collator_std_rules = pg_ucol_open(iculocstr);
-
- std_rules = ucol_getRules(collator_std_rules, &length);
-
- total = u_strlen(std_rules) + u_strlen(my_rules) + 1;
-
- /* avoid leaking collator on OOM */
- all_rules = palloc_extended(sizeof(UChar) * total, MCXT_ALLOC_NO_OOM);
- if (!all_rules)
- {
- ucol_close(collator_std_rules);
- ereport(ERROR,
- (errcode(ERRCODE_OUT_OF_MEMORY),
- errmsg("out of memory")));
- }
-
- u_strcpy(all_rules, std_rules);
- u_strcat(all_rules, my_rules);
-
- ucol_close(collator_std_rules);
-
- status = U_ZERO_ERROR;
- collator_all_rules = ucol_openRules(all_rules, u_strlen(all_rules),
- UCOL_DEFAULT, UCOL_DEFAULT_STRENGTH,
- NULL, &status);
- if (U_FAILURE(status))
- {
- ereport(ERROR,
- (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("could not open collator for locale \"%s\" with rules \"%s\": %s",
- iculocstr, icurules, u_errorName(status))));
- }
-
- return collator_all_rules;
- }
-}
-#endif /* not USE_ICU */
-
/*
* Initialize default_locale with database locale settings.
*/
@@ -1471,6 +1278,7 @@ init_database_collation(void)
HeapTuple tup;
Form_pg_database dbform;
Datum datum;
+ pg_locale_t result = NULL;
/* Fetch our pg_database row normally, via syscache */
tup = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId));
@@ -1487,70 +1295,38 @@ init_database_collation(void)
builtin_validate_locale(dbform->encoding, datlocale);
- default_locale.collate_is_c = true;
- default_locale.ctype_is_c = (strcmp(datlocale, "C") == 0);
-
- default_locale.info.builtin.locale = MemoryContextStrdup(
+ result = MemoryContextAllocZero(TopMemoryContext,
+ sizeof(struct pg_locale_struct));
+ result->info.builtin.locale = MemoryContextStrdup(
TopMemoryContext, datlocale);
- }
- else if (dbform->datlocprovider == COLLPROVIDER_ICU)
- {
-#ifdef USE_ICU
- char *datlocale;
- char *icurules;
- bool isnull;
-
- datum = SysCacheGetAttrNotNull(DATABASEOID, tup, Anum_pg_database_datlocale);
- datlocale = TextDatumGetCString(datum);
-
- default_locale.collate_is_c = false;
- default_locale.ctype_is_c = false;
+ result->collate_is_c = true;
+ result->ctype_is_c = (strcmp(datlocale, "C") == 0);
- datum = SysCacheGetAttr(DATABASEOID, tup, Anum_pg_database_daticurules, &isnull);
- if (!isnull)
- icurules = TextDatumGetCString(datum);
- else
- icurules = NULL;
+ if (!result->ctype_is_c)
+ result->ctype = &builtin_ctype_methods;
- default_locale.info.icu.locale = MemoryContextStrdup(TopMemoryContext, datlocale);
- default_locale.info.icu.ucol = make_icu_collator(datlocale, icurules);
-#else /* not USE_ICU */
- /* could get here if a collation was created by a build with ICU */
- ereport(ERROR,
- (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
- errmsg("ICU is not supported in this build")));
-#endif /* not USE_ICU */
}
+#ifdef USE_ICU
+ else if (dbform->datlocprovider == COLLPROVIDER_ICU)
+ result = icu_dat_create_locale(tup);
+#endif /* not USE_ICU */
+ else if (dbform->datlocprovider == COLLPROVIDER_LIBC)
+ result = libc_dat_create_locale(tup);
else
- {
- const char *datcollate;
- const char *datctype;
-
- Assert(dbform->datlocprovider == COLLPROVIDER_LIBC);
-
- datum = SysCacheGetAttrNotNull(DATABASEOID, tup, Anum_pg_database_datcollate);
- datcollate = TextDatumGetCString(datum);
- datum = SysCacheGetAttrNotNull(DATABASEOID, tup, Anum_pg_database_datctype);
- datctype = TextDatumGetCString(datum);
-
- default_locale.collate_is_c = (strcmp(datcollate, "C") == 0) ||
- (strcmp(datcollate, "POSIX") == 0);
- default_locale.ctype_is_c = (strcmp(datctype, "C") == 0) ||
- (strcmp(datctype, "POSIX") == 0);
+ PGLOCALE_SUPPORT_ERROR(dbform->datlocprovider);
- default_locale.info.lt = make_libc_collator(datcollate, datctype);
- }
-
- default_locale.provider = dbform->datlocprovider;
+ result->provider = dbform->datlocprovider;
/*
* Default locale is currently always deterministic. Nondeterministic
* locales currently don't support pattern matching, which would break a
* lot of things if applied globally.
*/
- default_locale.deterministic = true;
+ result->deterministic = true;
ReleaseSysCache(tup);
+
+ default_locale = result;
}
/*
@@ -1558,12 +1334,12 @@ init_database_collation(void)
* allocating memory.
*/
static pg_locale_t
-create_pg_locale(MemoryContext context, ResourceOwner owner, Oid collid)
+create_pg_locale(MemoryContext context, ResourceOwner resowner, Oid collid)
{
/* We haven't computed this yet in this session, so do it */
HeapTuple tp;
Form_pg_collation collform;
- pg_locale_t result;
+ pg_locale_t result = NULL;
Datum datum;
bool isnull;
@@ -1631,65 +1407,19 @@ create_pg_locale(MemoryContext context, ResourceOwner owner, Oid collid)
result->deterministic = collform->collisdeterministic;
result->collate_is_c = true;
result->ctype_is_c = (strcmp(locstr, "C") == 0);
+ if (!result->ctype_is_c)
+ result->ctype = &builtin_ctype_methods;
result->info.builtin.locale = MemoryContextStrdup(context,
locstr);
}
- else if (collform->collprovider == COLLPROVIDER_LIBC)
- {
- const char *collcollate;
- const char *collctype;
- locale_t locale;
-
- datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collcollate);
- collcollate = TextDatumGetCString(datum);
- datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collctype);
- collctype = TextDatumGetCString(datum);
-
- ResourceOwnerEnlarge(owner);
- locale = make_libc_collator(collcollate, collctype);
- if (locale)
- ResourceOwnerRememberLocaleT(owner, locale);
-
- result = MemoryContextAllocZero(context,
- sizeof(struct pg_locale_struct));
-
- result->provider = collform->collprovider;
- result->deterministic = collform->collisdeterministic;
- result->collate_is_c = (strcmp(collcollate, "C") == 0) ||
- (strcmp(collcollate, "POSIX") == 0);
- result->ctype_is_c = (strcmp(collctype, "C") == 0) ||
- (strcmp(collctype, "POSIX") == 0);
- result->info.lt = locale;
- }
+#ifdef USE_ICU
else if (collform->collprovider == COLLPROVIDER_ICU)
- {
- const char *iculocstr;
- const char *icurules;
- UCollator *collator;
-
- datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colllocale);
- iculocstr = TextDatumGetCString(datum);
-
- datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collicurules, &isnull);
- if (!isnull)
- icurules = TextDatumGetCString(datum);
- else
- icurules = NULL;
-
- ResourceOwnerEnlarge(owner);
- collator = make_icu_collator(iculocstr, icurules);
- ResourceOwnerRememberUCollator(owner, collator);
-
- result = MemoryContextAllocZero(context,
- sizeof(struct pg_locale_struct));
-
- result->provider = collform->collprovider;
- result->deterministic = collform->collisdeterministic;
- result->collate_is_c = false;
- result->ctype_is_c = false;
- result->info.icu.locale = MemoryContextStrdup(context, iculocstr);
- result->info.icu.ucol = collator;
- }
+ result = icu_coll_create_locale(context, resowner, tp);
+#endif
+ else if (collform->collprovider == COLLPROVIDER_LIBC)
+ result = libc_coll_create_locale(context, resowner, tp);
+ else
+ PGLOCALE_SUPPORT_ERROR(collform->collprovider);
ReleaseSysCache(tp);
@@ -1735,7 +1465,7 @@ pg_newlocale_from_collation(Oid collid)
bool found;
if (collid == DEFAULT_COLLATION_OID)
- return &default_locale;
+ return default_locale;
if (!OidIsValid(collid))
elog(ERROR, "cache lookup failed for collation %u", collid);
@@ -1886,483 +1616,48 @@ get_collation_actual_version(char collprovider, const char *collcollate)
}
/*
- * strncoll_libc_win32_utf8
- *
- * Win32 does not have UTF-8. Convert UTF8 arguments to wide characters and
- * invoke wcscoll_l().
+ * pg_strcoll
*
- * An input string length of -1 means that it's NUL-terminated.
+ * Like pg_strncoll for NUL-terminated input strings.
*/
-#ifdef WIN32
-static int
-strncoll_libc_win32_utf8(const char *arg1, ssize_t len1, const char *arg2,
- ssize_t len2, pg_locale_t locale)
+int
+pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale)
{
- char sbuf[TEXTBUFLEN];
- char *buf = sbuf;
- char *a1p,
- *a2p;
- int a1len;
- int a2len;
- int r;
- int result;
-
- Assert(locale->provider == COLLPROVIDER_LIBC);
- Assert(GetDatabaseEncoding() == PG_UTF8);
-#ifndef WIN32
- Assert(false);
-#endif
-
- if (len1 == -1)
- len1 = strlen(arg1);
- if (len2 == -1)
- len2 = strlen(arg2);
-
- a1len = len1 * 2 + 2;
- a2len = len2 * 2 + 2;
-
- if (a1len + a2len > TEXTBUFLEN)
- buf = palloc(a1len + a2len);
-
- a1p = buf;
- a2p = buf + a1len;
-
- /* API does not work for zero-length input */
- if (len1 == 0)
- r = 0;
- else
- {
- r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
- (LPWSTR) a1p, a1len / 2);
- if (!r)
- ereport(ERROR,
- (errmsg("could not convert string to UTF-16: error code %lu",
- GetLastError())));
- }
- ((LPWSTR) a1p)[r] = 0;
-
- if (len2 == 0)
- r = 0;
- else
- {
- r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
- (LPWSTR) a2p, a2len / 2);
- if (!r)
- ereport(ERROR,
- (errmsg("could not convert string to UTF-16: error code %lu",
- GetLastError())));
- }
- ((LPWSTR) a2p)[r] = 0;
-
- errno = 0;
- result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, locale->info.lt);
- if (result == 2147483647) /* _NLSCMPERROR; missing from mingw headers */
- ereport(ERROR,
- (errmsg("could not compare Unicode strings: %m")));
-
- if (buf != sbuf)
- pfree(buf);
-
- return result;
+ return locale->collate->strncoll(arg1, -1, arg2, -1, locale);
}
-#endif /* WIN32 */
/*
- * strncoll_libc
+ * pg_strncoll
+ *
+ * Call ucol_strcollUTF8(), ucol_strcoll(), strcoll_l() or wcscoll_l() as
+ * appropriate for the given locale, platform, and database encoding. If the
+ * locale is not specified, use the database collation.
+ *
+ * The input strings must be encoded in the database encoding. If an input
+ * string is NUL-terminated, its length may be specified as -1.
*
- * An input string length of -1 means that it's NUL-terminated.
+ * The caller is responsible for breaking ties if the collation is
+ * deterministic; this maintains consistency with pg_strnxfrm(), which cannot
+ * easily account for deterministic collations.
*/
-static int
-strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
- pg_locale_t locale)
+int
+pg_strncoll(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
+ pg_locale_t locale)
{
- char sbuf[TEXTBUFLEN];
- char *buf = sbuf;
- size_t bufsize1 = (len1 == -1) ? 0 : len1 + 1;
- size_t bufsize2 = (len2 == -1) ? 0 : len2 + 1;
- const char *arg1n;
- const char *arg2n;
- int result;
-
- Assert(locale->provider == COLLPROVIDER_LIBC);
-
-#ifdef WIN32
- /* check for this case before doing the work for nul-termination */
- if (GetDatabaseEncoding() == PG_UTF8)
- return strncoll_libc_win32_utf8(arg1, len1, arg2, len2, locale);
-#endif /* WIN32 */
-
- if (bufsize1 + bufsize2 > TEXTBUFLEN)
- buf = palloc(bufsize1 + bufsize2);
-
- /* nul-terminate arguments if necessary */
- if (len1 == -1)
- {
- arg1n = arg1;
- }
- else
- {
- char *buf1 = buf;
- memcpy(buf1, arg1, len1);
- buf1[len1] = '\0';
- arg1n = buf1;
- }
-
- if (len2 == -1)
- {
- arg2n = arg2;
- }
- else
- {
- char *buf2 = buf + bufsize1;
- memcpy(buf2, arg2, len2);
- buf2[len2] = '\0';
- arg2n = buf2;
- }
-
- result = strcoll_l(arg1n, arg2n, locale->info.lt);
-
- if (buf != sbuf)
- pfree(buf);
-
- return result;
+ return locale->collate->strncoll(arg1, len1, arg2, len2, locale);
}
-#ifdef USE_ICU
-
/*
- * strncoll_icu_no_utf8
+ * Return true if the collation provider supports pg_strxfrm() and
+ * pg_strnxfrm(); otherwise false.
*
- * Convert the arguments from the database encoding to UChar strings, then
- * call ucol_strcoll(). An argument length of -1 means that the string is
- * NUL-terminated.
- *
- * When the database encoding is UTF-8, and ICU supports ucol_strcollUTF8(),
- * caller should call that instead.
- */
-static int
-strncoll_icu_no_utf8(const char *arg1, ssize_t len1,
- const char *arg2, ssize_t len2, pg_locale_t locale)
-{
- char sbuf[TEXTBUFLEN];
- char *buf = sbuf;
- int32_t ulen1;
- int32_t ulen2;
- size_t bufsize1;
- size_t bufsize2;
- UChar *uchar1,
- *uchar2;
- int result;
-
- Assert(locale->provider == COLLPROVIDER_ICU);
-#ifdef HAVE_UCOL_STRCOLLUTF8
- Assert(GetDatabaseEncoding() != PG_UTF8);
-#endif
-
- init_icu_converter();
-
- ulen1 = uchar_length(icu_converter, arg1, len1);
- ulen2 = uchar_length(icu_converter, arg2, len2);
-
- bufsize1 = (ulen1 + 1) * sizeof(UChar);
- bufsize2 = (ulen2 + 1) * sizeof(UChar);
-
- if (bufsize1 + bufsize2 > TEXTBUFLEN)
- buf = palloc(bufsize1 + bufsize2);
-
- uchar1 = (UChar *) buf;
- uchar2 = (UChar *) (buf + bufsize1);
-
- ulen1 = uchar_convert(icu_converter, uchar1, ulen1 + 1, arg1, len1);
- ulen2 = uchar_convert(icu_converter, uchar2, ulen2 + 1, arg2, len2);
-
- result = ucol_strcoll(locale->info.icu.ucol,
- uchar1, ulen1,
- uchar2, ulen2);
-
- if (buf != sbuf)
- pfree(buf);
-
- return result;
-}
-
-/*
- * strncoll_icu
- *
- * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given
- * database encoding. An argument length of -1 means the string is
- * NUL-terminated.
- */
-static int
-strncoll_icu(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
- pg_locale_t locale)
-{
- int result;
-
- Assert(locale->provider == COLLPROVIDER_ICU);
-
-#ifdef HAVE_UCOL_STRCOLLUTF8
- if (GetDatabaseEncoding() == PG_UTF8)
- {
- UErrorCode status;
-
- status = U_ZERO_ERROR;
- result = ucol_strcollUTF8(locale->info.icu.ucol,
- arg1, len1,
- arg2, len2,
- &status);
- if (U_FAILURE(status))
- ereport(ERROR,
- (errmsg("collation failed: %s", u_errorName(status))));
- }
- else
-#endif
- {
- result = strncoll_icu_no_utf8(arg1, len1, arg2, len2, locale);
- }
-
- return result;
-}
-
-#endif /* USE_ICU */
-
-/*
- * pg_strcoll
- *
- * Like pg_strncoll for NUL-terminated input strings.
- */
-int
-pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale)
-{
- int result;
-
- if (locale->provider == COLLPROVIDER_LIBC)
- result = strncoll_libc(arg1, -1, arg2, -1, locale);
-#ifdef USE_ICU
- else if (locale->provider == COLLPROVIDER_ICU)
- result = strncoll_icu(arg1, -1, arg2, -1, locale);
-#endif
- else
- /* shouldn't happen */
- PGLOCALE_SUPPORT_ERROR(locale->provider);
-
- return result;
-}
-
-/*
- * pg_strncoll
- *
- * Call ucol_strcollUTF8(), ucol_strcoll(), strcoll_l() or wcscoll_l() as
- * appropriate for the given locale, platform, and database encoding. If the
- * locale is not specified, use the database collation.
- *
- * The input strings must be encoded in the database encoding. If an input
- * string is NUL-terminated, its length may be specified as -1.
- *
- * The caller is responsible for breaking ties if the collation is
- * deterministic; this maintains consistency with pg_strnxfrm(), which cannot
- * easily account for deterministic collations.
- */
-int
-pg_strncoll(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
- pg_locale_t locale)
-{
- int result;
-
- if (locale->provider == COLLPROVIDER_LIBC)
- result = strncoll_libc(arg1, len1, arg2, len2, locale);
-#ifdef USE_ICU
- else if (locale->provider == COLLPROVIDER_ICU)
- result = strncoll_icu(arg1, len1, arg2, len2, locale);
-#endif
- else
- /* shouldn't happen */
- PGLOCALE_SUPPORT_ERROR(locale->provider);
-
- return result;
-}
-
-
-static size_t
-strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen,
- pg_locale_t locale)
-{
- char sbuf[TEXTBUFLEN];
- char *buf = sbuf;
- size_t bufsize = srclen + 1;
- size_t result;
-
- Assert(locale->provider == COLLPROVIDER_LIBC);
-
- if (srclen == -1)
- return strxfrm_l(dest, src, destsize, locale->info.lt);
-
- if (bufsize > TEXTBUFLEN)
- buf = palloc(bufsize);
-
- /* nul-terminate argument */
- memcpy(buf, src, srclen);
- buf[srclen] = '\0';
-
- result = strxfrm_l(dest, buf, destsize, locale->info.lt);
-
- if (buf != sbuf)
- pfree(buf);
-
- /* if dest is defined, it should be nul-terminated */
- Assert(result >= destsize || dest[result] == '\0');
-
- return result;
-}
-
-#ifdef USE_ICU
-
-/* 'srclen' of -1 means the strings are NUL-terminated */
-static size_t
-strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
- pg_locale_t locale)
-{
- char sbuf[TEXTBUFLEN];
- char *buf = sbuf;
- UChar *uchar;
- int32_t ulen;
- size_t uchar_bsize;
- Size result_bsize;
-
- Assert(locale->provider == COLLPROVIDER_ICU);
-
- init_icu_converter();
-
- ulen = uchar_length(icu_converter, src, srclen);
-
- uchar_bsize = (ulen + 1) * sizeof(UChar);
-
- if (uchar_bsize > TEXTBUFLEN)
- buf = palloc(uchar_bsize);
-
- uchar = (UChar *) buf;
-
- ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
-
- result_bsize = ucol_getSortKey(locale->info.icu.ucol,
- uchar, ulen,
- (uint8_t *) dest, destsize);
-
- /*
- * ucol_getSortKey() counts the nul-terminator in the result length, but
- * this function should not.
- */
- Assert(result_bsize > 0);
- result_bsize--;
-
- if (buf != sbuf)
- pfree(buf);
-
- /* if dest is defined, it should be nul-terminated */
- Assert(result_bsize >= destsize || dest[result_bsize] == '\0');
-
- return result_bsize;
-}
-
-/* 'srclen' of -1 means the strings are NUL-terminated */
-static size_t
-strnxfrm_prefix_icu_no_utf8(char *dest,size_t destsize,
- const char *src, ssize_t srclen,
- pg_locale_t locale)
-{
- char sbuf[TEXTBUFLEN];
- char *buf = sbuf;
- UCharIterator iter;
- uint32_t state[2];
- UErrorCode status;
- int32_t ulen = -1;
- UChar *uchar = NULL;
- size_t uchar_bsize;
- Size result_bsize;
-
- Assert(locale->provider == COLLPROVIDER_ICU);
- Assert(GetDatabaseEncoding() != PG_UTF8);
-
- init_icu_converter();
-
- ulen = uchar_length(icu_converter, src, srclen);
-
- uchar_bsize = (ulen + 1) * sizeof(UChar);
-
- if (uchar_bsize > TEXTBUFLEN)
- buf = palloc(uchar_bsize);
-
- uchar = (UChar *) buf;
-
- ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
-
- uiter_setString(&iter, uchar, ulen);
- state[0] = state[1] = 0; /* won't need that again */
- status = U_ZERO_ERROR;
- result_bsize = ucol_nextSortKeyPart(locale->info.icu.ucol,
- &iter,
- state,
- (uint8_t *) dest,
- destsize,
- &status);
- if (U_FAILURE(status))
- ereport(ERROR,
- (errmsg("sort key generation failed: %s",
- u_errorName(status))));
-
- return result_bsize;
-}
-
-/* 'srclen' of -1 means the strings are NUL-terminated */
-static size_t
-strnxfrm_prefix_icu(char *dest, size_t destsize,
- const char *src, ssize_t srclen,
- pg_locale_t locale)
-{
- size_t result;
-
- Assert(locale->provider == COLLPROVIDER_ICU);
-
- if (GetDatabaseEncoding() == PG_UTF8)
- {
- UCharIterator iter;
- uint32_t state[2];
- UErrorCode status;
-
- uiter_setUTF8(&iter, src, srclen);
- state[0] = state[1] = 0; /* won't need that again */
- status = U_ZERO_ERROR;
- result = ucol_nextSortKeyPart(locale->info.icu.ucol,
- &iter,
- state,
- (uint8_t *) dest,
- destsize,
- &status);
- if (U_FAILURE(status))
- ereport(ERROR,
- (errmsg("sort key generation failed: %s",
- u_errorName(status))));
- }
- else
- result = strnxfrm_prefix_icu_no_utf8(dest, destsize, src, srclen,
- locale);
-
- return result;
-}
-
-#endif
-
-/*
- * Return true if the collation provider supports pg_strxfrm() and
- * pg_strnxfrm(); otherwise false.
- *
- * Unfortunately, it seems that strxfrm() for non-C collations is broken on
- * many common platforms; testing of multiple versions of glibc reveals that,
- * for many locales, strcoll() and strxfrm() do not return consistent
- * results. While no other libc other than Cygwin has so far been shown to
- * have a problem, we take the conservative course of action for right now and
- * disable this categorically. (Users who are certain this isn't a problem on
- * their system can define TRUST_STRXFRM.)
+ * Unfortunately, it seems that strxfrm() for non-C collations is broken on
+ * many common platforms; testing of multiple versions of glibc reveals that,
+ * for many locales, strcoll() and strxfrm() do not return consistent
+ * results. While no other libc other than Cygwin has so far been shown to
+ * have a problem, we take the conservative course of action for right now and
+ * disable this categorically. (Users who are certain this isn't a problem on
+ * their system can define TRUST_STRXFRM.)
*
* No similar problem is known for the ICU provider.
*/
@@ -2392,19 +1687,7 @@ pg_strxfrm_enabled(pg_locale_t locale)
size_t
pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale)
{
- size_t result = 0; /* keep compiler quiet */
-
- if (locale->provider == COLLPROVIDER_LIBC)
- result = strnxfrm_libc(dest, destsize, src, -1, locale);
-#ifdef USE_ICU
- else if (locale->provider == COLLPROVIDER_ICU)
- result = strnxfrm_icu(dest, destsize, src, -1, locale);
-#endif
- else
- /* shouldn't happen */
- PGLOCALE_SUPPORT_ERROR(locale->provider);
-
- return result;
+ return locale->collate->strnxfrm(dest, destsize, src, -1, locale);
}
/*
@@ -2430,19 +1713,7 @@ size_t
pg_strnxfrm(char *dest, size_t destsize, const char *src, ssize_t srclen,
pg_locale_t locale)
{
- size_t result = 0; /* keep compiler quiet */
-
- if (locale->provider == COLLPROVIDER_LIBC)
- result = strnxfrm_libc(dest, src, srclen, destsize, locale);
-#ifdef USE_ICU
- else if (locale->provider == COLLPROVIDER_ICU)
- result = strnxfrm_icu(dest, src, srclen, destsize, locale);
-#endif
- else
- /* shouldn't happen */
- PGLOCALE_SUPPORT_ERROR(locale->provider);
-
- return result;
+ return locale->collate->strnxfrm(dest, destsize, src, srclen, locale);
}
/*
@@ -2472,7 +1743,7 @@ size_t
pg_strxfrm_prefix(char *dest, const char *src, size_t destsize,
pg_locale_t locale)
{
- return pg_strnxfrm_prefix(dest, destsize, src, -1, locale);
+ return locale->collate->strnxfrm_prefix(dest, destsize, src, -1, locale);
}
/*
@@ -2497,16 +1768,9 @@ size_t
pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src,
ssize_t srclen, pg_locale_t locale)
{
- size_t result = 0; /* keep compiler quiet */
-
-#ifdef USE_ICU
- if (locale->provider == COLLPROVIDER_ICU)
- result = strnxfrm_prefix_icu(dest, src, -1, destsize, locale);
- else
-#endif
- PGLOCALE_SUPPORT_ERROR(locale->provider);
-
- return result;
+ return locale->collate->strnxfrm_prefix(dest, destsize,
+ src, srclen,
+ locale);
}
/*
@@ -2561,356 +1825,6 @@ builtin_validate_locale(int encoding, const char *locale)
return canonical_name;
}
-
-#ifdef USE_ICU
-
-/*
- * Wrapper around ucol_open() to handle API differences for older ICU
- * versions.
- *
- * Ensure that no path leaks a UCollator.
- */
-static UCollator *
-pg_ucol_open(const char *loc_str)
-{
- UCollator *collator;
- UErrorCode status;
- const char *orig_str = loc_str;
- char *fixed_str = NULL;
-
- /*
- * Must never open default collator, because it depends on the environment
- * and may change at any time. Should not happen, but check here to catch
- * bugs that might be hard to catch otherwise.
- *
- * NB: the default collator is not the same as the collator for the root
- * locale. The root locale may be specified as the empty string, "und", or
- * "root". The default collator is opened by passing NULL to ucol_open().
- */
- if (loc_str == NULL)
- elog(ERROR, "opening default collator is not supported");
-
- /*
- * In ICU versions 54 and earlier, "und" is not a recognized spelling of
- * the root locale. If the first component of the locale is "und", replace
- * with "root" before opening.
- */
- if (U_ICU_VERSION_MAJOR_NUM < 55)
- {
- char lang[ULOC_LANG_CAPACITY];
-
- status = U_ZERO_ERROR;
- uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
- if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
- {
- ereport(ERROR,
- (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("could not get language from locale \"%s\": %s",
- loc_str, u_errorName(status))));
- }
-
- if (strcmp(lang, "und") == 0)
- {
- const char *remainder = loc_str + strlen("und");
-
- fixed_str = palloc(strlen("root") + strlen(remainder) + 1);
- strcpy(fixed_str, "root");
- strcat(fixed_str, remainder);
-
- loc_str = fixed_str;
- }
- }
-
- status = U_ZERO_ERROR;
- collator = ucol_open(loc_str, &status);
- if (U_FAILURE(status))
- ereport(ERROR,
- /* use original string for error report */
- (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("could not open collator for locale \"%s\": %s",
- orig_str, u_errorName(status))));
-
- if (U_ICU_VERSION_MAJOR_NUM < 54)
- {
- status = U_ZERO_ERROR;
- icu_set_collation_attributes(collator, loc_str, &status);
-
- /*
- * Pretend the error came from ucol_open(), for consistent error
- * message across ICU versions.
- */
- if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
- {
- ucol_close(collator);
- ereport(ERROR,
- (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- errmsg("could not open collator for locale \"%s\": %s",
- orig_str, u_errorName(status))));
- }
- }
-
- if (fixed_str != NULL)
- pfree(fixed_str);
-
- return collator;
-}
-
-static void
-init_icu_converter(void)
-{
- const char *icu_encoding_name;
- UErrorCode status;
- UConverter *conv;
-
- if (icu_converter)
- return; /* already done */
-
- icu_encoding_name = get_encoding_name_for_icu(GetDatabaseEncoding());
- if (!icu_encoding_name)
- ereport(ERROR,
- (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
- errmsg("encoding \"%s\" not supported by ICU",
- pg_encoding_to_char(GetDatabaseEncoding()))));
-
- status = U_ZERO_ERROR;
- conv = ucnv_open(icu_encoding_name, &status);
- if (U_FAILURE(status))
- ereport(ERROR,
- (errmsg("could not open ICU converter for encoding \"%s\": %s",
- icu_encoding_name, u_errorName(status))));
-
- icu_converter = conv;
-}
-
-/*
- * Find length, in UChars, of given string if converted to UChar string.
- *
- * A length of -1 indicates that the input string is NUL-terminated.
- */
-static size_t
-uchar_length(UConverter *converter, const char *str, int32_t len)
-{
- UErrorCode status = U_ZERO_ERROR;
- int32_t ulen;
-
- ulen = ucnv_toUChars(converter, NULL, 0, str, len, &status);
- if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
- ereport(ERROR,
- (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
- return ulen;
-}
-
-/*
- * Convert the given source string into a UChar string, stored in dest, and
- * return the length (in UChars).
- *
- * A srclen of -1 indicates that the input string is NUL-terminated.
- */
-static int32_t
-uchar_convert(UConverter *converter, UChar *dest, int32_t destlen,
- const char *src, int32_t srclen)
-{
- UErrorCode status = U_ZERO_ERROR;
- int32_t ulen;
-
- status = U_ZERO_ERROR;
- ulen = ucnv_toUChars(converter, dest, destlen, src, srclen, &status);
- if (U_FAILURE(status))
- ereport(ERROR,
- (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
- return ulen;
-}
-
-/*
- * Convert a string in the database encoding into a string of UChars.
- *
- * The source string at buff is of length nbytes
- * (it needn't be nul-terminated)
- *
- * *buff_uchar receives a pointer to the palloc'd result string, and
- * the function's result is the number of UChars generated.
- *
- * The result string is nul-terminated, though most callers rely on the
- * result length instead.
- */
-int32_t
-icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes)
-{
- int32_t len_uchar;
-
- init_icu_converter();
-
- len_uchar = uchar_length(icu_converter, buff, nbytes);
-
- *buff_uchar = palloc((len_uchar + 1) * sizeof(**buff_uchar));
- len_uchar = uchar_convert(icu_converter,
- *buff_uchar, len_uchar + 1, buff, nbytes);
-
- return len_uchar;
-}
-
-/*
- * Convert a string of UChars into the database encoding.
- *
- * The source string at buff_uchar is of length len_uchar
- * (it needn't be nul-terminated)
- *
- * *result receives a pointer to the palloc'd result string, and the
- * function's result is the number of bytes generated (not counting nul).
- *
- * The result string is nul-terminated.
- */
-int32_t
-icu_from_uchar(char **result, const UChar *buff_uchar, int32_t len_uchar)
-{
- UErrorCode status;
- int32_t len_result;
-
- init_icu_converter();
-
- status = U_ZERO_ERROR;
- len_result = ucnv_fromUChars(icu_converter, NULL, 0,
- buff_uchar, len_uchar, &status);
- if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
- ereport(ERROR,
- (errmsg("%s failed: %s", "ucnv_fromUChars",
- u_errorName(status))));
-
- *result = palloc(len_result + 1);
-
- status = U_ZERO_ERROR;
- len_result = ucnv_fromUChars(icu_converter, *result, len_result + 1,
- buff_uchar, len_uchar, &status);
- if (U_FAILURE(status) ||
- status == U_STRING_NOT_TERMINATED_WARNING)
- ereport(ERROR,
- (errmsg("%s failed: %s", "ucnv_fromUChars",
- u_errorName(status))));
-
- return len_result;
-}
-
-/*
- * Parse collation attributes from the given locale string and apply them to
- * the open collator.
- *
- * First, the locale string is canonicalized to an ICU format locale ID such
- * as "und@colStrength=primary;colCaseLevel=yes". Then, it parses and applies
- * the key-value arguments.
- *
- * Starting with ICU version 54, the attributes are processed automatically by
- * ucol_open(), so this is only necessary for emulating this behavior on older
- * versions.
- */
-pg_attribute_unused()
-static void
-icu_set_collation_attributes(UCollator *collator, const char *loc,
- UErrorCode *status)
-{
- int32_t len;
- char *icu_locale_id;
- char *lower_str;
- char *str;
- char *token;
-
- /*
- * The input locale may be a BCP 47 language tag, e.g.
- * "und-u-kc-ks-level1", which expresses the same attributes in a
- * different form. It will be converted to the equivalent ICU format
- * locale ID, e.g. "und@colcaselevel=yes;colstrength=primary", by
- * uloc_canonicalize().
- */
- *status = U_ZERO_ERROR;
- len = uloc_canonicalize(loc, NULL, 0, status);
- icu_locale_id = palloc(len + 1);
- *status = U_ZERO_ERROR;
- len = uloc_canonicalize(loc, icu_locale_id, len + 1, status);
- if (U_FAILURE(*status) || *status == U_STRING_NOT_TERMINATED_WARNING)
- return;
-
- lower_str = asc_tolower(icu_locale_id, strlen(icu_locale_id));
-
- pfree(icu_locale_id);
-
- str = strchr(lower_str, '@');
- if (!str)
- return;
- str++;
-
- while ((token = strsep(&str, ";")))
- {
- char *e = strchr(token, '=');
-
- if (e)
- {
- char *name;
- char *value;
- UColAttribute uattr;
- UColAttributeValue uvalue;
-
- *status = U_ZERO_ERROR;
-
- *e = '\0';
- name = token;
- value = e + 1;
-
- /*
- * See attribute name and value lists in ICU i18n/coll.cpp
- */
- if (strcmp(name, "colstrength") == 0)
- uattr = UCOL_STRENGTH;
- else if (strcmp(name, "colbackwards") == 0)
- uattr = UCOL_FRENCH_COLLATION;
- else if (strcmp(name, "colcaselevel") == 0)
- uattr = UCOL_CASE_LEVEL;
- else if (strcmp(name, "colcasefirst") == 0)
- uattr = UCOL_CASE_FIRST;
- else if (strcmp(name, "colalternate") == 0)
- uattr = UCOL_ALTERNATE_HANDLING;
- else if (strcmp(name, "colnormalization") == 0)
- uattr = UCOL_NORMALIZATION_MODE;
- else if (strcmp(name, "colnumeric") == 0)
- uattr = UCOL_NUMERIC_COLLATION;
- else
- /* ignore if unknown */
- continue;
-
- if (strcmp(value, "primary") == 0)
- uvalue = UCOL_PRIMARY;
- else if (strcmp(value, "secondary") == 0)
- uvalue = UCOL_SECONDARY;
- else if (strcmp(value, "tertiary") == 0)
- uvalue = UCOL_TERTIARY;
- else if (strcmp(value, "quaternary") == 0)
- uvalue = UCOL_QUATERNARY;
- else if (strcmp(value, "identical") == 0)
- uvalue = UCOL_IDENTICAL;
- else if (strcmp(value, "no") == 0)
- uvalue = UCOL_OFF;
- else if (strcmp(value, "yes") == 0)
- uvalue = UCOL_ON;
- else if (strcmp(value, "shifted") == 0)
- uvalue = UCOL_SHIFTED;
- else if (strcmp(value, "non-ignorable") == 0)
- uvalue = UCOL_NON_IGNORABLE;
- else if (strcmp(value, "lower") == 0)
- uvalue = UCOL_LOWER_FIRST;
- else if (strcmp(value, "upper") == 0)
- uvalue = UCOL_UPPER_FIRST;
- else
- {
- *status = U_ILLEGAL_ARGUMENT_ERROR;
- break;
- }
-
- ucol_setAttribute(collator, uattr, uvalue, status);
- }
- }
-
- pfree(lower_str);
-}
-#endif
-
/*
* Return the BCP47 language tag representation of the requested locale.
*
@@ -3049,6 +1963,16 @@ icu_validate_locale(const char *loc_str)
#endif /* not USE_ICU */
}
+/*
+ *
+ *TODO: add caching?
+ */
+int
+char_props(pg_wchar wc, int mask, pg_locale_t locale)
+{
+ return locale->ctype->char_props(wc, mask, locale);
+}
+
/*
* These functions convert from/to libc's wchar_t, *not* pg_wchar_t.
* Therefore we keep them here rather than with the mbutils code.
diff --git a/src/backend/utils/adt/pg_locale_icu.c b/src/backend/utils/adt/pg_locale_icu.c
new file mode 100644
index 00000000000..a9e8b4b642b
--- /dev/null
+++ b/src/backend/utils/adt/pg_locale_icu.c
@@ -0,0 +1,873 @@
+/*-----------------------------------------------------------------------
+ *
+ * PostgreSQL locale utilities for ICU
+ *
+ * Portions Copyright (c) 2002-2024, PostgreSQL Global Development Group
+ *
+ * src/backend/utils/adt/pg_locale_libc.c
+ *
+ *-----------------------------------------------------------------------
+ */
+
+
+#include "postgres.h"
+
+#ifdef USE_ICU
+
+#include <unicode/ucnv.h>
+#include <unicode/ustring.h>
+
+#include "access/htup_details.h"
+#include "catalog/pg_collation.h"
+#include "catalog/pg_database.h"
+#include "utils/builtins.h"
+#include "utils/formatting.h"
+#include "utils/memutils.h"
+#include "utils/pg_locale.h"
+#include "utils/resowner.h"
+#include "utils/syscache.h"
+
+/*
+ * This should be large enough that most strings will fit, but small enough
+ * that we feel comfortable putting it on the stack
+ */
+#define TEXTBUFLEN 1024
+
+extern pg_locale_t icu_dat_create_locale(HeapTuple dattuple);
+extern pg_locale_t icu_coll_create_locale(MemoryContext context,
+ ResourceOwner resowner,
+ HeapTuple colltuple);
+extern UCollator *pg_ucol_open(const char *loc_str);
+
+
+static UCollator * make_icu_collator(const char *iculocstr,
+ const char *icurules);
+
+static int strncoll_icu(const char *arg1, ssize_t len1,
+ const char *arg2, ssize_t len2,
+ pg_locale_t locale);
+static size_t strnxfrm_icu(char *dest, size_t destsize,
+ const char *src, ssize_t srclen,
+ pg_locale_t locale);
+static size_t strnxfrm_prefix_icu(char *dest, size_t destsize,
+ const char *src, ssize_t srclen,
+ pg_locale_t locale);
+
+static void ResourceOwnerRememberUCollator(ResourceOwner owner,
+ UCollator *collator);
+static void ResOwnerReleaseUCollator(Datum val);
+
+static void init_icu_converter(void);
+static size_t uchar_length(UConverter *converter,
+ const char *str, int32_t len);
+static int32_t uchar_convert(UConverter *converter,
+ UChar *dest, int32_t destlen,
+ const char *src, int32_t srclen);
+static void icu_set_collation_attributes(UCollator *collator, const char *loc,
+ UErrorCode *status);
+
+/*
+ * Converter object for converting between ICU's UChar strings and C strings
+ * in database encoding. Since the database encoding doesn't change, we only
+ * need one of these per session.
+ */
+static UConverter *icu_converter = NULL;
+
+static const ResourceOwnerDesc UCollatorResourceKind =
+{
+ .name = "UCollator reference",
+ .release_phase = RESOURCE_RELEASE_AFTER_LOCKS,
+ .release_priority = RELEASE_PRIO_LAST,
+ .ReleaseResource = ResOwnerReleaseUCollator,
+ .DebugPrint = NULL /* the default message is fine */
+};
+
+
+static int
+char_props_icu(pg_wchar wc, int mask, pg_locale_t locale)
+{
+ int result = 0;
+
+ if ((mask & PG_ISDIGIT) && u_isdigit(wc))
+ result |= PG_ISDIGIT;
+ if ((mask & PG_ISALPHA) && u_isalpha(wc))
+ result |= PG_ISALPHA;
+ if ((mask & PG_ISUPPER) && u_isupper(wc))
+ result |= PG_ISUPPER;
+ if ((mask & PG_ISLOWER) && u_islower(wc))
+ result |= PG_ISLOWER;
+ if ((mask & PG_ISGRAPH) && u_isgraph(wc))
+ result |= PG_ISGRAPH;
+ if ((mask & PG_ISPRINT) && u_isprint(wc))
+ result |= PG_ISPRINT;
+ if ((mask & PG_ISPUNCT) && u_ispunct(wc))
+ result |= PG_ISPUNCT;
+ if ((mask & PG_ISSPACE) && u_isspace(wc))
+ result |= PG_ISSPACE;
+
+ return result;
+}
+
+static pg_wchar
+toupper_icu(pg_wchar wc, pg_locale_t locale)
+{
+ return u_toupper(wc);
+}
+
+static pg_wchar
+tolower_icu(pg_wchar wc, pg_locale_t locale)
+{
+ return u_tolower(wc);
+}
+
+struct collate_methods icu_collate_methods = {
+ .strncoll = strncoll_icu,
+ .strnxfrm = strnxfrm_icu,
+ .strnxfrm_prefix = strnxfrm_prefix_icu,
+};
+
+struct ctype_methods icu_ctype_methods = {
+ .char_props = char_props_icu,
+ .wc_toupper = toupper_icu,
+ .wc_tolower = tolower_icu,
+};
+
+static void
+ResourceOwnerRememberUCollator(ResourceOwner owner, UCollator *collator)
+{
+ ResourceOwnerRemember(owner, PointerGetDatum(collator),
+ &UCollatorResourceKind);
+}
+
+static void
+ResOwnerReleaseUCollator(Datum val)
+{
+ UCollator *collator = (UCollator *) DatumGetPointer(val);
+ ucol_close(collator);
+}
+
+pg_locale_t
+icu_dat_create_locale(HeapTuple dattuple)
+{
+ Form_pg_database dbform;
+ Datum datum;
+ char *datlocale;
+ char *icurules;
+ bool isnull;
+ pg_locale_t result;
+
+ dbform = (Form_pg_database) GETSTRUCT(dattuple);
+
+ datum = SysCacheGetAttrNotNull(DATABASEOID, dattuple, Anum_pg_database_datlocale);
+ datlocale = TextDatumGetCString(datum);
+
+ datum = SysCacheGetAttr(DATABASEOID, dattuple, Anum_pg_database_daticurules, &isnull);
+ if (!isnull)
+ icurules = TextDatumGetCString(datum);
+ else
+ icurules = NULL;
+
+ result = MemoryContextAllocZero(TopMemoryContext,
+ sizeof(struct pg_locale_struct));
+
+ result->info.icu.locale = MemoryContextStrdup(TopMemoryContext,
+ datlocale);
+ result->provider = dbform->datlocprovider;
+ result->deterministic = true;
+ result->collate_is_c = false;
+ result->ctype_is_c = false;
+ result->collate = &icu_collate_methods;
+ result->ctype = &icu_ctype_methods;
+ result->info.icu.ucol = make_icu_collator(datlocale, icurules);
+
+ return result;
+}
+
+pg_locale_t
+icu_coll_create_locale(MemoryContext context, ResourceOwner resowner,
+ HeapTuple colltuple)
+{
+ Form_pg_collation collform;
+ Datum datum;
+ bool isnull;
+ const char *iculocstr;
+ const char *icurules;
+ UCollator *collator;
+ pg_locale_t result;
+
+ collform = (Form_pg_collation) GETSTRUCT(colltuple);
+
+ Assert(collform->collprovider == COLLPROVIDER_ICU);
+
+ datum = SysCacheGetAttrNotNull(COLLOID, colltuple, Anum_pg_collation_colllocale);
+ iculocstr = TextDatumGetCString(datum);
+
+ datum = SysCacheGetAttr(COLLOID, colltuple, Anum_pg_collation_collicurules, &isnull);
+ if (!isnull)
+ icurules = TextDatumGetCString(datum);
+ else
+ icurules = NULL;
+
+ ResourceOwnerEnlarge(resowner);
+ collator = make_icu_collator(iculocstr, icurules);
+ ResourceOwnerRememberUCollator(resowner, collator);
+
+ result = MemoryContextAllocZero(context,
+ sizeof(struct pg_locale_struct));
+
+ result->info.icu.locale = MemoryContextStrdup(context, iculocstr);
+ result->provider = collform->collprovider;
+ result->deterministic = collform->collisdeterministic;
+ result->collate_is_c = false;
+ result->ctype_is_c = false;
+ result->collate = &icu_collate_methods;
+ result->ctype = &icu_ctype_methods;
+ result->info.icu.ucol = collator;
+
+ return result;
+}
+
+/*
+ * Create a UCollator with the given locale string and rules.
+ *
+ * Ensure that no path leaks a UCollator.
+ */
+static UCollator *
+make_icu_collator(const char *iculocstr, const char *icurules)
+{
+ if (!icurules)
+ {
+ /* simple case without rules */
+ return pg_ucol_open(iculocstr);
+ }
+ else
+ {
+ UCollator *collator_std_rules;
+ UCollator *collator_all_rules;
+ const UChar *std_rules;
+ UChar *my_rules;
+ UChar *all_rules;
+ int32_t length;
+ int32_t total;
+ UErrorCode status;
+
+ /*
+ * If rules are specified, we extract the rules of the standard
+ * collation, add our own rules, and make a new collator with the
+ * combined rules.
+ */
+ icu_to_uchar(&my_rules, icurules, strlen(icurules));
+
+ collator_std_rules = pg_ucol_open(iculocstr);
+
+ std_rules = ucol_getRules(collator_std_rules, &length);
+
+ total = u_strlen(std_rules) + u_strlen(my_rules) + 1;
+
+ /* avoid leaking collator on OOM */
+ all_rules = palloc_extended(sizeof(UChar) * total, MCXT_ALLOC_NO_OOM);
+ if (!all_rules)
+ {
+ ucol_close(collator_std_rules);
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+
+ u_strcpy(all_rules, std_rules);
+ u_strcat(all_rules, my_rules);
+
+ ucol_close(collator_std_rules);
+
+ status = U_ZERO_ERROR;
+ collator_all_rules = ucol_openRules(all_rules, u_strlen(all_rules),
+ UCOL_DEFAULT, UCOL_DEFAULT_STRENGTH,
+ NULL, &status);
+ if (U_FAILURE(status))
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("could not open collator for locale \"%s\" with rules \"%s\": %s",
+ iculocstr, icurules, u_errorName(status))));
+ }
+
+ return collator_all_rules;
+ }
+}
+
+
+/*
+ * strncoll_icu_no_utf8
+ *
+ * Convert the arguments from the database encoding to UChar strings, then
+ * call ucol_strcoll(). An argument length of -1 means that the string is
+ * NUL-terminated.
+ *
+ * When the database encoding is UTF-8, and ICU supports ucol_strcollUTF8(),
+ * caller should call that instead.
+ */
+static int
+strncoll_icu_no_utf8(const char *arg1, ssize_t len1,
+ const char *arg2, ssize_t len2, pg_locale_t locale)
+{
+ char sbuf[TEXTBUFLEN];
+ char *buf = sbuf;
+ int32_t ulen1;
+ int32_t ulen2;
+ size_t bufsize1;
+ size_t bufsize2;
+ UChar *uchar1,
+ *uchar2;
+ int result;
+
+ Assert(locale->provider == COLLPROVIDER_ICU);
+#ifdef HAVE_UCOL_STRCOLLUTF8
+ Assert(GetDatabaseEncoding() != PG_UTF8);
+#endif
+
+ init_icu_converter();
+
+ ulen1 = uchar_length(icu_converter, arg1, len1);
+ ulen2 = uchar_length(icu_converter, arg2, len2);
+
+ bufsize1 = (ulen1 + 1) * sizeof(UChar);
+ bufsize2 = (ulen2 + 1) * sizeof(UChar);
+
+ if (bufsize1 + bufsize2 > TEXTBUFLEN)
+ buf = palloc(bufsize1 + bufsize2);
+
+ uchar1 = (UChar *) buf;
+ uchar2 = (UChar *) (buf + bufsize1);
+
+ ulen1 = uchar_convert(icu_converter, uchar1, ulen1 + 1, arg1, len1);
+ ulen2 = uchar_convert(icu_converter, uchar2, ulen2 + 1, arg2, len2);
+
+ result = ucol_strcoll(locale->info.icu.ucol,
+ uchar1, ulen1,
+ uchar2, ulen2);
+
+ if (buf != sbuf)
+ pfree(buf);
+
+ return result;
+}
+
+/*
+ * strncoll_icu
+ *
+ * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given
+ * database encoding. An argument length of -1 means the string is
+ * NUL-terminated.
+ */
+static int
+strncoll_icu(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
+ pg_locale_t locale)
+{
+ int result;
+
+ Assert(locale->provider == COLLPROVIDER_ICU);
+
+#ifdef HAVE_UCOL_STRCOLLUTF8
+ if (GetDatabaseEncoding() == PG_UTF8)
+ {
+ UErrorCode status;
+
+ status = U_ZERO_ERROR;
+ result = ucol_strcollUTF8(locale->info.icu.ucol,
+ arg1, len1,
+ arg2, len2,
+ &status);
+ if (U_FAILURE(status))
+ ereport(ERROR,
+ (errmsg("collation failed: %s", u_errorName(status))));
+ }
+ else
+#endif
+ {
+ result = strncoll_icu_no_utf8(arg1, len1, arg2, len2, locale);
+ }
+
+ return result;
+}
+
+
+/* 'srclen' of -1 means the strings are NUL-terminated */
+static size_t
+strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
+ pg_locale_t locale)
+{
+ char sbuf[TEXTBUFLEN];
+ char *buf = sbuf;
+ UChar *uchar;
+ int32_t ulen;
+ size_t uchar_bsize;
+ Size result_bsize;
+
+ Assert(locale->provider == COLLPROVIDER_ICU);
+
+ init_icu_converter();
+
+ ulen = uchar_length(icu_converter, src, srclen);
+
+ uchar_bsize = (ulen + 1) * sizeof(UChar);
+
+ if (uchar_bsize > TEXTBUFLEN)
+ buf = palloc(uchar_bsize);
+
+ uchar = (UChar *) buf;
+
+ ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
+
+ result_bsize = ucol_getSortKey(locale->info.icu.ucol,
+ uchar, ulen,
+ (uint8_t *) dest, destsize);
+
+ /*
+ * ucol_getSortKey() counts the nul-terminator in the result length, but
+ * this function should not.
+ */
+ Assert(result_bsize > 0);
+ result_bsize--;
+
+ if (buf != sbuf)
+ pfree(buf);
+
+ /* if dest is defined, it should be nul-terminated */
+ Assert(result_bsize >= destsize || dest[result_bsize] == '\0');
+
+ return result_bsize;
+}
+
+/* 'srclen' of -1 means the strings are NUL-terminated */
+static size_t
+strnxfrm_prefix_icu_no_utf8(char *dest,size_t destsize,
+ const char *src, ssize_t srclen,
+ pg_locale_t locale)
+{
+ char sbuf[TEXTBUFLEN];
+ char *buf = sbuf;
+ UCharIterator iter;
+ uint32_t state[2];
+ UErrorCode status;
+ int32_t ulen = -1;
+ UChar *uchar = NULL;
+ size_t uchar_bsize;
+ Size result_bsize;
+
+ Assert(locale->provider == COLLPROVIDER_ICU);
+ Assert(GetDatabaseEncoding() != PG_UTF8);
+
+ init_icu_converter();
+
+ ulen = uchar_length(icu_converter, src, srclen);
+
+ uchar_bsize = (ulen + 1) * sizeof(UChar);
+
+ if (uchar_bsize > TEXTBUFLEN)
+ buf = palloc(uchar_bsize);
+
+ uchar = (UChar *) buf;
+
+ ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen);
+
+ uiter_setString(&iter, uchar, ulen);
+ state[0] = state[1] = 0; /* won't need that again */
+ status = U_ZERO_ERROR;
+ result_bsize = ucol_nextSortKeyPart(locale->info.icu.ucol,
+ &iter,
+ state,
+ (uint8_t *) dest,
+ destsize,
+ &status);
+ if (U_FAILURE(status))
+ ereport(ERROR,
+ (errmsg("sort key generation failed: %s",
+ u_errorName(status))));
+
+ return result_bsize;
+}
+
+/* 'srclen' of -1 means the strings are NUL-terminated */
+static size_t
+strnxfrm_prefix_icu(char *dest, size_t destsize,
+ const char *src, ssize_t srclen,
+ pg_locale_t locale)
+{
+ size_t result;
+
+ Assert(locale->provider == COLLPROVIDER_ICU);
+
+ if (GetDatabaseEncoding() == PG_UTF8)
+ {
+ UCharIterator iter;
+ uint32_t state[2];
+ UErrorCode status;
+
+ uiter_setUTF8(&iter, src, srclen);
+ state[0] = state[1] = 0; /* won't need that again */
+ status = U_ZERO_ERROR;
+ result = ucol_nextSortKeyPart(locale->info.icu.ucol,
+ &iter,
+ state,
+ (uint8_t *) dest,
+ destsize,
+ &status);
+ if (U_FAILURE(status))
+ ereport(ERROR,
+ (errmsg("sort key generation failed: %s",
+ u_errorName(status))));
+ }
+ else
+ result = strnxfrm_prefix_icu_no_utf8(dest, destsize, src, srclen,
+ locale);
+
+ return result;
+}
+
+/*
+ * Wrapper around ucol_open() to handle API differences for older ICU
+ * versions.
+ *
+ * Ensure that no path leaks a UCollator.
+ */
+UCollator *
+pg_ucol_open(const char *loc_str)
+{
+ UCollator *collator;
+ UErrorCode status;
+ const char *orig_str = loc_str;
+ char *fixed_str = NULL;
+
+ /*
+ * Must never open default collator, because it depends on the environment
+ * and may change at any time. Should not happen, but check here to catch
+ * bugs that might be hard to catch otherwise.
+ *
+ * NB: the default collator is not the same as the collator for the root
+ * locale. The root locale may be specified as the empty string, "und", or
+ * "root". The default collator is opened by passing NULL to ucol_open().
+ */
+ if (loc_str == NULL)
+ elog(ERROR, "opening default collator is not supported");
+
+ /*
+ * In ICU versions 54 and earlier, "und" is not a recognized spelling of
+ * the root locale. If the first component of the locale is "und", replace
+ * with "root" before opening.
+ */
+ if (U_ICU_VERSION_MAJOR_NUM < 55)
+ {
+ char lang[ULOC_LANG_CAPACITY];
+
+ status = U_ZERO_ERROR;
+ uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
+ if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("could not get language from locale \"%s\": %s",
+ loc_str, u_errorName(status))));
+ }
+
+ if (strcmp(lang, "und") == 0)
+ {
+ const char *remainder = loc_str + strlen("und");
+
+ fixed_str = palloc(strlen("root") + strlen(remainder) + 1);
+ strcpy(fixed_str, "root");
+ strcat(fixed_str, remainder);
+
+ loc_str = fixed_str;
+ }
+ }
+
+ status = U_ZERO_ERROR;
+ collator = ucol_open(loc_str, &status);
+ if (U_FAILURE(status))
+ ereport(ERROR,
+ /* use original string for error report */
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("could not open collator for locale \"%s\": %s",
+ orig_str, u_errorName(status))));
+
+ if (U_ICU_VERSION_MAJOR_NUM < 54)
+ {
+ status = U_ZERO_ERROR;
+ icu_set_collation_attributes(collator, loc_str, &status);
+
+ /*
+ * Pretend the error came from ucol_open(), for consistent error
+ * message across ICU versions.
+ */
+ if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
+ {
+ ucol_close(collator);
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("could not open collator for locale \"%s\": %s",
+ orig_str, u_errorName(status))));
+ }
+ }
+
+ if (fixed_str != NULL)
+ pfree(fixed_str);
+
+ return collator;
+}
+
+static void
+init_icu_converter(void)
+{
+ const char *icu_encoding_name;
+ UErrorCode status;
+ UConverter *conv;
+
+ if (icu_converter)
+ return; /* already done */
+
+ icu_encoding_name = get_encoding_name_for_icu(GetDatabaseEncoding());
+ if (!icu_encoding_name)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("encoding \"%s\" not supported by ICU",
+ pg_encoding_to_char(GetDatabaseEncoding()))));
+
+ status = U_ZERO_ERROR;
+ conv = ucnv_open(icu_encoding_name, &status);
+ if (U_FAILURE(status))
+ ereport(ERROR,
+ (errmsg("could not open ICU converter for encoding \"%s\": %s",
+ icu_encoding_name, u_errorName(status))));
+
+ icu_converter = conv;
+}
+
+/*
+ * Find length, in UChars, of given string if converted to UChar string.
+ *
+ * A length of -1 indicates that the input string is NUL-terminated.
+ */
+static size_t
+uchar_length(UConverter *converter, const char *str, int32_t len)
+{
+ UErrorCode status = U_ZERO_ERROR;
+ int32_t ulen;
+
+ ulen = ucnv_toUChars(converter, NULL, 0, str, len, &status);
+ if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
+ ereport(ERROR,
+ (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
+ return ulen;
+}
+
+/*
+ * Convert the given source string into a UChar string, stored in dest, and
+ * return the length (in UChars).
+ *
+ * A srclen of -1 indicates that the input string is NUL-terminated.
+ */
+static int32_t
+uchar_convert(UConverter *converter, UChar *dest, int32_t destlen,
+ const char *src, int32_t srclen)
+{
+ UErrorCode status = U_ZERO_ERROR;
+ int32_t ulen;
+
+ status = U_ZERO_ERROR;
+ ulen = ucnv_toUChars(converter, dest, destlen, src, srclen, &status);
+ if (U_FAILURE(status))
+ ereport(ERROR,
+ (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status))));
+ return ulen;
+}
+
+/*
+ * Convert a string in the database encoding into a string of UChars.
+ *
+ * The source string at buff is of length nbytes
+ * (it needn't be nul-terminated)
+ *
+ * *buff_uchar receives a pointer to the palloc'd result string, and
+ * the function's result is the number of UChars generated.
+ *
+ * The result string is nul-terminated, though most callers rely on the
+ * result length instead.
+ */
+int32_t
+icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes)
+{
+ int32_t len_uchar;
+
+ init_icu_converter();
+
+ len_uchar = uchar_length(icu_converter, buff, nbytes);
+
+ *buff_uchar = palloc((len_uchar + 1) * sizeof(**buff_uchar));
+ len_uchar = uchar_convert(icu_converter,
+ *buff_uchar, len_uchar + 1, buff, nbytes);
+
+ return len_uchar;
+}
+
+/*
+ * Convert a string of UChars into the database encoding.
+ *
+ * The source string at buff_uchar is of length len_uchar
+ * (it needn't be nul-terminated)
+ *
+ * *result receives a pointer to the palloc'd result string, and the
+ * function's result is the number of bytes generated (not counting nul).
+ *
+ * The result string is nul-terminated.
+ */
+int32_t
+icu_from_uchar(char **result, const UChar *buff_uchar, int32_t len_uchar)
+{
+ UErrorCode status;
+ int32_t len_result;
+
+ init_icu_converter();
+
+ status = U_ZERO_ERROR;
+ len_result = ucnv_fromUChars(icu_converter, NULL, 0,
+ buff_uchar, len_uchar, &status);
+ if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR)
+ ereport(ERROR,
+ (errmsg("%s failed: %s", "ucnv_fromUChars",
+ u_errorName(status))));
+
+ *result = palloc(len_result + 1);
+
+ status = U_ZERO_ERROR;
+ len_result = ucnv_fromUChars(icu_converter, *result, len_result + 1,
+ buff_uchar, len_uchar, &status);
+ if (U_FAILURE(status) ||
+ status == U_STRING_NOT_TERMINATED_WARNING)
+ ereport(ERROR,
+ (errmsg("%s failed: %s", "ucnv_fromUChars",
+ u_errorName(status))));
+
+ return len_result;
+}
+
+/*
+ * Parse collation attributes from the given locale string and apply them to
+ * the open collator.
+ *
+ * First, the locale string is canonicalized to an ICU format locale ID such
+ * as "und@colStrength=primary;colCaseLevel=yes". Then, it parses and applies
+ * the key-value arguments.
+ *
+ * Starting with ICU version 54, the attributes are processed automatically by
+ * ucol_open(), so this is only necessary for emulating this behavior on older
+ * versions.
+ */
+pg_attribute_unused()
+static void
+icu_set_collation_attributes(UCollator *collator, const char *loc,
+ UErrorCode *status)
+{
+ int32_t len;
+ char *icu_locale_id;
+ char *lower_str;
+ char *str;
+ char *token;
+
+ /*
+ * The input locale may be a BCP 47 language tag, e.g.
+ * "und-u-kc-ks-level1", which expresses the same attributes in a
+ * different form. It will be converted to the equivalent ICU format
+ * locale ID, e.g. "und@colcaselevel=yes;colstrength=primary", by
+ * uloc_canonicalize().
+ */
+ *status = U_ZERO_ERROR;
+ len = uloc_canonicalize(loc, NULL, 0, status);
+ icu_locale_id = palloc(len + 1);
+ *status = U_ZERO_ERROR;
+ len = uloc_canonicalize(loc, icu_locale_id, len + 1, status);
+ if (U_FAILURE(*status) || *status == U_STRING_NOT_TERMINATED_WARNING)
+ return;
+
+ lower_str = asc_tolower(icu_locale_id, strlen(icu_locale_id));
+
+ pfree(icu_locale_id);
+
+ str = strchr(lower_str, '@');
+ if (!str)
+ return;
+ str++;
+
+ while ((token = strsep(&str, ";")))
+ {
+ char *e = strchr(token, '=');
+
+ if (e)
+ {
+ char *name;
+ char *value;
+ UColAttribute uattr;
+ UColAttributeValue uvalue;
+
+ *status = U_ZERO_ERROR;
+
+ *e = '\0';
+ name = token;
+ value = e + 1;
+
+ /*
+ * See attribute name and value lists in ICU i18n/coll.cpp
+ */
+ if (strcmp(name, "colstrength") == 0)
+ uattr = UCOL_STRENGTH;
+ else if (strcmp(name, "colbackwards") == 0)
+ uattr = UCOL_FRENCH_COLLATION;
+ else if (strcmp(name, "colcaselevel") == 0)
+ uattr = UCOL_CASE_LEVEL;
+ else if (strcmp(name, "colcasefirst") == 0)
+ uattr = UCOL_CASE_FIRST;
+ else if (strcmp(name, "colalternate") == 0)
+ uattr = UCOL_ALTERNATE_HANDLING;
+ else if (strcmp(name, "colnormalization") == 0)
+ uattr = UCOL_NORMALIZATION_MODE;
+ else if (strcmp(name, "colnumeric") == 0)
+ uattr = UCOL_NUMERIC_COLLATION;
+ else
+ /* ignore if unknown */
+ continue;
+
+ if (strcmp(value, "primary") == 0)
+ uvalue = UCOL_PRIMARY;
+ else if (strcmp(value, "secondary") == 0)
+ uvalue = UCOL_SECONDARY;
+ else if (strcmp(value, "tertiary") == 0)
+ uvalue = UCOL_TERTIARY;
+ else if (strcmp(value, "quaternary") == 0)
+ uvalue = UCOL_QUATERNARY;
+ else if (strcmp(value, "identical") == 0)
+ uvalue = UCOL_IDENTICAL;
+ else if (strcmp(value, "no") == 0)
+ uvalue = UCOL_OFF;
+ else if (strcmp(value, "yes") == 0)
+ uvalue = UCOL_ON;
+ else if (strcmp(value, "shifted") == 0)
+ uvalue = UCOL_SHIFTED;
+ else if (strcmp(value, "non-ignorable") == 0)
+ uvalue = UCOL_NON_IGNORABLE;
+ else if (strcmp(value, "lower") == 0)
+ uvalue = UCOL_LOWER_FIRST;
+ else if (strcmp(value, "upper") == 0)
+ uvalue = UCOL_UPPER_FIRST;
+ else
+ {
+ *status = U_ILLEGAL_ARGUMENT_ERROR;
+ break;
+ }
+
+ ucol_setAttribute(collator, uattr, uvalue, status);
+ }
+ }
+
+ pfree(lower_str);
+}
+
+#endif /* USE_ICU */
diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c
new file mode 100644
index 00000000000..6eb8b80fdf9
--- /dev/null
+++ b/src/backend/utils/adt/pg_locale_libc.c
@@ -0,0 +1,604 @@
+/*-----------------------------------------------------------------------
+ *
+ * PostgreSQL locale utilities for libc
+ *
+ * Portions Copyright (c) 2002-2024, PostgreSQL Global Development Group
+ *
+ * src/backend/utils/adt/pg_locale_libc.c
+ *
+ *-----------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <limits.h>
+#include <wctype.h>
+
+#include "access/htup_details.h"
+#include "catalog/pg_collation.h"
+#include "catalog/pg_database.h"
+#include "utils/builtins.h"
+#include "utils/memutils.h"
+#include "utils/pg_locale.h"
+#include "utils/resowner.h"
+#include "utils/syscache.h"
+
+/*
+ * This should be large enough that most strings will fit, but small enough
+ * that we feel comfortable putting it on the stack
+ */
+#define TEXTBUFLEN 1024
+
+extern pg_locale_t libc_dat_create_locale(HeapTuple dattuple);
+extern pg_locale_t libc_coll_create_locale(MemoryContext context,
+ ResourceOwner resowner,
+ HeapTuple colltuple);
+
+static int strncoll_libc(const char *arg1, ssize_t len1,
+ const char *arg2, ssize_t len2,
+ pg_locale_t locale);
+static size_t strnxfrm_libc(char *dest, size_t destsize,
+ const char *src, ssize_t srclen,
+ pg_locale_t locale);
+
+static int char_props_libc(pg_wchar wc, int mask, pg_locale_t locale);
+static pg_wchar toupper_libc(pg_wchar wc, pg_locale_t locale);
+static pg_wchar tolower_libc(pg_wchar wc, pg_locale_t locale);
+
+static void ResourceOwnerRememberLocaleT(ResourceOwner resowner,
+ locale_t locale);
+static void ResOwnerReleaseLocaleT(Datum val);
+
+static locale_t make_libc_collator(const char *collate, const char *ctype);
+
+static void report_newlocale_failure(const char *localename);
+
+static const ResourceOwnerDesc LocaleTResourceKind =
+{
+ .name = "locale_t reference",
+ .release_phase = RESOURCE_RELEASE_AFTER_LOCKS,
+ .release_priority = RELEASE_PRIO_LAST,
+ .ReleaseResource = ResOwnerReleaseLocaleT,
+ .DebugPrint = NULL /* the default message is fine */
+};
+
+struct collate_methods libc_collate_methods = {
+ .strncoll = strncoll_libc,
+ .strnxfrm = strnxfrm_libc,
+ .strnxfrm_prefix = NULL,
+};
+
+struct ctype_methods libc_ctype_methods = {
+ .char_props = char_props_libc,
+ .wc_toupper = toupper_libc,
+ .wc_tolower = tolower_libc,
+};
+
+pg_locale_t
+libc_dat_create_locale(HeapTuple dattuple)
+{
+ Form_pg_database dbform;
+ Datum datum;
+ const char *datcollate;
+ const char *datctype;
+ pg_locale_t result;
+
+ dbform = (Form_pg_database) GETSTRUCT(dattuple);
+
+ Assert(dbform->datlocprovider == COLLPROVIDER_LIBC);
+
+ datum = SysCacheGetAttrNotNull(DATABASEOID, dattuple, Anum_pg_database_datcollate);
+ datcollate = TextDatumGetCString(datum);
+ datum = SysCacheGetAttrNotNull(DATABASEOID, dattuple, Anum_pg_database_datctype);
+ datctype = TextDatumGetCString(datum);
+
+ result = MemoryContextAllocZero(TopMemoryContext,
+ sizeof(struct pg_locale_struct));
+
+ result->provider = dbform->datlocprovider;
+ result->deterministic = true;
+ result->collate_is_c = (strcmp(datcollate, "C") == 0) ||
+ (strcmp(datcollate, "POSIX") == 0);
+ result->ctype_is_c = (strcmp(datctype, "C") == 0) ||
+ (strcmp(datctype, "POSIX") == 0);
+
+ if (!result->collate_is_c)
+ result->collate = &libc_collate_methods;
+ if (!result->ctype_is_c)
+ result->ctype = &libc_ctype_methods;
+ result->info.lt = make_libc_collator(datcollate, datctype);
+
+ return result;
+}
+
+pg_locale_t
+libc_coll_create_locale(MemoryContext context, ResourceOwner resowner,
+ HeapTuple colltuple)
+{
+ Form_pg_collation collform;
+ Datum datum;
+ const char *collcollate;
+ const char *collctype;
+ locale_t locale;
+ pg_locale_t result;
+
+ collform = (Form_pg_collation) GETSTRUCT(colltuple);
+
+ datum = SysCacheGetAttrNotNull(COLLOID, colltuple, Anum_pg_collation_collcollate);
+ collcollate = TextDatumGetCString(datum);
+ datum = SysCacheGetAttrNotNull(COLLOID, colltuple, Anum_pg_collation_collctype);
+ collctype = TextDatumGetCString(datum);
+
+ ResourceOwnerEnlarge(resowner);
+ locale = make_libc_collator(collcollate, collctype);
+ if (locale)
+ ResourceOwnerRememberLocaleT(resowner, locale);
+
+ result = MemoryContextAllocZero(context,
+ sizeof(struct pg_locale_struct));
+
+ result->provider = collform->collprovider;
+ result->deterministic = collform->collisdeterministic;
+ result->collate_is_c = (strcmp(collcollate, "C") == 0) ||
+ (strcmp(collcollate, "POSIX") == 0);
+ result->ctype_is_c = (strcmp(collctype, "C") == 0) ||
+ (strcmp(collctype, "POSIX") == 0);
+ if (!result->collate_is_c)
+ result->collate = &libc_collate_methods;
+ if (!result->ctype_is_c)
+ result->ctype = &libc_ctype_methods;
+ result->info.lt = locale;
+
+ return result;
+}
+
+static void
+ResourceOwnerRememberLocaleT(ResourceOwner resowner, locale_t locale)
+{
+ ResourceOwnerRemember(resowner, PointerGetDatum(locale),
+ &LocaleTResourceKind);
+}
+
+static void
+ResOwnerReleaseLocaleT(Datum val)
+{
+ locale_t locale = (locale_t) DatumGetPointer(val);
+#ifndef WIN32
+ freelocale(locale);
+#else
+ _free_locale(locale);
+#endif
+}
+
+static int
+char_props_libc(pg_wchar wc, int mask, pg_locale_t locale)
+{
+ int result = 0;
+
+ Assert(!locale->ctype_is_c);
+
+ if (mask & PG_ISDIGIT)
+ {
+ if (GetDatabaseEncoding() == PG_UTF8 &&
+ (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF))
+ {
+ if (iswdigit_l((wint_t) wc, locale->info.lt))
+ result |= PG_ISDIGIT;
+ }
+ else
+ {
+ if (wc <= (pg_wchar) UCHAR_MAX &&
+ isdigit_l((unsigned char) wc, locale->info.lt))
+ result |= PG_ISDIGIT;
+ }
+ }
+ if (mask & PG_ISALPHA)
+ {
+ if (GetDatabaseEncoding() == PG_UTF8 &&
+ (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF))
+ {
+ if (iswalpha_l((wint_t) wc, locale->info.lt))
+ result |= PG_ISALPHA;
+ }
+ else
+ {
+ if (wc <= (pg_wchar) UCHAR_MAX &&
+ isalpha_l((unsigned char) wc, locale->info.lt))
+ result |= PG_ISALPHA;
+ }
+ }
+ if (mask & PG_ISUPPER)
+ {
+ if (GetDatabaseEncoding() == PG_UTF8 &&
+ (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF))
+ {
+ if (iswupper_l((wint_t) wc, locale->info.lt))
+ result |= PG_ISUPPER;
+ }
+ else
+ {
+ if (wc <= (pg_wchar) UCHAR_MAX &&
+ isupper_l((unsigned char) wc, locale->info.lt))
+ result |= PG_ISUPPER;
+ }
+ }
+ if (mask & PG_ISLOWER)
+ {
+ if (GetDatabaseEncoding() == PG_UTF8 &&
+ (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF))
+ {
+ if (iswlower_l((wint_t) wc, locale->info.lt))
+ result |= PG_ISLOWER;
+ }
+ else
+ {
+ if (wc <= (pg_wchar) UCHAR_MAX &&
+ islower_l((unsigned char) wc, locale->info.lt))
+ result |= PG_ISLOWER;
+ }
+ }
+ if (mask & PG_ISGRAPH)
+ {
+ if (GetDatabaseEncoding() == PG_UTF8 &&
+ (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF))
+ {
+ if (iswgraph_l((wint_t) wc, locale->info.lt))
+ result |= PG_ISGRAPH;
+ }
+ else
+ {
+ if (wc <= (pg_wchar) UCHAR_MAX &&
+ isgraph_l((unsigned char) wc, locale->info.lt))
+ result |= PG_ISGRAPH;
+ }
+ }
+ if (mask & PG_ISPRINT)
+ {
+ if (GetDatabaseEncoding() == PG_UTF8 &&
+ (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF))
+ {
+ if (iswprint_l((wint_t) wc, locale->info.lt))
+ result |= PG_ISPRINT;
+ }
+ else
+ {
+ if (wc <= (pg_wchar) UCHAR_MAX &&
+ isprint_l((unsigned char) wc, locale->info.lt))
+ result |= PG_ISPRINT;
+ }
+ }
+ if (mask & PG_ISPUNCT)
+ {
+ if (GetDatabaseEncoding() == PG_UTF8 &&
+ (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF))
+ {
+ if (iswpunct_l((wint_t) wc, locale->info.lt))
+ result |= PG_ISPUNCT;
+ }
+ else
+ {
+ if (wc <= (pg_wchar) UCHAR_MAX &&
+ ispunct_l((unsigned char) wc, locale->info.lt))
+ result |= PG_ISPUNCT;
+ }
+ }
+ if (mask & PG_ISSPACE)
+ {
+ if (GetDatabaseEncoding() == PG_UTF8 &&
+ (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF))
+ {
+ if (iswspace_l((wint_t) wc, locale->info.lt))
+ result |= PG_ISSPACE;
+ }
+ else
+ {
+ if (wc <= (pg_wchar) UCHAR_MAX &&
+ isspace_l((unsigned char) wc, locale->info.lt))
+ result |= PG_ISSPACE;
+ }
+ }
+
+ return result;
+}
+
+static pg_wchar
+toupper_libc(pg_wchar wc, pg_locale_t locale)
+{
+ if (GetDatabaseEncoding() == PG_UTF8 &&
+ (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF))
+ return towupper_l((wint_t) wc, locale->info.lt);
+ else if (wc <= (pg_wchar) UCHAR_MAX)
+ return toupper_l((unsigned char) wc, locale->info.lt);
+ else
+ return wc;
+}
+
+static pg_wchar
+tolower_libc(pg_wchar wc, pg_locale_t locale)
+{
+ if (GetDatabaseEncoding() == PG_UTF8 &&
+ (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF))
+ return towlower_l((wint_t) wc, locale->info.lt);
+ else if (wc <= (pg_wchar) UCHAR_MAX)
+ return tolower_l((unsigned char) wc, locale->info.lt);
+ else
+ return wc;
+}
+
+/*
+ * Create a locale_t with the given collation and ctype.
+ *
+ * The "C" and "POSIX" locales are not actually handled by libc, so return
+ * NULL.
+ *
+ * Ensure that no path leaks a locale_t.
+ */
+static locale_t
+make_libc_collator(const char *collate, const char *ctype)
+{
+ locale_t loc = 0;
+
+ if (strcmp(collate, ctype) == 0)
+ {
+ if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0)
+ {
+ /* Normal case where they're the same */
+ errno = 0;
+#ifndef WIN32
+ loc = newlocale(LC_COLLATE_MASK | LC_CTYPE_MASK, collate,
+ NULL);
+#else
+ loc = _create_locale(LC_ALL, collate);
+#endif
+ if (!loc)
+ report_newlocale_failure(collate);
+ }
+ }
+ else
+ {
+#ifndef WIN32
+ /* We need two newlocale() steps */
+ locale_t loc1 = 0;
+
+ if (strcmp(collate, "C") != 0 && strcmp(collate, "POSIX") != 0)
+ {
+ errno = 0;
+ loc1 = newlocale(LC_COLLATE_MASK, collate, NULL);
+ if (!loc1)
+ report_newlocale_failure(collate);
+ }
+
+ if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0)
+ {
+ errno = 0;
+ loc = newlocale(LC_CTYPE_MASK, ctype, loc1);
+ if (!loc)
+ {
+ if (loc1)
+ freelocale(loc1);
+ report_newlocale_failure(ctype);
+ }
+ }
+ else
+ loc = loc1;
+#else
+
+ /*
+ * XXX The _create_locale() API doesn't appear to support this. Could
+ * perhaps be worked around by changing pg_locale_t to contain two
+ * separate fields.
+ */
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("collations with different collate and ctype values are not supported on this platform")));
+#endif
+ }
+
+ return loc;
+}
+
+/*
+ * strncoll_libc_win32_utf8
+ *
+ * Win32 does not have UTF-8. Convert UTF8 arguments to wide characters and
+ * invoke wcscoll_l().
+ *
+ * An input string length of -1 means that it's NUL-terminated.
+ */
+#ifdef WIN32
+static int
+strncoll_libc_win32_utf8(const char *arg1, ssize_t len1, const char *arg2,
+ ssize_t len2, pg_locale_t locale)
+{
+ char sbuf[TEXTBUFLEN];
+ char *buf = sbuf;
+ char *a1p,
+ *a2p;
+ int a1len;
+ int a2len;
+ int r;
+ int result;
+
+ Assert(locale->provider == COLLPROVIDER_LIBC);
+ Assert(GetDatabaseEncoding() == PG_UTF8);
+#ifndef WIN32
+ Assert(false);
+#endif
+
+ if (len1 == -1)
+ len1 = strlen(arg1);
+ if (len2 == -1)
+ len2 = strlen(arg2);
+
+ a1len = len1 * 2 + 2;
+ a2len = len2 * 2 + 2;
+
+ if (a1len + a2len > TEXTBUFLEN)
+ buf = palloc(a1len + a2len);
+
+ a1p = buf;
+ a2p = buf + a1len;
+
+ /* API does not work for zero-length input */
+ if (len1 == 0)
+ r = 0;
+ else
+ {
+ r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
+ (LPWSTR) a1p, a1len / 2);
+ if (!r)
+ ereport(ERROR,
+ (errmsg("could not convert string to UTF-16: error code %lu",
+ GetLastError())));
+ }
+ ((LPWSTR) a1p)[r] = 0;
+
+ if (len2 == 0)
+ r = 0;
+ else
+ {
+ r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
+ (LPWSTR) a2p, a2len / 2);
+ if (!r)
+ ereport(ERROR,
+ (errmsg("could not convert string to UTF-16: error code %lu",
+ GetLastError())));
+ }
+ ((LPWSTR) a2p)[r] = 0;
+
+ errno = 0;
+ result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, locale->info.lt);
+ if (result == 2147483647) /* _NLSCMPERROR; missing from mingw headers */
+ ereport(ERROR,
+ (errmsg("could not compare Unicode strings: %m")));
+
+ if (buf != sbuf)
+ pfree(buf);
+
+ return result;
+}
+#endif /* WIN32 */
+
+/*
+ * strncoll_libc
+ *
+ * An input string length of -1 means that it's NUL-terminated.
+ */
+static int
+strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
+ pg_locale_t locale)
+{
+ char sbuf[TEXTBUFLEN];
+ char *buf = sbuf;
+ size_t bufsize1 = (len1 == -1) ? 0 : len1 + 1;
+ size_t bufsize2 = (len2 == -1) ? 0 : len2 + 1;
+ const char *arg1n;
+ const char *arg2n;
+ int result;
+
+ Assert(locale->provider == COLLPROVIDER_LIBC);
+
+#ifdef WIN32
+ /* check for this case before doing the work for nul-termination */
+ if (GetDatabaseEncoding() == PG_UTF8)
+ return strncoll_libc_win32_utf8(arg1, len1, arg2, len2, locale);
+#endif /* WIN32 */
+
+ if (bufsize1 + bufsize2 > TEXTBUFLEN)
+ buf = palloc(bufsize1 + bufsize2);
+
+ /* nul-terminate arguments if necessary */
+ if (len1 == -1)
+ {
+ arg1n = arg1;
+ }
+ else
+ {
+ char *buf1 = buf;
+ memcpy(buf1, arg1, len1);
+ buf1[len1] = '\0';
+ arg1n = buf1;
+ }
+
+ if (len2 == -1)
+ {
+ arg2n = arg2;
+ }
+ else
+ {
+ char *buf2 = buf + bufsize1;
+ memcpy(buf2, arg2, len2);
+ buf2[len2] = '\0';
+ arg2n = buf2;
+ }
+
+ result = strcoll_l(arg1n, arg2n, locale->info.lt);
+
+ if (buf != sbuf)
+ pfree(buf);
+
+ return result;
+}
+
+static size_t
+strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen,
+ pg_locale_t locale)
+{
+ char sbuf[TEXTBUFLEN];
+ char *buf = sbuf;
+ size_t bufsize = srclen + 1;
+ size_t result;
+
+ Assert(locale->provider == COLLPROVIDER_LIBC);
+
+ if (srclen == -1)
+ return strxfrm_l(dest, src, destsize, locale->info.lt);
+
+ if (bufsize > TEXTBUFLEN)
+ buf = palloc(bufsize);
+
+ /* nul-terminate argument */
+ memcpy(buf, src, srclen);
+ buf[srclen] = '\0';
+
+ result = strxfrm_l(dest, buf, destsize, locale->info.lt);
+
+ if (buf != sbuf)
+ pfree(buf);
+
+ /* if dest is defined, it should be nul-terminated */
+ Assert(result >= destsize || dest[result] == '\0');
+
+ return result;
+}
+
+/* simple subroutine for reporting errors from newlocale() */
+static void
+report_newlocale_failure(const char *localename)
+{
+ int save_errno;
+
+ /*
+ * Windows doesn't provide any useful error indication from
+ * _create_locale(), and BSD-derived platforms don't seem to feel they
+ * need to set errno either (even though POSIX is pretty clear that
+ * newlocale should do so). So, if errno hasn't been set, assume ENOENT
+ * is what to report.
+ */
+ if (errno == 0)
+ errno = ENOENT;
+
+ /*
+ * ENOENT means "no such locale", not "no such file", so clarify that
+ * errno with an errdetail message.
+ */
+ save_errno = errno; /* auxiliary funcs might change errno */
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("could not create locale \"%s\": %m",
+ localename),
+ (save_errno == ENOENT ?
+ errdetail("The operating system could not find any locale data for the locale name \"%s\".",
+ localename) : 0)));
+}
+
diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h
index 3b443df8014..95ba7940b95 100644
--- a/src/include/utils/pg_locale.h
+++ b/src/include/utils/pg_locale.h
@@ -12,6 +12,7 @@
#ifndef _PG_LOCALE_
#define _PG_LOCALE_
+#include "mb/pg_wchar.h"
#if defined(LOCALE_T_IN_XLOCALE) || defined(WCSTOMBS_L_IN_XLOCALE)
#include <xlocale.h>
#endif
@@ -19,6 +20,19 @@
#include <unicode/ucol.h>
#endif
+/*
+ * Character properties for regular expressions.
+ */
+#define PG_ISDIGIT 0x01
+#define PG_ISALPHA 0x02
+#define PG_ISALNUM (PG_ISDIGIT | PG_ISALPHA)
+#define PG_ISUPPER 0x04
+#define PG_ISLOWER 0x08
+#define PG_ISGRAPH 0x10
+#define PG_ISPRINT 0x20
+#define PG_ISPUNCT 0x40
+#define PG_ISSPACE 0x80
+
#ifdef USE_ICU
/*
* ucol_strcollUTF8() was introduced in ICU 50, but it is buggy before ICU 53.
@@ -62,6 +76,28 @@ extern struct lconv *PGLC_localeconv(void);
extern void cache_locale_time(void);
+struct pg_locale_struct;
+typedef struct pg_locale_struct *pg_locale_t;
+
+struct collate_methods
+{
+ int (*strncoll)(const char *arg1, ssize_t len1,
+ const char *arg2, ssize_t len2,
+ pg_locale_t locale);
+ size_t (*strnxfrm)(char *dest, size_t destsize,
+ const char *src, ssize_t srclen,
+ pg_locale_t locale);
+ size_t (*strnxfrm_prefix)(char *dest, size_t destsize,
+ const char *src, ssize_t srclen,
+ pg_locale_t locale);
+};
+
+struct ctype_methods
+{
+ int (*char_props)(pg_wchar wc, int mask, pg_locale_t locale);
+ pg_wchar (*wc_toupper)(pg_wchar wc, pg_locale_t locale);
+ pg_wchar (*wc_tolower)(pg_wchar wc, pg_locale_t locale);
+};
/*
* We use a discriminated union to hold either a locale_t or an ICU collator.
@@ -85,6 +121,10 @@ struct pg_locale_struct
bool deterministic;
bool collate_is_c;
bool ctype_is_c;
+
+ struct collate_methods *collate;
+ struct ctype_methods *ctype;
+
union
{
struct
@@ -102,8 +142,6 @@ struct pg_locale_struct
} info;
};
-typedef struct pg_locale_struct *pg_locale_t;
-
extern void init_database_collation(void);
extern pg_locale_t pg_newlocale_from_collation(Oid collid);
@@ -132,6 +170,8 @@ extern int32_t icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes)
extern int32_t icu_from_uchar(char **result, const UChar *buff_uchar, int32_t len_uchar);
#endif
+extern int char_props(pg_wchar wc, int mask, pg_locale_t locale);
+
/* These functions convert from/to libc's wchar_t, *not* pg_wchar_t */
extern size_t wchar2char(char *to, const wchar_t *from, size_t tolen,
pg_locale_t locale);
--
2.34.1
v4-0006-Allow-length-1-for-NUL-terminated-input-to-pg_str.patchtext/x-patch; charset=UTF-8; name=v4-0006-Allow-length-1-for-NUL-terminated-input-to-pg_str.patchDownload
From 56e4fbb3ccd2927e4cc92b4201632361e2b16abb Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Wed, 21 Aug 2024 10:59:28 -0700
Subject: [PATCH v4 6/7] Allow length=-1 for NUL-terminated input to
pg_strncoll(), etc.
Like ICU, allow a length of -1 to be specified for NUL-terminated
arguments to pg_strncoll(), pg_strnxfrm(), and pg_strnxfrm_prefix().
Simplifies the code and comments.
---
src/backend/utils/adt/pg_locale.c | 256 ++++++++++++------------------
src/include/utils/pg_locale.h | 8 +-
2 files changed, 104 insertions(+), 160 deletions(-)
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index c89ac3b9e01..cfba55a6e31 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -1886,22 +1886,24 @@ get_collation_actual_version(char collprovider, const char *collcollate)
}
/*
- * pg_strncoll_libc_win32_utf8
+ * strncoll_libc_win32_utf8
*
* Win32 does not have UTF-8. Convert UTF8 arguments to wide characters and
* invoke wcscoll_l().
+ *
+ * An input string length of -1 means that it's NUL-terminated.
*/
#ifdef WIN32
static int
-pg_strncoll_libc_win32_utf8(const char *arg1, size_t len1, const char *arg2,
- size_t len2, pg_locale_t locale)
+strncoll_libc_win32_utf8(const char *arg1, ssize_t len1, const char *arg2,
+ ssize_t len2, pg_locale_t locale)
{
char sbuf[TEXTBUFLEN];
char *buf = sbuf;
char *a1p,
*a2p;
- int a1len = len1 * 2 + 2;
- int a2len = len2 * 2 + 2;
+ int a1len;
+ int a2len;
int r;
int result;
@@ -1911,6 +1913,14 @@ pg_strncoll_libc_win32_utf8(const char *arg1, size_t len1, const char *arg2,
Assert(false);
#endif
+ if (len1 == -1)
+ len1 = strlen(arg1);
+ if (len2 == -1)
+ len2 = strlen(arg2);
+
+ a1len = len1 * 2 + 2;
+ a2len = len2 * 2 + 2;
+
if (a1len + a2len > TEXTBUFLEN)
buf = palloc(a1len + a2len);
@@ -1958,50 +1968,20 @@ pg_strncoll_libc_win32_utf8(const char *arg1, size_t len1, const char *arg2,
#endif /* WIN32 */
/*
- * pg_strcoll_libc
+ * strncoll_libc
*
- * Call strcoll_l() or wcscoll_l() as appropriate for the given locale,
- * platform, and database encoding. If the locale is NULL, use the database
- * collation.
- *
- * Arguments must be encoded in the database encoding and nul-terminated.
+ * An input string length of -1 means that it's NUL-terminated.
*/
static int
-pg_strcoll_libc(const char *arg1, const char *arg2, pg_locale_t locale)
-{
- int result;
-
- Assert(locale->provider == COLLPROVIDER_LIBC);
-#ifdef WIN32
- if (GetDatabaseEncoding() == PG_UTF8)
- {
- size_t len1 = strlen(arg1);
- size_t len2 = strlen(arg2);
-
- result = pg_strncoll_libc_win32_utf8(arg1, len1, arg2, len2, locale);
- }
- else
-#endif /* WIN32 */
- result = strcoll_l(arg1, arg2, locale->info.lt);
-
- return result;
-}
-
-/*
- * pg_strncoll_libc
- *
- * Nul-terminate the arguments and call pg_strcoll_libc().
- */
-static int
-pg_strncoll_libc(const char *arg1, size_t len1, const char *arg2, size_t len2,
- pg_locale_t locale)
+strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
+ pg_locale_t locale)
{
char sbuf[TEXTBUFLEN];
char *buf = sbuf;
- size_t bufsize1 = len1 + 1;
- size_t bufsize2 = len2 + 1;
- char *arg1n;
- char *arg2n;
+ size_t bufsize1 = (len1 == -1) ? 0 : len1 + 1;
+ size_t bufsize2 = (len2 == -1) ? 0 : len2 + 1;
+ const char *arg1n;
+ const char *arg2n;
int result;
Assert(locale->provider == COLLPROVIDER_LIBC);
@@ -2009,22 +1989,38 @@ pg_strncoll_libc(const char *arg1, size_t len1, const char *arg2, size_t len2,
#ifdef WIN32
/* check for this case before doing the work for nul-termination */
if (GetDatabaseEncoding() == PG_UTF8)
- return pg_strncoll_libc_win32_utf8(arg1, len1, arg2, len2, locale);
+ return strncoll_libc_win32_utf8(arg1, len1, arg2, len2, locale);
#endif /* WIN32 */
if (bufsize1 + bufsize2 > TEXTBUFLEN)
buf = palloc(bufsize1 + bufsize2);
- arg1n = buf;
- arg2n = buf + bufsize1;
+ /* nul-terminate arguments if necessary */
+ if (len1 == -1)
+ {
+ arg1n = arg1;
+ }
+ else
+ {
+ char *buf1 = buf;
+ memcpy(buf1, arg1, len1);
+ buf1[len1] = '\0';
+ arg1n = buf1;
+ }
- /* nul-terminate arguments */
- memcpy(arg1n, arg1, len1);
- arg1n[len1] = '\0';
- memcpy(arg2n, arg2, len2);
- arg2n[len2] = '\0';
+ if (len2 == -1)
+ {
+ arg2n = arg2;
+ }
+ else
+ {
+ char *buf2 = buf + bufsize1;
+ memcpy(buf2, arg2, len2);
+ buf2[len2] = '\0';
+ arg2n = buf2;
+ }
- result = pg_strcoll_libc(arg1n, arg2n, locale);
+ result = strcoll_l(arg1n, arg2n, locale->info.lt);
if (buf != sbuf)
pfree(buf);
@@ -2035,7 +2031,7 @@ pg_strncoll_libc(const char *arg1, size_t len1, const char *arg2, size_t len2,
#ifdef USE_ICU
/*
- * pg_strncoll_icu_no_utf8
+ * strncoll_icu_no_utf8
*
* Convert the arguments from the database encoding to UChar strings, then
* call ucol_strcoll(). An argument length of -1 means that the string is
@@ -2045,8 +2041,8 @@ pg_strncoll_libc(const char *arg1, size_t len1, const char *arg2, size_t len2,
* caller should call that instead.
*/
static int
-pg_strncoll_icu_no_utf8(const char *arg1, int32_t len1,
- const char *arg2, int32_t len2, pg_locale_t locale)
+strncoll_icu_no_utf8(const char *arg1, ssize_t len1,
+ const char *arg2, ssize_t len2, pg_locale_t locale)
{
char sbuf[TEXTBUFLEN];
char *buf = sbuf;
@@ -2091,17 +2087,15 @@ pg_strncoll_icu_no_utf8(const char *arg1, int32_t len1,
}
/*
- * pg_strncoll_icu
+ * strncoll_icu
*
* Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given
* database encoding. An argument length of -1 means the string is
* NUL-terminated.
- *
- * Arguments must be encoded in the database encoding.
*/
static int
-pg_strncoll_icu(const char *arg1, int32_t len1, const char *arg2, int32_t len2,
- pg_locale_t locale)
+strncoll_icu(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
+ pg_locale_t locale)
{
int result;
@@ -2124,7 +2118,7 @@ pg_strncoll_icu(const char *arg1, int32_t len1, const char *arg2, int32_t len2,
else
#endif
{
- result = pg_strncoll_icu_no_utf8(arg1, len1, arg2, len2, locale);
+ result = strncoll_icu_no_utf8(arg1, len1, arg2, len2, locale);
}
return result;
@@ -2135,15 +2129,7 @@ pg_strncoll_icu(const char *arg1, int32_t len1, const char *arg2, int32_t len2,
/*
* pg_strcoll
*
- * Call ucol_strcollUTF8(), ucol_strcoll(), strcoll_l() or wcscoll_l() as
- * appropriate for the given locale, platform, and database encoding. If the
- * locale is not specified, use the database collation.
- *
- * Arguments must be encoded in the database encoding and nul-terminated.
- *
- * The caller is responsible for breaking ties if the collation is
- * deterministic; this maintains consistency with pg_strxfrm(), which cannot
- * easily account for deterministic collations.
+ * Like pg_strncoll for NUL-terminated input strings.
*/
int
pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale)
@@ -2151,10 +2137,10 @@ pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale)
int result;
if (locale->provider == COLLPROVIDER_LIBC)
- result = pg_strcoll_libc(arg1, arg2, locale);
+ result = strncoll_libc(arg1, -1, arg2, -1, locale);
#ifdef USE_ICU
else if (locale->provider == COLLPROVIDER_ICU)
- result = pg_strncoll_icu(arg1, -1, arg2, -1, locale);
+ result = strncoll_icu(arg1, -1, arg2, -1, locale);
#endif
else
/* shouldn't happen */
@@ -2170,27 +2156,24 @@ pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale)
* appropriate for the given locale, platform, and database encoding. If the
* locale is not specified, use the database collation.
*
- * Arguments must be encoded in the database encoding.
- *
- * This function may need to nul-terminate the arguments for libc functions;
- * so if the caller already has nul-terminated strings, it should call
- * pg_strcoll() instead.
+ * The input strings must be encoded in the database encoding. If an input
+ * string is NUL-terminated, its length may be specified as -1.
*
* The caller is responsible for breaking ties if the collation is
* deterministic; this maintains consistency with pg_strnxfrm(), which cannot
* easily account for deterministic collations.
*/
int
-pg_strncoll(const char *arg1, size_t len1, const char *arg2, size_t len2,
+pg_strncoll(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2,
pg_locale_t locale)
{
int result;
if (locale->provider == COLLPROVIDER_LIBC)
- result = pg_strncoll_libc(arg1, len1, arg2, len2, locale);
+ result = strncoll_libc(arg1, len1, arg2, len2, locale);
#ifdef USE_ICU
else if (locale->provider == COLLPROVIDER_ICU)
- result = pg_strncoll_icu(arg1, len1, arg2, len2, locale);
+ result = strncoll_icu(arg1, len1, arg2, len2, locale);
#endif
else
/* shouldn't happen */
@@ -2201,16 +2184,8 @@ pg_strncoll(const char *arg1, size_t len1, const char *arg2, size_t len2,
static size_t
-pg_strxfrm_libc(char *dest, const char *src, size_t destsize,
- pg_locale_t locale)
-{
- Assert(locale->provider == COLLPROVIDER_LIBC);
- return strxfrm_l(dest, src, destsize, locale->info.lt);
-}
-
-static size_t
-pg_strnxfrm_libc(char *dest, const char *src, size_t srclen, size_t destsize,
- pg_locale_t locale)
+strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen,
+ pg_locale_t locale)
{
char sbuf[TEXTBUFLEN];
char *buf = sbuf;
@@ -2219,14 +2194,17 @@ pg_strnxfrm_libc(char *dest, const char *src, size_t srclen, size_t destsize,
Assert(locale->provider == COLLPROVIDER_LIBC);
+ if (srclen == -1)
+ return strxfrm_l(dest, src, destsize, locale->info.lt);
+
if (bufsize > TEXTBUFLEN)
buf = palloc(bufsize);
- /* nul-terminate arguments */
+ /* nul-terminate argument */
memcpy(buf, src, srclen);
buf[srclen] = '\0';
- result = pg_strxfrm_libc(dest, buf, destsize, locale);
+ result = strxfrm_l(dest, buf, destsize, locale->info.lt);
if (buf != sbuf)
pfree(buf);
@@ -2241,8 +2219,8 @@ pg_strnxfrm_libc(char *dest, const char *src, size_t srclen, size_t destsize,
/* 'srclen' of -1 means the strings are NUL-terminated */
static size_t
-pg_strnxfrm_icu(char *dest, const char *src, int32_t srclen, int32_t destsize,
- pg_locale_t locale)
+strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen,
+ pg_locale_t locale)
{
char sbuf[TEXTBUFLEN];
char *buf = sbuf;
@@ -2288,8 +2266,9 @@ pg_strnxfrm_icu(char *dest, const char *src, int32_t srclen, int32_t destsize,
/* 'srclen' of -1 means the strings are NUL-terminated */
static size_t
-pg_strnxfrm_prefix_icu_no_utf8(char *dest, const char *src, int32_t srclen,
- int32_t destsize, pg_locale_t locale)
+strnxfrm_prefix_icu_no_utf8(char *dest,size_t destsize,
+ const char *src, ssize_t srclen,
+ pg_locale_t locale)
{
char sbuf[TEXTBUFLEN];
char *buf = sbuf;
@@ -2336,8 +2315,9 @@ pg_strnxfrm_prefix_icu_no_utf8(char *dest, const char *src, int32_t srclen,
/* 'srclen' of -1 means the strings are NUL-terminated */
static size_t
-pg_strnxfrm_prefix_icu(char *dest, const char *src, int32_t srclen,
- int32_t destsize, pg_locale_t locale)
+strnxfrm_prefix_icu(char *dest, size_t destsize,
+ const char *src, ssize_t srclen,
+ pg_locale_t locale)
{
size_t result;
@@ -2364,8 +2344,8 @@ pg_strnxfrm_prefix_icu(char *dest, const char *src, int32_t srclen,
u_errorName(status))));
}
else
- result = pg_strnxfrm_prefix_icu_no_utf8(dest, src, srclen, destsize,
- locale);
+ result = strnxfrm_prefix_icu_no_utf8(dest, destsize, src, srclen,
+ locale);
return result;
}
@@ -2407,20 +2387,7 @@ pg_strxfrm_enabled(pg_locale_t locale)
/*
* pg_strxfrm
*
- * Transforms 'src' to a nul-terminated string stored in 'dest' such that
- * ordinary strcmp() on transformed strings is equivalent to pg_strcoll() on
- * untransformed strings.
- *
- * The provided 'src' must be nul-terminated. If 'destsize' is zero, 'dest'
- * may be NULL.
- *
- * Not all providers support pg_strxfrm() safely. The caller should check
- * pg_strxfrm_enabled() first, otherwise this function may return wrong
- * results or an error.
- *
- * Returns the number of bytes needed (or more) to store the transformed
- * string, excluding the terminating nul byte. If the value returned is
- * 'destsize' or greater, the resulting contents of 'dest' are undefined.
+ * Like pg_strnxfrm for a NUL-terminated input string.
*/
size_t
pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale)
@@ -2428,10 +2395,10 @@ pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale)
size_t result = 0; /* keep compiler quiet */
if (locale->provider == COLLPROVIDER_LIBC)
- result = pg_strxfrm_libc(dest, src, destsize, locale);
+ result = strnxfrm_libc(dest, destsize, src, -1, locale);
#ifdef USE_ICU
else if (locale->provider == COLLPROVIDER_ICU)
- result = pg_strnxfrm_icu(dest, src, -1, destsize, locale);
+ result = strnxfrm_icu(dest, destsize, src, -1, locale);
#endif
else
/* shouldn't happen */
@@ -2447,8 +2414,9 @@ pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale)
* ordinary strcmp() on transformed strings is equivalent to pg_strcoll() on
* untransformed strings.
*
- * 'src' does not need to be nul-terminated. If 'destsize' is zero, 'dest' may
- * be NULL.
+ * The input string must be encoded in the database encoding. If the input
+ * string is NUL-terminated, its length may be specified as -1. If 'destsize'
+ * is zero, 'dest' may be NULL.
*
* Not all providers support pg_strnxfrm() safely. The caller should check
* pg_strxfrm_enabled() first, otherwise this function may return wrong
@@ -2457,22 +2425,18 @@ pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale)
* Returns the number of bytes needed (or more) to store the transformed
* string, excluding the terminating nul byte. If the value returned is
* 'destsize' or greater, the resulting contents of 'dest' are undefined.
- *
- * This function may need to nul-terminate the argument for libc functions;
- * so if the caller already has a nul-terminated string, it should call
- * pg_strxfrm() instead.
*/
size_t
-pg_strnxfrm(char *dest, size_t destsize, const char *src, size_t srclen,
+pg_strnxfrm(char *dest, size_t destsize, const char *src, ssize_t srclen,
pg_locale_t locale)
{
size_t result = 0; /* keep compiler quiet */
if (locale->provider == COLLPROVIDER_LIBC)
- result = pg_strnxfrm_libc(dest, src, srclen, destsize, locale);
+ result = strnxfrm_libc(dest, src, srclen, destsize, locale);
#ifdef USE_ICU
else if (locale->provider == COLLPROVIDER_ICU)
- result = pg_strnxfrm_icu(dest, src, srclen, destsize, locale);
+ result = strnxfrm_icu(dest, src, srclen, destsize, locale);
#endif
else
/* shouldn't happen */
@@ -2502,44 +2466,24 @@ pg_strxfrm_prefix_enabled(pg_locale_t locale)
/*
* pg_strxfrm_prefix
*
- * Transforms 'src' to a byte sequence stored in 'dest' such that ordinary
- * memcmp() on the byte sequence is equivalent to pg_strcoll() on
- * untransformed strings. The result is not nul-terminated.
- *
- * The provided 'src' must be nul-terminated.
- *
- * Not all providers support pg_strxfrm_prefix() safely. The caller should
- * check pg_strxfrm_prefix_enabled() first, otherwise this function may return
- * wrong results or an error.
- *
- * If destsize is not large enough to hold the resulting byte sequence, stores
- * only the first destsize bytes in 'dest'. Returns the number of bytes
- * actually copied to 'dest'.
+ * Like pg_strnxfrm_prefix for a NUL-terminated input string.
*/
size_t
pg_strxfrm_prefix(char *dest, const char *src, size_t destsize,
pg_locale_t locale)
{
- size_t result = 0; /* keep compiler quiet */
-
-#ifdef USE_ICU
- if (locale->provider == COLLPROVIDER_ICU)
- result = pg_strnxfrm_prefix_icu(dest, src, -1, destsize, locale);
- else
-#endif
- PGLOCALE_SUPPORT_ERROR(locale->provider);
-
- return result;
+ return pg_strnxfrm_prefix(dest, destsize, src, -1, locale);
}
/*
* pg_strnxfrm_prefix
*
* Transforms 'src' to a byte sequence stored in 'dest' such that ordinary
- * memcmp() on the byte sequence is equivalent to pg_strcoll() on
+ * memcmp() on the byte sequence is equivalent to pg_strncoll() on
* untransformed strings. The result is not nul-terminated.
*
- * The provided 'src' must be nul-terminated.
+ * The input string must be encoded in the database encoding. If the input
+ * string is NUL-terminated, its length may be specified as -1.
*
* Not all providers support pg_strnxfrm_prefix() safely. The caller should
* check pg_strxfrm_prefix_enabled() first, otherwise this function may return
@@ -2548,20 +2492,16 @@ pg_strxfrm_prefix(char *dest, const char *src, size_t destsize,
* If destsize is not large enough to hold the resulting byte sequence, stores
* only the first destsize bytes in 'dest'. Returns the number of bytes
* actually copied to 'dest'.
- *
- * This function may need to nul-terminate the argument for libc functions;
- * so if the caller already has a nul-terminated string, it should call
- * pg_strxfrm_prefix() instead.
*/
size_t
pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src,
- size_t srclen, pg_locale_t locale)
+ ssize_t srclen, pg_locale_t locale)
{
size_t result = 0; /* keep compiler quiet */
#ifdef USE_ICU
if (locale->provider == COLLPROVIDER_ICU)
- result = pg_strnxfrm_prefix_icu(dest, src, -1, destsize, locale);
+ result = strnxfrm_prefix_icu(dest, src, -1, destsize, locale);
else
#endif
PGLOCALE_SUPPORT_ERROR(locale->provider);
@@ -2744,6 +2684,8 @@ init_icu_converter(void)
/*
* Find length, in UChars, of given string if converted to UChar string.
+ *
+ * A length of -1 indicates that the input string is NUL-terminated.
*/
static size_t
uchar_length(UConverter *converter, const char *str, int32_t len)
@@ -2761,6 +2703,8 @@ uchar_length(UConverter *converter, const char *str, int32_t len)
/*
* Convert the given source string into a UChar string, stored in dest, and
* return the length (in UChars).
+ *
+ * A srclen of -1 indicates that the input string is NUL-terminated.
*/
static int32_t
uchar_convert(UConverter *converter, UChar *dest, int32_t destlen,
diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h
index c2d95411e0a..3b443df8014 100644
--- a/src/include/utils/pg_locale.h
+++ b/src/include/utils/pg_locale.h
@@ -109,18 +109,18 @@ extern pg_locale_t pg_newlocale_from_collation(Oid collid);
extern char *get_collation_actual_version(char collprovider, const char *collcollate);
extern int pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale);
-extern int pg_strncoll(const char *arg1, size_t len1,
- const char *arg2, size_t len2, pg_locale_t locale);
+extern int pg_strncoll(const char *arg1, ssize_t len1,
+ const char *arg2, ssize_t len2, pg_locale_t locale);
extern bool pg_strxfrm_enabled(pg_locale_t locale);
extern size_t pg_strxfrm(char *dest, const char *src, size_t destsize,
pg_locale_t locale);
extern size_t pg_strnxfrm(char *dest, size_t destsize, const char *src,
- size_t srclen, pg_locale_t locale);
+ ssize_t srclen, pg_locale_t locale);
extern bool pg_strxfrm_prefix_enabled(pg_locale_t locale);
extern size_t pg_strxfrm_prefix(char *dest, const char *src, size_t destsize,
pg_locale_t locale);
extern size_t pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src,
- size_t srclen, pg_locale_t locale);
+ ssize_t srclen, pg_locale_t locale);
extern int builtin_locale_encoding(const char *locale);
extern const char *builtin_validate_locale(int encoding, const char *locale);
--
2.34.1
v4-0005-invalidation.patchtext/x-patch; charset=UTF-8; name=v4-0005-invalidation.patchDownload
From 2f51247615a36dc257b700c2832f3d4aa32fce64 Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Wed, 18 Sep 2024 17:49:57 -0700
Subject: [PATCH v4 5/7] invalidation
---
src/backend/utils/adt/pg_locale.c | 41 +++++++++++++++++++++++++------
1 file changed, 33 insertions(+), 8 deletions(-)
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 9d1d71f1561..c89ac3b9e01 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -63,6 +63,7 @@
#include "utils/builtins.h"
#include "utils/formatting.h"
#include "utils/guc_hooks.h"
+#include "utils/inval.h"
#include "utils/lsyscache.h"
#include "utils/memutils.h"
#include "utils/pg_locale.h"
@@ -1695,6 +1696,34 @@ create_pg_locale(MemoryContext context, ResourceOwner owner, Oid collid)
return result;
}
+static void
+CollationCacheInvalidate(Datum arg, int cacheid, uint32 hashvalue)
+{
+ last_collation_cache_oid = InvalidOid;
+
+ if (CollationCache == NULL)
+ return;
+
+ ResourceOwnerRelease(CollationCacheOwner,
+ RESOURCE_RELEASE_BEFORE_LOCKS,
+ false, true);
+ ResourceOwnerRelease(CollationCacheOwner,
+ RESOURCE_RELEASE_LOCKS,
+ false, true);
+ ResourceOwnerRelease(CollationCacheOwner,
+ RESOURCE_RELEASE_AFTER_LOCKS,
+ false, true);
+ ResourceOwnerDelete(CollationCacheOwner);
+ CollationCacheOwner = ResourceOwnerCreate(NULL, "collation cache");
+
+ MemoryContextReset(CollationCacheContext);
+
+ /* free all memory and reset hash table */
+ CollationCache = collation_cache_create(CollationCacheContext,
+ 16, NULL);
+}
+
+
/*
* Create or retrieve a pg_locale_t for the given collation OID. Results are
* cached for the lifetime of the backend.
@@ -1714,14 +1743,7 @@ pg_newlocale_from_collation(Oid collid)
if (last_collation_cache_oid == collid)
return last_collation_cache_locale;
- /*
- * Cache mechanism for collation information.
- *
- * Note that we currently lack any way to flush the cache. Since we don't
- * support ALTER COLLATION, this is OK. The worst case is that someone
- * drops a collation, and a useless cache entry hangs around in existing
- * backends.
- */
+ /* cache mechanism for collation information */
if (CollationCache == NULL)
{
CollationCacheOwner = ResourceOwnerCreate(NULL, "collation cache");
@@ -1730,6 +1752,9 @@ pg_newlocale_from_collation(Oid collid)
ALLOCSET_DEFAULT_SIZES);
CollationCache = collation_cache_create(CollationCacheContext,
16, NULL);
+ CacheRegisterSyscacheCallback(COLLOID,
+ CollationCacheInvalidate,
+ (Datum) 0);
}
cache_entry = collation_cache_insert(CollationCache, collid, &found);
--
2.34.1
v4-0004-resource-owners.patchtext/x-patch; charset=UTF-8; name=v4-0004-resource-owners.patchDownload
From 5ae3b1be6489617a1639141749c31d2f4419a676 Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Wed, 18 Sep 2024 16:55:42 -0700
Subject: [PATCH v4 4/7] resource owners
---
src/backend/utils/adt/pg_locale.c | 74 ++++++++++++++++++++++++++++++-
1 file changed, 72 insertions(+), 2 deletions(-)
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index d3d9c3920e6..9d1d71f1561 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -66,6 +66,7 @@
#include "utils/lsyscache.h"
#include "utils/memutils.h"
#include "utils/pg_locale.h"
+#include "utils/resowner.h"
#include "utils/syscache.h"
#ifdef USE_ICU
@@ -148,6 +149,12 @@ typedef struct
#define SH_DEFINE
#include "lib/simplehash.h"
+/*
+ * Collator objects (UCollator for ICU or locale_t for libc) are allocated in
+ * an external library, so track them using a resource owner.
+ */
+static ResourceOwner CollationCacheOwner = NULL;
+
static MemoryContext CollationCacheContext = NULL;
static collation_cache_hash *CollationCache = NULL;
@@ -179,8 +186,35 @@ static int32_t uchar_convert(UConverter *converter,
const char *src, int32_t srclen);
static void icu_set_collation_attributes(UCollator *collator, const char *loc,
UErrorCode *status);
+
+static void ResourceOwnerRememberUCollator(ResourceOwner owner,
+ UCollator *collator);
+static void ResOwnerReleaseUCollator(Datum val);
+
+static const ResourceOwnerDesc UCollatorResourceKind =
+{
+ .name = "UCollator reference",
+ .release_phase = RESOURCE_RELEASE_AFTER_LOCKS,
+ .release_priority = RELEASE_PRIO_LAST,
+ .ReleaseResource = ResOwnerReleaseUCollator,
+ .DebugPrint = NULL /* the default message is fine */
+};
#endif
+static void ResourceOwnerRememberLocaleT(ResourceOwner owner,
+ locale_t locale);
+static void ResOwnerReleaseLocaleT(Datum val);
+
+static const ResourceOwnerDesc LocaleTResourceKind =
+{
+ .name = "locale_t reference",
+ .release_phase = RESOURCE_RELEASE_AFTER_LOCKS,
+ .release_priority = RELEASE_PRIO_LAST,
+ .ReleaseResource = ResOwnerReleaseLocaleT,
+ .DebugPrint = NULL /* the default message is fine */
+};
+
+
/*
* POSIX doesn't define _l-variants of these functions, but several systems
* have them. We provide our own replacements here.
@@ -1257,6 +1291,20 @@ report_newlocale_failure(const char *localename)
localename) : 0)));
}
+static void
+ResourceOwnerRememberLocaleT(ResourceOwner owner, locale_t locale)
+{
+ ResourceOwnerRemember(owner, PointerGetDatum(locale),
+ &LocaleTResourceKind);
+}
+
+static void
+ResOwnerReleaseLocaleT(Datum val)
+{
+ locale_t locale = (locale_t) DatumGetPointer(val);
+ freelocale(locale);
+}
+
/*
* Create a locale_t with the given collation and ctype.
*
@@ -1335,6 +1383,20 @@ make_libc_collator(const char *collate, const char *ctype)
* Ensure that no path leaks a UCollator.
*/
#ifdef USE_ICU
+static void
+ResourceOwnerRememberUCollator(ResourceOwner owner, UCollator *collator)
+{
+ ResourceOwnerRemember(owner, PointerGetDatum(collator),
+ &UCollatorResourceKind);
+}
+
+static void
+ResOwnerReleaseUCollator(Datum val)
+{
+ UCollator *collator = (UCollator *) DatumGetPointer(val);
+ ucol_close(collator);
+}
+
static UCollator *
make_icu_collator(const char *iculocstr, const char *icurules)
{
@@ -1495,7 +1557,7 @@ init_database_collation(void)
* allocating memory.
*/
static pg_locale_t
-create_pg_locale(MemoryContext context, Oid collid)
+create_pg_locale(MemoryContext context, ResourceOwner owner, Oid collid)
{
/* We haven't computed this yet in this session, so do it */
HeapTuple tp;
@@ -1582,7 +1644,10 @@ create_pg_locale(MemoryContext context, Oid collid)
datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collctype);
collctype = TextDatumGetCString(datum);
+ ResourceOwnerEnlarge(owner);
locale = make_libc_collator(collcollate, collctype);
+ if (locale)
+ ResourceOwnerRememberLocaleT(owner, locale);
result = MemoryContextAllocZero(context,
sizeof(struct pg_locale_struct));
@@ -1610,7 +1675,9 @@ create_pg_locale(MemoryContext context, Oid collid)
else
icurules = NULL;
+ ResourceOwnerEnlarge(owner);
collator = make_icu_collator(iculocstr, icurules);
+ ResourceOwnerRememberUCollator(owner, collator);
result = MemoryContextAllocZero(context,
sizeof(struct pg_locale_struct));
@@ -1657,6 +1724,7 @@ pg_newlocale_from_collation(Oid collid)
*/
if (CollationCache == NULL)
{
+ CollationCacheOwner = ResourceOwnerCreate(NULL, "collation cache");
CollationCacheContext = AllocSetContextCreate(TopMemoryContext,
"collation cache",
ALLOCSET_DEFAULT_SIZES);
@@ -1675,7 +1743,9 @@ pg_newlocale_from_collation(Oid collid)
}
if (cache_entry->locale == 0)
- cache_entry->locale = create_pg_locale(CollationCacheContext, collid);
+ cache_entry->locale = create_pg_locale(CollationCacheContext,
+ CollationCacheOwner,
+ collid);
last_collation_cache_oid = collid;
last_collation_cache_locale = cache_entry->locale;
--
2.34.1
v4-0003-CollationCacheContext.patchtext/x-patch; charset=UTF-8; name=v4-0003-CollationCacheContext.patchDownload
From fca0efa184971f9780b356039aa3ed08a7445524 Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Wed, 18 Sep 2024 15:55:37 -0700
Subject: [PATCH v4 3/7] CollationCacheContext
---
src/backend/utils/adt/pg_locale.c | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 1dec00b55ed..d3d9c3920e6 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -1495,7 +1495,7 @@ init_database_collation(void)
* allocating memory.
*/
static pg_locale_t
-create_pg_locale(Oid collid)
+create_pg_locale(MemoryContext context, Oid collid)
{
/* We haven't computed this yet in this session, so do it */
HeapTuple tp;
@@ -1561,15 +1561,15 @@ create_pg_locale(Oid collid)
builtin_validate_locale(GetDatabaseEncoding(), locstr);
- result = MemoryContextAllocZero(TopMemoryContext,
+ result = MemoryContextAllocZero(context,
sizeof(struct pg_locale_struct));
result->provider = collform->collprovider;
result->deterministic = collform->collisdeterministic;
result->collate_is_c = true;
result->ctype_is_c = (strcmp(locstr, "C") == 0);
- result->info.builtin.locale = MemoryContextStrdup(TopMemoryContext,
- locstr);
+ result->info.builtin.locale = MemoryContextStrdup(context,
+ locstr);
}
else if (collform->collprovider == COLLPROVIDER_LIBC)
{
@@ -1584,7 +1584,7 @@ create_pg_locale(Oid collid)
locale = make_libc_collator(collcollate, collctype);
- result = MemoryContextAllocZero(TopMemoryContext,
+ result = MemoryContextAllocZero(context,
sizeof(struct pg_locale_struct));
result->provider = collform->collprovider;
@@ -1612,14 +1612,14 @@ create_pg_locale(Oid collid)
collator = make_icu_collator(iculocstr, icurules);
- result = MemoryContextAllocZero(TopMemoryContext,
+ result = MemoryContextAllocZero(context,
sizeof(struct pg_locale_struct));
result->provider = collform->collprovider;
result->deterministic = collform->collisdeterministic;
result->collate_is_c = false;
result->ctype_is_c = false;
- result->info.icu.locale = MemoryContextStrdup(TopMemoryContext, iculocstr);
+ result->info.icu.locale = MemoryContextStrdup(context, iculocstr);
result->info.icu.ucol = collator;
}
@@ -1675,7 +1675,7 @@ pg_newlocale_from_collation(Oid collid)
}
if (cache_entry->locale == 0)
- cache_entry->locale = create_pg_locale(collid);
+ cache_entry->locale = create_pg_locale(CollationCacheContext, collid);
last_collation_cache_oid = collid;
last_collation_cache_locale = cache_entry->locale;
--
2.34.1
v4-0002-create_pg_locale.patchtext/x-patch; charset=UTF-8; name=v4-0002-create_pg_locale.patchDownload
From eccc4a4a83069c6a14465b4a9239a4d759aaa2a8 Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Wed, 18 Sep 2024 15:53:56 -0700
Subject: [PATCH v4 2/7] create_pg_locale
---
src/backend/utils/adt/pg_locale.c | 310 +++++++++++++++---------------
1 file changed, 155 insertions(+), 155 deletions(-)
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 12ba5726f77..1dec00b55ed 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -1227,45 +1227,6 @@ IsoLocaleName(const char *winlocname)
#endif /* WIN32 && LC_MESSAGES */
-/*
- * Cache mechanism for collation information.
- *
- * Note that we currently lack any way to flush the cache. Since we don't
- * support ALTER COLLATION, this is OK. The worst case is that someone
- * drops a collation, and a useless cache entry hangs around in existing
- * backends.
- */
-static collation_cache_entry *
-lookup_collation_cache(Oid collation)
-{
- collation_cache_entry *cache_entry;
- bool found;
-
- Assert(OidIsValid(collation));
- Assert(collation != DEFAULT_COLLATION_OID);
-
- if (CollationCache == NULL)
- {
- CollationCacheContext = AllocSetContextCreate(TopMemoryContext,
- "collation cache",
- ALLOCSET_DEFAULT_SIZES);
- CollationCache = collation_cache_create(CollationCacheContext,
- 16, NULL);
- }
-
- cache_entry = collation_cache_insert(CollationCache, collation, &found);
- if (!found)
- {
- /*
- * Make sure cache entry is marked invalid, in case we fail before
- * setting things.
- */
- cache_entry->locale = 0;
- }
-
- return cache_entry;
-}
-
/* simple subroutine for reporting errors from newlocale() */
static void
report_newlocale_failure(const char *localename)
@@ -1530,153 +1491,192 @@ init_database_collation(void)
}
/*
- * Create a pg_locale_t from a collation OID. Results are cached for the
- * lifetime of the backend. Thus, do not free the result with freelocale().
- *
- * For simplicity, we always generate COLLATE + CTYPE even though we
- * might only need one of them. Since this is called only once per session,
- * it shouldn't cost much.
+ * Create and initialize a pg_locale_t. Be careful to check for errors before
+ * allocating memory.
*/
-pg_locale_t
-pg_newlocale_from_collation(Oid collid)
+static pg_locale_t
+create_pg_locale(Oid collid)
{
- collation_cache_entry *cache_entry;
-
- if (collid == DEFAULT_COLLATION_OID)
- return &default_locale;
+ /* We haven't computed this yet in this session, so do it */
+ HeapTuple tp;
+ Form_pg_collation collform;
+ pg_locale_t result;
+ Datum datum;
+ bool isnull;
- if (!OidIsValid(collid))
+ tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collid));
+ if (!HeapTupleIsValid(tp))
elog(ERROR, "cache lookup failed for collation %u", collid);
+ collform = (Form_pg_collation) GETSTRUCT(tp);
- if (last_collation_cache_oid == collid)
- return last_collation_cache_locale;
-
- cache_entry = lookup_collation_cache(collid);
-
- if (cache_entry->locale == 0)
+ /* compare version in catalog to version from provider */
+ datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collversion,
+ &isnull);
+ if (!isnull)
{
- /* We haven't computed this yet in this session, so do it */
- HeapTuple tp;
- Form_pg_collation collform;
- struct pg_locale_struct result;
- pg_locale_t resultp;
- Datum datum;
- bool isnull;
+ char *actual_versionstr;
+ char *collversionstr;
- tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collid));
- if (!HeapTupleIsValid(tp))
- elog(ERROR, "cache lookup failed for collation %u", collid);
- collform = (Form_pg_collation) GETSTRUCT(tp);
+ collversionstr = TextDatumGetCString(datum);
- /* We'll fill in the result struct locally before allocating memory */
- memset(&result, 0, sizeof(result));
- result.provider = collform->collprovider;
- result.deterministic = collform->collisdeterministic;
+ if (collform->collprovider == COLLPROVIDER_LIBC)
+ datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collcollate);
+ else
+ datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colllocale);
- if (collform->collprovider == COLLPROVIDER_BUILTIN)
+ actual_versionstr = get_collation_actual_version(collform->collprovider,
+ TextDatumGetCString(datum));
+ if (!actual_versionstr)
{
- const char *locstr;
+ /*
+ * This could happen when specifying a version in CREATE
+ * COLLATION but the provider does not support versioning, or
+ * manually creating a mess in the catalogs.
+ */
+ ereport(ERROR,
+ (errmsg("collation \"%s\" has no actual version, but a version was recorded",
+ NameStr(collform->collname))));
+ }
- datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colllocale);
- locstr = TextDatumGetCString(datum);
+ if (strcmp(actual_versionstr, collversionstr) != 0)
+ ereport(WARNING,
+ (errmsg("collation \"%s\" has version mismatch",
+ NameStr(collform->collname)),
+ errdetail("The collation in the database was created using version %s, "
+ "but the operating system provides version %s.",
+ collversionstr, actual_versionstr),
+ errhint("Rebuild all objects affected by this collation and run "
+ "ALTER COLLATION %s REFRESH VERSION, "
+ "or build PostgreSQL with the right library version.",
+ quote_qualified_identifier(get_namespace_name(collform->collnamespace),
+ NameStr(collform->collname)))));
+ }
- result.collate_is_c = true;
- result.ctype_is_c = (strcmp(locstr, "C") == 0);
+ if (collform->collprovider == COLLPROVIDER_BUILTIN)
+ {
+ const char *locstr;
- builtin_validate_locale(GetDatabaseEncoding(), locstr);
+ datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colllocale);
+ locstr = TextDatumGetCString(datum);
- result.info.builtin.locale = MemoryContextStrdup(TopMemoryContext,
- locstr);
- }
- else if (collform->collprovider == COLLPROVIDER_LIBC)
- {
- const char *collcollate;
- const char *collctype;
+ builtin_validate_locale(GetDatabaseEncoding(), locstr);
- datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collcollate);
- collcollate = TextDatumGetCString(datum);
- datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collctype);
- collctype = TextDatumGetCString(datum);
+ result = MemoryContextAllocZero(TopMemoryContext,
+ sizeof(struct pg_locale_struct));
- result.collate_is_c = (strcmp(collcollate, "C") == 0) ||
- (strcmp(collcollate, "POSIX") == 0);
- result.ctype_is_c = (strcmp(collctype, "C") == 0) ||
- (strcmp(collctype, "POSIX") == 0);
+ result->provider = collform->collprovider;
+ result->deterministic = collform->collisdeterministic;
+ result->collate_is_c = true;
+ result->ctype_is_c = (strcmp(locstr, "C") == 0);
+ result->info.builtin.locale = MemoryContextStrdup(TopMemoryContext,
+ locstr);
+ }
+ else if (collform->collprovider == COLLPROVIDER_LIBC)
+ {
+ const char *collcollate;
+ const char *collctype;
+ locale_t locale;
+
+ datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collcollate);
+ collcollate = TextDatumGetCString(datum);
+ datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collctype);
+ collctype = TextDatumGetCString(datum);
+
+ locale = make_libc_collator(collcollate, collctype);
+
+ result = MemoryContextAllocZero(TopMemoryContext,
+ sizeof(struct pg_locale_struct));
+
+ result->provider = collform->collprovider;
+ result->deterministic = collform->collisdeterministic;
+ result->collate_is_c = (strcmp(collcollate, "C") == 0) ||
+ (strcmp(collcollate, "POSIX") == 0);
+ result->ctype_is_c = (strcmp(collctype, "C") == 0) ||
+ (strcmp(collctype, "POSIX") == 0);
+ result->info.lt = locale;
+ }
+ else if (collform->collprovider == COLLPROVIDER_ICU)
+ {
+ const char *iculocstr;
+ const char *icurules;
+ UCollator *collator;
- result.info.lt = make_libc_collator(collcollate, collctype);
- }
- else if (collform->collprovider == COLLPROVIDER_ICU)
- {
- const char *iculocstr;
- const char *icurules;
+ datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colllocale);
+ iculocstr = TextDatumGetCString(datum);
- datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colllocale);
- iculocstr = TextDatumGetCString(datum);
+ datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collicurules, &isnull);
+ if (!isnull)
+ icurules = TextDatumGetCString(datum);
+ else
+ icurules = NULL;
- result.collate_is_c = false;
- result.ctype_is_c = false;
+ collator = make_icu_collator(iculocstr, icurules);
- datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collicurules, &isnull);
- if (!isnull)
- icurules = TextDatumGetCString(datum);
- else
- icurules = NULL;
+ result = MemoryContextAllocZero(TopMemoryContext,
+ sizeof(struct pg_locale_struct));
- result.info.icu.locale = MemoryContextStrdup(TopMemoryContext, iculocstr);
- result.info.icu.ucol = make_icu_collator(iculocstr, icurules);
- }
+ result->provider = collform->collprovider;
+ result->deterministic = collform->collisdeterministic;
+ result->collate_is_c = false;
+ result->ctype_is_c = false;
+ result->info.icu.locale = MemoryContextStrdup(TopMemoryContext, iculocstr);
+ result->info.icu.ucol = collator;
+ }
- datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collversion,
- &isnull);
- if (!isnull)
- {
- char *actual_versionstr;
- char *collversionstr;
+ ReleaseSysCache(tp);
- collversionstr = TextDatumGetCString(datum);
+ return result;
+}
- if (collform->collprovider == COLLPROVIDER_LIBC)
- datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collcollate);
- else
- datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colllocale);
+/*
+ * Create or retrieve a pg_locale_t for the given collation OID. Results are
+ * cached for the lifetime of the backend.
+ */
+pg_locale_t
+pg_newlocale_from_collation(Oid collid)
+{
+ collation_cache_entry *cache_entry;
+ bool found;
- actual_versionstr = get_collation_actual_version(collform->collprovider,
- TextDatumGetCString(datum));
- if (!actual_versionstr)
- {
- /*
- * This could happen when specifying a version in CREATE
- * COLLATION but the provider does not support versioning, or
- * manually creating a mess in the catalogs.
- */
- ereport(ERROR,
- (errmsg("collation \"%s\" has no actual version, but a version was recorded",
- NameStr(collform->collname))));
- }
+ if (collid == DEFAULT_COLLATION_OID)
+ return &default_locale;
- if (strcmp(actual_versionstr, collversionstr) != 0)
- ereport(WARNING,
- (errmsg("collation \"%s\" has version mismatch",
- NameStr(collform->collname)),
- errdetail("The collation in the database was created using version %s, "
- "but the operating system provides version %s.",
- collversionstr, actual_versionstr),
- errhint("Rebuild all objects affected by this collation and run "
- "ALTER COLLATION %s REFRESH VERSION, "
- "or build PostgreSQL with the right library version.",
- quote_qualified_identifier(get_namespace_name(collform->collnamespace),
- NameStr(collform->collname)))));
- }
+ if (!OidIsValid(collid))
+ elog(ERROR, "cache lookup failed for collation %u", collid);
- ReleaseSysCache(tp);
+ if (last_collation_cache_oid == collid)
+ return last_collation_cache_locale;
- /* We'll keep the pg_locale_t structures in TopMemoryContext */
- resultp = MemoryContextAlloc(TopMemoryContext, sizeof(*resultp));
- *resultp = result;
+ /*
+ * Cache mechanism for collation information.
+ *
+ * Note that we currently lack any way to flush the cache. Since we don't
+ * support ALTER COLLATION, this is OK. The worst case is that someone
+ * drops a collation, and a useless cache entry hangs around in existing
+ * backends.
+ */
+ if (CollationCache == NULL)
+ {
+ CollationCacheContext = AllocSetContextCreate(TopMemoryContext,
+ "collation cache",
+ ALLOCSET_DEFAULT_SIZES);
+ CollationCache = collation_cache_create(CollationCacheContext,
+ 16, NULL);
+ }
- cache_entry->locale = resultp;
+ cache_entry = collation_cache_insert(CollationCache, collid, &found);
+ if (!found)
+ {
+ /*
+ * Make sure cache entry is marked invalid, in case we fail before
+ * setting things.
+ */
+ cache_entry->locale = 0;
}
+ if (cache_entry->locale == 0)
+ cache_entry->locale = create_pg_locale(collid);
+
last_collation_cache_oid = collid;
last_collation_cache_locale = cache_entry->locale;
--
2.34.1