From c6721272200c14931ad757185a3aaeb615c432ed Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Mon, 24 Apr 2023 15:46:17 -0700
Subject: [PATCH 1/2] Interpret C locales consistently between ICU and libc.

Treat a locale named C, C.anything, POSIX, or POSIX.anything as
equivalent to the C locale; implemented with built-in semantics
(memcmp() for collation and pg_ascii_*() for ctype).

Such locales are not passed to the provider at all, so have identical
behavior regardless of whether it's declared with provider ICU or
libc.

Previously, only C and POSIX locales had this behavior (not
e.g. "C.UTF-8"), and only if the provider was declared as libc. That
caused problems on libc for locales like C.UTF-8, which may have
subtly different behavior in some versions of libc; and it caused
problems on ICU because newer versions don't recognize C locales.

Discussion: https://postgr.es/m/1559006.1685040536@sss.pgh.pa.us
Discussion: https://postgr.es/m/c840107b-4cb9-c8e9-abb7-1d8c5e0d51df%40enterprisedb.com
Discussion: https://postgr.es/m/87v8hoexdv.fsf@news-spur.riddles.org.uk
---
 doc/src/sgml/charset.sgml                     |   3 +-
 src/backend/commands/collationcmds.c          |  42 +++---
 src/backend/commands/dbcommands.c             |  41 +++---
 src/backend/utils/adt/pg_locale.c             | 126 +++++++++++++-----
 src/backend/utils/init/postinit.c             |   4 +-
 src/backend/utils/mb/mbutils.c                |   3 +-
 src/include/utils/pg_locale.h                 |   1 +
 .../regress/expected/collate.icu.utf8.out     |   6 +
 src/test/regress/expected/collate.out         |   5 +
 src/test/regress/sql/collate.icu.utf8.sql     |   4 +
 src/test/regress/sql/collate.sql              |   5 +
 11 files changed, 167 insertions(+), 73 deletions(-)

diff --git a/doc/src/sgml/charset.sgml b/doc/src/sgml/charset.sgml
index ed84465996..8ba3117557 100644
--- a/doc/src/sgml/charset.sgml
+++ b/doc/src/sgml/charset.sgml
@@ -136,7 +136,8 @@ initdb --locale=sv_SE
    <para>
     If you want the system to behave as if it had no locale support,
     use the special locale name <literal>C</literal>, or equivalently
-    <literal>POSIX</literal>.
+    <literal>POSIX</literal>. An encoding may also be appended, for
+    example <literal>C.UTF-8</literal>.
    </para>
 
    <para>
diff --git a/src/backend/commands/collationcmds.c b/src/backend/commands/collationcmds.c
index 2969a2bb21..a451ae8843 100644
--- a/src/backend/commands/collationcmds.c
+++ b/src/backend/commands/collationcmds.c
@@ -264,26 +264,38 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e
 						(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
 						 errmsg("parameter \"locale\" must be specified")));
 
-			/*
-			 * During binary upgrade, preserve the locale string. Otherwise,
-			 * canonicalize to a language tag.
-			 */
-			if (!IsBinaryUpgrade)
+			if (locale_name_is_c(colliculocale))
 			{
-				char	   *langtag = icu_language_tag(colliculocale,
-													   icu_validation_level);
-
-				if (langtag && strcmp(colliculocale, langtag) != 0)
+				if (!collisdeterministic)
+					ereport(ERROR,
+							(errmsg("nondeterministic collations not supported for C or POSIX locale")));
+				if (collicurules != NULL)
+					ereport(ERROR,
+							(errmsg("RULES not supported for C or POSIX locale")));
+			}
+			else
+			{
+				/*
+				 * During binary upgrade, preserve the locale
+				 * string. Otherwise, canonicalize to a language tag.
+				 */
+				if (!IsBinaryUpgrade)
 				{
-					ereport(NOTICE,
-							(errmsg("using standard form \"%s\" for locale \"%s\"",
-									langtag, colliculocale)));
+					char	   *langtag = icu_language_tag(colliculocale,
+														   icu_validation_level);
+
+					if (langtag && strcmp(colliculocale, langtag) != 0)
+					{
+						ereport(NOTICE,
+								(errmsg("using standard form \"%s\" for locale \"%s\"",
+										langtag, colliculocale)));
 
-					colliculocale = langtag;
+						colliculocale = langtag;
+					}
 				}
-			}
 
-			icu_validate_locale(colliculocale);
+				icu_validate_locale(colliculocale);
+			}
 		}
 
 		/*
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 99d4080ea9..601a08ef11 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -1058,27 +1058,36 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 					 errmsg("ICU locale must be specified")));
 
-		/*
-		 * During binary upgrade, or when the locale came from the template
-		 * database, preserve locale string. Otherwise, canonicalize to a
-		 * language tag.
-		 */
-		if (!IsBinaryUpgrade && dbiculocale != src_iculocale)
+		if (locale_name_is_c(dbiculocale))
 		{
-			char	   *langtag = icu_language_tag(dbiculocale,
-												   icu_validation_level);
-
-			if (langtag && strcmp(dbiculocale, langtag) != 0)
+			if (dbicurules != NULL)
+				ereport(ERROR,
+						(errmsg("ICU_RULES not supported for C or POSIX locale")));
+		}
+		else
+		{
+			/*
+			 * During binary upgrade, or when the locale came from the
+			 * template database, preserve locale string. Otherwise,
+			 * canonicalize to a language tag.
+			 */
+			if (!IsBinaryUpgrade && dbiculocale != src_iculocale)
 			{
-				ereport(NOTICE,
-						(errmsg("using standard form \"%s\" for locale \"%s\"",
-								langtag, dbiculocale)));
+				char	   *langtag = icu_language_tag(dbiculocale,
+													   icu_validation_level);
+
+				if (langtag && strcmp(dbiculocale, langtag) != 0)
+				{
+					ereport(NOTICE,
+							(errmsg("using standard form \"%s\" for locale \"%s\"",
+									langtag, dbiculocale)));
 
-				dbiculocale = langtag;
+					dbiculocale = langtag;
+				}
 			}
-		}
 
-		icu_validate_locale(dbiculocale);
+			icu_validate_locale(dbiculocale);
+		}
 	}
 	else
 	{
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 31e3b16ae0..2f2734a405 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -1239,15 +1239,19 @@ lookup_collation_cache(Oid collation, bool set_flags)
 			datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collctype);
 			collctype = TextDatumGetCString(datum);
 
-			cache_entry->collate_is_c = ((strcmp(collcollate, "C") == 0) ||
-										 (strcmp(collcollate, "POSIX") == 0));
-			cache_entry->ctype_is_c = ((strcmp(collctype, "C") == 0) ||
-									   (strcmp(collctype, "POSIX") == 0));
+			cache_entry->collate_is_c = locale_name_is_c(collcollate);
+			cache_entry->ctype_is_c = locale_name_is_c(collctype);
 		}
 		else
 		{
-			cache_entry->collate_is_c = false;
-			cache_entry->ctype_is_c = false;
+			Datum		datum;
+			const char *colliculocale;
+
+			datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colliculocale);
+			colliculocale = TextDatumGetCString(datum);
+
+			cache_entry->collate_is_c = locale_name_is_c(colliculocale);
+			cache_entry->ctype_is_c = cache_entry->collate_is_c;
 		}
 
 		cache_entry->flags_valid = true;
@@ -1258,6 +1262,22 @@ lookup_collation_cache(Oid collation, bool set_flags)
 	return cache_entry;
 }
 
+/*
+ * Check if the locale name should be handled like the C locale.
+ *
+ * If so, the locale should be handled with built-in memcmp() and
+ * pg_ascii_*(); otherwise, the locale should be handled by the collation
+ * provider.
+ */
+bool
+locale_name_is_c(const char *locale)
+{
+	if (strcmp(locale, "C") == 0 || strncmp(locale, "C.", 2) == 0 ||
+		strcmp(locale, "POSIX") == 0 || strncmp(locale, "POSIX.", 6) == 0)
+		return true;
+
+	return false;
+}
 
 /*
  * Detect whether collation's LC_COLLATE property is C
@@ -1279,23 +1299,30 @@ lc_collate_is_c(Oid collation)
 	if (collation == DEFAULT_COLLATION_OID)
 	{
 		static int	result = -1;
-		char	   *localeptr;
-
-		if (default_locale.provider == COLLPROVIDER_ICU)
-			return false;
+		const char *localeptr;
 
 		if (result >= 0)
 			return (bool) result;
-		localeptr = setlocale(LC_COLLATE, NULL);
-		if (!localeptr)
-			elog(ERROR, "invalid LC_COLLATE setting");
-
-		if (strcmp(localeptr, "C") == 0)
-			result = true;
-		else if (strcmp(localeptr, "POSIX") == 0)
-			result = true;
+
+		if (default_locale.provider == COLLPROVIDER_ICU)
+		{
+#ifdef USE_ICU
+			localeptr = default_locale.info.icu.locale;
+#else
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("ICU is not supported in this build")));
+#endif
+		}
 		else
-			result = false;
+		{
+			localeptr = setlocale(LC_COLLATE, NULL);
+			if (!localeptr)
+				elog(ERROR, "invalid LC_COLLATE setting");
+		}
+
+		result = locale_name_is_c(localeptr);
+
 		return (bool) result;
 	}
 
@@ -1332,23 +1359,30 @@ lc_ctype_is_c(Oid collation)
 	if (collation == DEFAULT_COLLATION_OID)
 	{
 		static int	result = -1;
-		char	   *localeptr;
-
-		if (default_locale.provider == COLLPROVIDER_ICU)
-			return false;
+		const char *localeptr;
 
 		if (result >= 0)
 			return (bool) result;
-		localeptr = setlocale(LC_CTYPE, NULL);
-		if (!localeptr)
-			elog(ERROR, "invalid LC_CTYPE setting");
-
-		if (strcmp(localeptr, "C") == 0)
-			result = true;
-		else if (strcmp(localeptr, "POSIX") == 0)
-			result = true;
+
+		if (default_locale.provider == COLLPROVIDER_ICU)
+		{
+#ifdef USE_ICU
+			localeptr = default_locale.info.icu.locale;
+#else
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("ICU is not supported in this build")));
+#endif
+		}
 		else
-			result = false;
+		{
+			localeptr = setlocale(LC_CTYPE, NULL);
+			if (!localeptr)
+				elog(ERROR, "invalid LC_CTYPE setting");
+		}
+
+		result = locale_name_is_c(localeptr);
+
 		return (bool) result;
 	}
 
@@ -1375,7 +1409,13 @@ make_icu_collator(const char *iculocstr,
 #ifdef USE_ICU
 	UCollator  *collator;
 
-	collator = pg_ucol_open(iculocstr);
+	if (locale_name_is_c(iculocstr))
+	{
+		Assert(icurules == NULL);
+		collator = NULL;
+	}
+	else
+		collator = pg_ucol_open(iculocstr);
 
 	/*
 	 * If rules are specified, we extract the rules of the standard collation,
@@ -1525,6 +1565,9 @@ pg_newlocale_from_collation(Oid collid)
 			datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collctype);
 			collctype = TextDatumGetCString(datum);
 
+			Assert(!locale_name_is_c(collcollate));
+			Assert(!locale_name_is_c(collctype));
+
 			if (strcmp(collcollate, collctype) == 0)
 			{
 				/* Normal case where they're the same */
@@ -1581,6 +1624,8 @@ pg_newlocale_from_collation(Oid collid)
 			datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colliculocale);
 			iculocstr = TextDatumGetCString(datum);
 
+			Assert(!locale_name_is_c(iculocstr));
+
 			datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collicurules, &isnull);
 			if (!isnull)
 				icurules = TextDatumGetCString(datum);
@@ -1650,6 +1695,9 @@ get_collation_actual_version(char collprovider, const char *collcollate)
 {
 	char	   *collversion = NULL;
 
+	if (locale_name_is_c(collcollate))
+		return NULL;
+
 #ifdef USE_ICU
 	if (collprovider == COLLPROVIDER_ICU)
 	{
@@ -1667,10 +1715,7 @@ get_collation_actual_version(char collprovider, const char *collcollate)
 	}
 	else
 #endif
-		if (collprovider == COLLPROVIDER_LIBC &&
-			pg_strcasecmp("C", collcollate) != 0 &&
-			pg_strncasecmp("C.", collcollate, 2) != 0 &&
-			pg_strcasecmp("POSIX", collcollate) != 0)
+		if (collprovider == COLLPROVIDER_LIBC)
 	{
 #if defined(__GLIBC__)
 		/* Use the glibc version because we don't have anything better. */
@@ -2457,6 +2502,13 @@ pg_ucol_open(const char *loc_str)
 	if (loc_str == NULL)
 		elog(ERROR, "opening default collator is not supported");
 
+	/*
+	 * Must never open special values C or POSIX, which are treated specially
+	 * and not passed to the provider.
+	 */
+	if (locale_name_is_c(loc_str))
+		elog(ERROR, "unexpected ICU locale string: %s", loc_str);
+
 	/*
 	 * In ICU versions 54 and earlier, "und" is not a recognized spelling of
 	 * the root locale. If the first component of the locale is "und", replace
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index 6856ed99e7..92928133c0 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -419,9 +419,7 @@ CheckMyDatabase(const char *name, bool am_superuser, bool override_allow_connect
 						   " which is not recognized by setlocale().", ctype),
 				 errhint("Recreate the database with another locale or install the missing locale.")));
 
-	if (strcmp(ctype, "C") == 0 ||
-		strcmp(ctype, "POSIX") == 0)
-		database_ctype_is_c = true;
+	database_ctype_is_c = locale_name_is_c(ctype);
 
 	if (dbform->datlocprovider == COLLPROVIDER_ICU)
 	{
diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c
index 67a1ab2ab2..9a54a952e0 100644
--- a/src/backend/utils/mb/mbutils.c
+++ b/src/backend/utils/mb/mbutils.c
@@ -39,6 +39,7 @@
 #include "mb/pg_wchar.h"
 #include "utils/builtins.h"
 #include "utils/memutils.h"
+#include "utils/pg_locale.h"
 #include "utils/syscache.h"
 #include "varatt.h"
 
@@ -1239,7 +1240,7 @@ pg_bind_textdomain_codeset(const char *domainname)
 #ifndef WIN32
 	const char *ctype = setlocale(LC_CTYPE, NULL);
 
-	if (pg_strcasecmp(ctype, "C") == 0 || pg_strcasecmp(ctype, "POSIX") == 0)
+	if (locale_name_is_c(ctype))
 #endif
 		if (encoding != PG_SQL_ASCII &&
 			raw_pg_bind_textdomain_codeset(domainname, encoding))
diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h
index e2a7243542..0e26346546 100644
--- a/src/include/utils/pg_locale.h
+++ b/src/include/utils/pg_locale.h
@@ -54,6 +54,7 @@ extern PGDLLIMPORT bool database_ctype_is_c;
 extern bool check_locale(int category, const char *locale, char **canonname);
 extern char *pg_perm_setlocale(int category, const char *locale);
 
+extern bool locale_name_is_c(const char *locale);
 extern bool lc_collate_is_c(Oid collation);
 extern bool lc_ctype_is_c(Oid collation);
 
diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out
index c658ee1404..79ce33abbd 100644
--- a/src/test/regress/expected/collate.icu.utf8.out
+++ b/src/test/regress/expected/collate.icu.utf8.out
@@ -1043,12 +1043,18 @@ ERROR:  ICU locale "nonsense-nowhere" has unknown language "nonsense"
 HINT:  To disable ICU locale validation, set parameter icu_validation_level to DISABLED.
 CREATE COLLATION testx (provider = icu, locale = '@colStrength=primary;nonsense=yes'); -- fails
 ERROR:  could not convert locale name "@colStrength=primary;nonsense=yes" to language tag: U_ILLEGAL_ARGUMENT_ERROR
+CREATE COLLATION testx (provider = icu, locale = 'C', deterministic = false); -- fails
+ERROR:  nondeterministic collations not supported for C or POSIX locale
+CREATE COLLATION testx (provider = icu, locale = 'C', rules = '&V << w <<< W'); -- fails
+ERROR:  RULES not supported for C or POSIX locale
 RESET icu_validation_level;
 CREATE COLLATION testx (provider = icu, locale = '@colStrength=primary;nonsense=yes'); DROP COLLATION testx;
 WARNING:  could not convert locale name "@colStrength=primary;nonsense=yes" to language tag: U_ILLEGAL_ARGUMENT_ERROR
 CREATE COLLATION testx (provider = icu, locale = 'nonsense-nowhere'); DROP COLLATION testx;
 WARNING:  ICU locale "nonsense-nowhere" has unknown language "nonsense"
 HINT:  To disable ICU locale validation, set parameter icu_validation_level to DISABLED.
+CREATE COLLATION testx (provider = icu, locale = 'C.UTF-8'); DROP COLLATION testx;
+CREATE COLLATION testx (provider = icu, locale = 'POSIX'); DROP COLLATION testx;
 CREATE COLLATION test4 FROM nonsense;
 ERROR:  collation "nonsense" for encoding "UTF8" does not exist
 CREATE COLLATION test5 FROM test0;
diff --git a/src/test/regress/expected/collate.out b/src/test/regress/expected/collate.out
index 0649564485..e2d0a39732 100644
--- a/src/test/regress/expected/collate.out
+++ b/src/test/regress/expected/collate.out
@@ -649,6 +649,11 @@ EXPLAIN (COSTS OFF)
    ->  Seq Scan on collate_test10
 (3 rows)
 
+-- test alternate spellings of special locale C
+CREATE COLLATION coll_c_locale ( LOCALE = "C.something" );
+DROP COLLATION coll_c_locale;
+CREATE COLLATION coll_c_locale ( LOCALE = "POSIX.something" );
+DROP COLLATION coll_c_locale;
 -- CREATE/DROP COLLATION
 CREATE COLLATION mycoll1 FROM "C";
 CREATE COLLATION mycoll2 ( LC_COLLATE = "POSIX", LC_CTYPE = "POSIX" );
diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql
index 7bd0901281..adc6b7deec 100644
--- a/src/test/regress/sql/collate.icu.utf8.sql
+++ b/src/test/regress/sql/collate.icu.utf8.sql
@@ -379,9 +379,13 @@ CREATE COLLATION test3 (provider = icu, lc_collate = 'en_US.utf8'); -- fail, nee
 SET icu_validation_level = ERROR;
 CREATE COLLATION testx (provider = icu, locale = 'nonsense-nowhere'); -- fails
 CREATE COLLATION testx (provider = icu, locale = '@colStrength=primary;nonsense=yes'); -- fails
+CREATE COLLATION testx (provider = icu, locale = 'C', deterministic = false); -- fails
+CREATE COLLATION testx (provider = icu, locale = 'C', rules = '&V << w <<< W'); -- fails
 RESET icu_validation_level;
 CREATE COLLATION testx (provider = icu, locale = '@colStrength=primary;nonsense=yes'); DROP COLLATION testx;
 CREATE COLLATION testx (provider = icu, locale = 'nonsense-nowhere'); DROP COLLATION testx;
+CREATE COLLATION testx (provider = icu, locale = 'C.UTF-8'); DROP COLLATION testx;
+CREATE COLLATION testx (provider = icu, locale = 'POSIX'); DROP COLLATION testx;
 
 CREATE COLLATION test4 FROM nonsense;
 CREATE COLLATION test5 FROM test0;
diff --git a/src/test/regress/sql/collate.sql b/src/test/regress/sql/collate.sql
index c3d40fc195..10ff532169 100644
--- a/src/test/regress/sql/collate.sql
+++ b/src/test/regress/sql/collate.sql
@@ -241,6 +241,11 @@ EXPLAIN (COSTS OFF)
 EXPLAIN (COSTS OFF)
   SELECT * FROM collate_test10 ORDER BY x DESC, y COLLATE "C" ASC NULLS FIRST;
 
+-- test alternate spellings of special locale C
+CREATE COLLATION coll_c_locale ( LOCALE = "C.something" );
+DROP COLLATION coll_c_locale;
+CREATE COLLATION coll_c_locale ( LOCALE = "POSIX.something" );
+DROP COLLATION coll_c_locale;
 
 -- CREATE/DROP COLLATION
 
-- 
2.34.1

