From e339123ae9c41a1161d516173e67c543d5542a60 Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Thu, 15 Jun 2023 15:18:50 -0700
Subject: [PATCH v1 2/2] ICU: do not convert locale 'C' to 'en-US-u-va-posix'.

Older versions of ICU canonicalize "C" to "en-US-u-va-posix"; but
starting in ICU version 64, the "C" locale is considered
obsolete. Postgres commit ea1db8ae70 introduced code to always
canonicalize "C" to "en-US-u-va-posix" for consistency and
convenience, but it was deemed too confusing.

This commit removes that code, so that "C" is treated like other ICU
locale names: canonicalization is attempted, and if it fails, the
behavior is controlled by icu_validation_level.

A similar change was previously committed as f7faa9976c, then reverted
due to an ICU-version-dependent test failure. This commit un-reverts
it, omitting the test because we now expect the behavior to depend on
the version of ICU being used.

Discussion: https://postgr.es/m/3a200aca-4672-4b37-fc91-5d198a323503%40eisentraut.org
Discussion: https://postgr.es/m/f83f089ee1e9acd5dbbbf3353294d24e1f196e95.camel@j-davis.com
Discussion: https://postgr.es/m/37520ec1ae9591f83132f82dbd625f3fc2d69c16.camel@j-davis.com
---
 src/backend/utils/adt/pg_locale.c             | 27 ++++---------------
 src/bin/initdb/initdb.c                       | 25 ++++-------------
 .../regress/expected/collate.icu.utf8.out     |  2 ++
 src/test/regress/sql/collate.icu.utf8.sql     |  2 ++
 4 files changed, 14 insertions(+), 42 deletions(-)

diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 31e3b16ae0..69a2fb3376 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -2783,26 +2783,10 @@ char *
 icu_language_tag(const char *loc_str, int elevel)
 {
 #ifdef USE_ICU
-	UErrorCode	status;
-	char		lang[ULOC_LANG_CAPACITY];
-	char	   *langtag;
-	size_t		buflen = 32;	/* arbitrary starting buffer size */
-	const bool	strict = true;
-
-	status = U_ZERO_ERROR;
-	uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
-	if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
-	{
-		if (elevel > 0)
-			ereport(elevel,
-					(errmsg("could not get language from locale \"%s\": %s",
-							loc_str, u_errorName(status))));
-		return NULL;
-	}
-
-	/* C/POSIX locales aren't handled by uloc_getLanguageTag() */
-	if (strcmp(lang, "c") == 0 || strcmp(lang, "posix") == 0)
-		return pstrdup("en-US-u-va-posix");
+	UErrorCode	 status;
+	char		*langtag;
+	size_t		 buflen = 32;	/* arbitrary starting buffer size */
+	const bool	 strict = true;
 
 	/*
 	 * A BCP47 language tag doesn't have a clearly-defined upper limit (cf.
@@ -2884,8 +2868,7 @@ icu_validate_locale(const char *loc_str)
 
 	/* check for special language name */
 	if (strcmp(lang, "") == 0 ||
-		strcmp(lang, "root") == 0 || strcmp(lang, "und") == 0 ||
-		strcmp(lang, "c") == 0 || strcmp(lang, "posix") == 0)
+		strcmp(lang, "root") == 0 || strcmp(lang, "und") == 0)
 		found = true;
 
 	/* search for matching language within ICU */
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index fa3af0d75c..a5ce1f66d5 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -2243,24 +2243,10 @@ static char *
 icu_language_tag(const char *loc_str)
 {
 #ifdef USE_ICU
-	UErrorCode	status;
-	char		lang[ULOC_LANG_CAPACITY];
-	char	   *langtag;
-	size_t		buflen = 32;	/* arbitrary starting buffer size */
-	const bool	strict = true;
-
-	status = U_ZERO_ERROR;
-	uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status);
-	if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING)
-	{
-		pg_fatal("could not get language from locale \"%s\": %s",
-				 loc_str, u_errorName(status));
-		return NULL;
-	}
-
-	/* C/POSIX locales aren't handled by uloc_getLanguageTag() */
-	if (strcmp(lang, "c") == 0 || strcmp(lang, "posix") == 0)
-		return pstrdup("en-US-u-va-posix");
+	UErrorCode	 status;
+	char		*langtag;
+	size_t		 buflen = 32;	/* arbitrary starting buffer size */
+	const bool	 strict = true;
 
 	/*
 	 * A BCP47 language tag doesn't have a clearly-defined upper limit (cf.
@@ -2326,8 +2312,7 @@ icu_validate_locale(const char *loc_str)
 
 	/* check for special language name */
 	if (strcmp(lang, "") == 0 ||
-		strcmp(lang, "root") == 0 || strcmp(lang, "und") == 0 ||
-		strcmp(lang, "c") == 0 || strcmp(lang, "posix") == 0)
+		strcmp(lang, "root") == 0 || strcmp(lang, "und") == 0)
 		found = true;
 
 	/* search for matching language within ICU */
diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out
index b7fbee447f..78a9cb38fa 100644
--- a/src/test/regress/expected/collate.icu.utf8.out
+++ b/src/test/regress/expected/collate.icu.utf8.out
@@ -1020,6 +1020,7 @@ CREATE ROLE regress_test_role;
 CREATE SCHEMA test_schema;
 -- We need to do this this way to cope with varying names for encodings:
 SET client_min_messages TO WARNING;
+SET icu_validation_level = disabled;
 do $$
 BEGIN
   EXECUTE 'CREATE COLLATION test0 (provider = icu, locale = ' ||
@@ -1034,6 +1035,7 @@ BEGIN
           quote_literal((SELECT CASE WHEN datlocprovider='i' THEN daticulocale ELSE datcollate END FROM pg_database WHERE datname = current_database())) || ');';
 END
 $$;
+RESET icu_validation_level;
 RESET client_min_messages;
 CREATE COLLATION test3 (provider = icu, lc_collate = 'en_US.utf8'); -- fail, needs "locale"
 ERROR:  parameter "locale" must be specified
diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql
index 079d7ae39d..3db9e25913 100644
--- a/src/test/regress/sql/collate.icu.utf8.sql
+++ b/src/test/regress/sql/collate.icu.utf8.sql
@@ -358,6 +358,7 @@ CREATE SCHEMA test_schema;
 
 -- We need to do this this way to cope with varying names for encodings:
 SET client_min_messages TO WARNING;
+SET icu_validation_level = disabled;
 
 do $$
 BEGIN
@@ -373,6 +374,7 @@ BEGIN
 END
 $$;
 
+RESET icu_validation_level;
 RESET client_min_messages;
 
 CREATE COLLATION test3 (provider = icu, lc_collate = 'en_US.utf8'); -- fail, needs "locale"
-- 
2.34.1

