From bb873845bf1c9bc019606a691d66c2d148a745e2 Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Wed, 15 Feb 2023 23:05:08 -0800
Subject: [PATCH v1] For ICU collations, canonicalize locale names to language
 tags.

Before storing the locale name in the catalog, convert to a BCP47
language tag. The language tag should hold all of the necessary
information and also be an unambiguous representation of the locale.

During pg_upgrade, the previous locale string may need to be preserved
if the language tag resolves to a different actual locale.
---
 src/backend/commands/collationcmds.c |  39 ++++----
 src/backend/commands/dbcommands.c    |  58 +++++++++++
 src/backend/utils/adt/pg_locale.c    | 139 +++++++++++++++++++++++++--
 src/include/commands/dbcommands.h    |   1 +
 src/include/utils/pg_locale.h        |   3 +
 5 files changed, 211 insertions(+), 29 deletions(-)

diff --git a/src/backend/commands/collationcmds.c b/src/backend/commands/collationcmds.c
index eb62d285ea..3a98bbbd35 100644
--- a/src/backend/commands/collationcmds.c
+++ b/src/backend/commands/collationcmds.c
@@ -240,10 +240,23 @@ DefineCollation(ParseState *pstate, List *names, List *parameters, bool if_not_e
 		}
 		else if (collprovider == COLLPROVIDER_ICU)
 		{
+#ifdef USE_ICU
+			char *iculocale;
+
 			if (!colliculocale)
 				ereport(ERROR,
 						(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
 						 errmsg("parameter \"locale\" must be specified")));
+
+			check_icu_locale(colliculocale);
+			iculocale = get_icu_locale(colliculocale);
+			if (iculocale)
+				colliculocale = iculocale;
+#else
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("ICU is not supported in this build")));
+#endif
 		}
 
 		/*
@@ -556,26 +569,6 @@ cmpaliases(const void *a, const void *b)
 
 
 #ifdef USE_ICU
-/*
- * Get the ICU language tag for a locale name.
- * The result is a palloc'd string.
- */
-static char *
-get_icu_language_tag(const char *localename)
-{
-	char		buf[ULOC_FULLNAME_CAPACITY];
-	UErrorCode	status;
-
-	status = U_ZERO_ERROR;
-	uloc_toLanguageTag(localename, buf, sizeof(buf), true, &status);
-	if (U_FAILURE(status))
-		ereport(ERROR,
-				(errmsg("could not convert locale name \"%s\" to language tag: %s",
-						localename, u_errorName(status))));
-
-	return pstrdup(buf);
-}
-
 /*
  * Get a comment (specifically, the display name) for an ICU locale.
  * The result is a palloc'd string, or NULL if we can't get a comment
@@ -938,7 +931,11 @@ pg_import_system_collations(PG_FUNCTION_ARGS)
 			else
 				name = uloc_getAvailable(i);
 
-			langtag = get_icu_language_tag(name);
+			langtag = icu_language_tag(name);
+			if (langtag == NULL)
+				ereport(ERROR,
+						(errmsg("could not convert locale name \"%s\" to language tag",
+								name)));
 			iculocstr = U_ICU_VERSION_MAJOR_NUM >= 54 ? langtag : name;
 
 			/*
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index ef05633bb0..ccb5304250 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -1029,6 +1029,9 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
 
 	if (dblocprovider == COLLPROVIDER_ICU)
 	{
+#ifdef USE_ICU
+		char *iculocale;
+
 		if (!(is_encoding_supported_by_icu(encoding)))
 			ereport(ERROR,
 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
@@ -1045,6 +1048,14 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
 					 errmsg("ICU locale must be specified")));
 
 		check_icu_locale(dbiculocale);
+		iculocale = get_icu_locale(dbiculocale);
+		if (iculocale)
+			dbiculocale = iculocale;
+#else
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("ICU is not supported in this build")));
+#endif
 	}
 	else
 	{
@@ -1463,6 +1474,53 @@ check_encoding_locale_matches(int encoding, const char *collate, const char *cty
 						   pg_encoding_to_char(collate_encoding))));
 }
 
+/*
+ * Given the input ICU locale string, return a new string in a form suitable
+ * for storing in the catalog.
+ *
+ * Ordinarily this just converts to a language tag, but we need to make an
+ * allowance for invalid locale strings that come from earlier versions of
+ * Postgres while upgrading.
+ *
+ * Converting to a language tag performs "level 2 canonicalization". In
+ * addition to producing a consistent result format, level 2 canonicalization
+ * is able to more accurately interpret different input locale string formats,
+ * such as POSIX and .NET IDs. But prior to Postgres version 16, input locale
+ * strings were not canonicalized; the raw string provided by the user was
+ * stored in the catalog and passed directly to ucol_open().
+ *
+ * The raw string may resolve to the wrong actual collator when passed to
+ * directly ucol_open(), but indexes in older versions may depend on that
+ * actual collator. Therefore, during binary upgrade, we preserve the invalid
+ * raw string if it resolves to a different actual collator than the language
+ * tag. If it resolves to the same actual collator, then we proceed using the
+ * language tag.
+ */
+char *
+get_icu_locale(const char *requested_locale)
+{
+#ifdef USE_ICU
+	char *lang_tag = icu_language_tag(requested_locale);
+
+	if (lang_tag != NULL && IsBinaryUpgrade &&
+		!check_equivalent_icu_locales(requested_locale, lang_tag))
+	{
+		ereport(WARNING,
+				(errmsg("language tag \"%s\" resolves to different actual collator "
+						"than raw locale string \"%s\"",
+						lang_tag, requested_locale)));
+		pfree(lang_tag);
+		return pstrdup(requested_locale);
+	}
+
+	return lang_tag;
+#else
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("ICU is not supported in this build")));
+#endif
+}
+
 /* Error cleanup callback for createdb */
 static void
 createdb_failure_callback(int code, Datum arg)
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 059e4fd79f..52d9c8b5a6 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -1945,15 +1945,12 @@ icu_set_collation_attributes(UCollator *collator, const char *loc)
 	}
 }
 
-#endif							/* USE_ICU */
-
 /*
  * Check if the given locale ID is valid, and ereport(ERROR) if it isn't.
  */
 void
 check_icu_locale(const char *icu_locale)
 {
-#ifdef USE_ICU
 	UCollator  *collator;
 	UErrorCode	status;
 
@@ -1967,13 +1964,139 @@ check_icu_locale(const char *icu_locale)
 	if (U_ICU_VERSION_MAJOR_NUM < 54)
 		icu_set_collation_attributes(collator, icu_locale);
 	ucol_close(collator);
-#else
-	ereport(ERROR,
-			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-			 errmsg("ICU is not supported in this build")));
-#endif
 }
 
+/*
+ * Test if the given locales resolve to the same actual collator with the same
+ * attributes and version.
+ */
+bool
+check_equivalent_icu_locales(const char *locale1, const char *locale2)
+{
+	const UColAttribute collAtt[]	   = {
+		UCOL_FRENCH_COLLATION,
+		UCOL_ALTERNATE_HANDLING,
+		UCOL_CASE_FIRST,
+		UCOL_CASE_LEVEL,
+		UCOL_NORMALIZATION_MODE,
+		UCOL_DECOMPOSITION_MODE,
+		UCOL_STRENGTH,
+		UCOL_HIRAGANA_QUATERNARY_MODE,
+		UCOL_NUMERIC_COLLATION};
+	int              n_collAtt		   = sizeof(collAtt)/sizeof(*collAtt);
+	const char		*actual1, *actual2;
+	UVersionInfo     versionInfo1;
+	UVersionInfo	 versionInfo2;
+	char             version1[U_MAX_VERSION_STRING_LENGTH];
+	char             version2[U_MAX_VERSION_STRING_LENGTH];
+	UCollator		*collator1		   = NULL;
+	UCollator		*collator2		   = NULL;
+	UErrorCode		 status;
+	bool			 result			   = false;
+
+	/*
+	 * Be careful not to return without closing the collators.
+	 */
+
+	status = U_ZERO_ERROR;
+	collator1 = ucol_open(locale1, &status);
+	if (U_FAILURE(status))
+		goto cleanup;
+
+	status = U_ZERO_ERROR;
+	collator2 = ucol_open(locale2, &status);
+	if (U_FAILURE(status))
+		goto cleanup;
+
+	/* actual locale */
+	status = U_ZERO_ERROR;
+	actual1 = ucol_getLocaleByType(collator1, ULOC_ACTUAL_LOCALE, &status);
+	if (U_FAILURE(status))
+		goto cleanup;
+
+	status = U_ZERO_ERROR;
+	actual2 = ucol_getLocaleByType(collator2, ULOC_ACTUAL_LOCALE, &status);
+	if (U_FAILURE(status))
+		goto cleanup;
+
+	if (strcmp(actual1, actual2) != 0)
+		goto cleanup;
+
+	/* version */
+	ucol_getVersion(collator1, versionInfo1);
+	u_versionToString(versionInfo1, version1);
+	ucol_getVersion(collator2, versionInfo2);
+	u_versionToString(versionInfo2, version2);
+	if (strcmp(version1, version2) != 0)
+		goto cleanup;
+
+	/* attributes */
+	for (int i = 0; i < n_collAtt; i++)
+	{
+		UColAttributeValue val1, val2;
+
+		status = U_ZERO_ERROR;
+		val1 = ucol_getAttribute(collator1, collAtt[i], &status);
+		if (U_FAILURE(status))
+			goto cleanup;
+
+		status = U_ZERO_ERROR;
+		val2 = ucol_getAttribute(collator2, collAtt[i], &status);
+		if (U_FAILURE(status))
+			goto cleanup;
+
+		if (val1 != val2)
+			goto cleanup;
+	}
+
+	/* passed all the best-effort checks for equivalence */
+	result = true;
+
+cleanup:
+	if (collator2)
+		ucol_close(collator2);
+	if (collator1)
+		ucol_close(collator1);
+
+	return result;
+}
+
+/*
+ * Return the BCP47 language tag representation of the requested locale; or
+ * NULL if a problem is encountered.
+ *
+ * This function should be called before passing the string to ucol_open(),
+ * because conversion to a language tag also performs "level 2
+ * canonicalization". In addition to producing a consistent result format,
+ * level 2 canonicalization is able to more accurately interpret different
+ * input locale string formats, such as POSIX and .NET IDs.
+ */
+char *
+icu_language_tag(const char *requested_locale)
+{
+	UErrorCode	 status;
+	char		*result;
+	int32_t		 len;
+	const bool	 strict = true;
+
+	status = U_ZERO_ERROR;
+	len = uloc_toLanguageTag(requested_locale, NULL, 0, strict, &status);
+
+	result = palloc(len + 1);
+
+	status = U_ZERO_ERROR;
+	uloc_toLanguageTag(requested_locale, result, len + 1, strict, &status);
+	if (U_FAILURE(status))
+	{
+		pfree(result);
+		return NULL;
+	}
+
+	return result;
+}
+
+#endif							/* USE_ICU */
+
 /*
  * These functions convert from/to libc's wchar_t, *not* pg_wchar_t.
  * Therefore we keep them here rather than with the mbutils code.
diff --git a/src/include/commands/dbcommands.h b/src/include/commands/dbcommands.h
index 5fbc3ca752..0f0e827ff2 100644
--- a/src/include/commands/dbcommands.h
+++ b/src/include/commands/dbcommands.h
@@ -33,5 +33,6 @@ extern char *get_database_name(Oid dbid);
 extern bool have_createdb_privilege(void);
 
 extern void check_encoding_locale_matches(int encoding, const char *collate, const char *ctype);
+extern char *get_icu_locale(const char *requested_locale);
 
 #endif							/* DBCOMMANDS_H */
diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h
index cede43440b..5f9626985f 100644
--- a/src/include/utils/pg_locale.h
+++ b/src/include/utils/pg_locale.h
@@ -104,8 +104,11 @@ extern char *get_collation_actual_version(char collprovider, const char *collcol
 #ifdef USE_ICU
 extern int32_t icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes);
 extern int32_t icu_from_uchar(char **result, const UChar *buff_uchar, int32_t len_uchar);
+extern bool check_equivalent_icu_locales(const char *locale1,
+										 const char *locale2);
 #endif
 extern void check_icu_locale(const char *icu_locale);
+extern char *icu_language_tag(const char *requested_locale);
 
 /* These functions convert from/to libc's wchar_t, *not* pg_wchar_t */
 extern size_t wchar2char(char *to, const wchar_t *from, size_t tolen,
-- 
2.34.1

