From ad80a4097ba76acbe1434208a060cb271f7e6155 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Thu, 15 Aug 2024 16:45:27 +1200
Subject: [PATCH v3 3/3] Remove setlocale() calls from check_locale().

Validate locale names with newlocale() or _create_locale() instead, to
avoid clobbering global state.

This removes the previous assumption that it's useful to canonicalize
locale names with setlocale(), which wasn't really true, at least on any
known Unix (thanks to Tom Lane for this observation).  Two kinds of name
transformation are still useful:

1.  "" means use the contents of LC_xxx environment variables, which we
can easily look up ourselves instead of asking setlocale() to do it.
(We set them ourselves in pg_perm_setlocale() early in main().)

2.  Windows setlocale() apparently does some transformations (for
example the EDB installer passes it "Language,Country" and it returns
"Language_Country.CodePage").  While all such locale names are an
unstable mess and deprecated in favor of BCP 47 on that OS since ~2007,
we want to keep supporting that for a bit longer, so create a
canonicalization function in win32setlocale.c that is careful to be
thread-safe, so that it doesn't get in the way of our plan to make the
backend potentially thread-safe.

Reviewed-by:
Discussion: https://postgr.es/m/CA%2BhUKGJqVe0%2BPv9dvC9dSums_PXxGo9SWcxYAMBguWJUGbWz-A%40mail.gmail.com
Discussion: https://postgr.es/m/CA%2BhUKGK57sgUYKO03jB4VarTsswfMyScFAyJpVnYD8c%2Bg12_mg%40mail.gmail.com
---
 src/backend/utils/adt/pg_locale.c | 193 ++++++++++++++++++++++--------
 src/include/port/win32_port.h     |   2 +
 src/port/win32setlocale.c         |  48 ++++++++
 3 files changed, 190 insertions(+), 53 deletions(-)

diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 1cf8efcc7b7..b11d3ac81e7 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -190,6 +190,65 @@ wcstombs_l(char *dest, const wchar_t *src, size_t n, locale_t loc)
 }
 #endif
 
+/*
+ * The category names as strings.  These are the names of the environment
+ * variables that define the server locale environment.  We always unset
+ * LC_ALL, so we only need the actual categories.
+ */
+static const char *
+get_lc_category_name(int category)
+{
+	switch (category)
+	{
+		case LC_COLLATE:
+			return "LC_COLLATE";
+		case LC_CTYPE:
+			return "LC_CTYPE";
+#ifdef LC_MESSAGES
+		case LC_MESSAGES:
+			return "LC_MESSAGES";
+#endif
+		case LC_MONETARY:
+			return "LC_MONETARY";
+		case LC_NUMERIC:
+			return "LC_NUMERIC";
+		case LC_TIME:
+			return "LC_TIME";
+		default:
+			return NULL;
+	};
+};
+
+#ifndef WIN32
+/*
+ * The newlocale() function needs LC_xxx_MASK, but sometimes we have LC_xxx,
+ * and POSIX doesn't offer a way to translate.
+ */
+static int
+get_lc_category_mask(int category)
+{
+	switch (category)
+	{
+		case LC_COLLATE:
+			return LC_COLLATE_MASK;
+		case LC_CTYPE:
+			return LC_CTYPE_MASK;
+#ifdef LC_MESSAGES
+		case LC_MESSAGES:
+			return LC_MESSAGES_MASK;
+#endif
+		case LC_MONETARY:
+			return LC_MONETARY_MASK;
+		case LC_NUMERIC:
+			return LC_NUMERIC_MASK;
+		case LC_TIME:
+			return LC_TIME_MASK;
+		default:
+			return 0;
+	};
+}
+#endif
+
 /*
  * pg_perm_setlocale
  *
@@ -257,38 +316,9 @@ pg_perm_setlocale(int category, const char *locale)
 #endif
 	}
 
-	switch (category)
-	{
-		case LC_COLLATE:
-			envvar = "LC_COLLATE";
-			break;
-		case LC_CTYPE:
-			envvar = "LC_CTYPE";
-			break;
-#ifdef LC_MESSAGES
-		case LC_MESSAGES:
-			envvar = "LC_MESSAGES";
-#ifdef WIN32
-			result = IsoLocaleName(locale);
-			if (result == NULL)
-				result = (char *) locale;
-			elog(DEBUG3, "IsoLocaleName() executed; locale: \"%s\"", result);
-#endif							/* WIN32 */
-			break;
-#endif							/* LC_MESSAGES */
-		case LC_MONETARY:
-			envvar = "LC_MONETARY";
-			break;
-		case LC_NUMERIC:
-			envvar = "LC_NUMERIC";
-			break;
-		case LC_TIME:
-			envvar = "LC_TIME";
-			break;
-		default:
-			elog(FATAL, "unrecognized LC category: %d", category);
-			return NULL;		/* keep compiler quiet */
-	}
+	envvar = get_lc_category_name(category);
+	if (!envvar)
+		elog(FATAL, "unrecognized LC category: %d", category);
 
 	if (setenv(envvar, result, 1) != 0)
 		return NULL;
@@ -302,40 +332,97 @@ pg_perm_setlocale(int category, const char *locale)
  *
  * If successful, and canonname isn't NULL, a palloc'd copy of the locale's
  * canonical name is stored there.  This is especially useful for figuring out
- * what locale name "" means (ie, the server environment value).  (Actually,
- * it seems that on most implementations that's the only thing it's good for;
- * we could wish that setlocale gave back a canonically spelled version of
- * the locale name, but typically it doesn't.)
+ * what locale name "" means (ie, the server environment value).  On Windows,
+ * it also gives a canonically spelled version of the locale name, when using
+ * traditional Windows (pre-BCP 47) locale names.
  */
 bool
 check_locale(int category, const char *locale, char **canonname)
 {
-	char	   *save;
-	char	   *res;
+	locale_t	loc;
 
 	if (canonname)
 		*canonname = NULL;		/* in case of failure */
 
-	save = setlocale(category, NULL);
-	if (!save)
-		return false;			/* won't happen, we hope */
+	if (locale[0] == 0)
+	{
+		/*
+		 * Caller asked for "", meaning the "native environment" in POSIX
+		 * terminology.  This means the LC_XXX environment variables, which
+		 * pg_perm_setlocale() sets, so we can get them directly from there.
+		 * That is exactly what POSIX setlocale() is required to do for "",
+		 * except that it would also check "LC_ALL" and "LANG", and we unset
+		 * the former and the latter has lower priority than the category
+		 * names.
+		 */
+		const char *envvar = get_lc_category_name(category);
 
-	/* save may be pointing at a modifiable scratch variable, see above. */
-	save = pstrdup(save);
+		if (!envvar)
+			return false;
+
+		locale = getenv(envvar);
+		if (locale)
+			return false;
+	}
+
+	/*
+	 * See if we can open it.  Unfortunately we can't always distinguish
+	 * out-of-memory from invalid locale name.
+	 */
+	errno = ENOENT;
+#ifdef WIN32
+	loc = _create_locale(category, locale);
+	if (loc == (locale_t) 0)
+		_dosmaperr(GetLastError());
+#else
+	loc = newlocale(get_lc_category_mask(category), locale, (locale_t) 0);
+#endif
+	if (loc == (locale_t) 0)
+	{
+		if (errno == ENOMEM)
+			elog(ERROR, "out of memory");
 
-	/* set the locale with setlocale, to see if it accepts it. */
-	res = setlocale(category, locale);
+		/* Otherwise assume the locale doesn't exist. */
+		return false;
+	}
+#ifdef WIN32
+	_free_locale(loc);
+#else
+	freelocale(loc);
+#endif
 
-	/* save canonical name if requested. */
-	if (res && canonname)
-		*canonname = pstrdup(res);
+	if (canonname)
+	{
+#ifdef WIN32
+		char	   *canonicalized;
 
-	/* restore old value. */
-	if (!setlocale(category, save))
-		elog(WARNING, "failed to restore old locale \"%s\"", save);
-	pfree(save);
+		/*
+		 * On Windows, we pass it through setlocale() in thread-local mode,
+		 * which gives a canonical version of the name.
+		 */
+		canonicalized = pgwin32_canonicalize_locale_name(category, locale);
+		if (!canonicalized)
+			return false;
+		/* Copy from malloc'd memory to palloc'd memory. */
+		*canonname = palloc_extended(strlen(canonicalized) + 1,
+									 MCXT_ALLOC_NO_OOM);
+		if (!*canonname)
+		{
+			free(canonicalized);
+			return false;
+		}
+		strcpy(*canonname, canonicalized);
+		free(canonicalized);
+#else
+		/* On Unix there is no such concept, so just copy the name verbatim. */
+		*canonname = palloc_extended(strlen(locale) + 1, MCXT_ALLOC_NO_OOM);
+		if (!*canonname)
+			return false;
+		strcpy(*canonname, locale);
+#endif
+	}
 
-	return (res != NULL);
+	return true;
 }
 
 
diff --git a/src/include/port/win32_port.h b/src/include/port/win32_port.h
index 7ffe5891c69..30bcf14812f 100644
--- a/src/include/port/win32_port.h
+++ b/src/include/port/win32_port.h
@@ -471,6 +471,8 @@ extern char *pgwin32_setlocale(int category, const char *locale);
 
 #define setlocale(a,b) pgwin32_setlocale(a,b)
 
+extern char *pgwin32_canonicalize_locale_name(int category,
+											  const char *locale);
 
 /* In backend/port/win32/signal.c */
 extern PGDLLIMPORT volatile int pg_signal_queue;
diff --git a/src/port/win32setlocale.c b/src/port/win32setlocale.c
index 9e2ab8cc3ad..e45b76c9fca 100644
--- a/src/port/win32setlocale.c
+++ b/src/port/win32setlocale.c
@@ -191,3 +191,51 @@ pgwin32_setlocale(int category, const char *locale)
 
 	return result;
 }
+
+/*
+ * Returns a malloc'd copy of the name that Windows returns when you set a
+ * locale, or NULL on error.  Since this function is usually called after the
+ * name has been validated, failure here likely implies out-of-memory.
+ *
+ * Modern BCP 47 locale names such as "en-US" are not expected to be changed by
+ * this function, but for the older deprecated "English_United States.1252"
+ * format, several variations seem to be accepted and converted to that form.
+ *
+ * This function is thread-safe, because it puts setlocale() into thread-local
+ * mode temporarily.  It uses wchar_t for save-and-restore, to avoid problems
+ * restoring the old locale if the code page changes.
+ */
+char *
+pgwin32_canonicalize_locale_name(int category, const char *locale)
+{
+	wchar_t    *save_locale = NULL;
+	int			save_config_thread_locale;
+	char	   *canonical;
+	char	   *result = NULL;
+
+	save_config_thread_locale = _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
+
+	save_locale = _wsetlocale(category, NULL);
+	if (!save_locale || !(save_locale = wcsdup(save_locale)))
+		goto exit;
+
+	canonical = setlocale(category, locale);
+	if (!canonical)
+		goto exit;
+
+	result = malloc(strlen(canonical) + 1);
+	if (!result)
+		goto exit;
+	strcpy(result, canonical);
+
+exit:
+	/* Restore everything we changed. */
+	if (save_locale)
+	{
+		_wsetlocale(category, save_locale);
+		free(save_locale);
+	}
+	_configthreadlocale(save_config_thread_locale);
+
+	return result;
+}
-- 
2.46.0

