From 1e0b75b4c8958397a8e660fa0b8759f1da78a753 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Tue, 19 Jul 2022 08:53:08 +1200
Subject: [PATCH v2 2/2] Remove support for old Windows locale names.

We now use BCP 47 locale names by default and also advise those for
explicit use.  Remove support for munging the old unstable and
unsystematic English word-style locale names.

XXX When could we do this?
XXX How are you supposed to pg_upgrade a system with datcollate =
"English_Canada" to "en-CA"?
XXX There are more traces to remove in win32_langinfo()
---
 configure                         |   6 -
 configure.ac                      |   1 -
 doc/src/sgml/charset.sgml         |   2 +-
 src/backend/utils/adt/pg_locale.c | 235 ++----------------------------
 src/include/port/win32_port.h     |   9 --
 src/port/win32setlocale.c         | 193 ------------------------
 src/tools/msvc/Mkvcbuild.pm       |   2 +-
 7 files changed, 18 insertions(+), 430 deletions(-)
 delete mode 100644 src/port/win32setlocale.c

diff --git a/configure b/configure
index a4f4d321fb..5b7704352d 100755
--- a/configure
+++ b/configure
@@ -17174,12 +17174,6 @@ esac
  ;;
 esac
 
-  case " $LIBOBJS " in
-  *" win32setlocale.$ac_objext "* ) ;;
-  *) LIBOBJS="$LIBOBJS win32setlocale.$ac_objext"
- ;;
-esac
-
   case " $LIBOBJS " in
   *" win32stat.$ac_objext "* ) ;;
   *) LIBOBJS="$LIBOBJS win32stat.$ac_objext"
diff --git a/configure.ac b/configure.ac
index 5bd29a4d2f..358580e6d6 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1995,7 +1995,6 @@ if test "$PORTNAME" = "win32"; then
   AC_LIBOBJ(win32error)
   AC_LIBOBJ(win32ntdll)
   AC_LIBOBJ(win32security)
-  AC_LIBOBJ(win32setlocale)
   AC_LIBOBJ(win32stat)
   AC_DEFINE([HAVE_SYMLINK], 1,
             [Define to 1 if you have the `symlink' function.])
diff --git a/doc/src/sgml/charset.sgml b/doc/src/sgml/charset.sgml
index b656ca489f..3c12f3f344 100644
--- a/doc/src/sgml/charset.sgml
+++ b/doc/src/sgml/charset.sgml
@@ -90,7 +90,7 @@ initdb --locale=sv_SE
     For example, <literal>sv-SE</literal> represents Swedish as spoken in Sweden.
     Windows also supports more verbose locale names based on English words,
     such as <literal>German_Germany</literal> or <literal>Swedish_Sweden.1252</literal>,
-    but these are not recommended.
+    but these should not be used in PostgreSQL.
    </para>
 
    <para>
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 607a4b7340..044f62de2b 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -118,7 +118,7 @@ static HTAB *collation_cache = NULL;
 
 
 #if defined(WIN32) && defined(LC_MESSAGES)
-static char *IsoLocaleName(const char *);	/* MSVC specific */
+static char *PosixLocaleName(const char *);
 #endif
 
 #ifdef USE_ICU
@@ -204,10 +204,7 @@ pg_perm_setlocale(int category, const char *locale)
 		case LC_MESSAGES:
 			envvar = "LC_MESSAGES";
 #ifdef WIN32
-			result = IsoLocaleName(locale);
-			if (result == NULL)
-				result = (char *) locale;
-			elog(DEBUG3, "IsoLocaleName() executed; locale: \"%s\"", result);
+			locale = PosixLocaleName(locale);
 #endif							/* WIN32 */
 			break;
 #endif							/* LC_MESSAGES */
@@ -905,218 +902,35 @@ cache_locale_time(void)
 
 #if defined(WIN32) && defined(LC_MESSAGES)
 /*
- * Convert a Windows setlocale() argument to a Unix-style one.
+ * Convert a Windows BCP 47 locale name to a POSIX one.
  *
  * Regardless of platform, we install message catalogs under a Unix-style
  * LL[_CC][.ENCODING][@VARIANT] naming convention.  Only LC_MESSAGES settings
  * following that style will elicit localized interface strings.
  *
- * Before Visual Studio 2012 (msvcr110.dll), Windows setlocale() accepted "C"
- * (but not "c") and strings of the form <Language>[_<Country>][.<CodePage>],
- * case-insensitive.  setlocale() returns the fully-qualified form; for
- * example, setlocale("thaI") returns "Thai_Thailand.874".  Internally,
- * setlocale() and _create_locale() select a "locale identifier"[1] and store
- * it in an undocumented _locale_t field.  From that LCID, we can retrieve the
- * ISO 639 language and the ISO 3166 country.  Character encoding does not
- * matter, because the server and client encodings govern that.
- *
- * Windows Vista introduced the "locale name" concept[2], closely following
- * RFC 4646.  Locale identifiers are now deprecated.  Starting with Visual
- * Studio 2012, setlocale() accepts locale names in addition to the strings it
- * accepted historically.  It does not standardize them; setlocale("Th-tH")
- * returns "Th-tH".  setlocale(category, "") still returns a traditional
- * string.  Furthermore, msvcr110.dll changed the undocumented _locale_t
- * content to carry locale names instead of locale identifiers.
- *
- * Visual Studio 2015 should still be able to do the same as Visual Studio
- * 2012, but the declaration of locale_name is missing in _locale_t, causing
- * this code compilation to fail, hence this falls back instead on to
- * enumerating all system locales by using EnumSystemLocalesEx to find the
- * required locale name.  If the input argument is in Unix-style then we can
- * get ISO Locale name directly by using GetLocaleInfoEx() with LCType as
- * LOCALE_SNAME.
- *
- * MinGW headers declare _create_locale(), but msvcrt.dll lacks that symbol in
- * releases before Windows 8. IsoLocaleName() always fails in a MinGW-built
- * postgres.exe, so only Unix-style values of the lc_messages GUC can elicit
- * localized messages. In particular, every lc_messages setting that initdb
- * can select automatically will yield only C-locale messages. XXX This could
- * be fixed by running the fully-qualified locale name through a lookup table.
- *
- * This function returns a pointer to a static buffer bearing the converted
- * name or NULL if conversion fails.
- *
- * [1] https://docs.microsoft.com/en-us/windows/win32/intl/locale-identifiers
- * [2] https://docs.microsoft.com/en-us/windows/win32/intl/locale-names
- */
-
-/*
- * Callback function for EnumSystemLocalesEx() in get_iso_localename().
- *
- * This function enumerates all system locales, searching for one that matches
- * an input with the format: <Language>[_<Country>], e.g.
- * English[_United States]
- *
- * The input is a three wchar_t array as an LPARAM. The first element is the
- * locale_name we want to match, the second element is an allocated buffer
- * where the Unix-style locale is copied if a match is found, and the third
- * element is the search status, 1 if a match was found, 0 otherwise.
+ * Historically, verbose, but unsystematic and unstable names like
+ * "Thai_Thailand.874" were supported, but now only BCP 47 input is expected.
+ * That means we just need to be able to convert "en-US" to "en_US".
  */
-static BOOL CALLBACK
-search_locale_enum(LPWSTR pStr, DWORD dwFlags, LPARAM lparam)
-{
-	wchar_t		test_locale[LOCALE_NAME_MAX_LENGTH];
-	wchar_t   **argv;
-
-	(void) (dwFlags);
-
-	argv = (wchar_t **) lparam;
-	*argv[2] = (wchar_t) 0;
-
-	memset(test_locale, 0, sizeof(test_locale));
-
-	/* Get the name of the <Language> in English */
-	if (GetLocaleInfoEx(pStr, LOCALE_SENGLISHLANGUAGENAME,
-						test_locale, LOCALE_NAME_MAX_LENGTH))
-	{
-		/*
-		 * If the enumerated locale does not have a hyphen ("en") OR the
-		 * lc_message input does not have an underscore ("English"), we only
-		 * need to compare the <Language> tags.
-		 */
-		if (wcsrchr(pStr, '-') == NULL || wcsrchr(argv[0], '_') == NULL)
-		{
-			if (_wcsicmp(argv[0], test_locale) == 0)
-			{
-				wcscpy(argv[1], pStr);
-				*argv[2] = (wchar_t) 1;
-				return FALSE;
-			}
-		}
-
-		/*
-		 * We have to compare a full <Language>_<Country> tag, so we append
-		 * the underscore and name of the country/region in English, e.g.
-		 * "English_United States".
-		 */
-		else
-		{
-			size_t		len;
-
-			wcscat(test_locale, L"_");
-			len = wcslen(test_locale);
-			if (GetLocaleInfoEx(pStr, LOCALE_SENGLISHCOUNTRYNAME,
-								test_locale + len,
-								LOCALE_NAME_MAX_LENGTH - len))
-			{
-				if (_wcsicmp(argv[0], test_locale) == 0)
-				{
-					wcscpy(argv[1], pStr);
-					*argv[2] = (wchar_t) 1;
-					return FALSE;
-				}
-			}
-		}
-	}
-
-	return TRUE;
-}
-
-/*
- * This function converts a Windows locale name to an ISO formatted version
- * for Visual Studio 2015 or greater.
- *
- * Returns NULL, if no valid conversion was found.
- */
-static char *
-get_iso_localename(const char *winlocname)
-{
-	wchar_t		wc_locale_name[LOCALE_NAME_MAX_LENGTH];
-	wchar_t		buffer[LOCALE_NAME_MAX_LENGTH];
-	static char iso_lc_messages[LOCALE_NAME_MAX_LENGTH];
-	char	   *period;
-	int			len;
-	int			ret_val;
-
-	/*
-	 * Valid locales have the following syntax:
-	 * <Language>[_<Country>[.<CodePage>]]
-	 *
-	 * GetLocaleInfoEx can only take locale name without code-page and for the
-	 * purpose of this API the code-page doesn't matter.
-	 */
-	period = strchr(winlocname, '.');
-	if (period != NULL)
-		len = period - winlocname;
-	else
-		len = pg_mbstrlen(winlocname);
-
-	memset(wc_locale_name, 0, sizeof(wc_locale_name));
-	memset(buffer, 0, sizeof(buffer));
-	MultiByteToWideChar(CP_ACP, 0, winlocname, len, wc_locale_name,
-						LOCALE_NAME_MAX_LENGTH);
-
-	/*
-	 * If the lc_messages is already a Unix-style string, we have a direct
-	 * match with LOCALE_SNAME, e.g. en-US, en_US.
-	 */
-	ret_val = GetLocaleInfoEx(wc_locale_name, LOCALE_SNAME, (LPWSTR) &buffer,
-							  LOCALE_NAME_MAX_LENGTH);
-	if (!ret_val)
-	{
-		/*
-		 * Search for a locale in the system that matches language and country
-		 * name.
-		 */
-		wchar_t    *argv[3];
-
-		argv[0] = wc_locale_name;
-		argv[1] = buffer;
-		argv[2] = (wchar_t *) &ret_val;
-		EnumSystemLocalesEx(search_locale_enum, LOCALE_WINDOWS, (LPARAM) argv,
-							NULL);
-	}
-
-	if (ret_val)
-	{
-		size_t		rc;
-		char	   *hyphen;
-
-		/* Locale names use only ASCII, any conversion locale suffices. */
-		rc = wchar2char(iso_lc_messages, buffer, sizeof(iso_lc_messages), NULL);
-		if (rc == -1 || rc == sizeof(iso_lc_messages))
-			return NULL;
-
-		/*
-		 * Simply replace the hyphen with an underscore.  See comments in
-		 * IsoLocaleName.
-		 */
-		hyphen = strchr(iso_lc_messages, '-');
-		if (hyphen)
-			*hyphen = '_';
-		return iso_lc_messages;
-	}
-
-	return NULL;
-}
-
 static char *
-IsoLocaleName(const char *winlocname)
+PosixLocaleName(const char *winlocname)
 {
-#if defined(_MSC_VER)
-	static char iso_lc_messages[LOCALE_NAME_MAX_LENGTH];
+	char iso_lc_messages[LOCALE_NAME_MAX_LENGTH];
+	char *hyphen;
 
 	if (pg_strcasecmp("c", winlocname) == 0 ||
 		pg_strcasecmp("posix", winlocname) == 0)
 	{
 		strcpy(iso_lc_messages, "C");
-		return iso_lc_messages;
 	}
 	else
-		return get_iso_localename(winlocname);
-
-#endif							/* defined(_MSC_VER) */
-	return NULL;				/* Not supported on this version of msvc/mingw */
+	{
+		strlcpy(iso_lc_messages, winlocname, sizeof(iso_lc_messages));
+		hypen = strchr(iso_lc_messages, '-');
+		if (hyphen)
+			*hyphen = '_';
+	}
+	return pg_strcpy(iso_lc_messages);
 }
 #endif							/* WIN32 && LC_MESSAGES */
 
@@ -1680,33 +1494,16 @@ get_collation_actual_version(char collprovider, const char *collcollate)
 			ereport(ERROR,
 					(errmsg("could not load locale \"%s\"", collcollate)));
 #elif defined(WIN32)
-		/*
-		 * If we are targeting Windows Vista and above, we can ask for a name
-		 * given a collation name (earlier versions required a location code
-		 * that we don't have).
-		 */
 		NLSVERSIONINFOEX version = {sizeof(NLSVERSIONINFOEX)};
 		WCHAR		wide_collcollate[LOCALE_NAME_MAX_LENGTH];
 
 		MultiByteToWideChar(CP_ACP, 0, collcollate, -1, wide_collcollate,
 							LOCALE_NAME_MAX_LENGTH);
 		if (!GetNLSVersionEx(COMPARE_STRING, wide_collcollate, &version))
-		{
-			/*
-			 * GetNLSVersionEx() wants a language tag such as "en-US", not a
-			 * locale name like "English_United States.1252".  Until those
-			 * values can be prevented from entering the system, or 100%
-			 * reliably converted to the more useful tag format, tolerate the
-			 * resulting error and report that we have no version data.
-			 */
-			if (GetLastError() == ERROR_INVALID_PARAMETER)
-				return NULL;
-
 			ereport(ERROR,
 					(errmsg("could not get collation version for locale \"%s\": error code %lu",
 							collcollate,
 							GetLastError())));
-		}
 		collversion = psprintf("%ld.%ld,%ld.%ld",
 							   (version.dwNLSVersion >> 8) & 0xFFFF,
 							   version.dwNLSVersion & 0xFF,
diff --git a/src/include/port/win32_port.h b/src/include/port/win32_port.h
index 5121c0c626..13ed144a8f 100644
--- a/src/include/port/win32_port.h
+++ b/src/include/port/win32_port.h
@@ -437,15 +437,6 @@ extern int	_pgstat64(const char *name, struct stat *buf);
 #undef setlocale
 #endif
 
-/*
- * Define our own wrapper macro around setlocale() to work around bugs in
- * Windows' native setlocale() function.
- */
-extern char *pgwin32_setlocale(int category, const char *locale);
-
-#define setlocale(a,b) pgwin32_setlocale(a,b)
-
-
 /* In backend/port/win32/signal.c */
 extern PGDLLIMPORT volatile int pg_signal_queue;
 extern PGDLLIMPORT int pg_signal_mask;
diff --git a/src/port/win32setlocale.c b/src/port/win32setlocale.c
deleted file mode 100644
index aadd09a4e9..0000000000
--- a/src/port/win32setlocale.c
+++ /dev/null
@@ -1,193 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * win32setlocale.c
- *		Wrapper to work around bugs in Windows setlocale() implementation
- *
- * Copyright (c) 2011-2022, PostgreSQL Global Development Group
- *
- * IDENTIFICATION
- *	  src/port/win32setlocale.c
- *
- *
- * The setlocale() function in Windows is broken in two ways. First, it
- * has a problem with locale names that have a dot in the country name. For
- * example:
- *
- * "Chinese (Traditional)_Hong Kong S.A.R..950"
- *
- * For some reason, setlocale() doesn't accept that as argument, even though
- * setlocale(LC_ALL, NULL) returns exactly that. Fortunately, it accepts
- * various alternative names for such countries, so to work around the broken
- * setlocale() function, we map the troublemaking locale names to accepted
- * aliases, before calling setlocale().
- *
- * The second problem is that the locale name for "Norwegian (Bokm&aring;l)"
- * contains a non-ASCII character. That's problematic, because it's not clear
- * what encoding the locale name itself is supposed to be in, when you
- * haven't yet set a locale. Also, it causes problems when the cluster
- * contains databases with different encodings, as the locale name is stored
- * in the pg_database system catalog. To work around that, when setlocale()
- * returns that locale name, map it to a pure-ASCII alias for the same
- * locale.
- *-------------------------------------------------------------------------
- */
-
-#include "c.h"
-
-#undef setlocale
-
-struct locale_map
-{
-	/*
-	 * String in locale name to replace. Can be a single string (end is NULL),
-	 * or separate start and end strings. If two strings are given, the locale
-	 * name must contain both of them, and everything between them is
-	 * replaced. This is used for a poor-man's regexp search, allowing
-	 * replacement of "start.*end".
-	 */
-	const char *locale_name_start;
-	const char *locale_name_end;
-
-	const char *replacement;	/* string to replace the match with */
-};
-
-/*
- * Mappings applied before calling setlocale(), to the argument.
- */
-static const struct locale_map locale_map_argument[] = {
-	/*
-	 * "HKG" is listed here:
-	 * http://msdn.microsoft.com/en-us/library/cdax410z%28v=vs.71%29.aspx
-	 * (Country/Region Strings).
-	 *
-	 * "ARE" is the ISO-3166 three-letter code for U.A.E. It is not on the
-	 * above list, but seems to work anyway.
-	 */
-	{"Hong Kong S.A.R.", NULL, "HKG"},
-	{"U.A.E.", NULL, "ARE"},
-
-	/*
-	 * The ISO-3166 country code for Macau S.A.R. is MAC, but Windows doesn't
-	 * seem to recognize that. And Macau isn't listed in the table of accepted
-	 * abbreviations linked above. Fortunately, "ZHM" seems to be accepted as
-	 * an alias for "Chinese (Traditional)_Macau S.A.R..950". I'm not sure
-	 * where "ZHM" comes from, must be some legacy naming scheme. But hey, it
-	 * works.
-	 *
-	 * Note that unlike HKG and ARE, ZHM is an alias for the *whole* locale
-	 * name, not just the country part.
-	 *
-	 * Some versions of Windows spell it "Macau", others "Macao".
-	 */
-	{"Chinese (Traditional)_Macau S.A.R..950", NULL, "ZHM"},
-	{"Chinese_Macau S.A.R..950", NULL, "ZHM"},
-	{"Chinese (Traditional)_Macao S.A.R..950", NULL, "ZHM"},
-	{"Chinese_Macao S.A.R..950", NULL, "ZHM"},
-	{NULL, NULL, NULL}
-};
-
-/*
- * Mappings applied after calling setlocale(), to its return value.
- */
-static const struct locale_map locale_map_result[] = {
-	/*
-	 * "Norwegian (Bokm&aring;l)" locale name contains the a-ring character.
-	 * Map it to a pure-ASCII alias.
-	 *
-	 * It's not clear what encoding setlocale() uses when it returns the
-	 * locale name, so to play it safe, we search for "Norwegian (Bok*l)".
-	 *
-	 * Just to make life even more complicated, some versions of Windows spell
-	 * the locale name without parentheses.  Translate that too.
-	 */
-	{"Norwegian (Bokm", "l)_Norway", "Norwegian_Norway"},
-	{"Norwegian Bokm", "l_Norway", "Norwegian_Norway"},
-	{NULL, NULL, NULL}
-};
-
-#define MAX_LOCALE_NAME_LEN		100
-
-static const char *
-map_locale(const struct locale_map *map, const char *locale)
-{
-	static char aliasbuf[MAX_LOCALE_NAME_LEN];
-	int			i;
-
-	/* Check if the locale name matches any of the problematic ones. */
-	for (i = 0; map[i].locale_name_start != NULL; i++)
-	{
-		const char *needle_start = map[i].locale_name_start;
-		const char *needle_end = map[i].locale_name_end;
-		const char *replacement = map[i].replacement;
-		char	   *match;
-		char	   *match_start = NULL;
-		char	   *match_end = NULL;
-
-		match = strstr(locale, needle_start);
-		if (match)
-		{
-			/*
-			 * Found a match for the first part. If this was a two-part
-			 * replacement, find the second part.
-			 */
-			match_start = match;
-			if (needle_end)
-			{
-				match = strstr(match_start + strlen(needle_start), needle_end);
-				if (match)
-					match_end = match + strlen(needle_end);
-				else
-					match_start = NULL;
-			}
-			else
-				match_end = match_start + strlen(needle_start);
-		}
-
-		if (match_start)
-		{
-			/* Found a match. Replace the matched string. */
-			int			matchpos = match_start - locale;
-			int			replacementlen = strlen(replacement);
-			char	   *rest = match_end;
-			int			restlen = strlen(rest);
-
-			/* check that the result fits in the static buffer */
-			if (matchpos + replacementlen + restlen + 1 > MAX_LOCALE_NAME_LEN)
-				return NULL;
-
-			memcpy(&aliasbuf[0], &locale[0], matchpos);
-			memcpy(&aliasbuf[matchpos], replacement, replacementlen);
-			/* includes null terminator */
-			memcpy(&aliasbuf[matchpos + replacementlen], rest, restlen + 1);
-
-			return aliasbuf;
-		}
-	}
-
-	/* no match, just return the original string */
-	return locale;
-}
-
-char *
-pgwin32_setlocale(int category, const char *locale)
-{
-	const char *argument;
-	char	   *result;
-
-	if (locale == NULL)
-		argument = NULL;
-	else
-		argument = map_locale(locale_map_argument, locale);
-
-	/* Call the real setlocale() function */
-	result = setlocale(category, argument);
-
-	/*
-	 * setlocale() is specified to return a "char *" that the caller is
-	 * forbidden to modify, so casting away the "const" is innocuous.
-	 */
-	if (result)
-		result = unconstify(char *, map_locale(locale_map_result, result));
-
-	return result;
-}
diff --git a/src/tools/msvc/Mkvcbuild.pm b/src/tools/msvc/Mkvcbuild.pm
index e4feda10fd..1bc35ef926 100644
--- a/src/tools/msvc/Mkvcbuild.pm
+++ b/src/tools/msvc/Mkvcbuild.pm
@@ -108,7 +108,7 @@ sub mkvcbuild
 	  pqsignal.c mkdtemp.c qsort.c qsort_arg.c bsearch_arg.c quotes.c system.c
 	  strerror.c tar.c
 	  win32env.c win32error.c win32ntdll.c
-	  win32security.c win32setlocale.c win32stat.c);
+	  win32security.c win32stat.c);
 
 	push(@pgportfiles, 'strtof.c') if ($vsVersion < '14.00');
 
-- 
2.35.1

