From beba97b53fa9fc6d302fc84acaca2b267c991625 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Tue, 14 May 2024 16:46:27 +1200
Subject: [PATCH v1] Make win32locale.c's kludge table dynamic.

Historically, we transformed Windows' locale names on entry to and exit
from setlocale(), because they are unstable and we can't really handle
non-ASCII locale names.  We shouldn't be using these unstable display
names, but *setlocale() itself* gave them to us, and it can't even
understand everything it returns, leading to a series of historical
kludges.

Ideally we should soon switch to BCP47 codes for new clusters, but we'll
have to support the older names in ancient database clusters for now and
deal with the transition.

So, provide users with an emergency option for getting unstuck, since
they probably don't want to wait 3-6 months for a new binary with an
updated kludge table when their database fails to come up.  While there
are probably workarounds within Windows tools, it seems easy enough to
offer a textfile of name mappings.  It might also have applications in
the transition to BCP47.

Discussion: https://postgr.es/m/PH8PR21MB3902F334A3174C54058F792CE5182%40PH8PR21MB3902.namprd21.prod.outlook.com
---
 src/port/win32setlocale.c        | 415 ++++++++++++++++++++++++-------
 src/tools/pgindent/typedefs.list |   1 +
 2 files changed, 332 insertions(+), 84 deletions(-)

diff --git a/src/port/win32setlocale.c b/src/port/win32setlocale.c
index 9e2ab8cc3ad..ee580311487 100644
--- a/src/port/win32setlocale.c
+++ b/src/port/win32setlocale.c
@@ -36,25 +36,36 @@
 
 #undef setlocale
 
-struct locale_map
-{
-	/*
-	 * String in locale name to replace. Can be a single string (end is NULL),
-	 * or separate start and end strings. If two strings are given, the locale
-	 * name must contain both of them, and everything between them is
-	 * replaced. This is used for a poor-man's regexp search, allowing
-	 * replacement of "start.*end".
-	 */
-	const char *locale_name_start;
-	const char *locale_name_end;
-
-	const char *replacement;	/* string to replace the match with */
-};
-
 /*
- * Mappings applied before calling setlocale(), to the argument.
+ * The path of a text file that can be created under PGDATA to override these
+ * rules.  It allows locale names to be overridden on input to setlocale, and
+ * on return from setlocale when querying the default.  This is intended as an
+ * option of last resort for users whose system becomes unstartable due to an
+ * operating system update that changes a locale name.
+ *
+ * # comments begin a hash sign
+ * call pattern=replacement
+ * return pattern=replacemnt
+ *
+ * The pattern syntax supports ? for any character (really byte), and * for
+ * any sequence, though only one star can be used in the whole pattern.
+ *
+ * The encoding of the file is effectively undefined; setlocale() works with
+ * the current Windows ACP, and PostgreSQL thinks the strings should be ASCII
+ * or some undefined superset.  This could lead to some confusion if databases
+ * have different encodings, so it's likely that replacements should use BCP47
+ * tags if possible.
  */
-static const struct locale_map locale_map_argument[] = {
+#define LOCALE_MAP_PATH "win32setlocale.map"
+
+typedef struct Win32LocaleTableEntry
+{
+	char		direction;
+	const char *pattern;
+	const char *replacement;
+} Win32LocaleTableEntry;
+
+static const Win32LocaleTableEntry default_mapping_table[] = {
 	/*
 	 * "HKG" is listed here:
 	 * http://msdn.microsoft.com/en-us/library/cdax410z%28v=vs.71%29.aspx
@@ -63,8 +74,8 @@ static const struct locale_map locale_map_argument[] = {
 	 * "ARE" is the ISO-3166 three-letter code for U.A.E. It is not on the
 	 * above list, but seems to work anyway.
 	 */
-	{"Hong Kong S.A.R.", NULL, "HKG"},
-	{"U.A.E.", NULL, "ARE"},
+	{'c', "Hong Kong S.A.R.", "HKG"},
+	{'c', "U.A.E.", "ARE"},
 
 	/*
 	 * The ISO-3166 country code for Macau S.A.R. is MAC, but Windows doesn't
@@ -79,17 +90,9 @@ static const struct locale_map locale_map_argument[] = {
 	 *
 	 * Some versions of Windows spell it "Macau", others "Macao".
 	 */
-	{"Chinese (Traditional)_Macau S.A.R..950", NULL, "ZHM"},
-	{"Chinese_Macau S.A.R..950", NULL, "ZHM"},
-	{"Chinese (Traditional)_Macao S.A.R..950", NULL, "ZHM"},
-	{"Chinese_Macao S.A.R..950", NULL, "ZHM"},
-	{NULL, NULL, NULL}
-};
+	{'c', "Chinese (Traditional)_Maca? S.A.R..950", "ZHM"},
+	{'c', "Chinese_Maca? S.A.R..950", "ZHM"},
 
-/*
- * Mappings applied after calling setlocale(), to its return value.
- */
-static const struct locale_map locale_map_result[] = {
 	/*
 	 * "Norwegian (Bokm&aring;l)" locale name contains the a-ring character.
 	 * Map it to a pure-ASCII alias.
@@ -100,84 +103,324 @@ static const struct locale_map locale_map_result[] = {
 	 * Just to make life even more complicated, some versions of Windows spell
 	 * the locale name without parentheses.  Translate that too.
 	 */
-	{"Norwegian (Bokm", "l)_Norway", "Norwegian_Norway"},
-	{"Norwegian Bokm", "l_Norway", "Norwegian_Norway"},
-	{NULL, NULL, NULL}
+	{'r', "Norwegian (Bokm*l)_Norway", "Norwegian_Norway"},
+	{'r', "Norwegian Bokm*l_Norway", "Norwegian_Norway"},
 };
 
-#define MAX_LOCALE_NAME_LEN		100
+static bool mapping_table_initialized;
+static const Win32LocaleTableEntry *mapping_table;
+static size_t mapping_table_size;
 
-static const char *
-map_locale(const struct locale_map *map, const char *locale)
+/*
+ * Parse a line of the mapping file.  Returns 0 on success. Squawks to stderr
+ * on failure, but also sets the errno for setlocale() to fail with and
+ * returns -1 in that case.
+ */
+static int
+parse_line(char *line,
+		   int line_number,
+		   char *direction,
+		   char **pattern,
+		   char **replacement)
+{
+	const char *rest = NULL;
+	const char *delimiter;
+	size_t		len;
+
+	/* Strip line endings. */
+	while ((len = strlen(line)) > 0 &&
+		   (line[len - 1] == '\r' || line[len - 1] == '\n'))
+		line[len - 1] = '\0';
+
+	/* Skip empty lines and shell-style comments. */
+	if (line[0] == '\0' || line[0] == '#')
+	{
+		*direction = '\0';
+		return 0;
+	}
+
+	/* Is it "call ..." or "return ..."? */
+	if (strncmp(line, "call ", 5) == 0)
+		rest = line + 5;
+	else if (strncmp(line, "return ", 7) == 0)
+		rest = line + 7;
+	if (!rest)
+	{
+		fprintf(stderr,
+				"syntax error on line %d of " LOCALE_MAP_PATH "\n",
+				line_number);
+		errno = EINVAL;
+		return -1;
+	}
+
+	/* Grab 'c' or 'r'. */
+	*direction = line[0];
+
+	/* Look for the equal sign followed by something. */
+	delimiter = strchr(rest, '=');
+	if (!delimiter || delimiter[1] == '\0')
+	{
+		fprintf(stderr,
+				"syntax error on line %d of " LOCALE_MAP_PATH ", expected '='\n",
+				line_number);
+		errno = EINVAL;
+		return -1;
+	}
+
+	/* Copy the pattern. */
+	len = delimiter - rest;
+	*pattern = malloc(len + 1);
+	if (!*pattern)
+	{
+		errno = ENOMEM;
+		return -1;
+	}
+	memcpy(*pattern, rest, len);
+	(*pattern)[len] = '\0';
+
+	/* Copy the replacement. */
+	*replacement = strdup(delimiter + 1);
+	if (!*replacement)
+	{
+		free(*pattern);
+		errno = ENOMEM;
+		return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * Free a mapping table.  Only used for cleanup on failure, because otherwise
+ * the mapping table is built once and sticks around until process exit.
+ */
+static void
+free_mapping_table(Win32LocaleTableEntry *table, size_t size)
 {
-	static char aliasbuf[MAX_LOCALE_NAME_LEN];
-	int			i;
+	for (size_t i = 0; i < size; ++i)
+	{
+		free(unconstify(char *, table[i].pattern));
+		free(unconstify(char *, table[i].replacement));
+	}
+	free(table);
+}
+
+static void
+initialize_default_mapping_table(void)
+{
+	mapping_table = default_mapping_table;
+	mapping_table_size = lengthof(default_mapping_table);
+}
+
+/*
+ * Initialize mapping_table and mapping_table_size.  Returns 0 on success, or
+ * -1 for failure.
+ */
+static int
+initialize_mapping_table(void)
+{
+	FILE	   *file;
+	char		path[MAXPGPATH];
+	const char *pgdata;
+	char		line[128];
+	int			line_number;
+	Win32LocaleTableEntry *table;
+	size_t		table_size;
+	size_t		table_capacity;
+
+	/* If there is no PGDATA, we're probably not in a backend so do nothing. */
+	pgdata = getenv("PGDATA");
+	if (!pgdata)
+	{
+		initialize_default_mapping_table();
+		return 0;
+	}
 
-	/* Check if the locale name matches any of the problematic ones. */
-	for (i = 0; map[i].locale_name_start != NULL; i++)
+	/* If there is no mapping file, do nothing. */
+	snprintf(path, sizeof(path), "%s/%s", pgdata, LOCALE_MAP_PATH);
+	file = fopen(path, "r");
+	if (!file)
 	{
-		const char *needle_start = map[i].locale_name_start;
-		const char *needle_end = map[i].locale_name_end;
-		const char *replacement = map[i].replacement;
-		char	   *match;
-		char	   *match_start = NULL;
-		char	   *match_end = NULL;
-
-		match = strstr(locale, needle_start);
-		if (match)
+		initialize_default_mapping_table();
+		return 0;
+	}
+
+	/* Initial guess at required space. */
+	table_size = 0;
+	table_capacity = 16;
+	table = malloc(sizeof(Win32LocaleTableEntry) * table_capacity);
+	if (table == NULL)
+	{
+		errno = ENOMEM;
+		return -1;
+	}
+
+	/* Read the file line-by-line. */
+	while (fgets(line, sizeof(line), file))
+	{
+		char		direction;
+		char	   *pattern;
+		char	   *replacement;
+
+		if (parse_line(line,
+					   line_number++,
+					   &direction,
+					   &pattern,
+					   &replacement) != 0)
 		{
-			/*
-			 * Found a match for the first part. If this was a two-part
-			 * replacement, find the second part.
-			 */
-			match_start = match;
-			if (needle_end)
-			{
-				match = strstr(match_start + strlen(needle_start), needle_end);
-				if (match)
-					match_end = match + strlen(needle_end);
-				else
-					match_start = NULL;
-			}
-			else
-				match_end = match_start + strlen(needle_start);
+			/* errno already set */
+			free_mapping_table(table, table_size);
+			fclose(file);
+			return -1;
 		}
 
-		if (match_start)
+		/* Skip blank/comments. */
+		if (direction == '\0')
+			continue;
+
+		/* Grow by doubling on demand. */
+		if (table_size == table_capacity)
 		{
-			/* Found a match. Replace the matched string. */
-			int			matchpos = match_start - locale;
-			int			replacementlen = strlen(replacement);
-			char	   *rest = match_end;
-			int			restlen = strlen(rest);
-
-			/* check that the result fits in the static buffer */
-			if (matchpos + replacementlen + restlen + 1 > MAX_LOCALE_NAME_LEN)
-				return NULL;
-
-			memcpy(&aliasbuf[0], &locale[0], matchpos);
-			memcpy(&aliasbuf[matchpos], replacement, replacementlen);
-			/* includes null terminator */
-			memcpy(&aliasbuf[matchpos + replacementlen], rest, restlen + 1);
-
-			return aliasbuf;
+			Win32LocaleTableEntry *new_table;
+
+			new_table = malloc(sizeof(*new_table) * table_capacity * 2);
+			if (new_table == NULL)
+			{
+				free_mapping_table(table, table_size);
+				fclose(file);
+				errno = ENOMEM;
+				return -1;
+			}
+			memcpy(new_table, table, sizeof(*table) * table_size);
+			free(table);
+			table = new_table;
+			table_capacity *= 2;
 		}
+
+		/* Fill in new entry. */
+		table[table_size].direction = direction;
+		table[table_size].pattern = pattern;
+		table[table_size].replacement = replacement;
+		table_size++;
+	}
+
+	fclose(file);
+
+	/* Mapping table established for this process. */
+	mapping_table = table;
+	mapping_table_size = table_size;
+
+	return 0;
+}
+
+/*
+ * Checks if n bytes of pattern and name match.  '?' is treated as a wildcard
+ * in the pattern, but all other bytes must be identical to match.
+ */
+static bool
+subpattern_matches(const char *pattern, const char *name, size_t n)
+{
+	while (n > 0)
+	{
+		/* Have we hit the end of the pattern or name? */
+		if (*pattern == '\0')
+			return *name == '\0';
+		else if (*name == '\0')
+			return false;
+
+		/* Otherwise matches wildcard or exact character. */
+		if (*pattern != '?' && *pattern != *name)
+			return false;
+
+		/* Next. */
+		n--;
+		pattern++;
+		name++;
+	}
+	return true;
+}
+
+/*
+ * Checks if a name matches a pattern, with an extremely simple pattern logic.
+ * The pattern may contain any number of '?' characters to match any character,
+ * and zero or one '*' characters to match any sequence of characters.
+ */
+static bool
+pattern_matches(const char *pattern, const char *name)
+{
+	const char *star;
+
+	if ((star = strchr(pattern, '*')))
+	{
+		size_t		len_pattern_before_star;
+		size_t		len_pattern_after_star;
+		size_t		len_name;
+
+		/* Does the name match the part before the star? */
+		len_pattern_before_star = star - pattern;
+		if (!subpattern_matches(pattern, name, len_pattern_before_star))
+			return false;
+
+		/* Step over the star in the pattern. */
+		pattern += len_pattern_before_star;
+		pattern++;
+		len_pattern_after_star = strlen(pattern);
+
+		/* Step over the star in the name. */
+		name += len_pattern_before_star;
+		len_name = strlen(name);
+		if (len_name < len_pattern_after_star)
+			return false;
+		name += len_name - len_pattern_after_star;
 	}
 
-	/* no match, just return the original string */
-	return locale;
+	return subpattern_matches(pattern, name, SIZE_MAX);
 }
 
+/*
+ * Convert a setlocale() locale name according to the installed mapping table.
+ * Direction should be 'c' for calls, and 'r' for return values.
+ *
+ */
+static const char *
+map_locale(char direction, const char *name)
+{
+	for (size_t i = 0; i < mapping_table_size; ++i)
+	{
+		if (mapping_table[i].direction == direction &&
+			pattern_matches(mapping_table[i].pattern, name))
+			return mapping_table[i].replacement;
+	}
+	return name;
+}
+
+/*
+ * This implementation sets errno and writes messages to stderr for
+ * catastrophic internal failures, though the POSIX function defines no errors
+ * so callers shouldn't generally check errno.
+ */
 char *
 pgwin32_setlocale(int category, const char *locale)
 {
 	const char *argument;
 	char	   *result;
 
+	if (!mapping_table_initialized)
+	{
+		if (initialize_mapping_table() < 0)
+			return NULL;
+		mapping_table_initialized = true;
+	}
+
+	/*
+	 * XXX Call value transformation is relevant as long as we think there are
+	 * existing systems that were initdb'd with unstable and non-ASCII locale
+	 * names.
+	 */
 	if (locale == NULL)
 		argument = NULL;
 	else
-		argument = map_locale(locale_map_argument, locale);
+		argument = map_locale('c', locale);
 
 	/* Call the real setlocale() function */
 	result = setlocale(category, argument);
@@ -185,9 +428,13 @@ pgwin32_setlocale(int category, const char *locale)
 	/*
 	 * setlocale() is specified to return a "char *" that the caller is
 	 * forbidden to modify, so casting away the "const" is innocuous.
+	 *
+	 * XXX Return value transformation is only relevant as long as we continue
+	 * to use setlocale("") as a way to query the default locale names, which
+	 * is the source of the unstable and non-ASCII locale names.
 	 */
 	if (result)
-		result = unconstify(char *, map_locale(locale_map_result, result));
+		result = unconstify(char *, map_locale('r', result));
 
 	return result;
 }
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 34ec87a85eb..67451e57279 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -3109,6 +3109,7 @@ WalUsage
 WalWriteMethod
 WalWriteMethodOps
 Walfile
+Win32LocaleTableEntry
 WindowAgg
 WindowAggPath
 WindowAggState
-- 
2.44.0

