From 7ce735b1e85b9f3f9ab6d48588de5824667323d2 Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Fri, 10 Oct 2025 10:49:05 -0700
Subject: [PATCH v3 1/2] initdb: add default locales for builtin and ICU
 providers.

Allows initdb to succeed with any provider even if no other locale
options are specified. Will be useful if the provider comes from
another source, like an environment variable, or if we ever change the
initdb default provider.

Discussion: https://postgr.es/m/7d424dc0b032b30a22220634d12377bf59524bdb.camel@j-davis.com
---
 src/bin/initdb/initdb.c           | 84 +++++++++++++++++++++++++++----
 src/bin/initdb/t/001_initdb.pl    | 11 ++--
 src/bin/scripts/t/020_createdb.pl | 69 ++++++++++++++-----------
 3 files changed, 120 insertions(+), 44 deletions(-)

diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index 92fe2f531f7..e2960e5f17c 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -82,6 +82,8 @@
 #include "mb/pg_wchar.h"
 #include "miscadmin.h"
 
+#define DEFAULT_BUILTIN_LOCALE		"C.UTF-8"
+#define DEFAULT_ICU_LOCALE			"und"
 
 /* Ideally this would be in a .h file, but it hardly seems worth the trouble */
 extern const char *select_default_timezone(const char *share_path);
@@ -2412,6 +2414,25 @@ icu_validate_locale(const char *loc_str)
 #endif
 }
 
+/*
+ * Is the given locale name UTF-8 compatible?
+ */
+static bool
+utf8_compatible(const char *localename)
+{
+#ifndef WIN32
+	int			ctype_enc;
+
+	Assert(localename != NULL);
+	ctype_enc = pg_get_encoding_from_locale(localename, false);
+
+	return (ctype_enc == PG_UTF8 || ctype_enc == PG_SQL_ASCII);
+#else
+	/* on windows, all locales are compatible with UTF-8 */
+	return true;
+#endif
+}
+
 /*
  * set up the locale variables
  *
@@ -2420,6 +2441,8 @@ icu_validate_locale(const char *loc_str)
 static void
 setlocales(void)
 {
+	bool		ctype_from_env;
+	bool		collate_from_env;
 	char	   *canonname;
 
 	/* set empty lc_* and datlocale values to locale config if set */
@@ -2442,6 +2465,9 @@ setlocales(void)
 			datlocale = locale;
 	}
 
+	ctype_from_env = (lc_ctype == NULL);
+	collate_from_env = (lc_collate == NULL);
+
 	/*
 	 * canonicalize locale names, and obtain any missing values from our
 	 * current environment
@@ -2465,12 +2491,11 @@ setlocales(void)
 	lc_messages = canonname;
 #endif
 
-	if (locale_provider != COLLPROVIDER_LIBC && datlocale == NULL)
-		pg_fatal("locale must be specified if provider is %s",
-				 collprovider_name(locale_provider));
-
 	if (locale_provider == COLLPROVIDER_BUILTIN)
 	{
+		if (!datlocale)
+			datlocale = DEFAULT_BUILTIN_LOCALE;
+
 		if (strcmp(datlocale, "C") == 0)
 			canonname = "C";
 		else if (strcmp(datlocale, "C.UTF-8") == 0 ||
@@ -2488,11 +2513,13 @@ setlocales(void)
 	{
 		char	   *langtag;
 
+		if (!datlocale)
+			datlocale = DEFAULT_ICU_LOCALE;
+
 		/* canonicalize to a language tag */
 		langtag = icu_language_tag(datlocale);
 		printf(_("Using language tag \"%s\" for ICU locale \"%s\".\n"),
 			   langtag, datlocale);
-		pg_free(datlocale);
 		datlocale = langtag;
 
 		icu_validate_locale(datlocale);
@@ -2505,6 +2532,46 @@ setlocales(void)
 		pg_fatal("ICU is not supported in this build");
 #endif
 	}
+
+	/*
+	 * If using the builtin provider with a locale requiring UTF-8, avoid
+	 * taking incompatible settings from the environment.
+	 */
+	if (locale_provider == COLLPROVIDER_BUILTIN &&
+		strcmp(datlocale, "C") != 0)
+	{
+		if (!encoding)
+			encoding = "UTF-8";
+
+		/*
+		 * LC_CTYPE has little effect unless using the libc provider, but does
+		 * still affect some places, such translation of error messages from
+		 * the OS. Overriding it here may be an inconvenience, but in the
+		 * absence of specified locale options, it's the best choice.
+		 *
+		 * XXX: minimize the effects of LC_CTYPE when not using libc.
+		 */
+		if (ctype_from_env && !utf8_compatible(lc_ctype))
+		{
+			pg_log_warning("setting LC_CTYPE to \"C\"");
+			pg_log_warning_detail("Encoding of LC_CTYPE locale \"%s\" does not match encoding required by builtin locale \"%s\".",
+								  lc_ctype, datlocale);
+			pg_log_warning_hint("Specify a UTF-8 compatible locale with --lc-ctype, or choose a different locale provider.");
+			lc_ctype = "C";
+		}
+
+		/*
+		 * LC_COLLATE has no effect unless using the libc provider.
+		 */
+		if (collate_from_env && !utf8_compatible(lc_collate))
+		{
+			pg_log_warning("setting LC_COLLATE to \"C\"");
+			pg_log_warning_detail("Encoding of LC_COLLATE locale \"%s\" does not match encoding required by builtin locale \"%s\".",
+								  lc_collate, datlocale);
+			pg_log_warning_hint("Specify a UTF-8 compatible locale with --lc-collate, or choose a different locale provider.");
+			lc_collate = "C";
+		}
+	}
 }
 
 /*
@@ -2770,11 +2837,10 @@ setup_locale_encoding(void)
 		!check_locale_encoding(lc_collate, encodingid))
 		exit(1);				/* check_locale_encoding printed the error */
 
-	if (locale_provider == COLLPROVIDER_BUILTIN)
+	if (locale_provider == COLLPROVIDER_BUILTIN &&
+		strcmp(datlocale, "C") != 0)
 	{
-		if ((strcmp(datlocale, "C.UTF-8") == 0 ||
-			 strcmp(datlocale, "PG_UNICODE_FAST") == 0) &&
-			encodingid != PG_UTF8)
+		if (encodingid != PG_UTF8)
 			pg_fatal("builtin provider locale \"%s\" requires encoding \"%s\"",
 					 datlocale, "UTF-8");
 	}
diff --git a/src/bin/initdb/t/001_initdb.pl b/src/bin/initdb/t/001_initdb.pl
index b7ef7ed8d06..ba3211a4aa6 100644
--- a/src/bin/initdb/t/001_initdb.pl
+++ b/src/bin/initdb/t/001_initdb.pl
@@ -113,14 +113,13 @@ SKIP:
 
 if ($ENV{with_icu} eq 'yes')
 {
-	command_fails_like(
+	command_ok(
 		[
 			'initdb', '--no-sync',
 			'--locale-provider' => 'icu',
 			"$tempdir/data2"
 		],
-		qr/initdb: error: locale must be specified if provider is icu/,
-		'locale provider ICU requires --icu-locale');
+		'locale provider ICU default locale');
 
 	command_ok(
 		[
@@ -200,13 +199,15 @@ else
 		'locale provider ICU fails since no ICU support');
 }
 
-command_fails(
+command_like(
 	[
 		'initdb', '--no-sync',
+		'--auth' => 'trust',
 		'--locale-provider' => 'builtin',
 		"$tempdir/data6"
 	],
-	'locale provider builtin fails without --locale');
+	qr/^\s+default collation:\s+C.UTF-8\n/ms,
+	'locale provider builtin defaults to C.UTF-8');
 
 command_ok(
 	[
diff --git a/src/bin/scripts/t/020_createdb.pl b/src/bin/scripts/t/020_createdb.pl
index a8293390ede..6003d213e89 100644
--- a/src/bin/scripts/t/020_createdb.pl
+++ b/src/bin/scripts/t/020_createdb.pl
@@ -16,6 +16,9 @@ my $node = PostgreSQL::Test::Cluster->new('main');
 $node->init;
 $node->start;
 
+my $datlocprovider = $node->safe_psql('postgres',
+	"SELECT datlocprovider FROM pg_database WHERE datname='template1'");
+
 $node->issues_sql_like(
 	[ 'createdb', 'foobar1' ],
 	qr/statement: CREATE DATABASE foobar1/,
@@ -33,19 +36,6 @@ $node->issues_sql_like(
 
 if ($ENV{with_icu} eq 'yes')
 {
-	# This fails because template0 uses libc provider and has no ICU
-	# locale set.  It would succeed if template0 used the icu
-	# provider.  XXX Maybe split into multiple tests?
-	$node->command_fails(
-		[
-			'createdb',
-			'--template' => 'template0',
-			'--encoding' => 'UTF8',
-			'--locale-provider' => 'icu',
-			'foobar4',
-		],
-		'create database with ICU fails without ICU locale specified');
-
 	$node->issues_sql_like(
 		[
 			'createdb',
@@ -130,14 +120,18 @@ else
 		'create database with ICU fails since no ICU support');
 }
 
-$node->command_fails(
-	[
-		'createdb',
-		'--template' => 'template0',
-		'--locale-provider' => 'builtin',
-		'tbuiltin1',
-	],
-	'create database with provider "builtin" fails without --locale');
+if ($datlocprovider eq 'c')
+{
+	$node->command_fails(
+		[
+			'createdb',
+			'--template' => 'template0',
+			'--encoding' => 'UTF8',
+			'--locale-provider' => 'builtin',
+			'foobar4',
+		],
+		'create database with builtin provider fails without locale specified');
+}
 
 $node->command_ok(
 	[
@@ -219,15 +213,30 @@ $node->command_fails(
 	],
 	'create database with provider "builtin" and ICU_RULES=""');
 
-$node->command_fails(
-	[
-		'createdb',
-		'--template' => 'template1',
-		'--locale-provider' => 'builtin',
-		'--locale' => 'C',
-		'tbuiltin9',
-	],
-	'create database with provider "builtin" not matching template');
+if ($datlocprovider eq 'b')
+{
+	$node->command_fails(
+		[
+			'createdb',
+			'--template' => 'template1',
+			'--locale-provider' => 'libc',
+			'--locale' => 'C',
+			'tbuiltin9',
+		],
+		'create database with provider "libc" not matching template');
+}
+else
+{
+	$node->command_fails(
+		[
+			'createdb',
+			'--template' => 'template1',
+			'--locale-provider' => 'builtin',
+			'--locale' => 'C',
+			'tbuiltin9',
+		],
+		'create database with provider "builtin" not matching template');
+}
 
 $node->command_fails([ 'createdb', 'foobar1' ],
 	'fails if database already exists');
-- 
2.43.0

