simplify regular expression locale global variables

Started by Peter Eisentrautabout 1 year ago10 messages

peter@eisentraut.org

about 1 year ago

3 attachment(s)

We currently have

static PG_Locale_Strategy pg_regex_strategy;
static pg_locale_t pg_regex_locale;
static Oid pg_regex_collation;

but after the recent improvements to pg_locale_t handling, we don't need
all three anymore. All the information we have is contained in
pg_locale_t, so we just need to keep that one. This allows us to
structure the locale-using regular expression code more similar to other
locale-using code, mainly by provider, avoiding another layer that is
specific only to the regular expression code. The first patch
implements that.

The second patch removes a call to pg_set_regex_collation() that I think
is unnecessary.

The third patch adds a pg_unset_regex_collation() call that undoes what
pg_set_regex_collation() does. I mainly used this to verify the second
patch, but maybe it's also useful on its own, not sure.

(I don't have any plans to get rid of the remaining global variable.
That would certainly be nice from an intellectual point of view, but
fiddling this into the regular expression code looks quite messy. In
any case, it's probably easier with one variable instead of three, if
someone wants to try.)

Attachments:

0001-Remove-pg_regex_collation-and-pg_regex_strategy.patchtext/plain; charset=UTF-8; name=0001-Remove-pg_regex_collation-and-pg_regex_strategy.patchDownload

From 1799abec05ae3d49a7a57333acd1d377e26d0fe9 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter@eisentraut.org>
Date: Tue, 15 Oct 2024 08:01:41 +0200
Subject: [PATCH 1/3] Remove pg_regex_collation and pg_regex_strategy

We don't need three global variables to describe the locale strategy
for regular expressions.  We only need to keep pg_regex_locale.  This
works now because pg_locale_t now contains all the required
information (such as a ctype_is_c field).  This allows us to structure
the locale-using regular expression code more similar to other
locale-using code, mainly by provider, avoiding another layer that is
specific only to the regular expression code.
---
 src/backend/regex/regc_pg_locale.c | 430 +++++++++++++----------------
 1 file changed, 185 insertions(+), 245 deletions(-)

diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c
index b75784b6ce5..4691e796385 100644
--- a/src/backend/regex/regc_pg_locale.c
+++ b/src/backend/regex/regc_pg_locale.c
@@ -63,18 +63,7 @@
  * NB: the coding here assumes pg_wchar is an unsigned type.
  */
 
-typedef enum
-{
-	PG_REGEX_STRATEGY_C,		/* C locale (encoding independent) */
-	PG_REGEX_STRATEGY_BUILTIN,	/* built-in Unicode semantics */
-	PG_REGEX_STRATEGY_LIBC_WIDE,	/* Use locale_t <wctype.h> functions */
-	PG_REGEX_STRATEGY_LIBC_1BYTE,	/* Use locale_t <ctype.h> functions */
-	PG_REGEX_STRATEGY_ICU,		/* Use ICU uchar.h functions */
-} PG_Locale_Strategy;
-
-static PG_Locale_Strategy pg_regex_strategy;
 static pg_locale_t pg_regex_locale;
-static Oid	pg_regex_collation;
 
 /*
  * Hard-wired character properties for C locale
@@ -232,7 +221,6 @@ void
 pg_set_regex_collation(Oid collation)
 {
 	pg_locale_t locale = 0;
-	PG_Locale_Strategy strategy;
 
 	if (!OidIsValid(collation))
 	{
@@ -253,8 +241,9 @@ pg_set_regex_collation(Oid collation)
 		 * catalog access is available, so we can't call
 		 * pg_newlocale_from_collation().
 		 */
-		strategy = PG_REGEX_STRATEGY_C;
-		collation = C_COLLATION_OID;
+		static struct pg_locale_struct dummy_locale = {.ctype_is_c = true};
+
+		locale = &dummy_locale;
 	}
 	else
 	{
@@ -264,121 +253,80 @@ pg_set_regex_collation(Oid collation)
 			ereport(ERROR,
 					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 					 errmsg("nondeterministic collations are not supported for regular expressions")));
-
-		if (locale->ctype_is_c)
-		{
-			/*
-			 * C/POSIX collations use this path regardless of database
-			 * encoding
-			 */
-			strategy = PG_REGEX_STRATEGY_C;
-			locale = 0;
-			collation = C_COLLATION_OID;
-		}
-		else if (locale->provider == COLLPROVIDER_BUILTIN)
-		{
-			Assert(GetDatabaseEncoding() == PG_UTF8);
-			strategy = PG_REGEX_STRATEGY_BUILTIN;
-		}
-#ifdef USE_ICU
-		else if (locale->provider == COLLPROVIDER_ICU)
-		{
-			strategy = PG_REGEX_STRATEGY_ICU;
-		}
-#endif
-		else
-		{
-			Assert(locale->provider == COLLPROVIDER_LIBC);
-			if (GetDatabaseEncoding() == PG_UTF8)
-				strategy = PG_REGEX_STRATEGY_LIBC_WIDE;
-			else
-				strategy = PG_REGEX_STRATEGY_LIBC_1BYTE;
-		}
 	}
 
-	pg_regex_strategy = strategy;
 	pg_regex_locale = locale;
-	pg_regex_collation = collation;
 }
 
 static int
 pg_wc_isdigit(pg_wchar c)
 {
-	switch (pg_regex_strategy)
+	if (pg_regex_locale->ctype_is_c)
+		return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISDIGIT));
+	else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+		return pg_u_isdigit(c, true);
+	else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
 	{
-		case PG_REGEX_STRATEGY_C:
-			return (c <= (pg_wchar) 127 &&
-					(pg_char_properties[c] & PG_ISDIGIT));
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_isdigit(c, true);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return iswdigit_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF))
+			return iswdigit_l((wint_t) c, pg_regex_locale->info.lt);
+		else
 			return (c <= (pg_wchar) UCHAR_MAX &&
 					isdigit_l((unsigned char) c, pg_regex_locale->info.lt));
-			break;
-		case PG_REGEX_STRATEGY_ICU:
+	}
 #ifdef USE_ICU
-			return u_isdigit(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+		return u_isdigit(c);
 #endif
-			break;
-	}
+
 	return 0;					/* can't get here, but keep compiler quiet */
 }
 
 static int
 pg_wc_isalpha(pg_wchar c)
 {
-	switch (pg_regex_strategy)
+	if (pg_regex_locale->ctype_is_c)
+		return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISALPHA));
+	else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+		return pg_u_isalpha(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
 	{
-		case PG_REGEX_STRATEGY_C:
-			return (c <= (pg_wchar) 127 &&
-					(pg_char_properties[c] & PG_ISALPHA));
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_isalpha(c);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return iswalpha_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF))
+			return iswalpha_l((wint_t) c, pg_regex_locale->info.lt);
+		else
 			return (c <= (pg_wchar) UCHAR_MAX &&
 					isalpha_l((unsigned char) c, pg_regex_locale->info.lt));
-			break;
-		case PG_REGEX_STRATEGY_ICU:
+	}
 #ifdef USE_ICU
-			return u_isalpha(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+		return u_isalpha(c);
 #endif
-			break;
-	}
+
 	return 0;					/* can't get here, but keep compiler quiet */
 }
 
 static int
 pg_wc_isalnum(pg_wchar c)
 {
-	switch (pg_regex_strategy)
+	if (pg_regex_locale->ctype_is_c)
+		return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISALNUM));
+	else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+		return pg_u_isalnum(c, true);
+	else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
 	{
-		case PG_REGEX_STRATEGY_C:
-			return (c <= (pg_wchar) 127 &&
-					(pg_char_properties[c] & PG_ISALNUM));
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_isalnum(c, true);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return iswalnum_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF))
+			return iswalnum_l((wint_t) c, pg_regex_locale->info.lt);
+		else
 			return (c <= (pg_wchar) UCHAR_MAX &&
 					isalnum_l((unsigned char) c, pg_regex_locale->info.lt));
-			break;
-		case PG_REGEX_STRATEGY_ICU:
+	}
 #ifdef USE_ICU
-			return u_isalnum(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+		return u_isalnum(c);
 #endif
-			break;
-	}
+
 	return 0;					/* can't get here, but keep compiler quiet */
 }
 
@@ -394,218 +342,206 @@ pg_wc_isword(pg_wchar c)
 static int
 pg_wc_isupper(pg_wchar c)
 {
-	switch (pg_regex_strategy)
+	if (pg_regex_locale->ctype_is_c)
+		return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISUPPER));
+	else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+		return pg_u_isupper(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
 	{
-		case PG_REGEX_STRATEGY_C:
-			return (c <= (pg_wchar) 127 &&
-					(pg_char_properties[c] & PG_ISUPPER));
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_isupper(c);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return iswupper_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF))
+			return iswupper_l((wint_t) c, pg_regex_locale->info.lt);
+		else
 			return (c <= (pg_wchar) UCHAR_MAX &&
 					isupper_l((unsigned char) c, pg_regex_locale->info.lt));
-			break;
-		case PG_REGEX_STRATEGY_ICU:
+	}
 #ifdef USE_ICU
-			return u_isupper(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+		return u_isupper(c);
 #endif
-			break;
-	}
+
 	return 0;					/* can't get here, but keep compiler quiet */
 }
 
 static int
 pg_wc_islower(pg_wchar c)
 {
-	switch (pg_regex_strategy)
+	if (pg_regex_locale->ctype_is_c)
+		return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISLOWER));
+	else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+		return pg_u_islower(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
 	{
-		case PG_REGEX_STRATEGY_C:
-			return (c <= (pg_wchar) 127 &&
-					(pg_char_properties[c] & PG_ISLOWER));
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_islower(c);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return iswlower_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF))
+			return iswlower_l((wint_t) c, pg_regex_locale->info.lt);
+		else
 			return (c <= (pg_wchar) UCHAR_MAX &&
 					islower_l((unsigned char) c, pg_regex_locale->info.lt));
-			break;
-		case PG_REGEX_STRATEGY_ICU:
+	}
 #ifdef USE_ICU
-			return u_islower(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+		return u_islower(c);
 #endif
-			break;
-	}
+
 	return 0;					/* can't get here, but keep compiler quiet */
 }
 
 static int
 pg_wc_isgraph(pg_wchar c)
 {
-	switch (pg_regex_strategy)
+	if (pg_regex_locale->ctype_is_c)
+		return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISGRAPH));
+	else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+		return pg_u_isgraph(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
 	{
-		case PG_REGEX_STRATEGY_C:
-			return (c <= (pg_wchar) 127 &&
-					(pg_char_properties[c] & PG_ISGRAPH));
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_isgraph(c);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return iswgraph_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF))
+			return iswgraph_l((wint_t) c, pg_regex_locale->info.lt);
+		else
 			return (c <= (pg_wchar) UCHAR_MAX &&
 					isgraph_l((unsigned char) c, pg_regex_locale->info.lt));
-			break;
-		case PG_REGEX_STRATEGY_ICU:
+	}
 #ifdef USE_ICU
-			return u_isgraph(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+		return u_isgraph(c);
 #endif
-			break;
-	}
+
 	return 0;					/* can't get here, but keep compiler quiet */
 }
 
 static int
 pg_wc_isprint(pg_wchar c)
 {
-	switch (pg_regex_strategy)
+	if (pg_regex_locale->ctype_is_c)
+		return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISPRINT));
+	else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+		return pg_u_isprint(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
 	{
-		case PG_REGEX_STRATEGY_C:
-			return (c <= (pg_wchar) 127 &&
-					(pg_char_properties[c] & PG_ISPRINT));
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_isprint(c);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return iswprint_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF))
+			return iswprint_l((wint_t) c, pg_regex_locale->info.lt);
+		else
 			return (c <= (pg_wchar) UCHAR_MAX &&
 					isprint_l((unsigned char) c, pg_regex_locale->info.lt));
-			break;
-		case PG_REGEX_STRATEGY_ICU:
+	}
 #ifdef USE_ICU
-			return u_isprint(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+		return u_isprint(c);
 #endif
-			break;
-	}
+
 	return 0;					/* can't get here, but keep compiler quiet */
 }
 
 static int
 pg_wc_ispunct(pg_wchar c)
 {
-	switch (pg_regex_strategy)
+	if (pg_regex_locale->ctype_is_c)
+		return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISPUNCT));
+	else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+		return pg_u_ispunct(c, true);
+	else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
 	{
-		case PG_REGEX_STRATEGY_C:
-			return (c <= (pg_wchar) 127 &&
-					(pg_char_properties[c] & PG_ISPUNCT));
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_ispunct(c, true);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return iswpunct_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF))
+			return iswpunct_l((wint_t) c, pg_regex_locale->info.lt);
+		else
 			return (c <= (pg_wchar) UCHAR_MAX &&
 					ispunct_l((unsigned char) c, pg_regex_locale->info.lt));
-			break;
-		case PG_REGEX_STRATEGY_ICU:
+	}
 #ifdef USE_ICU
-			return u_ispunct(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+		return u_ispunct(c);
 #endif
-			break;
-	}
+
 	return 0;					/* can't get here, but keep compiler quiet */
 }
 
 static int
 pg_wc_isspace(pg_wchar c)
 {
-	switch (pg_regex_strategy)
+	if (pg_regex_locale->ctype_is_c)
+		return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISSPACE));
+	else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+		return pg_u_isspace(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
 	{
-		case PG_REGEX_STRATEGY_C:
-			return (c <= (pg_wchar) 127 &&
-					(pg_char_properties[c] & PG_ISSPACE));
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_isspace(c);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return iswspace_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF))
+			return iswspace_l((wint_t) c, pg_regex_locale->info.lt);
+		else
 			return (c <= (pg_wchar) UCHAR_MAX &&
 					isspace_l((unsigned char) c, pg_regex_locale->info.lt));
-			break;
-		case PG_REGEX_STRATEGY_ICU:
+	}
 #ifdef USE_ICU
-			return u_isspace(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+		return u_isspace(c);
 #endif
-			break;
-	}
+
 	return 0;					/* can't get here, but keep compiler quiet */
 }
 
 static pg_wchar
 pg_wc_toupper(pg_wchar c)
 {
-	switch (pg_regex_strategy)
+	if (pg_regex_locale->ctype_is_c)
 	{
-		case PG_REGEX_STRATEGY_C:
-			if (c <= (pg_wchar) 127)
-				return pg_ascii_toupper((unsigned char) c);
-			return c;
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return unicode_uppercase_simple(c);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return towupper_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
+		if (c <= (pg_wchar) 127)
+			return pg_ascii_toupper((unsigned char) c);
+		return c;
+	}
+	else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+		return unicode_uppercase_simple(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
+	{
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF))
+			return towupper_l((wint_t) c, pg_regex_locale->info.lt);
+		else
+		{
 			if (c <= (pg_wchar) UCHAR_MAX)
 				return toupper_l((unsigned char) c, pg_regex_locale->info.lt);
 			return c;
-		case PG_REGEX_STRATEGY_ICU:
+		}
+	}
 #ifdef USE_ICU
-			return u_toupper(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+		return u_toupper(c);
 #endif
-			break;
-	}
+
 	return 0;					/* can't get here, but keep compiler quiet */
 }
 
 static pg_wchar
 pg_wc_tolower(pg_wchar c)
 {
-	switch (pg_regex_strategy)
+	if (pg_regex_locale->ctype_is_c)
 	{
-		case PG_REGEX_STRATEGY_C:
-			if (c <= (pg_wchar) 127)
-				return pg_ascii_tolower((unsigned char) c);
-			return c;
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return unicode_lowercase_simple(c);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return towlower_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
+		if (c <= (pg_wchar) 127)
+			return pg_ascii_tolower((unsigned char) c);
+		return c;
+	}
+	else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+		return unicode_lowercase_simple(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
+	{
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF))
+			return towlower_l((wint_t) c, pg_regex_locale->info.lt);
+		else
+		{
 			if (c <= (pg_wchar) UCHAR_MAX)
 				return tolower_l((unsigned char) c, pg_regex_locale->info.lt);
 			return c;
-		case PG_REGEX_STRATEGY_ICU:
+		}
+	}
 #ifdef USE_ICU
-			return u_tolower(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+		return u_tolower(c);
 #endif
-			break;
-	}
+
 	return 0;					/* can't get here, but keep compiler quiet */
 }
 
@@ -628,7 +564,7 @@ typedef int (*pg_wc_probefunc) (pg_wchar c);
 typedef struct pg_ctype_cache
 {
 	pg_wc_probefunc probefunc;	/* pg_wc_isalpha or a sibling */
-	Oid			collation;		/* collation this entry is for */
+	pg_locale_t locale;			/* locale this entry is for */
 	struct cvec cv;				/* cache entry contents */
 	struct pg_ctype_cache *next;	/* chain link */
 } pg_ctype_cache;
@@ -697,7 +633,7 @@ pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode)
 	for (pcc = pg_ctype_cache_list; pcc != NULL; pcc = pcc->next)
 	{
 		if (pcc->probefunc == probefunc &&
-			pcc->collation == pg_regex_collation)
+			pcc->locale == pg_regex_locale)
 			return &pcc->cv;
 	}
 
@@ -708,7 +644,7 @@ pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode)
 	if (pcc == NULL)
 		return NULL;
 	pcc->probefunc = probefunc;
-	pcc->collation = pg_regex_collation;
+	pcc->locale = pg_regex_locale;
 	pcc->cv.nchrs = 0;
 	pcc->cv.chrspace = 128;
 	pcc->cv.chrs = (chr *) malloc(pcc->cv.chrspace * sizeof(chr));
@@ -732,37 +668,41 @@ pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode)
 	 * would always be true for production values of MAX_SIMPLE_CHR, but it's
 	 * useful to allow it to be small for testing purposes.)
 	 */
-	switch (pg_regex_strategy)
+	if (pg_regex_locale->ctype_is_c)
 	{
-		case PG_REGEX_STRATEGY_C:
 #if MAX_SIMPLE_CHR >= 127
-			max_chr = (pg_wchar) 127;
-			pcc->cv.cclasscode = -1;
+		max_chr = (pg_wchar) 127;
+		pcc->cv.cclasscode = -1;
 #else
-			max_chr = (pg_wchar) MAX_SIMPLE_CHR;
+		max_chr = (pg_wchar) MAX_SIMPLE_CHR;
 #endif
-			break;
-		case PG_REGEX_STRATEGY_BUILTIN:
-			max_chr = (pg_wchar) MAX_SIMPLE_CHR;
-			break;
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
+	}
+	else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+	{
+		max_chr = (pg_wchar) MAX_SIMPLE_CHR;
+	}
+	else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
+	{
+		if (GetDatabaseEncoding() == PG_UTF8)
 			max_chr = (pg_wchar) MAX_SIMPLE_CHR;
-			break;
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
+		else
+		{
 #if MAX_SIMPLE_CHR >= UCHAR_MAX
 			max_chr = (pg_wchar) UCHAR_MAX;
 			pcc->cv.cclasscode = -1;
 #else
 			max_chr = (pg_wchar) MAX_SIMPLE_CHR;
 #endif
-			break;
-		case PG_REGEX_STRATEGY_ICU:
-			max_chr = (pg_wchar) MAX_SIMPLE_CHR;
-			break;
-		default:
-			Assert(false);
-			max_chr = 0;		/* can't get here, but keep compiler quiet */
-			break;
+		}
+	}
+	else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+	{
+		max_chr = (pg_wchar) MAX_SIMPLE_CHR;
+	}
+	else
+	{
+		Assert(false);
+		max_chr = 0;			/* can't get here, but keep compiler quiet */
 	}
 
 	/*

base-commit: 7cdfeee320e72162b62dddddee638e713c2b8680
-- 
2.47.0

0002-Remove-unneeded-pg_set_regex_collation-call.patchtext/plain; charset=UTF-8; name=0002-Remove-unneeded-pg_set_regex_collation-call.patchDownload

From dc4b2e9b8b89feb3a687e7a8906c5e496f53706f Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter@eisentraut.org>
Date: Tue, 15 Oct 2024 08:01:41 +0200
Subject: [PATCH 2/3] Remove unneeded pg_set_regex_collation() call

The call in pg_regprefix() was apparently never necessary, because
this code doesn't actually execute a regular expression, but it just
looks at it, which doesn't invoke any locale-using functionality.
---
 src/backend/regex/regprefix.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/backend/regex/regprefix.c b/src/backend/regex/regprefix.c
index 47a8cebe075..6ba32ac1de0 100644
--- a/src/backend/regex/regprefix.c
+++ b/src/backend/regex/regprefix.c
@@ -61,9 +61,6 @@ pg_regprefix(regex_t *re,
 	if (re->re_csize != sizeof(chr))
 		return REG_MIXED;
 
-	/* Initialize locale-dependent support */
-	pg_set_regex_collation(re->re_collation);
-
 	/* setup */
 	g = (struct guts *) re->re_guts;
 	if (g->info & REG_UIMPOSSIBLE)
-- 
2.47.0

0003-WIP-Add-pg_unset_regex_collation.patchtext/plain; charset=UTF-8; name=0003-WIP-Add-pg_unset_regex_collation.patchDownload

From d1ce2c40a15443891ddf9c2340cca66d5e141c92 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter@eisentraut.org>
Date: Tue, 15 Oct 2024 08:01:41 +0200
Subject: [PATCH 3/3] WIP: Add pg_unset_regex_collation()

Add a function pg_unset_regex_collation() that complements
pg_set_regex_collation().  This unsets the global locale variable for
regular expression use.  This also adds assertions into both functions
to make sure they are used correctly in pairs.

This allows us to detect if pg_set_regex_collation() is not called
when it should have been.  Before, this would not be detected but the
locale settings lingering from a previous use would be used.
---
 src/backend/regex/regc_pg_locale.c | 10 ++++++++++
 src/backend/regex/regcomp.c        |  9 ++++++++-
 src/backend/regex/regexec.c        |  8 +++++---
 src/include/regex/regguts.h        |  1 +
 4 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c
index 4691e796385..8f38507b4bf 100644
--- a/src/backend/regex/regc_pg_locale.c
+++ b/src/backend/regex/regc_pg_locale.c
@@ -222,6 +222,8 @@ pg_set_regex_collation(Oid collation)
 {
 	pg_locale_t locale = 0;
 
+	Assert(!pg_regex_locale);
+
 	if (!OidIsValid(collation))
 	{
 		/*
@@ -258,6 +260,14 @@ pg_set_regex_collation(Oid collation)
 	pg_regex_locale = locale;
 }
 
+void
+pg_unset_regex_collation(void)
+{
+	Assert(pg_regex_locale);
+	pg_regex_locale = 0;
+}
+
+
 static int
 pg_wc_isdigit(pg_wchar c)
 {
diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c
index 8a6cfb2973d..f1442272065 100644
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -385,7 +385,7 @@ pg_regcomp(regex_t *re,
 	FILE	   *debug = (FILE *) NULL;
 #endif
 
-#define  CNOERR()	 { if (ISERR()) return freev(v, v->err); }
+#define  CNOERR()	 { if (ISERR()) { pg_unset_regex_collation(); return freev(v, v->err); } }
 
 	/* sanity checks */
 
@@ -433,7 +433,10 @@ pg_regcomp(regex_t *re,
 	/* more complex setup, malloced things */
 	re->re_guts = VS(MALLOC(sizeof(struct guts)));
 	if (re->re_guts == NULL)
+	{
+		pg_unset_regex_collation();
 		return freev(v, REG_ESPACE);
+	}
 	g = (struct guts *) re->re_guts;
 	g->tree = NULL;
 	initcm(v, &g->cmap);
@@ -446,7 +449,10 @@ pg_regcomp(regex_t *re,
 	/* set up a reasonably-sized transient cvec for getcvec usage */
 	v->cv = newcvec(100, 20);
 	if (v->cv == NULL)
+	{
+		pg_unset_regex_collation();
 		return freev(v, REG_ESPACE);
+	}
 
 	/* parsing */
 	lexstart(v);				/* also handles prefixes */
@@ -542,6 +548,7 @@ pg_regcomp(regex_t *re,
 	}
 #endif
 
+	pg_unset_regex_collation();
 	assert(v->err == 0);
 	return freev(v, 0);
 }
diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c
index 2a1d5bebda3..406fd2f7a1b 100644
--- a/src/backend/regex/regexec.c
+++ b/src/backend/regex/regexec.c
@@ -212,9 +212,6 @@ pg_regexec(regex_t *re,
 	if (search_start > len)
 		return REG_NOMATCH;
 
-	/* Initialize locale-dependent support */
-	pg_set_regex_collation(re->re_collation);
-
 	/* setup */
 	v->re = re;
 	v->g = (struct guts *) re->re_guts;
@@ -257,6 +254,10 @@ pg_regexec(regex_t *re,
 	v->ladfas = NULL;
 	v->lblastcss = NULL;
 	v->lblastcp = NULL;
+
+	/* Initialize locale-dependent support */
+	pg_set_regex_collation(re->re_collation);
+
 	/* below this point, "goto cleanup" will behave sanely */
 
 	assert(v->g->ntree >= 0);
@@ -326,6 +327,7 @@ pg_regexec(regex_t *re,
 
 	/* clean up */
 cleanup:
+	pg_unset_regex_collation();
 	if (v->pmatch != pmatch && v->pmatch != mat)
 		FREE(v->pmatch);
 	if (v->subdfas != NULL)
diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h
index 3ca3647e118..a35b85b463c 100644
--- a/src/include/regex/regguts.h
+++ b/src/include/regex/regguts.h
@@ -545,4 +545,5 @@ struct guts
 
 /* prototypes for functions that are exported from regcomp.c to regexec.c */
 extern void pg_set_regex_collation(Oid collation);
+extern void pg_unset_regex_collation(void);
 extern color pg_reg_getcolor(struct colormap *cm, chr c);
-- 
2.47.0

Alvaro Herrera

alvherre@alvh.no-ip.org

about 1 year ago

In reply to: Peter Eisentraut (#1)

Re: simplify regular expression locale global variables

On 2024-Oct-15, Peter Eisentraut wrote:

@@ -253,8 +241,9 @@ pg_set_regex_collation(Oid collation)
* catalog access is available, so we can't call
* pg_newlocale_from_collation().
*/
+		static struct pg_locale_struct dummy_locale = {.ctype_is_c = true};
+
+		locale = &dummy_locale;
}
else
{
@@ -264,121 +253,80 @@ pg_set_regex_collation(Oid collation)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("nondeterministic collations are not supported for regular expressions")));
[...]
}

pg_regex_locale = locale;
}

Hmm, is it valid to make pg_regex_locale point to a function-local
static here? The lifetime of this static is not clear to me, and I
think this pattern works with at least some compilers, but I remember
comments on previous patch review threads that this pattern isn't
kosher.

--
Álvaro Herrera 48°01'N 7°57'E — https://www.EnterpriseDB.com/

Peter Eisentraut

peter@eisentraut.org

about 1 year ago

In reply to: Alvaro Herrera (#2)

Re: simplify regular expression locale global variables

On 15.10.24 12:08, Alvaro Herrera wrote:

On 2024-Oct-15, Peter Eisentraut wrote:
@@ -253,8 +241,9 @@ pg_set_regex_collation(Oid collation)
* catalog access is available, so we can't call
* pg_newlocale_from_collation().
*/
+		static struct pg_locale_struct dummy_locale = {.ctype_is_c = true};
+
+		locale = &dummy_locale;
}
else
{
@@ -264,121 +253,80 @@ pg_set_regex_collation(Oid collation)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("nondeterministic collations are not supported for regular expressions")));
[...]
}
pg_regex_locale = locale;
}
Hmm, is it valid to make pg_regex_locale point to a function-local
static here? The lifetime of this static is not clear to me, and I
think this pattern works with at least some compilers, but I remember
comments on previous patch review threads that this pattern isn't
kosher.

I think this must be okay. Some classic non-thread-safe C library
functions essentially work that way, e.g.,

char *strerror(int errnum)
{
static char buf[...];
strcpy(buf, ....);
return buf;
}

and then you can use the return pointer wherever you want.

Tom Lane

tgl@sss.pgh.pa.us

about 1 year ago

In reply to: Alvaro Herrera (#2)

Re: simplify regular expression locale global variables

Alvaro Herrera <alvherre@alvh.no-ip.org> writes:

Hmm, is it valid to make pg_regex_locale point to a function-local
static here? The lifetime of this static is not clear to me, and I
think this pattern works with at least some compilers, but I remember
comments on previous patch review threads that this pattern isn't
kosher.

We use function-local statics in other places, and I have never
heard that it's not kosher. There would be little point in
declaring such a variable static at all if that didn't cause
it to have persistent storage.

regards, tom lane

Tom Lane

tgl@sss.pgh.pa.us

about 1 year ago

In reply to: Peter Eisentraut (#1)

Re: simplify regular expression locale global variables

Peter Eisentraut <peter@eisentraut.org> writes:

but after the recent improvements to pg_locale_t handling, we don't need
all three anymore. All the information we have is contained in
pg_locale_t, so we just need to keep that one. This allows us to
structure the locale-using regular expression code more similar to other
locale-using code, mainly by provider, avoiding another layer that is
specific only to the regular expression code. The first patch
implements that.

I didn't read that patch in detail; somebody who's more familiar than
I with the recent locale-code changes ought to read it and confirm
that no subtle behavioral changes are sneaking in. But +1 for
concept.

The second patch removes a call to pg_set_regex_collation() that I think
is unnecessary.

I think this is actively wrong. pg_regprefix is engaged in
determining whether there's a fixed prefix of the regex, which
at least involves a sort of symbolic execution. As an example,
whether '^x' has a fixed prefix surely depends on whether the locale
is case-insensitive. (It may be that we get such cases wrong today,
since pg_regprefix was written before we had ICU locales and I don't
know if anyone has revisited it with this in mind. But removing this
pg_set_regex_collation call is surely not going to make that better.
In any case, the gain of removing it must be microscopic.)

(I don't have any plans to get rid of the remaining global variable.
That would certainly be nice from an intellectual point of view, but
fiddling this into the regular expression code looks quite messy. In
any case, it's probably easier with one variable instead of three, if
someone wants to try.)

Yeah. Those global variables are my fault. I did try hard to avoid
having them, but came to the same conclusion that it was not worth
contorting the regex code to pass a locale pointer through it.
Maybe if we ever completely give up on maintaining code similarity
with the Tcl version, we should just bull ahead and do that; but for
now I don't want to.

regards, tom lane

Peter Eisentraut

peter@eisentraut.org

about 1 year ago

In reply to: Tom Lane (#5)

1 attachment(s)

Re: simplify regular expression locale global variables

On 15.10.24 17:04, Tom Lane wrote:

Peter Eisentraut <peter@eisentraut.org> writes:

but after the recent improvements to pg_locale_t handling, we don't need
all three anymore. All the information we have is contained in
pg_locale_t, so we just need to keep that one. This allows us to
structure the locale-using regular expression code more similar to other
locale-using code, mainly by provider, avoiding another layer that is
specific only to the regular expression code. The first patch
implements that.

I didn't read that patch in detail; somebody who's more familiar than
I with the recent locale-code changes ought to read it and confirm
that no subtle behavioral changes are sneaking in. But +1 for
concept.

Ok, I'll wait for someone to give it a detailed review.

The second patch removes a call to pg_set_regex_collation() that I think
is unnecessary.

I think this is actively wrong. pg_regprefix is engaged in> determining whether there's a fixed prefix of the regex, which
at least involves a sort of symbolic execution. As an example,
whether '^x' has a fixed prefix surely depends on whether the locale
is case-insensitive.

Hmm, okay, I'll leave this out for now and maybe come back to it later.
For the time being, here is a new patch with this part omitted.

Attachments:

v2-0001-Remove-pg_regex_collation-and-pg_regex_strategy.patchtext/plain; charset=UTF-8; name=v2-0001-Remove-pg_regex_collation-and-pg_regex_strategy.patchDownload

From 817a47ac865a0aa3a99a79d9bc2bec951d6f2a6e Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter@eisentraut.org>
Date: Mon, 21 Oct 2024 07:24:50 +0200
Subject: [PATCH v2] Remove pg_regex_collation and pg_regex_strategy

We don't need three global variables to describe the locale strategy
for regular expressions.  We only need to keep pg_regex_locale.  This
works now because pg_locale_t now contains all the required
information (such as a ctype_is_c field).  This allows us to structure
the locale-using regular expression code more similar to other
locale-using code, mainly by provider, avoiding another layer that is
specific only to the regular expression code.

Discussion: https://www.postgresql.org/message-id/flat/b1b92ae1-2e06-4619-a87a-4b4858e547ec%40eisentraut.org
---
 src/backend/regex/regc_pg_locale.c | 430 +++++++++++++----------------
 src/tools/pgindent/typedefs.list   |   1 -
 2 files changed, 185 insertions(+), 246 deletions(-)

diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c
index b75784b6ce5..4691e796385 100644
--- a/src/backend/regex/regc_pg_locale.c
+++ b/src/backend/regex/regc_pg_locale.c
@@ -63,18 +63,7 @@
  * NB: the coding here assumes pg_wchar is an unsigned type.
  */
 
-typedef enum
-{
-	PG_REGEX_STRATEGY_C,		/* C locale (encoding independent) */
-	PG_REGEX_STRATEGY_BUILTIN,	/* built-in Unicode semantics */
-	PG_REGEX_STRATEGY_LIBC_WIDE,	/* Use locale_t <wctype.h> functions */
-	PG_REGEX_STRATEGY_LIBC_1BYTE,	/* Use locale_t <ctype.h> functions */
-	PG_REGEX_STRATEGY_ICU,		/* Use ICU uchar.h functions */
-} PG_Locale_Strategy;
-
-static PG_Locale_Strategy pg_regex_strategy;
 static pg_locale_t pg_regex_locale;
-static Oid	pg_regex_collation;
 
 /*
  * Hard-wired character properties for C locale
@@ -232,7 +221,6 @@ void
 pg_set_regex_collation(Oid collation)
 {
 	pg_locale_t locale = 0;
-	PG_Locale_Strategy strategy;
 
 	if (!OidIsValid(collation))
 	{
@@ -253,8 +241,9 @@ pg_set_regex_collation(Oid collation)
 		 * catalog access is available, so we can't call
 		 * pg_newlocale_from_collation().
 		 */
-		strategy = PG_REGEX_STRATEGY_C;
-		collation = C_COLLATION_OID;
+		static struct pg_locale_struct dummy_locale = {.ctype_is_c = true};
+
+		locale = &dummy_locale;
 	}
 	else
 	{
@@ -264,121 +253,80 @@ pg_set_regex_collation(Oid collation)
 			ereport(ERROR,
 					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 					 errmsg("nondeterministic collations are not supported for regular expressions")));
-
-		if (locale->ctype_is_c)
-		{
-			/*
-			 * C/POSIX collations use this path regardless of database
-			 * encoding
-			 */
-			strategy = PG_REGEX_STRATEGY_C;
-			locale = 0;
-			collation = C_COLLATION_OID;
-		}
-		else if (locale->provider == COLLPROVIDER_BUILTIN)
-		{
-			Assert(GetDatabaseEncoding() == PG_UTF8);
-			strategy = PG_REGEX_STRATEGY_BUILTIN;
-		}
-#ifdef USE_ICU
-		else if (locale->provider == COLLPROVIDER_ICU)
-		{
-			strategy = PG_REGEX_STRATEGY_ICU;
-		}
-#endif
-		else
-		{
-			Assert(locale->provider == COLLPROVIDER_LIBC);
-			if (GetDatabaseEncoding() == PG_UTF8)
-				strategy = PG_REGEX_STRATEGY_LIBC_WIDE;
-			else
-				strategy = PG_REGEX_STRATEGY_LIBC_1BYTE;
-		}
 	}
 
-	pg_regex_strategy = strategy;
 	pg_regex_locale = locale;
-	pg_regex_collation = collation;
 }
 
 static int
 pg_wc_isdigit(pg_wchar c)
 {
-	switch (pg_regex_strategy)
+	if (pg_regex_locale->ctype_is_c)
+		return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISDIGIT));
+	else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+		return pg_u_isdigit(c, true);
+	else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
 	{
-		case PG_REGEX_STRATEGY_C:
-			return (c <= (pg_wchar) 127 &&
-					(pg_char_properties[c] & PG_ISDIGIT));
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_isdigit(c, true);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return iswdigit_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF))
+			return iswdigit_l((wint_t) c, pg_regex_locale->info.lt);
+		else
 			return (c <= (pg_wchar) UCHAR_MAX &&
 					isdigit_l((unsigned char) c, pg_regex_locale->info.lt));
-			break;
-		case PG_REGEX_STRATEGY_ICU:
+	}
 #ifdef USE_ICU
-			return u_isdigit(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+		return u_isdigit(c);
 #endif
-			break;
-	}
+
 	return 0;					/* can't get here, but keep compiler quiet */
 }
 
 static int
 pg_wc_isalpha(pg_wchar c)
 {
-	switch (pg_regex_strategy)
+	if (pg_regex_locale->ctype_is_c)
+		return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISALPHA));
+	else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+		return pg_u_isalpha(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
 	{
-		case PG_REGEX_STRATEGY_C:
-			return (c <= (pg_wchar) 127 &&
-					(pg_char_properties[c] & PG_ISALPHA));
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_isalpha(c);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return iswalpha_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF))
+			return iswalpha_l((wint_t) c, pg_regex_locale->info.lt);
+		else
 			return (c <= (pg_wchar) UCHAR_MAX &&
 					isalpha_l((unsigned char) c, pg_regex_locale->info.lt));
-			break;
-		case PG_REGEX_STRATEGY_ICU:
+	}
 #ifdef USE_ICU
-			return u_isalpha(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+		return u_isalpha(c);
 #endif
-			break;
-	}
+
 	return 0;					/* can't get here, but keep compiler quiet */
 }
 
 static int
 pg_wc_isalnum(pg_wchar c)
 {
-	switch (pg_regex_strategy)
+	if (pg_regex_locale->ctype_is_c)
+		return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISALNUM));
+	else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+		return pg_u_isalnum(c, true);
+	else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
 	{
-		case PG_REGEX_STRATEGY_C:
-			return (c <= (pg_wchar) 127 &&
-					(pg_char_properties[c] & PG_ISALNUM));
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_isalnum(c, true);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return iswalnum_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF))
+			return iswalnum_l((wint_t) c, pg_regex_locale->info.lt);
+		else
 			return (c <= (pg_wchar) UCHAR_MAX &&
 					isalnum_l((unsigned char) c, pg_regex_locale->info.lt));
-			break;
-		case PG_REGEX_STRATEGY_ICU:
+	}
 #ifdef USE_ICU
-			return u_isalnum(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+		return u_isalnum(c);
 #endif
-			break;
-	}
+
 	return 0;					/* can't get here, but keep compiler quiet */
 }
 
@@ -394,218 +342,206 @@ pg_wc_isword(pg_wchar c)
 static int
 pg_wc_isupper(pg_wchar c)
 {
-	switch (pg_regex_strategy)
+	if (pg_regex_locale->ctype_is_c)
+		return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISUPPER));
+	else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+		return pg_u_isupper(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
 	{
-		case PG_REGEX_STRATEGY_C:
-			return (c <= (pg_wchar) 127 &&
-					(pg_char_properties[c] & PG_ISUPPER));
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_isupper(c);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return iswupper_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF))
+			return iswupper_l((wint_t) c, pg_regex_locale->info.lt);
+		else
 			return (c <= (pg_wchar) UCHAR_MAX &&
 					isupper_l((unsigned char) c, pg_regex_locale->info.lt));
-			break;
-		case PG_REGEX_STRATEGY_ICU:
+	}
 #ifdef USE_ICU
-			return u_isupper(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+		return u_isupper(c);
 #endif
-			break;
-	}
+
 	return 0;					/* can't get here, but keep compiler quiet */
 }
 
 static int
 pg_wc_islower(pg_wchar c)
 {
-	switch (pg_regex_strategy)
+	if (pg_regex_locale->ctype_is_c)
+		return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISLOWER));
+	else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+		return pg_u_islower(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
 	{
-		case PG_REGEX_STRATEGY_C:
-			return (c <= (pg_wchar) 127 &&
-					(pg_char_properties[c] & PG_ISLOWER));
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_islower(c);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return iswlower_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF))
+			return iswlower_l((wint_t) c, pg_regex_locale->info.lt);
+		else
 			return (c <= (pg_wchar) UCHAR_MAX &&
 					islower_l((unsigned char) c, pg_regex_locale->info.lt));
-			break;
-		case PG_REGEX_STRATEGY_ICU:
+	}
 #ifdef USE_ICU
-			return u_islower(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+		return u_islower(c);
 #endif
-			break;
-	}
+
 	return 0;					/* can't get here, but keep compiler quiet */
 }
 
 static int
 pg_wc_isgraph(pg_wchar c)
 {
-	switch (pg_regex_strategy)
+	if (pg_regex_locale->ctype_is_c)
+		return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISGRAPH));
+	else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+		return pg_u_isgraph(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
 	{
-		case PG_REGEX_STRATEGY_C:
-			return (c <= (pg_wchar) 127 &&
-					(pg_char_properties[c] & PG_ISGRAPH));
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_isgraph(c);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return iswgraph_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF))
+			return iswgraph_l((wint_t) c, pg_regex_locale->info.lt);
+		else
 			return (c <= (pg_wchar) UCHAR_MAX &&
 					isgraph_l((unsigned char) c, pg_regex_locale->info.lt));
-			break;
-		case PG_REGEX_STRATEGY_ICU:
+	}
 #ifdef USE_ICU
-			return u_isgraph(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+		return u_isgraph(c);
 #endif
-			break;
-	}
+
 	return 0;					/* can't get here, but keep compiler quiet */
 }
 
 static int
 pg_wc_isprint(pg_wchar c)
 {
-	switch (pg_regex_strategy)
+	if (pg_regex_locale->ctype_is_c)
+		return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISPRINT));
+	else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+		return pg_u_isprint(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
 	{
-		case PG_REGEX_STRATEGY_C:
-			return (c <= (pg_wchar) 127 &&
-					(pg_char_properties[c] & PG_ISPRINT));
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_isprint(c);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return iswprint_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF))
+			return iswprint_l((wint_t) c, pg_regex_locale->info.lt);
+		else
 			return (c <= (pg_wchar) UCHAR_MAX &&
 					isprint_l((unsigned char) c, pg_regex_locale->info.lt));
-			break;
-		case PG_REGEX_STRATEGY_ICU:
+	}
 #ifdef USE_ICU
-			return u_isprint(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+		return u_isprint(c);
 #endif
-			break;
-	}
+
 	return 0;					/* can't get here, but keep compiler quiet */
 }
 
 static int
 pg_wc_ispunct(pg_wchar c)
 {
-	switch (pg_regex_strategy)
+	if (pg_regex_locale->ctype_is_c)
+		return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISPUNCT));
+	else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+		return pg_u_ispunct(c, true);
+	else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
 	{
-		case PG_REGEX_STRATEGY_C:
-			return (c <= (pg_wchar) 127 &&
-					(pg_char_properties[c] & PG_ISPUNCT));
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_ispunct(c, true);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return iswpunct_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF))
+			return iswpunct_l((wint_t) c, pg_regex_locale->info.lt);
+		else
 			return (c <= (pg_wchar) UCHAR_MAX &&
 					ispunct_l((unsigned char) c, pg_regex_locale->info.lt));
-			break;
-		case PG_REGEX_STRATEGY_ICU:
+	}
 #ifdef USE_ICU
-			return u_ispunct(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+		return u_ispunct(c);
 #endif
-			break;
-	}
+
 	return 0;					/* can't get here, but keep compiler quiet */
 }
 
 static int
 pg_wc_isspace(pg_wchar c)
 {
-	switch (pg_regex_strategy)
+	if (pg_regex_locale->ctype_is_c)
+		return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISSPACE));
+	else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+		return pg_u_isspace(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
 	{
-		case PG_REGEX_STRATEGY_C:
-			return (c <= (pg_wchar) 127 &&
-					(pg_char_properties[c] & PG_ISSPACE));
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_isspace(c);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return iswspace_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF))
+			return iswspace_l((wint_t) c, pg_regex_locale->info.lt);
+		else
 			return (c <= (pg_wchar) UCHAR_MAX &&
 					isspace_l((unsigned char) c, pg_regex_locale->info.lt));
-			break;
-		case PG_REGEX_STRATEGY_ICU:
+	}
 #ifdef USE_ICU
-			return u_isspace(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+		return u_isspace(c);
 #endif
-			break;
-	}
+
 	return 0;					/* can't get here, but keep compiler quiet */
 }
 
 static pg_wchar
 pg_wc_toupper(pg_wchar c)
 {
-	switch (pg_regex_strategy)
+	if (pg_regex_locale->ctype_is_c)
 	{
-		case PG_REGEX_STRATEGY_C:
-			if (c <= (pg_wchar) 127)
-				return pg_ascii_toupper((unsigned char) c);
-			return c;
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return unicode_uppercase_simple(c);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return towupper_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
+		if (c <= (pg_wchar) 127)
+			return pg_ascii_toupper((unsigned char) c);
+		return c;
+	}
+	else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+		return unicode_uppercase_simple(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
+	{
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF))
+			return towupper_l((wint_t) c, pg_regex_locale->info.lt);
+		else
+		{
 			if (c <= (pg_wchar) UCHAR_MAX)
 				return toupper_l((unsigned char) c, pg_regex_locale->info.lt);
 			return c;
-		case PG_REGEX_STRATEGY_ICU:
+		}
+	}
 #ifdef USE_ICU
-			return u_toupper(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+		return u_toupper(c);
 #endif
-			break;
-	}
+
 	return 0;					/* can't get here, but keep compiler quiet */
 }
 
 static pg_wchar
 pg_wc_tolower(pg_wchar c)
 {
-	switch (pg_regex_strategy)
+	if (pg_regex_locale->ctype_is_c)
 	{
-		case PG_REGEX_STRATEGY_C:
-			if (c <= (pg_wchar) 127)
-				return pg_ascii_tolower((unsigned char) c);
-			return c;
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return unicode_lowercase_simple(c);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return towlower_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
+		if (c <= (pg_wchar) 127)
+			return pg_ascii_tolower((unsigned char) c);
+		return c;
+	}
+	else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+		return unicode_lowercase_simple(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
+	{
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF))
+			return towlower_l((wint_t) c, pg_regex_locale->info.lt);
+		else
+		{
 			if (c <= (pg_wchar) UCHAR_MAX)
 				return tolower_l((unsigned char) c, pg_regex_locale->info.lt);
 			return c;
-		case PG_REGEX_STRATEGY_ICU:
+		}
+	}
 #ifdef USE_ICU
-			return u_tolower(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+		return u_tolower(c);
 #endif
-			break;
-	}
+
 	return 0;					/* can't get here, but keep compiler quiet */
 }
 
@@ -628,7 +564,7 @@ typedef int (*pg_wc_probefunc) (pg_wchar c);
 typedef struct pg_ctype_cache
 {
 	pg_wc_probefunc probefunc;	/* pg_wc_isalpha or a sibling */
-	Oid			collation;		/* collation this entry is for */
+	pg_locale_t locale;			/* locale this entry is for */
 	struct cvec cv;				/* cache entry contents */
 	struct pg_ctype_cache *next;	/* chain link */
 } pg_ctype_cache;
@@ -697,7 +633,7 @@ pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode)
 	for (pcc = pg_ctype_cache_list; pcc != NULL; pcc = pcc->next)
 	{
 		if (pcc->probefunc == probefunc &&
-			pcc->collation == pg_regex_collation)
+			pcc->locale == pg_regex_locale)
 			return &pcc->cv;
 	}
 
@@ -708,7 +644,7 @@ pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode)
 	if (pcc == NULL)
 		return NULL;
 	pcc->probefunc = probefunc;
-	pcc->collation = pg_regex_collation;
+	pcc->locale = pg_regex_locale;
 	pcc->cv.nchrs = 0;
 	pcc->cv.chrspace = 128;
 	pcc->cv.chrs = (chr *) malloc(pcc->cv.chrspace * sizeof(chr));
@@ -732,37 +668,41 @@ pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode)
 	 * would always be true for production values of MAX_SIMPLE_CHR, but it's
 	 * useful to allow it to be small for testing purposes.)
 	 */
-	switch (pg_regex_strategy)
+	if (pg_regex_locale->ctype_is_c)
 	{
-		case PG_REGEX_STRATEGY_C:
 #if MAX_SIMPLE_CHR >= 127
-			max_chr = (pg_wchar) 127;
-			pcc->cv.cclasscode = -1;
+		max_chr = (pg_wchar) 127;
+		pcc->cv.cclasscode = -1;
 #else
-			max_chr = (pg_wchar) MAX_SIMPLE_CHR;
+		max_chr = (pg_wchar) MAX_SIMPLE_CHR;
 #endif
-			break;
-		case PG_REGEX_STRATEGY_BUILTIN:
-			max_chr = (pg_wchar) MAX_SIMPLE_CHR;
-			break;
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
+	}
+	else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+	{
+		max_chr = (pg_wchar) MAX_SIMPLE_CHR;
+	}
+	else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
+	{
+		if (GetDatabaseEncoding() == PG_UTF8)
 			max_chr = (pg_wchar) MAX_SIMPLE_CHR;
-			break;
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
+		else
+		{
 #if MAX_SIMPLE_CHR >= UCHAR_MAX
 			max_chr = (pg_wchar) UCHAR_MAX;
 			pcc->cv.cclasscode = -1;
 #else
 			max_chr = (pg_wchar) MAX_SIMPLE_CHR;
 #endif
-			break;
-		case PG_REGEX_STRATEGY_ICU:
-			max_chr = (pg_wchar) MAX_SIMPLE_CHR;
-			break;
-		default:
-			Assert(false);
-			max_chr = 0;		/* can't get here, but keep compiler quiet */
-			break;
+		}
+	}
+	else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+	{
+		max_chr = (pg_wchar) MAX_SIMPLE_CHR;
+	}
+	else
+	{
+		Assert(false);
+		max_chr = 0;			/* can't get here, but keep compiler quiet */
 	}
 
 	/*
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 57de1acff3a..bbc1ac179e8 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -1821,7 +1821,6 @@ PGTargetServerType
 PGTernaryBool
 PGTransactionStatusType
 PGVerbosity
-PG_Locale_Strategy
 PG_Lock_Status
 PG_init_t
 PGcancel

base-commit: 57a36e890d3d60e6408bf6805e91c82f7b370873
-- 
2.47.0

Andreas Karlsson

andreas@proxel.se

about 1 year ago

In reply to: Peter Eisentraut (#1)

Re: simplify regular expression locale global variables

On 10/15/24 8:12 AM, Peter Eisentraut wrote:

We currently have

    static PG_Locale_Strategy pg_regex_strategy;
    static pg_locale_t pg_regex_locale;
    static Oid pg_regex_collation;

but after the recent improvements to pg_locale_t handling, we don't need
all three anymore. All the information we have is contained in
pg_locale_t, so we just need to keep that one. This allows us to
structure the locale-using regular expression code more similar to other
locale-using code, mainly by provider, avoiding another layer that is
specific only to the regular expression code. The first patch
implements that.

Jeff Davis has a patch which also fixes this while refactoring other
stuff too which I prefer over your patch since it also cleans up the
collation code in general.

/messages/by-id/2830211e1b6e6a2e26d845780b03e125281ea17b.camel@j-davis.com

The second patch removes a call to pg_set_regex_collation() that I think
is unnecessary.

The third patch adds a pg_unset_regex_collation() call that undoes what
pg_set_regex_collation() does. I mainly used this to verify the second
patch, but maybe it's also useful on its own, not sure.

(I don't have any plans to get rid of the remaining global variable.
That would certainly be nice from an intellectual point of view, but
fiddling this into the regular expression code looks quite messy. In
any case, it's probably easier with one variable instead of three, if
someone wants to try.)

I have not looked at your other two patches yet.

Andreas

Peter Eisentraut

peter@eisentraut.org

about 1 year ago

In reply to: Andreas Karlsson (#7)

2 attachment(s)

Re: simplify regular expression locale global variables

On 25.10.24 10:16, Andreas Karlsson wrote:

On 10/15/24 8:12 AM, Peter Eisentraut wrote:

We currently have

     static PG_Locale_Strategy pg_regex_strategy;
     static pg_locale_t pg_regex_locale;
     static Oid pg_regex_collation;

but after the recent improvements to pg_locale_t handling, we don't
need all three anymore. All the information we have is contained in
pg_locale_t, so we just need to keep that one. This allows us to
structure the locale-using regular expression code more similar to
other locale-using code, mainly by provider, avoiding another layer
that is specific only to the regular expression code. The first patch
implements that.

Jeff Davis has a patch which also fixes this while refactoring other
stuff too which I prefer over your patch since it also cleans up the
collation code in general.

https://www.postgresql.org/message-
id/2830211e1b6e6a2e26d845780b03e125281ea17b.camel%40j-davis.com

That patch set looks like a good direction.

But it doesn't remove pg_regex_collation, only pg_regex_strategy. So I
have split my v2 into two patches, the first removes pg_regex_collation
and the second removes pg_regex_strategy. The first patch is useful on
its own, I think; the second one will presumably be replaced by the
other patch series above.

Attachments:

v3-0001-Remove-pg_regex_collation.patchtext/plain; charset=UTF-8; name=v3-0001-Remove-pg_regex_collation.patchDownload

From ab19b4d7ab03ba8c515da3e4b389d41941c5ab27 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter@eisentraut.org>
Date: Tue, 3 Dec 2024 16:58:38 +0100
Subject: [PATCH v3 1/2] Remove pg_regex_collation

We can also use the existing pg_regex_locale as the cache key, which
is the only use of this variable.

Discussion: https://www.postgresql.org/message-id/flat/b1b92ae1-2e06-4619-a87a-4b4858e547ec%40eisentraut.org
---
 src/backend/regex/regc_pg_locale.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c
index b75784b6ce5..e07d4a8868c 100644
--- a/src/backend/regex/regc_pg_locale.c
+++ b/src/backend/regex/regc_pg_locale.c
@@ -74,7 +74,6 @@ typedef enum
 
 static PG_Locale_Strategy pg_regex_strategy;
 static pg_locale_t pg_regex_locale;
-static Oid	pg_regex_collation;
 
 /*
  * Hard-wired character properties for C locale
@@ -254,7 +253,7 @@ pg_set_regex_collation(Oid collation)
 		 * pg_newlocale_from_collation().
 		 */
 		strategy = PG_REGEX_STRATEGY_C;
-		collation = C_COLLATION_OID;
+		locale = 0;
 	}
 	else
 	{
@@ -273,7 +272,6 @@ pg_set_regex_collation(Oid collation)
 			 */
 			strategy = PG_REGEX_STRATEGY_C;
 			locale = 0;
-			collation = C_COLLATION_OID;
 		}
 		else if (locale->provider == COLLPROVIDER_BUILTIN)
 		{
@@ -298,7 +296,6 @@ pg_set_regex_collation(Oid collation)
 
 	pg_regex_strategy = strategy;
 	pg_regex_locale = locale;
-	pg_regex_collation = collation;
 }
 
 static int
@@ -628,7 +625,7 @@ typedef int (*pg_wc_probefunc) (pg_wchar c);
 typedef struct pg_ctype_cache
 {
 	pg_wc_probefunc probefunc;	/* pg_wc_isalpha or a sibling */
-	Oid			collation;		/* collation this entry is for */
+	pg_locale_t locale;			/* locale this entry is for */
 	struct cvec cv;				/* cache entry contents */
 	struct pg_ctype_cache *next;	/* chain link */
 } pg_ctype_cache;
@@ -697,7 +694,7 @@ pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode)
 	for (pcc = pg_ctype_cache_list; pcc != NULL; pcc = pcc->next)
 	{
 		if (pcc->probefunc == probefunc &&
-			pcc->collation == pg_regex_collation)
+			pcc->locale == pg_regex_locale)
 			return &pcc->cv;
 	}
 
@@ -708,7 +705,7 @@ pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode)
 	if (pcc == NULL)
 		return NULL;
 	pcc->probefunc = probefunc;
-	pcc->collation = pg_regex_collation;
+	pcc->locale = pg_regex_locale;
 	pcc->cv.nchrs = 0;
 	pcc->cv.chrspace = 128;
 	pcc->cv.chrs = (chr *) malloc(pcc->cv.chrspace * sizeof(chr));

base-commit: 1ba0782ce90cb4261098de59b49ae5cb2326566b
-- 
2.47.1

v3-0002-Remove-pg_regex_strategy.patchtext/plain; charset=UTF-8; name=v3-0002-Remove-pg_regex_strategy.patchDownload

From 3406390429bbb27931b1e99f9be410213fe05da4 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter@eisentraut.org>
Date: Tue, 3 Dec 2024 16:59:51 +0100
Subject: [PATCH v3 2/2] Remove pg_regex_strategy

We only need to keep pg_regex_locale.  This works now because
pg_locale_t now contains all the required information (such as a
ctype_is_c field).  This allows us to structure the locale-using
regular expression code more similar to other locale-using code,
mainly by provider, avoiding another layer that is specific only to
the regular expression code.

Discussion: https://www.postgresql.org/message-id/flat/b1b92ae1-2e06-4619-a87a-4b4858e547ec%40eisentraut.org
---
 src/backend/regex/regc_pg_locale.c | 421 +++++++++++++----------------
 src/tools/pgindent/typedefs.list   |   1 -
 2 files changed, 182 insertions(+), 240 deletions(-)

diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c
index e07d4a8868c..4691e796385 100644
--- a/src/backend/regex/regc_pg_locale.c
+++ b/src/backend/regex/regc_pg_locale.c
@@ -63,16 +63,6 @@
  * NB: the coding here assumes pg_wchar is an unsigned type.
  */
 
-typedef enum
-{
-	PG_REGEX_STRATEGY_C,		/* C locale (encoding independent) */
-	PG_REGEX_STRATEGY_BUILTIN,	/* built-in Unicode semantics */
-	PG_REGEX_STRATEGY_LIBC_WIDE,	/* Use locale_t <wctype.h> functions */
-	PG_REGEX_STRATEGY_LIBC_1BYTE,	/* Use locale_t <ctype.h> functions */
-	PG_REGEX_STRATEGY_ICU,		/* Use ICU uchar.h functions */
-} PG_Locale_Strategy;
-
-static PG_Locale_Strategy pg_regex_strategy;
 static pg_locale_t pg_regex_locale;
 
 /*
@@ -231,7 +221,6 @@ void
 pg_set_regex_collation(Oid collation)
 {
 	pg_locale_t locale = 0;
-	PG_Locale_Strategy strategy;
 
 	if (!OidIsValid(collation))
 	{
@@ -252,8 +241,9 @@ pg_set_regex_collation(Oid collation)
 		 * catalog access is available, so we can't call
 		 * pg_newlocale_from_collation().
 		 */
-		strategy = PG_REGEX_STRATEGY_C;
-		locale = 0;
+		static struct pg_locale_struct dummy_locale = {.ctype_is_c = true};
+
+		locale = &dummy_locale;
 	}
 	else
 	{
@@ -263,119 +253,80 @@ pg_set_regex_collation(Oid collation)
 			ereport(ERROR,
 					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 					 errmsg("nondeterministic collations are not supported for regular expressions")));
-
-		if (locale->ctype_is_c)
-		{
-			/*
-			 * C/POSIX collations use this path regardless of database
-			 * encoding
-			 */
-			strategy = PG_REGEX_STRATEGY_C;
-			locale = 0;
-		}
-		else if (locale->provider == COLLPROVIDER_BUILTIN)
-		{
-			Assert(GetDatabaseEncoding() == PG_UTF8);
-			strategy = PG_REGEX_STRATEGY_BUILTIN;
-		}
-#ifdef USE_ICU
-		else if (locale->provider == COLLPROVIDER_ICU)
-		{
-			strategy = PG_REGEX_STRATEGY_ICU;
-		}
-#endif
-		else
-		{
-			Assert(locale->provider == COLLPROVIDER_LIBC);
-			if (GetDatabaseEncoding() == PG_UTF8)
-				strategy = PG_REGEX_STRATEGY_LIBC_WIDE;
-			else
-				strategy = PG_REGEX_STRATEGY_LIBC_1BYTE;
-		}
 	}
 
-	pg_regex_strategy = strategy;
 	pg_regex_locale = locale;
 }
 
 static int
 pg_wc_isdigit(pg_wchar c)
 {
-	switch (pg_regex_strategy)
+	if (pg_regex_locale->ctype_is_c)
+		return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISDIGIT));
+	else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+		return pg_u_isdigit(c, true);
+	else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
 	{
-		case PG_REGEX_STRATEGY_C:
-			return (c <= (pg_wchar) 127 &&
-					(pg_char_properties[c] & PG_ISDIGIT));
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_isdigit(c, true);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return iswdigit_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF))
+			return iswdigit_l((wint_t) c, pg_regex_locale->info.lt);
+		else
 			return (c <= (pg_wchar) UCHAR_MAX &&
 					isdigit_l((unsigned char) c, pg_regex_locale->info.lt));
-			break;
-		case PG_REGEX_STRATEGY_ICU:
+	}
 #ifdef USE_ICU
-			return u_isdigit(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+		return u_isdigit(c);
 #endif
-			break;
-	}
+
 	return 0;					/* can't get here, but keep compiler quiet */
 }
 
 static int
 pg_wc_isalpha(pg_wchar c)
 {
-	switch (pg_regex_strategy)
+	if (pg_regex_locale->ctype_is_c)
+		return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISALPHA));
+	else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+		return pg_u_isalpha(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
 	{
-		case PG_REGEX_STRATEGY_C:
-			return (c <= (pg_wchar) 127 &&
-					(pg_char_properties[c] & PG_ISALPHA));
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_isalpha(c);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return iswalpha_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF))
+			return iswalpha_l((wint_t) c, pg_regex_locale->info.lt);
+		else
 			return (c <= (pg_wchar) UCHAR_MAX &&
 					isalpha_l((unsigned char) c, pg_regex_locale->info.lt));
-			break;
-		case PG_REGEX_STRATEGY_ICU:
+	}
 #ifdef USE_ICU
-			return u_isalpha(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+		return u_isalpha(c);
 #endif
-			break;
-	}
+
 	return 0;					/* can't get here, but keep compiler quiet */
 }
 
 static int
 pg_wc_isalnum(pg_wchar c)
 {
-	switch (pg_regex_strategy)
+	if (pg_regex_locale->ctype_is_c)
+		return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISALNUM));
+	else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+		return pg_u_isalnum(c, true);
+	else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
 	{
-		case PG_REGEX_STRATEGY_C:
-			return (c <= (pg_wchar) 127 &&
-					(pg_char_properties[c] & PG_ISALNUM));
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_isalnum(c, true);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return iswalnum_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF))
+			return iswalnum_l((wint_t) c, pg_regex_locale->info.lt);
+		else
 			return (c <= (pg_wchar) UCHAR_MAX &&
 					isalnum_l((unsigned char) c, pg_regex_locale->info.lt));
-			break;
-		case PG_REGEX_STRATEGY_ICU:
+	}
 #ifdef USE_ICU
-			return u_isalnum(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+		return u_isalnum(c);
 #endif
-			break;
-	}
+
 	return 0;					/* can't get here, but keep compiler quiet */
 }
 
@@ -391,218 +342,206 @@ pg_wc_isword(pg_wchar c)
 static int
 pg_wc_isupper(pg_wchar c)
 {
-	switch (pg_regex_strategy)
+	if (pg_regex_locale->ctype_is_c)
+		return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISUPPER));
+	else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+		return pg_u_isupper(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
 	{
-		case PG_REGEX_STRATEGY_C:
-			return (c <= (pg_wchar) 127 &&
-					(pg_char_properties[c] & PG_ISUPPER));
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_isupper(c);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return iswupper_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF))
+			return iswupper_l((wint_t) c, pg_regex_locale->info.lt);
+		else
 			return (c <= (pg_wchar) UCHAR_MAX &&
 					isupper_l((unsigned char) c, pg_regex_locale->info.lt));
-			break;
-		case PG_REGEX_STRATEGY_ICU:
+	}
 #ifdef USE_ICU
-			return u_isupper(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+		return u_isupper(c);
 #endif
-			break;
-	}
+
 	return 0;					/* can't get here, but keep compiler quiet */
 }
 
 static int
 pg_wc_islower(pg_wchar c)
 {
-	switch (pg_regex_strategy)
+	if (pg_regex_locale->ctype_is_c)
+		return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISLOWER));
+	else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+		return pg_u_islower(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
 	{
-		case PG_REGEX_STRATEGY_C:
-			return (c <= (pg_wchar) 127 &&
-					(pg_char_properties[c] & PG_ISLOWER));
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_islower(c);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return iswlower_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF))
+			return iswlower_l((wint_t) c, pg_regex_locale->info.lt);
+		else
 			return (c <= (pg_wchar) UCHAR_MAX &&
 					islower_l((unsigned char) c, pg_regex_locale->info.lt));
-			break;
-		case PG_REGEX_STRATEGY_ICU:
+	}
 #ifdef USE_ICU
-			return u_islower(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+		return u_islower(c);
 #endif
-			break;
-	}
+
 	return 0;					/* can't get here, but keep compiler quiet */
 }
 
 static int
 pg_wc_isgraph(pg_wchar c)
 {
-	switch (pg_regex_strategy)
+	if (pg_regex_locale->ctype_is_c)
+		return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISGRAPH));
+	else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+		return pg_u_isgraph(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
 	{
-		case PG_REGEX_STRATEGY_C:
-			return (c <= (pg_wchar) 127 &&
-					(pg_char_properties[c] & PG_ISGRAPH));
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_isgraph(c);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return iswgraph_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF))
+			return iswgraph_l((wint_t) c, pg_regex_locale->info.lt);
+		else
 			return (c <= (pg_wchar) UCHAR_MAX &&
 					isgraph_l((unsigned char) c, pg_regex_locale->info.lt));
-			break;
-		case PG_REGEX_STRATEGY_ICU:
+	}
 #ifdef USE_ICU
-			return u_isgraph(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+		return u_isgraph(c);
 #endif
-			break;
-	}
+
 	return 0;					/* can't get here, but keep compiler quiet */
 }
 
 static int
 pg_wc_isprint(pg_wchar c)
 {
-	switch (pg_regex_strategy)
+	if (pg_regex_locale->ctype_is_c)
+		return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISPRINT));
+	else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+		return pg_u_isprint(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
 	{
-		case PG_REGEX_STRATEGY_C:
-			return (c <= (pg_wchar) 127 &&
-					(pg_char_properties[c] & PG_ISPRINT));
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_isprint(c);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return iswprint_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF))
+			return iswprint_l((wint_t) c, pg_regex_locale->info.lt);
+		else
 			return (c <= (pg_wchar) UCHAR_MAX &&
 					isprint_l((unsigned char) c, pg_regex_locale->info.lt));
-			break;
-		case PG_REGEX_STRATEGY_ICU:
+	}
 #ifdef USE_ICU
-			return u_isprint(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+		return u_isprint(c);
 #endif
-			break;
-	}
+
 	return 0;					/* can't get here, but keep compiler quiet */
 }
 
 static int
 pg_wc_ispunct(pg_wchar c)
 {
-	switch (pg_regex_strategy)
+	if (pg_regex_locale->ctype_is_c)
+		return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISPUNCT));
+	else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+		return pg_u_ispunct(c, true);
+	else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
 	{
-		case PG_REGEX_STRATEGY_C:
-			return (c <= (pg_wchar) 127 &&
-					(pg_char_properties[c] & PG_ISPUNCT));
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_ispunct(c, true);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return iswpunct_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF))
+			return iswpunct_l((wint_t) c, pg_regex_locale->info.lt);
+		else
 			return (c <= (pg_wchar) UCHAR_MAX &&
 					ispunct_l((unsigned char) c, pg_regex_locale->info.lt));
-			break;
-		case PG_REGEX_STRATEGY_ICU:
+	}
 #ifdef USE_ICU
-			return u_ispunct(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+		return u_ispunct(c);
 #endif
-			break;
-	}
+
 	return 0;					/* can't get here, but keep compiler quiet */
 }
 
 static int
 pg_wc_isspace(pg_wchar c)
 {
-	switch (pg_regex_strategy)
+	if (pg_regex_locale->ctype_is_c)
+		return (c <= (pg_wchar) 127 && (pg_char_properties[c] & PG_ISSPACE));
+	else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+		return pg_u_isspace(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
 	{
-		case PG_REGEX_STRATEGY_C:
-			return (c <= (pg_wchar) 127 &&
-					(pg_char_properties[c] & PG_ISSPACE));
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_isspace(c);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return iswspace_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF))
+			return iswspace_l((wint_t) c, pg_regex_locale->info.lt);
+		else
 			return (c <= (pg_wchar) UCHAR_MAX &&
 					isspace_l((unsigned char) c, pg_regex_locale->info.lt));
-			break;
-		case PG_REGEX_STRATEGY_ICU:
+	}
 #ifdef USE_ICU
-			return u_isspace(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+		return u_isspace(c);
 #endif
-			break;
-	}
+
 	return 0;					/* can't get here, but keep compiler quiet */
 }
 
 static pg_wchar
 pg_wc_toupper(pg_wchar c)
 {
-	switch (pg_regex_strategy)
+	if (pg_regex_locale->ctype_is_c)
 	{
-		case PG_REGEX_STRATEGY_C:
-			if (c <= (pg_wchar) 127)
-				return pg_ascii_toupper((unsigned char) c);
-			return c;
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return unicode_uppercase_simple(c);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return towupper_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
+		if (c <= (pg_wchar) 127)
+			return pg_ascii_toupper((unsigned char) c);
+		return c;
+	}
+	else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+		return unicode_uppercase_simple(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
+	{
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF))
+			return towupper_l((wint_t) c, pg_regex_locale->info.lt);
+		else
+		{
 			if (c <= (pg_wchar) UCHAR_MAX)
 				return toupper_l((unsigned char) c, pg_regex_locale->info.lt);
 			return c;
-		case PG_REGEX_STRATEGY_ICU:
+		}
+	}
 #ifdef USE_ICU
-			return u_toupper(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+		return u_toupper(c);
 #endif
-			break;
-	}
+
 	return 0;					/* can't get here, but keep compiler quiet */
 }
 
 static pg_wchar
 pg_wc_tolower(pg_wchar c)
 {
-	switch (pg_regex_strategy)
+	if (pg_regex_locale->ctype_is_c)
 	{
-		case PG_REGEX_STRATEGY_C:
-			if (c <= (pg_wchar) 127)
-				return pg_ascii_tolower((unsigned char) c);
-			return c;
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return unicode_lowercase_simple(c);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return towlower_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
+		if (c <= (pg_wchar) 127)
+			return pg_ascii_tolower((unsigned char) c);
+		return c;
+	}
+	else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+		return unicode_lowercase_simple(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
+	{
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF))
+			return towlower_l((wint_t) c, pg_regex_locale->info.lt);
+		else
+		{
 			if (c <= (pg_wchar) UCHAR_MAX)
 				return tolower_l((unsigned char) c, pg_regex_locale->info.lt);
 			return c;
-		case PG_REGEX_STRATEGY_ICU:
+		}
+	}
 #ifdef USE_ICU
-			return u_tolower(c);
+	else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+		return u_tolower(c);
 #endif
-			break;
-	}
+
 	return 0;					/* can't get here, but keep compiler quiet */
 }
 
@@ -729,37 +668,41 @@ pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode)
 	 * would always be true for production values of MAX_SIMPLE_CHR, but it's
 	 * useful to allow it to be small for testing purposes.)
 	 */
-	switch (pg_regex_strategy)
+	if (pg_regex_locale->ctype_is_c)
 	{
-		case PG_REGEX_STRATEGY_C:
 #if MAX_SIMPLE_CHR >= 127
-			max_chr = (pg_wchar) 127;
-			pcc->cv.cclasscode = -1;
+		max_chr = (pg_wchar) 127;
+		pcc->cv.cclasscode = -1;
 #else
-			max_chr = (pg_wchar) MAX_SIMPLE_CHR;
+		max_chr = (pg_wchar) MAX_SIMPLE_CHR;
 #endif
-			break;
-		case PG_REGEX_STRATEGY_BUILTIN:
-			max_chr = (pg_wchar) MAX_SIMPLE_CHR;
-			break;
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
+	}
+	else if (pg_regex_locale->provider == COLLPROVIDER_BUILTIN)
+	{
+		max_chr = (pg_wchar) MAX_SIMPLE_CHR;
+	}
+	else if (pg_regex_locale->provider == COLLPROVIDER_LIBC)
+	{
+		if (GetDatabaseEncoding() == PG_UTF8)
 			max_chr = (pg_wchar) MAX_SIMPLE_CHR;
-			break;
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
+		else
+		{
 #if MAX_SIMPLE_CHR >= UCHAR_MAX
 			max_chr = (pg_wchar) UCHAR_MAX;
 			pcc->cv.cclasscode = -1;
 #else
 			max_chr = (pg_wchar) MAX_SIMPLE_CHR;
 #endif
-			break;
-		case PG_REGEX_STRATEGY_ICU:
-			max_chr = (pg_wchar) MAX_SIMPLE_CHR;
-			break;
-		default:
-			Assert(false);
-			max_chr = 0;		/* can't get here, but keep compiler quiet */
-			break;
+		}
+	}
+	else if (pg_regex_locale->provider == COLLPROVIDER_ICU)
+	{
+		max_chr = (pg_wchar) MAX_SIMPLE_CHR;
+	}
+	else
+	{
+		Assert(false);
+		max_chr = 0;			/* can't get here, but keep compiler quiet */
 	}
 
 	/*
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 2d4c870423a..94b041ec9e9 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -1823,7 +1823,6 @@ PGTargetServerType
 PGTernaryBool
 PGTransactionStatusType
 PGVerbosity
-PG_Locale_Strategy
 PG_Lock_Status
 PG_init_t
 PGcancel
-- 
2.47.1

Jeff Davis

pgsql@j-davis.com

about 1 year ago

In reply to: Peter Eisentraut (#8)

Re: simplify regular expression locale global variables

On Tue, 2024-12-03 at 17:06 +0100, Peter Eisentraut wrote:

But it doesn't remove pg_regex_collation, only pg_regex_strategy. So
I
have split my v2 into two patches, the first removes
pg_regex_collation
and the second removes pg_regex_strategy. The first patch is useful
on
its own, I think;

+1, looks committable now.

the second one will presumably be replaced by the
other patch series above.

Sounds good.

Regards,
Jeff Davis

#10

Peter Eisentraut

peter@eisentraut.org

about 1 year ago

In reply to: Jeff Davis (#9)

Re: simplify regular expression locale global variables

On 03.12.24 20:19, Jeff Davis wrote:

On Tue, 2024-12-03 at 17:06 +0100, Peter Eisentraut wrote:

But it doesn't remove pg_regex_collation, only pg_regex_strategy. So
I
have split my v2 into two patches, the first removes
pg_regex_collation
and the second removes pg_regex_strategy. The first patch is useful
on
its own, I think;

+1, looks committable now.

done