From dfdc402eac48c248f2a70edea91d57989a1af6f1 Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Thu, 26 Sep 2024 14:30:07 -0700
Subject: [PATCH v5 7/8] Control ctype behavior with a method table.

Previously, ctype behavior (pattern matching) behavior branched based
on the provider.

A method table is less error-prone and easier to hook.
---
 src/backend/regex/regc_pg_locale.c     | 378 +++++--------------------
 src/backend/utils/adt/pg_locale.c      |  62 ++++
 src/backend/utils/adt/pg_locale_icu.c  |  45 +++
 src/backend/utils/adt/pg_locale_libc.c | 169 +++++++++++
 src/include/utils/pg_locale.h          |  23 ++
 5 files changed, 373 insertions(+), 304 deletions(-)

diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c
index b75784b6ce5..d256e7be660 100644
--- a/src/backend/regex/regc_pg_locale.c
+++ b/src/backend/regex/regc_pg_locale.c
@@ -63,33 +63,18 @@
  * NB: the coding here assumes pg_wchar is an unsigned type.
  */
 
-typedef enum
-{
-	PG_REGEX_STRATEGY_C,		/* C locale (encoding independent) */
-	PG_REGEX_STRATEGY_BUILTIN,	/* built-in Unicode semantics */
-	PG_REGEX_STRATEGY_LIBC_WIDE,	/* Use locale_t <wctype.h> functions */
-	PG_REGEX_STRATEGY_LIBC_1BYTE,	/* Use locale_t <ctype.h> functions */
-	PG_REGEX_STRATEGY_ICU,		/* Use ICU uchar.h functions */
-} PG_Locale_Strategy;
-
-static PG_Locale_Strategy pg_regex_strategy;
 static pg_locale_t pg_regex_locale;
 static Oid	pg_regex_collation;
 
+static struct pg_locale_struct dummy_c_locale = {
+	.collate_is_c = true,
+	.ctype_is_c = true,
+};
+
 /*
  * Hard-wired character properties for C locale
  */
-#define PG_ISDIGIT	0x01
-#define PG_ISALPHA	0x02
-#define PG_ISALNUM	(PG_ISDIGIT | PG_ISALPHA)
-#define PG_ISUPPER	0x04
-#define PG_ISLOWER	0x08
-#define PG_ISGRAPH	0x10
-#define PG_ISPRINT	0x20
-#define PG_ISPUNCT	0x40
-#define PG_ISSPACE	0x80
-
-static const unsigned char pg_char_properties[128] = {
+static const unsigned char char_properties_tbl[128] = {
 	 /* NUL */ 0,
 	 /* ^A */ 0,
 	 /* ^B */ 0,
@@ -232,7 +217,6 @@ void
 pg_set_regex_collation(Oid collation)
 {
 	pg_locale_t locale = 0;
-	PG_Locale_Strategy strategy;
 
 	if (!OidIsValid(collation))
 	{
@@ -253,8 +237,8 @@ pg_set_regex_collation(Oid collation)
 		 * catalog access is available, so we can't call
 		 * pg_newlocale_from_collation().
 		 */
-		strategy = PG_REGEX_STRATEGY_C;
 		collation = C_COLLATION_OID;
+		locale = &dummy_c_locale;
 	}
 	else
 	{
@@ -271,32 +255,11 @@ pg_set_regex_collation(Oid collation)
 			 * C/POSIX collations use this path regardless of database
 			 * encoding
 			 */
-			strategy = PG_REGEX_STRATEGY_C;
-			locale = 0;
+			locale = &dummy_c_locale;
 			collation = C_COLLATION_OID;
 		}
-		else if (locale->provider == COLLPROVIDER_BUILTIN)
-		{
-			Assert(GetDatabaseEncoding() == PG_UTF8);
-			strategy = PG_REGEX_STRATEGY_BUILTIN;
-		}
-#ifdef USE_ICU
-		else if (locale->provider == COLLPROVIDER_ICU)
-		{
-			strategy = PG_REGEX_STRATEGY_ICU;
-		}
-#endif
-		else
-		{
-			Assert(locale->provider == COLLPROVIDER_LIBC);
-			if (GetDatabaseEncoding() == PG_UTF8)
-				strategy = PG_REGEX_STRATEGY_LIBC_WIDE;
-			else
-				strategy = PG_REGEX_STRATEGY_LIBC_1BYTE;
-		}
 	}
 
-	pg_regex_strategy = strategy;
 	pg_regex_locale = locale;
 	pg_regex_collation = collation;
 }
@@ -304,82 +267,31 @@ pg_set_regex_collation(Oid collation)
 static int
 pg_wc_isdigit(pg_wchar c)
 {
-	switch (pg_regex_strategy)
-	{
-		case PG_REGEX_STRATEGY_C:
-			return (c <= (pg_wchar) 127 &&
-					(pg_char_properties[c] & PG_ISDIGIT));
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_isdigit(c, true);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return iswdigit_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
-			return (c <= (pg_wchar) UCHAR_MAX &&
-					isdigit_l((unsigned char) c, pg_regex_locale->info.lt));
-			break;
-		case PG_REGEX_STRATEGY_ICU:
-#ifdef USE_ICU
-			return u_isdigit(c);
-#endif
-			break;
-	}
-	return 0;					/* can't get here, but keep compiler quiet */
+	if (pg_regex_locale->ctype_is_c)
+		return (c <= (pg_wchar) 127 &&
+				(char_properties_tbl[c] & PG_ISDIGIT));
+	else
+		return char_properties(c, PG_ISDIGIT, pg_regex_locale) != 0;
 }
 
 static int
 pg_wc_isalpha(pg_wchar c)
 {
-	switch (pg_regex_strategy)
-	{
-		case PG_REGEX_STRATEGY_C:
-			return (c <= (pg_wchar) 127 &&
-					(pg_char_properties[c] & PG_ISALPHA));
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_isalpha(c);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return iswalpha_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
-			return (c <= (pg_wchar) UCHAR_MAX &&
-					isalpha_l((unsigned char) c, pg_regex_locale->info.lt));
-			break;
-		case PG_REGEX_STRATEGY_ICU:
-#ifdef USE_ICU
-			return u_isalpha(c);
-#endif
-			break;
-	}
-	return 0;					/* can't get here, but keep compiler quiet */
+	if (pg_regex_locale->ctype_is_c)
+		return (c <= (pg_wchar) 127 &&
+				(char_properties_tbl[c] & PG_ISALPHA));
+	else
+		return char_properties(c, PG_ISALPHA, pg_regex_locale) != 0;
 }
 
 static int
 pg_wc_isalnum(pg_wchar c)
 {
-	switch (pg_regex_strategy)
-	{
-		case PG_REGEX_STRATEGY_C:
-			return (c <= (pg_wchar) 127 &&
-					(pg_char_properties[c] & PG_ISALNUM));
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_isalnum(c, true);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return iswalnum_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
-			return (c <= (pg_wchar) UCHAR_MAX &&
-					isalnum_l((unsigned char) c, pg_regex_locale->info.lt));
-			break;
-		case PG_REGEX_STRATEGY_ICU:
-#ifdef USE_ICU
-			return u_isalnum(c);
-#endif
-			break;
-	}
-	return 0;					/* can't get here, but keep compiler quiet */
+	if (pg_regex_locale->ctype_is_c)
+		return (c <= (pg_wchar) 127 &&
+				(char_properties_tbl[c] & PG_ISALNUM));
+	else
+		return char_properties(c, PG_ISDIGIT|PG_ISALPHA, pg_regex_locale) != 0;
 }
 
 static int
@@ -394,219 +306,87 @@ pg_wc_isword(pg_wchar c)
 static int
 pg_wc_isupper(pg_wchar c)
 {
-	switch (pg_regex_strategy)
-	{
-		case PG_REGEX_STRATEGY_C:
-			return (c <= (pg_wchar) 127 &&
-					(pg_char_properties[c] & PG_ISUPPER));
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_isupper(c);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return iswupper_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
-			return (c <= (pg_wchar) UCHAR_MAX &&
-					isupper_l((unsigned char) c, pg_regex_locale->info.lt));
-			break;
-		case PG_REGEX_STRATEGY_ICU:
-#ifdef USE_ICU
-			return u_isupper(c);
-#endif
-			break;
-	}
-	return 0;					/* can't get here, but keep compiler quiet */
+	if (pg_regex_locale->ctype_is_c)
+		return (c <= (pg_wchar) 127 &&
+				(char_properties_tbl[c] & PG_ISUPPER));
+	else
+		return char_properties(c, PG_ISUPPER, pg_regex_locale) != 0;
 }
 
 static int
 pg_wc_islower(pg_wchar c)
 {
-	switch (pg_regex_strategy)
-	{
-		case PG_REGEX_STRATEGY_C:
-			return (c <= (pg_wchar) 127 &&
-					(pg_char_properties[c] & PG_ISLOWER));
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_islower(c);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return iswlower_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
-			return (c <= (pg_wchar) UCHAR_MAX &&
-					islower_l((unsigned char) c, pg_regex_locale->info.lt));
-			break;
-		case PG_REGEX_STRATEGY_ICU:
-#ifdef USE_ICU
-			return u_islower(c);
-#endif
-			break;
-	}
-	return 0;					/* can't get here, but keep compiler quiet */
+	if (pg_regex_locale->ctype_is_c)
+		return (c <= (pg_wchar) 127 &&
+				(char_properties_tbl[c] & PG_ISLOWER));
+	else
+		return char_properties(c, PG_ISLOWER, pg_regex_locale) != 0;
 }
 
 static int
 pg_wc_isgraph(pg_wchar c)
 {
-	switch (pg_regex_strategy)
-	{
-		case PG_REGEX_STRATEGY_C:
-			return (c <= (pg_wchar) 127 &&
-					(pg_char_properties[c] & PG_ISGRAPH));
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_isgraph(c);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return iswgraph_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
-			return (c <= (pg_wchar) UCHAR_MAX &&
-					isgraph_l((unsigned char) c, pg_regex_locale->info.lt));
-			break;
-		case PG_REGEX_STRATEGY_ICU:
-#ifdef USE_ICU
-			return u_isgraph(c);
-#endif
-			break;
-	}
-	return 0;					/* can't get here, but keep compiler quiet */
+	if (pg_regex_locale->ctype_is_c)
+		return (c <= (pg_wchar) 127 &&
+				(char_properties_tbl[c] & PG_ISGRAPH));
+	else
+		return char_properties(c, PG_ISGRAPH, pg_regex_locale) != 0;
 }
 
 static int
 pg_wc_isprint(pg_wchar c)
 {
-	switch (pg_regex_strategy)
-	{
-		case PG_REGEX_STRATEGY_C:
-			return (c <= (pg_wchar) 127 &&
-					(pg_char_properties[c] & PG_ISPRINT));
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_isprint(c);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return iswprint_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
-			return (c <= (pg_wchar) UCHAR_MAX &&
-					isprint_l((unsigned char) c, pg_regex_locale->info.lt));
-			break;
-		case PG_REGEX_STRATEGY_ICU:
-#ifdef USE_ICU
-			return u_isprint(c);
-#endif
-			break;
-	}
-	return 0;					/* can't get here, but keep compiler quiet */
+	if (pg_regex_locale->ctype_is_c)
+		return (c <= (pg_wchar) 127 &&
+				(char_properties_tbl[c] & PG_ISPRINT));
+	else
+		return char_properties(c, PG_ISPRINT, pg_regex_locale) != 0;
 }
 
 static int
 pg_wc_ispunct(pg_wchar c)
 {
-	switch (pg_regex_strategy)
-	{
-		case PG_REGEX_STRATEGY_C:
-			return (c <= (pg_wchar) 127 &&
-					(pg_char_properties[c] & PG_ISPUNCT));
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_ispunct(c, true);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return iswpunct_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
-			return (c <= (pg_wchar) UCHAR_MAX &&
-					ispunct_l((unsigned char) c, pg_regex_locale->info.lt));
-			break;
-		case PG_REGEX_STRATEGY_ICU:
-#ifdef USE_ICU
-			return u_ispunct(c);
-#endif
-			break;
-	}
-	return 0;					/* can't get here, but keep compiler quiet */
+	if (pg_regex_locale->ctype_is_c)
+		return (c <= (pg_wchar) 127 &&
+				(char_properties_tbl[c] & PG_ISPUNCT));
+	else
+		return char_properties(c, PG_ISPUNCT, pg_regex_locale) != 0;
 }
 
 static int
 pg_wc_isspace(pg_wchar c)
 {
-	switch (pg_regex_strategy)
-	{
-		case PG_REGEX_STRATEGY_C:
-			return (c <= (pg_wchar) 127 &&
-					(pg_char_properties[c] & PG_ISSPACE));
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return pg_u_isspace(c);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return iswspace_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
-			return (c <= (pg_wchar) UCHAR_MAX &&
-					isspace_l((unsigned char) c, pg_regex_locale->info.lt));
-			break;
-		case PG_REGEX_STRATEGY_ICU:
-#ifdef USE_ICU
-			return u_isspace(c);
-#endif
-			break;
-	}
-	return 0;					/* can't get here, but keep compiler quiet */
+	if (pg_regex_locale->ctype_is_c)
+		return (c <= (pg_wchar) 127 &&
+				(char_properties_tbl[c] & PG_ISSPACE));
+	else
+		return char_properties(c, PG_ISSPACE, pg_regex_locale) != 0;
 }
 
 static pg_wchar
 pg_wc_toupper(pg_wchar c)
 {
-	switch (pg_regex_strategy)
+	if (pg_regex_locale->ctype_is_c)
 	{
-		case PG_REGEX_STRATEGY_C:
-			if (c <= (pg_wchar) 127)
-				return pg_ascii_toupper((unsigned char) c);
-			return c;
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return unicode_uppercase_simple(c);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return towupper_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
-			if (c <= (pg_wchar) UCHAR_MAX)
-				return toupper_l((unsigned char) c, pg_regex_locale->info.lt);
-			return c;
-		case PG_REGEX_STRATEGY_ICU:
-#ifdef USE_ICU
-			return u_toupper(c);
-#endif
-			break;
+		if (c <= (pg_wchar) 127)
+			return pg_ascii_toupper((unsigned char) c);
+		return c;
 	}
-	return 0;					/* can't get here, but keep compiler quiet */
+	else
+		return pg_regex_locale->ctype->wc_toupper(c, pg_regex_locale);
 }
 
 static pg_wchar
 pg_wc_tolower(pg_wchar c)
 {
-	switch (pg_regex_strategy)
+	if (pg_regex_locale->ctype_is_c)
 	{
-		case PG_REGEX_STRATEGY_C:
-			if (c <= (pg_wchar) 127)
-				return pg_ascii_tolower((unsigned char) c);
-			return c;
-		case PG_REGEX_STRATEGY_BUILTIN:
-			return unicode_lowercase_simple(c);
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
-				return towlower_l((wint_t) c, pg_regex_locale->info.lt);
-			/* FALL THRU */
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
-			if (c <= (pg_wchar) UCHAR_MAX)
-				return tolower_l((unsigned char) c, pg_regex_locale->info.lt);
-			return c;
-		case PG_REGEX_STRATEGY_ICU:
-#ifdef USE_ICU
-			return u_tolower(c);
-#endif
-			break;
+		if (c <= (pg_wchar) 127)
+			return pg_ascii_tolower((unsigned char) c);
+		return c;
 	}
-	return 0;					/* can't get here, but keep compiler quiet */
+	else
+		return pg_regex_locale->ctype->wc_tolower(c, pg_regex_locale);
 }
 
 
@@ -732,37 +512,27 @@ pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode)
 	 * would always be true for production values of MAX_SIMPLE_CHR, but it's
 	 * useful to allow it to be small for testing purposes.)
 	 */
-	switch (pg_regex_strategy)
+	if (pg_regex_locale->ctype_is_c)
 	{
-		case PG_REGEX_STRATEGY_C:
 #if MAX_SIMPLE_CHR >= 127
 			max_chr = (pg_wchar) 127;
 			pcc->cv.cclasscode = -1;
 #else
 			max_chr = (pg_wchar) MAX_SIMPLE_CHR;
 #endif
-			break;
-		case PG_REGEX_STRATEGY_BUILTIN:
-			max_chr = (pg_wchar) MAX_SIMPLE_CHR;
-			break;
-		case PG_REGEX_STRATEGY_LIBC_WIDE:
-			max_chr = (pg_wchar) MAX_SIMPLE_CHR;
-			break;
-		case PG_REGEX_STRATEGY_LIBC_1BYTE:
+	}
+	else
+	{
 #if MAX_SIMPLE_CHR >= UCHAR_MAX
+		if (pg_regex_locale->provider == COLLPROVIDER_LIBC &&
+			GetDatabaseEncoding() != PG_UTF8)
+		{
 			max_chr = (pg_wchar) UCHAR_MAX;
 			pcc->cv.cclasscode = -1;
-#else
-			max_chr = (pg_wchar) MAX_SIMPLE_CHR;
+		}
+		else
 #endif
-			break;
-		case PG_REGEX_STRATEGY_ICU:
 			max_chr = (pg_wchar) MAX_SIMPLE_CHR;
-			break;
-		default:
-			Assert(false);
-			max_chr = 0;		/* can't get here, but keep compiler quiet */
-			break;
 	}
 
 	/*
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index a106478b119..12f8987065c 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -249,6 +249,50 @@ static struct casemap_methods casemap_methods_builtin = {
 	.strupper = strupper_builtin,
 };
 
+static int
+char_properties_builtin(pg_wchar wc, int mask, pg_locale_t locale)
+{
+	int result = 0;
+
+	if ((mask & PG_ISDIGIT) && pg_u_isdigit(wc, true))
+		result |= PG_ISDIGIT;
+	if ((mask & PG_ISALPHA) && pg_u_isalpha(wc))
+		result |= PG_ISALPHA;
+	if ((mask & PG_ISUPPER) && pg_u_isupper(wc))
+		result |= PG_ISUPPER;
+	if ((mask & PG_ISLOWER) && pg_u_islower(wc))
+		result |= PG_ISLOWER;
+	if ((mask & PG_ISGRAPH) && pg_u_isgraph(wc))
+		result |= PG_ISGRAPH;
+	if ((mask & PG_ISPRINT) && pg_u_isprint(wc))
+		result |= PG_ISPRINT;
+	if ((mask & PG_ISPUNCT) && pg_u_ispunct(wc, true))
+		result |= PG_ISPUNCT;
+	if ((mask & PG_ISSPACE) && pg_u_isspace(wc))
+		result |= PG_ISSPACE;
+
+	return result;
+}
+
+static pg_wchar
+wc_toupper_builtin(pg_wchar wc, pg_locale_t locale)
+{
+	return unicode_uppercase_simple(wc);
+}
+ 
+static pg_wchar
+wc_tolower_builtin(pg_wchar wc, pg_locale_t locale)
+{
+	return unicode_lowercase_simple(wc);
+}
+
+static struct ctype_methods ctype_methods_builtin = {
+	.char_properties = char_properties_builtin,
+	.wc_tolower = wc_tolower_builtin,
+	.wc_toupper = wc_toupper_builtin,
+};
+
+
 /*
  * POSIX doesn't define _l-variants of these functions, but several systems
  * have them.  We provide our own replacements here.
@@ -1319,6 +1363,8 @@ dat_create_locale_builtin(HeapTuple dattuple)
 	result->collate_is_c = true;
 	result->ctype_is_c = (strcmp(locstr, "C") == 0);
 	result->casemap = &casemap_methods_builtin;
+	if (!result->ctype_is_c)
+		result->ctype = &ctype_methods_builtin;
 
 	return result;
 }
@@ -1346,6 +1392,8 @@ coll_create_locale_builtin(HeapTuple colltuple, MemoryContext context)
 	result->collate_is_c = true;
 	result->ctype_is_c = (strcmp(locstr, "C") == 0);
 	result->casemap = &casemap_methods_builtin;
+	if (!result->ctype_is_c)
+		result->ctype = &ctype_methods_builtin;
 
 	return result;
 }
@@ -1773,6 +1821,20 @@ pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src,
 	return locale->collate->strnxfrm_prefix(dest, destsize, src, srclen, locale);
 }
 
+/*
+ * char_properties()
+ *
+ * Out of the properties specified in the given mask, return a new mask of the
+ * properties true for the given character.
+ *
+ * XXX: add caching?
+ */
+int
+char_properties(pg_wchar wc, int mask, pg_locale_t locale)
+{
+	return locale->ctype->char_properties(wc, mask, locale);
+}
+
 /*
  * Return required encoding ID for the given locale, or -1 if any encoding is
  * valid for the locale.
diff --git a/src/backend/utils/adt/pg_locale_icu.c b/src/backend/utils/adt/pg_locale_icu.c
index 97e96d5b9fb..3951262486e 100644
--- a/src/backend/utils/adt/pg_locale_icu.c
+++ b/src/backend/utils/adt/pg_locale_icu.c
@@ -102,6 +102,43 @@ static int32_t u_strToTitle_default_BI(UChar *dest, int32_t destCapacity,
 									   const char *locale,
 									   UErrorCode *pErrorCode);
 
+static int
+char_properties_icu(pg_wchar wc, int mask, pg_locale_t locale)
+{
+	int result = 0;
+
+	if ((mask & PG_ISDIGIT) && u_isdigit(wc))
+		result |= PG_ISDIGIT;
+	if ((mask & PG_ISALPHA) && u_isalpha(wc))
+		result |= PG_ISALPHA;
+	if ((mask & PG_ISUPPER) && u_isupper(wc))
+		result |= PG_ISUPPER;
+	if ((mask & PG_ISLOWER) && u_islower(wc))
+		result |= PG_ISLOWER;
+	if ((mask & PG_ISGRAPH) && u_isgraph(wc))
+		result |= PG_ISGRAPH;
+	if ((mask & PG_ISPRINT) && u_isprint(wc))
+		result |= PG_ISPRINT;
+	if ((mask & PG_ISPUNCT) && u_ispunct(wc))
+		result |= PG_ISPUNCT;
+	if ((mask & PG_ISSPACE) && u_isspace(wc))
+		result |= PG_ISSPACE;
+
+	return result;
+}
+
+static pg_wchar
+toupper_icu(pg_wchar wc, pg_locale_t locale)
+{
+	return u_toupper(wc);
+}
+
+static pg_wchar
+tolower_icu(pg_wchar wc, pg_locale_t locale)
+{
+	return u_tolower(wc);
+}
+
 static struct collate_methods collate_methods_icu = {
 	.strncoll = strncoll_icu,
 	.strnxfrm = strnxfrm_icu,
@@ -114,6 +151,12 @@ static struct casemap_methods casemap_methods_icu = {
 	.strtitle = strtitle_icu,
 	.strupper = strupper_icu,
 };
+
+static struct ctype_methods ctype_methods_icu = {
+       .char_properties = char_properties_icu,
+       .wc_toupper = toupper_icu,
+       .wc_tolower = tolower_icu,
+};
 #endif
 
 pg_locale_t
@@ -151,6 +194,7 @@ dat_create_locale_icu(HeapTuple dattuple)
 	result->ctype_is_c = false;
 	result->collate = &collate_methods_icu;
 	result->casemap = &casemap_methods_icu;
+	result->ctype = &ctype_methods_icu;
 
 	return result;
 #else
@@ -197,6 +241,7 @@ coll_create_locale_icu(HeapTuple colltuple, MemoryContext context)
 	result->ctype_is_c = false;
 	result->collate = &collate_methods_icu;
 	result->casemap = &casemap_methods_icu;
+	result->ctype = &ctype_methods_icu;
 
 	return result;
 #else
diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c
index 79828ab3524..6de87d6b948 100644
--- a/src/backend/utils/adt/pg_locale_libc.c
+++ b/src/backend/utils/adt/pg_locale_libc.c
@@ -61,6 +61,10 @@ static size_t strupper_libc(char *dest, size_t destsize,
 							const char *src, ssize_t srclen,
 							pg_locale_t locale);
 
+static int char_properties_libc(pg_wchar wc, int mask, pg_locale_t locale);
+static pg_wchar toupper_libc(pg_wchar wc, pg_locale_t locale);
+static pg_wchar tolower_libc(pg_wchar wc, pg_locale_t locale);
+
 static struct collate_methods collate_methods_libc = {
 	.strncoll = strncoll_libc,
 	.strnxfrm = strnxfrm_libc,
@@ -88,6 +92,12 @@ static struct casemap_methods casemap_methods_libc = {
 	.strupper = strupper_libc,
 };
 
+static struct ctype_methods ctype_methods_libc = {
+	.char_properties = char_properties_libc,
+	.wc_toupper = toupper_libc,
+	.wc_tolower = tolower_libc,
+};
+
 static size_t
 strlower_libc(char *dest, size_t destsize, const char *src, ssize_t srclen,
 			  pg_locale_t locale)
@@ -319,6 +329,8 @@ dat_create_locale_libc(HeapTuple dattuple)
 		result->collate = &collate_methods_libc;
 	if (!result->ctype_is_c)
 		result->casemap = &casemap_methods_libc;
+	if (!result->ctype_is_c)
+		result->ctype = &ctype_methods_libc;
 
 	return result;
 }
@@ -356,6 +368,8 @@ coll_create_locale_libc(HeapTuple colltuple, MemoryContext context)
 		result->collate = &collate_methods_libc;
 	if (!result->ctype_is_c)
 		result->casemap = &casemap_methods_libc;
+	if (!result->ctype_is_c)
+		result->ctype = &ctype_methods_libc;
 
 	return result;
 }
@@ -644,3 +658,158 @@ report_newlocale_failure(const char *localename)
 			  errdetail("The operating system could not find any locale data for the locale name \"%s\".",
 						localename) : 0)));
 }
+
+static int
+char_properties_libc(pg_wchar wc, int mask, pg_locale_t locale)
+{
+	int result = 0;
+
+	Assert(!locale->ctype_is_c);
+
+	if (mask & PG_ISDIGIT)
+	{
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF))
+		{
+			if (iswdigit_l((wint_t) wc, locale->info.lt))
+				result |= PG_ISDIGIT;
+		}
+		else
+		{
+			if (wc <= (pg_wchar) UCHAR_MAX &&
+				isdigit_l((unsigned char) wc, locale->info.lt))
+				result |= PG_ISDIGIT;
+		}
+	}
+	if (mask & PG_ISALPHA)
+	{
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF))
+		{
+			if (iswalpha_l((wint_t) wc, locale->info.lt))
+				result |= PG_ISALPHA;
+		}
+		else
+		{
+			if (wc <= (pg_wchar) UCHAR_MAX &&
+				isalpha_l((unsigned char) wc, locale->info.lt))
+				result |= PG_ISALPHA;
+		}
+	}
+	if (mask & PG_ISUPPER)
+	{
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF))
+		{
+			if (iswupper_l((wint_t) wc, locale->info.lt))
+				result |= PG_ISUPPER;
+		}
+		else
+		{
+			if (wc <= (pg_wchar) UCHAR_MAX &&
+				isupper_l((unsigned char) wc, locale->info.lt))
+				result |= PG_ISUPPER;
+		}
+	}
+	if (mask & PG_ISLOWER)
+	{
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF))
+		{
+			if (iswlower_l((wint_t) wc, locale->info.lt))
+				result |= PG_ISLOWER;
+		}
+		else
+		{
+			if (wc <= (pg_wchar) UCHAR_MAX &&
+				islower_l((unsigned char) wc, locale->info.lt))
+				result |= PG_ISLOWER;
+		}
+	}
+	if (mask & PG_ISGRAPH)
+	{
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF))
+		{
+			if (iswgraph_l((wint_t) wc, locale->info.lt))
+				result |= PG_ISGRAPH;
+		}
+		else
+		{
+			if (wc <= (pg_wchar) UCHAR_MAX &&
+				isgraph_l((unsigned char) wc, locale->info.lt))
+				result |= PG_ISGRAPH;
+		}
+	}
+	if (mask & PG_ISPRINT)
+	{
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF))
+		{
+			if (iswprint_l((wint_t) wc, locale->info.lt))
+				result |= PG_ISPRINT;
+		}
+		else
+		{
+			if (wc <= (pg_wchar) UCHAR_MAX &&
+				isprint_l((unsigned char) wc, locale->info.lt))
+				result |= PG_ISPRINT;
+		}
+	}
+	if (mask & PG_ISPUNCT)
+	{
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF))
+		{
+			if (iswpunct_l((wint_t) wc, locale->info.lt))
+				result |= PG_ISPUNCT;
+		}
+		else
+		{
+			if (wc <= (pg_wchar) UCHAR_MAX &&
+				ispunct_l((unsigned char) wc, locale->info.lt))
+				result |= PG_ISPUNCT;
+		}
+	}
+	if (mask & PG_ISSPACE)
+	{
+		if (GetDatabaseEncoding() == PG_UTF8 &&
+			(sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF))
+		{
+			if (iswspace_l((wint_t) wc, locale->info.lt))
+				result |= PG_ISSPACE;
+		}
+		else
+		{
+			if (wc <= (pg_wchar) UCHAR_MAX &&
+				isspace_l((unsigned char) wc, locale->info.lt))
+				result |= PG_ISSPACE;
+		}
+	}
+
+	return result;
+}
+
+static pg_wchar
+toupper_libc(pg_wchar wc, pg_locale_t locale)
+{
+	if (GetDatabaseEncoding() == PG_UTF8 &&
+		(sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF))
+		return towupper_l((wint_t) wc, locale->info.lt);
+	else if (wc <= (pg_wchar) UCHAR_MAX)
+		return toupper_l((unsigned char) wc, locale->info.lt);
+	else
+		return wc;
+}
+
+static pg_wchar
+tolower_libc(pg_wchar wc, pg_locale_t locale)
+{
+	if (GetDatabaseEncoding() == PG_UTF8 &&
+		(sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF))
+		return towlower_l((wint_t) wc, locale->info.lt);
+	else if (wc <= (pg_wchar) UCHAR_MAX)
+		return tolower_l((unsigned char) wc, locale->info.lt);
+	else
+		return wc;
+}
diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h
index 4bd9e6de7a3..3e5f625f661 100644
--- a/src/include/utils/pg_locale.h
+++ b/src/include/utils/pg_locale.h
@@ -12,6 +12,8 @@
 #ifndef _PG_LOCALE_
 #define _PG_LOCALE_
 
+#include "mb/pg_wchar.h"
+
 #if defined(LOCALE_T_IN_XLOCALE) || defined(WCSTOMBS_L_IN_XLOCALE)
 #include <xlocale.h>
 #endif
@@ -19,6 +21,19 @@
 #include <unicode/ucol.h>
 #endif
 
+/*
+ * Character properties for regular expressions.
+ */
+#define PG_ISDIGIT     0x01
+#define PG_ISALPHA     0x02
+#define PG_ISALNUM     (PG_ISDIGIT | PG_ISALPHA)
+#define PG_ISUPPER     0x04
+#define PG_ISLOWER     0x08
+#define PG_ISGRAPH     0x10
+#define PG_ISPRINT     0x20
+#define PG_ISPUNCT     0x40
+#define PG_ISSPACE     0x80
+
 #ifdef USE_ICU
 /*
  * ucol_strcollUTF8() was introduced in ICU 50, but it is buggy before ICU 53.
@@ -107,6 +122,12 @@ struct casemap_methods
 							 pg_locale_t locale);
 };
 
+struct ctype_methods {
+	int (*char_properties) (pg_wchar wc, int mask, pg_locale_t locale);
+	pg_wchar (*wc_toupper) (pg_wchar wc, pg_locale_t locale);
+	pg_wchar (*wc_tolower) (pg_wchar wc, pg_locale_t locale);
+};
+
 /*
  * We use a discriminated union to hold either a locale_t or an ICU collator.
  * pg_locale_t is occasionally checked for truth, so make it a pointer.
@@ -132,6 +153,7 @@ struct pg_locale_struct
 
 	struct collate_methods *collate;	/* NULL if collate_is_c */
 	struct casemap_methods *casemap;	/* NULL if ctype_is_c */
+	struct ctype_methods *ctype;		/* NULL if ctype_is_c */
 
 	union
 	{
@@ -156,6 +178,7 @@ extern void init_database_collation(void);
 extern pg_locale_t pg_newlocale_from_collation(Oid collid);
 
 extern char *get_collation_actual_version(char collprovider, const char *collcollate);
+extern int char_properties(pg_wchar wc, int mask, pg_locale_t locale);
 extern size_t pg_strlower(char *dest, size_t destsize,
 						  const char *src, ssize_t srclen,
 						  pg_locale_t locale);
-- 
2.34.1

