C11: should we use char32_t for unicode code points?
Now that we're using C11, should we use char32_t for unicode code
points?
Right now, we use pg_wchar for two purposes:
1. to abstract away some problems with wchar_t on platforms where
it's 16 bits; and
2. hold unicode code point values
In UTF8, they are are equivalent and can be freely cast back and forth,
but not necessarily in other encodings. That can be confusing in some
contexts. Attached is a patch to use char32_t for the second purpose.
Both are equivalent to uint32, so there's no functional change and no
actual typechecking, it's just for readability.
Is this helpful, or needless code churn?
Regards,
Jeff Davis
Attachments:
v1-0001-Use-C11-char32_t-for-Unicode-code-points.patchtext/x-patch; charset=UTF-8; name=v1-0001-Use-C11-char32_t-for-Unicode-code-points.patchDownload
From b5b65eb496ff0365f8cde297c5486755e65fc4b1 Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Tue, 21 Oct 2025 13:16:47 -0700
Subject: [PATCH v1] Use C11 char32_t for Unicode code points.
---
src/backend/parser/parser.c | 2 +-
src/backend/utils/adt/pg_locale_builtin.c | 44 ++++++++++-----
src/backend/utils/adt/varlena.c | 40 ++++++-------
src/backend/utils/mb/mbutils.c | 4 +-
src/common/saslprep.c | 48 ++++++++--------
src/common/unicode/case_test.c | 23 ++++----
src/common/unicode/category_test.c | 3 +-
.../unicode/generate-norm_test_table.pl | 4 +-
.../unicode/generate-unicode_case_table.pl | 7 +--
.../generate-unicode_category_table.pl | 8 +--
src/common/unicode/norm_test.c | 6 +-
src/common/unicode_case.c | 56 +++++++++----------
src/common/unicode_category.c | 50 ++++++++---------
src/common/unicode_norm.c | 56 +++++++++----------
src/fe_utils/mbprint.c | 2 +-
src/include/c.h | 5 ++
src/include/common/unicode_case.h | 10 ++--
src/include/common/unicode_case_table.h | 13 ++---
src/include/common/unicode_category.h | 46 ++++++++-------
src/include/common/unicode_category_table.h | 8 +--
src/include/common/unicode_norm.h | 6 +-
src/include/mb/pg_wchar.h | 16 +++---
src/tools/pgindent/typedefs.list | 1 +
23 files changed, 237 insertions(+), 221 deletions(-)
diff --git a/src/backend/parser/parser.c b/src/backend/parser/parser.c
index 33a040506b4..88126626fb1 100644
--- a/src/backend/parser/parser.c
+++ b/src/backend/parser/parser.c
@@ -339,7 +339,7 @@ hexval(unsigned char c)
/* is Unicode code point acceptable? */
static void
-check_unicode_value(pg_wchar c)
+check_unicode_value(char32_t c)
{
if (!is_valid_unicode_codepoint(c))
ereport(ERROR,
diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c
index 3dc611b50e1..1021e0d129b 100644
--- a/src/backend/utils/adt/pg_locale_builtin.c
+++ b/src/backend/utils/adt/pg_locale_builtin.c
@@ -15,7 +15,6 @@
#include "catalog/pg_collation.h"
#include "common/unicode_case.h"
#include "common/unicode_category.h"
-#include "mb/pg_wchar.h"
#include "miscadmin.h"
#include "utils/builtins.h"
#include "utils/pg_locale.h"
@@ -35,6 +34,23 @@ struct WordBoundaryState
bool prev_alnum;
};
+/*
+ * In UTF-8, pg_wchar is guaranteed to be the code point value.
+ */
+static inline char32_t
+to_char32(pg_wchar wc)
+{
+ Assert(GetDatabaseEncoding() == PG_UTF8);
+ return (char32_t) wc;
+}
+
+static inline pg_wchar
+to_pg_wchar(char32_t c32)
+{
+ Assert(GetDatabaseEncoding() == PG_UTF8);
+ return (pg_wchar) c32;
+}
+
/*
* Simple word boundary iterator that draws boundaries each time the result of
* pg_u_isalnum() changes.
@@ -47,7 +63,7 @@ initcap_wbnext(void *state)
while (wbstate->offset < wbstate->len &&
wbstate->str[wbstate->offset] != '\0')
{
- pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str +
+ char32_t u = utf8_to_unicode((unsigned char *) wbstate->str +
wbstate->offset);
bool curr_alnum = pg_u_isalnum(u, wbstate->posix);
@@ -112,61 +128,61 @@ strfold_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
static bool
wc_isdigit_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_isdigit(wc, !locale->builtin.casemap_full);
+ return pg_u_isdigit(to_char32(wc), !locale->builtin.casemap_full);
}
static bool
wc_isalpha_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_isalpha(wc);
+ return pg_u_isalpha(to_char32(wc));
}
static bool
wc_isalnum_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_isalnum(wc, !locale->builtin.casemap_full);
+ return pg_u_isalnum(to_char32(wc), !locale->builtin.casemap_full);
}
static bool
wc_isupper_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_isupper(wc);
+ return pg_u_isupper(to_char32(wc));
}
static bool
wc_islower_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_islower(wc);
+ return pg_u_islower(to_char32(wc));
}
static bool
wc_isgraph_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_isgraph(wc);
+ return pg_u_isgraph(to_char32(wc));
}
static bool
wc_isprint_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_isprint(wc);
+ return pg_u_isprint(to_char32(wc));
}
static bool
wc_ispunct_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_ispunct(wc, !locale->builtin.casemap_full);
+ return pg_u_ispunct(to_char32(wc), !locale->builtin.casemap_full);
}
static bool
wc_isspace_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_isspace(wc);
+ return pg_u_isspace(to_char32(wc));
}
static bool
wc_isxdigit_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_isxdigit(wc, !locale->builtin.casemap_full);
+ return pg_u_isxdigit(to_char32(wc), !locale->builtin.casemap_full);
}
static bool
@@ -179,13 +195,13 @@ char_is_cased_builtin(char ch, pg_locale_t locale)
static pg_wchar
wc_toupper_builtin(pg_wchar wc, pg_locale_t locale)
{
- return unicode_uppercase_simple(wc);
+ return to_pg_wchar(unicode_uppercase_simple(to_char32(wc)));
}
static pg_wchar
wc_tolower_builtin(pg_wchar wc, pg_locale_t locale)
{
- return unicode_lowercase_simple(wc);
+ return to_pg_wchar(unicode_lowercase_simple(to_char32(wc)));
}
static const struct ctype_methods ctype_methods_builtin = {
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index 2c398cd9e5c..fa1a975cab9 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -5419,12 +5419,12 @@ unicode_assigned(PG_FUNCTION_ARGS)
ereport(ERROR,
(errmsg("Unicode categorization can only be performed if server encoding is UTF8")));
- /* convert to pg_wchar */
+ /* convert to char32_t */
size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
p = (unsigned char *) VARDATA_ANY(input);
for (int i = 0; i < size; i++)
{
- pg_wchar uchar = utf8_to_unicode(p);
+ char32_t uchar = utf8_to_unicode(p);
int category = unicode_category(uchar);
if (category == PG_U_UNASSIGNED)
@@ -5443,24 +5443,24 @@ unicode_normalize_func(PG_FUNCTION_ARGS)
char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
UnicodeNormalizationForm form;
int size;
- pg_wchar *input_chars;
- pg_wchar *output_chars;
+ char32_t *input_chars;
+ char32_t *output_chars;
unsigned char *p;
text *result;
int i;
form = unicode_norm_form_from_string(formstr);
- /* convert to pg_wchar */
+ /* convert to char32_t */
size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
- input_chars = palloc((size + 1) * sizeof(pg_wchar));
+ input_chars = palloc((size + 1) * sizeof(char32_t));
p = (unsigned char *) VARDATA_ANY(input);
for (i = 0; i < size; i++)
{
input_chars[i] = utf8_to_unicode(p);
p += pg_utf_mblen(p);
}
- input_chars[i] = (pg_wchar) '\0';
+ input_chars[i] = (char32_t) '\0';
Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
/* action */
@@ -5468,7 +5468,7 @@ unicode_normalize_func(PG_FUNCTION_ARGS)
/* convert back to UTF-8 string */
size = 0;
- for (pg_wchar *wp = output_chars; *wp; wp++)
+ for (char32_t *wp = output_chars; *wp; wp++)
{
unsigned char buf[4];
@@ -5480,7 +5480,7 @@ unicode_normalize_func(PG_FUNCTION_ARGS)
SET_VARSIZE(result, size + VARHDRSZ);
p = (unsigned char *) VARDATA_ANY(result);
- for (pg_wchar *wp = output_chars; *wp; wp++)
+ for (char32_t *wp = output_chars; *wp; wp++)
{
unicode_to_utf8(*wp, p);
p += pg_utf_mblen(p);
@@ -5509,8 +5509,8 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
UnicodeNormalizationForm form;
int size;
- pg_wchar *input_chars;
- pg_wchar *output_chars;
+ char32_t *input_chars;
+ char32_t *output_chars;
unsigned char *p;
int i;
UnicodeNormalizationQC quickcheck;
@@ -5519,16 +5519,16 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
form = unicode_norm_form_from_string(formstr);
- /* convert to pg_wchar */
+ /* convert to char32_t */
size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
- input_chars = palloc((size + 1) * sizeof(pg_wchar));
+ input_chars = palloc((size + 1) * sizeof(char32_t));
p = (unsigned char *) VARDATA_ANY(input);
for (i = 0; i < size; i++)
{
input_chars[i] = utf8_to_unicode(p);
p += pg_utf_mblen(p);
}
- input_chars[i] = (pg_wchar) '\0';
+ input_chars[i] = (char32_t) '\0';
Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
/* quick check (see UAX #15) */
@@ -5542,11 +5542,11 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
output_chars = unicode_normalize(form, input_chars);
output_size = 0;
- for (pg_wchar *wp = output_chars; *wp; wp++)
+ for (char32_t *wp = output_chars; *wp; wp++)
output_size++;
result = (size == output_size) &&
- (memcmp(input_chars, output_chars, size * sizeof(pg_wchar)) == 0);
+ (memcmp(input_chars, output_chars, size * sizeof(char32_t)) == 0);
PG_RETURN_BOOL(result);
}
@@ -5602,7 +5602,7 @@ unistr(PG_FUNCTION_ARGS)
int len;
StringInfoData str;
text *result;
- pg_wchar pair_first = 0;
+ char32_t pair_first = 0;
char cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
instr = VARDATA_ANY(input_text);
@@ -5626,7 +5626,7 @@ unistr(PG_FUNCTION_ARGS)
else if ((len >= 5 && isxdigits_n(instr + 1, 4)) ||
(len >= 6 && instr[1] == 'u' && isxdigits_n(instr + 2, 4)))
{
- pg_wchar unicode;
+ char32_t unicode;
int offset = instr[1] == 'u' ? 2 : 1;
unicode = hexval_n(instr + offset, 4);
@@ -5662,7 +5662,7 @@ unistr(PG_FUNCTION_ARGS)
}
else if (len >= 8 && instr[1] == '+' && isxdigits_n(instr + 2, 6))
{
- pg_wchar unicode;
+ char32_t unicode;
unicode = hexval_n(instr + 2, 6);
@@ -5697,7 +5697,7 @@ unistr(PG_FUNCTION_ARGS)
}
else if (len >= 10 && instr[1] == 'U' && isxdigits_n(instr + 2, 8))
{
- pg_wchar unicode;
+ char32_t unicode;
unicode = hexval_n(instr + 2, 8);
diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c
index 886ecbad871..fb629ed5c8f 100644
--- a/src/backend/utils/mb/mbutils.c
+++ b/src/backend/utils/mb/mbutils.c
@@ -862,7 +862,7 @@ perform_default_encoding_conversion(const char *src, int len,
* may call this outside any transaction, or in an aborted transaction.
*/
void
-pg_unicode_to_server(pg_wchar c, unsigned char *s)
+pg_unicode_to_server(char32_t c, unsigned char *s)
{
unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1];
int c_as_utf8_len;
@@ -924,7 +924,7 @@ pg_unicode_to_server(pg_wchar c, unsigned char *s)
* but simply return false on conversion failure.
*/
bool
-pg_unicode_to_server_noerror(pg_wchar c, unsigned char *s)
+pg_unicode_to_server_noerror(char32_t c, unsigned char *s)
{
unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1];
int c_as_utf8_len;
diff --git a/src/common/saslprep.c b/src/common/saslprep.c
index 97beb47940b..101e8d65a4d 100644
--- a/src/common/saslprep.c
+++ b/src/common/saslprep.c
@@ -47,7 +47,7 @@
/* Prototypes for local functions */
static int codepoint_range_cmp(const void *a, const void *b);
-static bool is_code_in_table(pg_wchar code, const pg_wchar *map, int mapsize);
+static bool is_code_in_table(char32_t code, const char32_t *map, int mapsize);
static int pg_utf8_string_len(const char *source);
/*
@@ -64,7 +64,7 @@ static int pg_utf8_string_len(const char *source);
*
* These are all mapped to the ASCII space character (U+00A0).
*/
-static const pg_wchar non_ascii_space_ranges[] =
+static const char32_t non_ascii_space_ranges[] =
{
0x00A0, 0x00A0,
0x1680, 0x1680,
@@ -79,7 +79,7 @@ static const pg_wchar non_ascii_space_ranges[] =
*
* If any of these appear in the input, they are removed.
*/
-static const pg_wchar commonly_mapped_to_nothing_ranges[] =
+static const char32_t commonly_mapped_to_nothing_ranges[] =
{
0x00AD, 0x00AD,
0x034F, 0x034F,
@@ -114,7 +114,7 @@ static const pg_wchar commonly_mapped_to_nothing_ranges[] =
* tables, so one code might originate from multiple source tables.
* Adjacent ranges have also been merged together, to save space.
*/
-static const pg_wchar prohibited_output_ranges[] =
+static const char32_t prohibited_output_ranges[] =
{
0x0000, 0x001F, /* C.2.1 */
0x007F, 0x00A0, /* C.1.2, C.2.1, C.2.2 */
@@ -155,7 +155,7 @@ static const pg_wchar prohibited_output_ranges[] =
};
/* A.1 Unassigned code points in Unicode 3.2 */
-static const pg_wchar unassigned_codepoint_ranges[] =
+static const char32_t unassigned_codepoint_ranges[] =
{
0x0221, 0x0221,
0x0234, 0x024F,
@@ -556,7 +556,7 @@ static const pg_wchar unassigned_codepoint_ranges[] =
};
/* D.1 Characters with bidirectional property "R" or "AL" */
-static const pg_wchar RandALCat_codepoint_ranges[] =
+static const char32_t RandALCat_codepoint_ranges[] =
{
0x05BE, 0x05BE,
0x05C0, 0x05C0,
@@ -595,7 +595,7 @@ static const pg_wchar RandALCat_codepoint_ranges[] =
};
/* D.2 Characters with bidirectional property "L" */
-static const pg_wchar LCat_codepoint_ranges[] =
+static const char32_t LCat_codepoint_ranges[] =
{
0x0041, 0x005A,
0x0061, 0x007A,
@@ -968,8 +968,8 @@ static const pg_wchar LCat_codepoint_ranges[] =
static int
codepoint_range_cmp(const void *a, const void *b)
{
- const pg_wchar *key = (const pg_wchar *) a;
- const pg_wchar *range = (const pg_wchar *) b;
+ const char32_t *key = (const char32_t *) a;
+ const char32_t *range = (const char32_t *) b;
if (*key < range[0])
return -1; /* less than lower bound */
@@ -980,14 +980,14 @@ codepoint_range_cmp(const void *a, const void *b)
}
static bool
-is_code_in_table(pg_wchar code, const pg_wchar *map, int mapsize)
+is_code_in_table(char32_t code, const char32_t *map, int mapsize)
{
Assert(mapsize % 2 == 0);
if (code < map[0] || code > map[mapsize - 1])
return false;
- if (bsearch(&code, map, mapsize / 2, sizeof(pg_wchar) * 2,
+ if (bsearch(&code, map, mapsize / 2, sizeof(char32_t) * 2,
codepoint_range_cmp))
return true;
else
@@ -1046,8 +1046,8 @@ pg_utf8_string_len(const char *source)
pg_saslprep_rc
pg_saslprep(const char *input, char **output)
{
- pg_wchar *input_chars = NULL;
- pg_wchar *output_chars = NULL;
+ char32_t *input_chars = NULL;
+ char32_t *output_chars = NULL;
int input_size;
char *result;
int result_size;
@@ -1055,7 +1055,7 @@ pg_saslprep(const char *input, char **output)
int i;
bool contains_RandALCat;
unsigned char *p;
- pg_wchar *wp;
+ char32_t *wp;
/* Ensure we return *output as NULL on failure */
*output = NULL;
@@ -1080,10 +1080,10 @@ pg_saslprep(const char *input, char **output)
input_size = pg_utf8_string_len(input);
if (input_size < 0)
return SASLPREP_INVALID_UTF8;
- if (input_size >= MaxAllocSize / sizeof(pg_wchar))
+ if (input_size >= MaxAllocSize / sizeof(char32_t))
goto oom;
- input_chars = ALLOC((input_size + 1) * sizeof(pg_wchar));
+ input_chars = ALLOC((input_size + 1) * sizeof(char32_t));
if (!input_chars)
goto oom;
@@ -1093,7 +1093,7 @@ pg_saslprep(const char *input, char **output)
input_chars[i] = utf8_to_unicode(p);
p += pg_utf_mblen(p);
}
- input_chars[i] = (pg_wchar) '\0';
+ input_chars[i] = (char32_t) '\0';
/*
* The steps below correspond to the steps listed in [RFC3454], Section
@@ -1107,7 +1107,7 @@ pg_saslprep(const char *input, char **output)
count = 0;
for (i = 0; i < input_size; i++)
{
- pg_wchar code = input_chars[i];
+ char32_t code = input_chars[i];
if (IS_CODE_IN_TABLE(code, non_ascii_space_ranges))
input_chars[count++] = 0x0020;
@@ -1118,7 +1118,7 @@ pg_saslprep(const char *input, char **output)
else
input_chars[count++] = code;
}
- input_chars[count] = (pg_wchar) '\0';
+ input_chars[count] = (char32_t) '\0';
input_size = count;
if (input_size == 0)
@@ -1138,7 +1138,7 @@ pg_saslprep(const char *input, char **output)
*/
for (i = 0; i < input_size; i++)
{
- pg_wchar code = input_chars[i];
+ char32_t code = input_chars[i];
if (IS_CODE_IN_TABLE(code, prohibited_output_ranges))
goto prohibited;
@@ -1170,7 +1170,7 @@ pg_saslprep(const char *input, char **output)
contains_RandALCat = false;
for (i = 0; i < input_size; i++)
{
- pg_wchar code = input_chars[i];
+ char32_t code = input_chars[i];
if (IS_CODE_IN_TABLE(code, RandALCat_codepoint_ranges))
{
@@ -1181,12 +1181,12 @@ pg_saslprep(const char *input, char **output)
if (contains_RandALCat)
{
- pg_wchar first = input_chars[0];
- pg_wchar last = input_chars[input_size - 1];
+ char32_t first = input_chars[0];
+ char32_t last = input_chars[input_size - 1];
for (i = 0; i < input_size; i++)
{
- pg_wchar code = input_chars[i];
+ char32_t code = input_chars[i];
if (IS_CODE_IN_TABLE(code, LCat_codepoint_ranges))
goto prohibited;
diff --git a/src/common/unicode/case_test.c b/src/common/unicode/case_test.c
index fdfb62e8552..00d4f85e5a5 100644
--- a/src/common/unicode/case_test.c
+++ b/src/common/unicode/case_test.c
@@ -24,6 +24,7 @@
#include "common/unicode_case.h"
#include "common/unicode_category.h"
#include "common/unicode_version.h"
+#include "mb/pg_wchar.h"
/* enough to hold largest source or result string, including NUL */
#define BUFSZ 256
@@ -54,7 +55,7 @@ initcap_wbnext(void *state)
while (wbstate->offset < wbstate->len &&
wbstate->str[wbstate->offset] != '\0')
{
- pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str +
+ char32_t u = utf8_to_unicode((unsigned char *) wbstate->str +
wbstate->offset);
bool curr_alnum = pg_u_isalnum(u, wbstate->posix);
@@ -77,16 +78,16 @@ initcap_wbnext(void *state)
#ifdef USE_ICU
static void
-icu_test_simple(pg_wchar code)
+icu_test_simple(char32_t code)
{
- pg_wchar lower = unicode_lowercase_simple(code);
- pg_wchar title = unicode_titlecase_simple(code);
- pg_wchar upper = unicode_uppercase_simple(code);
- pg_wchar fold = unicode_casefold_simple(code);
- pg_wchar iculower = u_tolower(code);
- pg_wchar icutitle = u_totitle(code);
- pg_wchar icuupper = u_toupper(code);
- pg_wchar icufold = u_foldCase(code, U_FOLD_CASE_DEFAULT);
+ char32_t lower = unicode_lowercase_simple(code);
+ char32_t title = unicode_titlecase_simple(code);
+ char32_t upper = unicode_uppercase_simple(code);
+ char32_t fold = unicode_casefold_simple(code);
+ char32_t iculower = u_tolower(code);
+ char32_t icutitle = u_totitle(code);
+ char32_t icuupper = u_toupper(code);
+ char32_t icufold = u_foldCase(code, U_FOLD_CASE_DEFAULT);
if (lower != iculower || title != icutitle || upper != icuupper ||
fold != icufold)
@@ -172,7 +173,7 @@ test_icu(void)
int successful = 0;
int skipped_mismatch = 0;
- for (pg_wchar code = 0; code <= 0x10ffff; code++)
+ for (char32_t code = 0; code <= 0x10ffff; code++)
{
pg_unicode_category category = unicode_category(code);
diff --git a/src/common/unicode/category_test.c b/src/common/unicode/category_test.c
index 5d37ba39196..1e8c1f7905f 100644
--- a/src/common/unicode/category_test.c
+++ b/src/common/unicode/category_test.c
@@ -22,6 +22,7 @@
#include "common/unicode_category.h"
#include "common/unicode_version.h"
+#include "mb/pg_wchar.h"
static int pg_unicode_version = 0;
#ifdef USE_ICU
@@ -59,7 +60,7 @@ icu_test()
int pg_skipped_codepoints = 0;
int icu_skipped_codepoints = 0;
- for (pg_wchar code = 0; code <= 0x10ffff; code++)
+ for (char32_t code = 0; code <= 0x10ffff; code++)
{
uint8_t pg_category = unicode_category(code);
uint8_t icu_category = u_charType(code);
diff --git a/src/common/unicode/generate-norm_test_table.pl b/src/common/unicode/generate-norm_test_table.pl
index 1b401be9409..1a8b908ff33 100644
--- a/src/common/unicode/generate-norm_test_table.pl
+++ b/src/common/unicode/generate-norm_test_table.pl
@@ -47,8 +47,8 @@ print $OUTPUT <<HEADER;
typedef struct
{
int linenum;
- pg_wchar input[50];
- pg_wchar output[4][50];
+ char32_t input[50];
+ char32_t output[4][50];
} pg_unicode_test;
/* test table */
diff --git a/src/common/unicode/generate-unicode_case_table.pl b/src/common/unicode/generate-unicode_case_table.pl
index 5d9ddd62803..f71eb25c94e 100644
--- a/src/common/unicode/generate-unicode_case_table.pl
+++ b/src/common/unicode/generate-unicode_case_table.pl
@@ -270,7 +270,6 @@ print $OT <<"EOS";
*/
#include "common/unicode_case.h"
-#include "mb/pg_wchar.h"
/*
* The maximum number of codepoints that can result from case mapping
@@ -297,7 +296,7 @@ typedef enum
typedef struct
{
int16 conditions;
- pg_wchar map[NCaseKind][MAX_CASE_EXPANSION];
+ char32_t map[NCaseKind][MAX_CASE_EXPANSION];
} pg_special_case;
/*
@@ -430,7 +429,7 @@ foreach my $kind ('lower', 'title', 'upper', 'fold')
* The entry case_map_${kind}[case_index(codepoint)] is the mapping for the
* given codepoint.
*/
-static const pg_wchar case_map_$kind\[$index\] =
+static const char32_t case_map_$kind\[$index\] =
{
EOS
@@ -502,7 +501,7 @@ print $OT <<"EOS";
* the offset into the mapping tables.
*/
static inline uint16
-case_index(pg_wchar cp)
+case_index(char32_t cp)
{
/* Fast path for codepoints < $fastpath_limit */
if (cp < $fastpath_limit)
diff --git a/src/common/unicode/generate-unicode_category_table.pl b/src/common/unicode/generate-unicode_category_table.pl
index abab5cd9696..7e094b13720 100644
--- a/src/common/unicode/generate-unicode_category_table.pl
+++ b/src/common/unicode/generate-unicode_category_table.pl
@@ -366,15 +366,15 @@ print $OT <<"EOS";
*/
typedef struct
{
- uint32 first; /* Unicode codepoint */
- uint32 last; /* Unicode codepoint */
+ char32_t first; /* Unicode codepoint */
+ char32_t last; /* Unicode codepoint */
uint8 category; /* General Category */
} pg_category_range;
typedef struct
{
- uint32 first; /* Unicode codepoint */
- uint32 last; /* Unicode codepoint */
+ char32_t first; /* Unicode codepoint */
+ char32_t last; /* Unicode codepoint */
} pg_unicode_range;
typedef struct
diff --git a/src/common/unicode/norm_test.c b/src/common/unicode/norm_test.c
index 25bc59463f2..058817f1719 100644
--- a/src/common/unicode/norm_test.c
+++ b/src/common/unicode/norm_test.c
@@ -20,7 +20,7 @@
#include "norm_test_table.h"
static char *
-print_wchar_str(const pg_wchar *s)
+print_wchar_str(const char32_t *s)
{
#define BUF_DIGITS 50
static char buf[BUF_DIGITS * 11 + 1];
@@ -41,7 +41,7 @@ print_wchar_str(const pg_wchar *s)
}
static int
-pg_wcscmp(const pg_wchar *s1, const pg_wchar *s2)
+pg_wcscmp(const char32_t *s1, const char32_t *s2)
{
for (;;)
{
@@ -65,7 +65,7 @@ main(int argc, char **argv)
{
for (int form = 0; form < 4; form++)
{
- pg_wchar *result;
+ char32_t *result;
result = unicode_normalize(form, test->input);
diff --git a/src/common/unicode_case.c b/src/common/unicode_case.c
index 073faf6a0d5..e5e494db43c 100644
--- a/src/common/unicode_case.c
+++ b/src/common/unicode_case.c
@@ -30,7 +30,7 @@ enum CaseMapResult
/*
* Map for each case kind.
*/
-static const pg_wchar *const casekind_map[NCaseKind] =
+static const char32_t *const casekind_map[NCaseKind] =
{
[CaseLower] = case_map_lower,
[CaseTitle] = case_map_title,
@@ -38,42 +38,42 @@ static const pg_wchar *const casekind_map[NCaseKind] =
[CaseFold] = case_map_fold,
};
-static pg_wchar find_case_map(pg_wchar ucs, const pg_wchar *map);
+static char32_t find_case_map(char32_t ucs, const char32_t *map);
static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
void *wbstate);
-static enum CaseMapResult casemap(pg_wchar u1, CaseKind casekind, bool full,
+static enum CaseMapResult casemap(char32_t u1, CaseKind casekind, bool full,
const char *src, size_t srclen, size_t srcoff,
- pg_wchar *simple, const pg_wchar **special);
+ char32_t *simple, const char32_t **special);
-pg_wchar
-unicode_lowercase_simple(pg_wchar code)
+char32_t
+unicode_lowercase_simple(char32_t code)
{
- pg_wchar cp = find_case_map(code, case_map_lower);
+ char32_t cp = find_case_map(code, case_map_lower);
return cp != 0 ? cp : code;
}
-pg_wchar
-unicode_titlecase_simple(pg_wchar code)
+char32_t
+unicode_titlecase_simple(char32_t code)
{
- pg_wchar cp = find_case_map(code, case_map_title);
+ char32_t cp = find_case_map(code, case_map_title);
return cp != 0 ? cp : code;
}
-pg_wchar
-unicode_uppercase_simple(pg_wchar code)
+char32_t
+unicode_uppercase_simple(char32_t code)
{
- pg_wchar cp = find_case_map(code, case_map_upper);
+ char32_t cp = find_case_map(code, case_map_upper);
return cp != 0 ? cp : code;
}
-pg_wchar
-unicode_casefold_simple(pg_wchar code)
+char32_t
+unicode_casefold_simple(char32_t code)
{
- pg_wchar cp = find_case_map(code, case_map_fold);
+ char32_t cp = find_case_map(code, case_map_fold);
return cp != 0 ? cp : code;
}
@@ -231,10 +231,10 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
{
- pg_wchar u1 = utf8_to_unicode((unsigned char *) src + srcoff);
+ char32_t u1 = utf8_to_unicode((unsigned char *) src + srcoff);
int u1len = unicode_utf8len(u1);
- pg_wchar simple = 0;
- const pg_wchar *special = NULL;
+ char32_t simple = 0;
+ const char32_t *special = NULL;
enum CaseMapResult casemap_result;
if (str_casekind == CaseTitle)
@@ -265,8 +265,8 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
case CASEMAP_SIMPLE:
{
/* replace with single character */
- pg_wchar u2 = simple;
- pg_wchar u2len = unicode_utf8len(u2);
+ char32_t u2 = simple;
+ char32_t u2len = unicode_utf8len(u2);
Assert(special == NULL);
if (result_len + u2len <= dstsize)
@@ -280,7 +280,7 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
Assert(simple == 0);
for (int i = 0; i < MAX_CASE_EXPANSION && special[i]; i++)
{
- pg_wchar u2 = special[i];
+ char32_t u2 = special[i];
size_t u2len = unicode_utf8len(u2);
if (result_len + u2len <= dstsize)
@@ -320,7 +320,7 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
{
if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
{
- pg_wchar curr = utf8_to_unicode(str + i);
+ char32_t curr = utf8_to_unicode(str + i);
if (pg_u_prop_case_ignorable(curr))
continue;
@@ -344,7 +344,7 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
{
if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
{
- pg_wchar curr = utf8_to_unicode(str + i);
+ char32_t curr = utf8_to_unicode(str + i);
if (pg_u_prop_case_ignorable(curr))
continue;
@@ -394,9 +394,9 @@ check_special_conditions(int conditions, const char *str, size_t len,
* character without modification.
*/
static enum CaseMapResult
-casemap(pg_wchar u1, CaseKind casekind, bool full,
+casemap(char32_t u1, CaseKind casekind, bool full,
const char *src, size_t srclen, size_t srcoff,
- pg_wchar *simple, const pg_wchar **special)
+ char32_t *simple, const char32_t **special)
{
uint16 idx;
@@ -434,8 +434,8 @@ casemap(pg_wchar u1, CaseKind casekind, bool full,
* Find entry in simple case map.
* If the entry does not exist, 0 will be returned.
*/
-static pg_wchar
-find_case_map(pg_wchar ucs, const pg_wchar *map)
+static char32_t
+find_case_map(char32_t ucs, const char32_t *map)
{
/* Fast path for codepoints < 0x80 */
if (ucs < 0x80)
diff --git a/src/common/unicode_category.c b/src/common/unicode_category.c
index 4136c4d4f92..aab667a7bb4 100644
--- a/src/common/unicode_category.c
+++ b/src/common/unicode_category.c
@@ -1,7 +1,7 @@
/*-------------------------------------------------------------------------
* unicode_category.c
* Determine general category and character properties of Unicode
- * characters. Encoding must be UTF8, where we assume that the pg_wchar
+ * characters. Encoding must be UTF8, where we assume that the char32_t
* representation is a code point.
*
* Portions Copyright (c) 2017-2025, PostgreSQL Global Development Group
@@ -76,13 +76,13 @@
#define PG_U_CHARACTER_TAB 0x09
static bool range_search(const pg_unicode_range *tbl, size_t size,
- pg_wchar code);
+ char32_t code);
/*
* Unicode general category for the given codepoint.
*/
pg_unicode_category
-unicode_category(pg_wchar code)
+unicode_category(char32_t code)
{
int min = 0;
int mid;
@@ -108,7 +108,7 @@ unicode_category(pg_wchar code)
}
bool
-pg_u_prop_alphabetic(pg_wchar code)
+pg_u_prop_alphabetic(char32_t code)
{
if (code < 0x80)
return unicode_opt_ascii[code].properties & PG_U_PROP_ALPHABETIC;
@@ -119,7 +119,7 @@ pg_u_prop_alphabetic(pg_wchar code)
}
bool
-pg_u_prop_lowercase(pg_wchar code)
+pg_u_prop_lowercase(char32_t code)
{
if (code < 0x80)
return unicode_opt_ascii[code].properties & PG_U_PROP_LOWERCASE;
@@ -130,7 +130,7 @@ pg_u_prop_lowercase(pg_wchar code)
}
bool
-pg_u_prop_uppercase(pg_wchar code)
+pg_u_prop_uppercase(char32_t code)
{
if (code < 0x80)
return unicode_opt_ascii[code].properties & PG_U_PROP_UPPERCASE;
@@ -141,7 +141,7 @@ pg_u_prop_uppercase(pg_wchar code)
}
bool
-pg_u_prop_cased(pg_wchar code)
+pg_u_prop_cased(char32_t code)
{
uint32 category_mask;
@@ -156,7 +156,7 @@ pg_u_prop_cased(pg_wchar code)
}
bool
-pg_u_prop_case_ignorable(pg_wchar code)
+pg_u_prop_case_ignorable(char32_t code)
{
if (code < 0x80)
return unicode_opt_ascii[code].properties & PG_U_PROP_CASE_IGNORABLE;
@@ -167,7 +167,7 @@ pg_u_prop_case_ignorable(pg_wchar code)
}
bool
-pg_u_prop_white_space(pg_wchar code)
+pg_u_prop_white_space(char32_t code)
{
if (code < 0x80)
return unicode_opt_ascii[code].properties & PG_U_PROP_WHITE_SPACE;
@@ -178,7 +178,7 @@ pg_u_prop_white_space(pg_wchar code)
}
bool
-pg_u_prop_hex_digit(pg_wchar code)
+pg_u_prop_hex_digit(char32_t code)
{
if (code < 0x80)
return unicode_opt_ascii[code].properties & PG_U_PROP_HEX_DIGIT;
@@ -189,7 +189,7 @@ pg_u_prop_hex_digit(pg_wchar code)
}
bool
-pg_u_prop_join_control(pg_wchar code)
+pg_u_prop_join_control(char32_t code)
{
if (code < 0x80)
return unicode_opt_ascii[code].properties & PG_U_PROP_JOIN_CONTROL;
@@ -208,7 +208,7 @@ pg_u_prop_join_control(pg_wchar code)
*/
bool
-pg_u_isdigit(pg_wchar code, bool posix)
+pg_u_isdigit(char32_t code, bool posix)
{
if (posix)
return ('0' <= code && code <= '9');
@@ -217,19 +217,19 @@ pg_u_isdigit(pg_wchar code, bool posix)
}
bool
-pg_u_isalpha(pg_wchar code)
+pg_u_isalpha(char32_t code)
{
return pg_u_prop_alphabetic(code);
}
bool
-pg_u_isalnum(pg_wchar code, bool posix)
+pg_u_isalnum(char32_t code, bool posix)
{
return pg_u_isalpha(code) || pg_u_isdigit(code, posix);
}
bool
-pg_u_isword(pg_wchar code)
+pg_u_isword(char32_t code)
{
uint32 category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
@@ -240,32 +240,32 @@ pg_u_isword(pg_wchar code)
}
bool
-pg_u_isupper(pg_wchar code)
+pg_u_isupper(char32_t code)
{
return pg_u_prop_uppercase(code);
}
bool
-pg_u_islower(pg_wchar code)
+pg_u_islower(char32_t code)
{
return pg_u_prop_lowercase(code);
}
bool
-pg_u_isblank(pg_wchar code)
+pg_u_isblank(char32_t code)
{
return code == PG_U_CHARACTER_TAB ||
unicode_category(code) == PG_U_SPACE_SEPARATOR;
}
bool
-pg_u_iscntrl(pg_wchar code)
+pg_u_iscntrl(char32_t code)
{
return unicode_category(code) == PG_U_CONTROL;
}
bool
-pg_u_isgraph(pg_wchar code)
+pg_u_isgraph(char32_t code)
{
uint32 category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
@@ -276,7 +276,7 @@ pg_u_isgraph(pg_wchar code)
}
bool
-pg_u_isprint(pg_wchar code)
+pg_u_isprint(char32_t code)
{
pg_unicode_category category = unicode_category(code);
@@ -287,7 +287,7 @@ pg_u_isprint(pg_wchar code)
}
bool
-pg_u_ispunct(pg_wchar code, bool posix)
+pg_u_ispunct(char32_t code, bool posix)
{
uint32 category_mask;
@@ -308,13 +308,13 @@ pg_u_ispunct(pg_wchar code, bool posix)
}
bool
-pg_u_isspace(pg_wchar code)
+pg_u_isspace(char32_t code)
{
return pg_u_prop_white_space(code);
}
bool
-pg_u_isxdigit(pg_wchar code, bool posix)
+pg_u_isxdigit(char32_t code, bool posix)
{
if (posix)
return (('0' <= code && code <= '9') ||
@@ -478,7 +478,7 @@ unicode_category_abbrev(pg_unicode_category category)
* given table.
*/
static bool
-range_search(const pg_unicode_range *tbl, size_t size, pg_wchar code)
+range_search(const pg_unicode_range *tbl, size_t size, char32_t code)
{
int min = 0;
int mid;
diff --git a/src/common/unicode_norm.c b/src/common/unicode_norm.c
index 6654b4cbc49..489d99cd5ab 100644
--- a/src/common/unicode_norm.c
+++ b/src/common/unicode_norm.c
@@ -69,7 +69,7 @@ conv_compare(const void *p1, const void *p2)
* lookup, while the frontend version uses a binary search.
*/
static const pg_unicode_decomposition *
-get_code_entry(pg_wchar code)
+get_code_entry(char32_t code)
{
#ifndef FRONTEND
int h;
@@ -109,7 +109,7 @@ get_code_entry(pg_wchar code)
* Get the combining class of the given codepoint.
*/
static uint8
-get_canonical_class(pg_wchar code)
+get_canonical_class(char32_t code)
{
const pg_unicode_decomposition *entry = get_code_entry(code);
@@ -130,15 +130,15 @@ get_canonical_class(pg_wchar code)
* Note: the returned pointer can point to statically allocated buffer, and
* is only valid until next call to this function!
*/
-static const pg_wchar *
+static const char32_t *
get_code_decomposition(const pg_unicode_decomposition *entry, int *dec_size)
{
- static pg_wchar x;
+ static char32_t x;
if (DECOMPOSITION_IS_INLINE(entry))
{
Assert(DECOMPOSITION_SIZE(entry) == 1);
- x = (pg_wchar) entry->dec_index;
+ x = (char32_t) entry->dec_index;
*dec_size = 1;
return &x;
}
@@ -156,7 +156,7 @@ get_code_decomposition(const pg_unicode_decomposition *entry, int *dec_size)
* are, in turn, decomposable.
*/
static int
-get_decomposed_size(pg_wchar code, bool compat)
+get_decomposed_size(char32_t code, bool compat)
{
const pg_unicode_decomposition *entry;
int size = 0;
@@ -318,7 +318,7 @@ recompose_code(uint32 start, uint32 code, uint32 *result)
* in the array result.
*/
static void
-decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
+decompose_code(char32_t code, bool compat, char32_t **result, int *current)
{
const pg_unicode_decomposition *entry;
int i;
@@ -337,7 +337,7 @@ decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
v,
tindex,
sindex;
- pg_wchar *res = *result;
+ char32_t *res = *result;
sindex = code - SBASE;
l = LBASE + sindex / (VCOUNT * TCOUNT);
@@ -369,7 +369,7 @@ decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
(!compat && DECOMPOSITION_IS_COMPAT(entry)))
{
- pg_wchar *res = *result;
+ char32_t *res = *result;
res[*current] = code;
(*current)++;
@@ -382,7 +382,7 @@ decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
decomp = get_code_decomposition(entry, &dec_size);
for (i = 0; i < dec_size; i++)
{
- pg_wchar lcode = (pg_wchar) decomp[i];
+ char32_t lcode = (char32_t) decomp[i];
/* Leave if no more decompositions */
decompose_code(lcode, compat, result, current);
@@ -398,17 +398,17 @@ decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
* malloc. Or NULL if we run out of memory. In backend, the returned
* string is palloc'd instead, and OOM is reported with ereport().
*/
-pg_wchar *
-unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
+char32_t *
+unicode_normalize(UnicodeNormalizationForm form, const char32_t *input)
{
bool compat = (form == UNICODE_NFKC || form == UNICODE_NFKD);
bool recompose = (form == UNICODE_NFC || form == UNICODE_NFKC);
- pg_wchar *decomp_chars;
- pg_wchar *recomp_chars;
+ char32_t *decomp_chars;
+ char32_t *recomp_chars;
int decomp_size,
current_size;
int count;
- const pg_wchar *p;
+ const char32_t *p;
/* variables for recomposition */
int last_class;
@@ -425,7 +425,7 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
for (p = input; *p; p++)
decomp_size += get_decomposed_size(*p, compat);
- decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
+ decomp_chars = (char32_t *) ALLOC((decomp_size + 1) * sizeof(char32_t));
if (decomp_chars == NULL)
return NULL;
@@ -448,9 +448,9 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
*/
for (count = 1; count < decomp_size; count++)
{
- pg_wchar prev = decomp_chars[count - 1];
- pg_wchar next = decomp_chars[count];
- pg_wchar tmp;
+ char32_t prev = decomp_chars[count - 1];
+ char32_t next = decomp_chars[count];
+ char32_t tmp;
const uint8 prevClass = get_canonical_class(prev);
const uint8 nextClass = get_canonical_class(next);
@@ -487,7 +487,7 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
* longer than the decomposed one, so make the allocation of the output
* string based on that assumption.
*/
- recomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
+ recomp_chars = (char32_t *) ALLOC((decomp_size + 1) * sizeof(char32_t));
if (!recomp_chars)
{
FREE(decomp_chars);
@@ -501,9 +501,9 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
for (count = 1; count < decomp_size; count++)
{
- pg_wchar ch = decomp_chars[count];
+ char32_t ch = decomp_chars[count];
int ch_class = get_canonical_class(ch);
- pg_wchar composite;
+ char32_t composite;
if (last_class < ch_class &&
recompose_code(starter_ch, ch, &composite))
@@ -524,7 +524,7 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
recomp_chars[target_pos++] = ch;
}
}
- recomp_chars[target_pos] = (pg_wchar) '\0';
+ recomp_chars[target_pos] = (char32_t) '\0';
FREE(decomp_chars);
@@ -540,7 +540,7 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
#ifndef FRONTEND
static const pg_unicode_normprops *
-qc_hash_lookup(pg_wchar ch, const pg_unicode_norminfo *norminfo)
+qc_hash_lookup(char32_t ch, const pg_unicode_norminfo *norminfo)
{
int h;
uint32 hashkey;
@@ -571,7 +571,7 @@ qc_hash_lookup(pg_wchar ch, const pg_unicode_norminfo *norminfo)
* Look up the normalization quick check character property
*/
static UnicodeNormalizationQC
-qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
+qc_is_allowed(UnicodeNormalizationForm form, char32_t ch)
{
const pg_unicode_normprops *found = NULL;
@@ -595,7 +595,7 @@ qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
}
UnicodeNormalizationQC
-unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *input)
+unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const char32_t *input)
{
uint8 lastCanonicalClass = 0;
UnicodeNormalizationQC result = UNICODE_NORM_QC_YES;
@@ -610,9 +610,9 @@ unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *
if (form == UNICODE_NFD || form == UNICODE_NFKD)
return UNICODE_NORM_QC_MAYBE;
- for (const pg_wchar *p = input; *p; p++)
+ for (const char32_t *p = input; *p; p++)
{
- pg_wchar ch = *p;
+ char32_t ch = *p;
uint8 canonicalClass;
UnicodeNormalizationQC check;
diff --git a/src/fe_utils/mbprint.c b/src/fe_utils/mbprint.c
index eb3eeee9925..8bfce1d4e07 100644
--- a/src/fe_utils/mbprint.c
+++ b/src/fe_utils/mbprint.c
@@ -49,7 +49,7 @@ pg_get_utf8_id(void)
*
* No error checks here, c must point to a long-enough string.
*/
-static pg_wchar
+static char32_t
utf8_to_unicode(const unsigned char *c)
{
if ((*c & 0x80) == 0)
diff --git a/src/include/c.h b/src/include/c.h
index 9ab5e617995..a2ee108fd16 100644
--- a/src/include/c.h
+++ b/src/include/c.h
@@ -513,6 +513,11 @@ typedef void (*pg_funcptr_t) (void);
#include <stdbool.h>
+/*
+ * char32_t
+ * Unicode code point.
+ */
+#include <uchar.h>
/* ----------------------------------------------------------------
* Section 3: standard system types
diff --git a/src/include/common/unicode_case.h b/src/include/common/unicode_case.h
index 41e2c1f4b33..6bcffd349c2 100644
--- a/src/include/common/unicode_case.h
+++ b/src/include/common/unicode_case.h
@@ -14,14 +14,12 @@
#ifndef UNICODE_CASE_H
#define UNICODE_CASE_H
-#include "mb/pg_wchar.h"
-
typedef size_t (*WordBoundaryNext) (void *wbstate);
-pg_wchar unicode_lowercase_simple(pg_wchar code);
-pg_wchar unicode_titlecase_simple(pg_wchar code);
-pg_wchar unicode_uppercase_simple(pg_wchar code);
-pg_wchar unicode_casefold_simple(pg_wchar code);
+char32_t unicode_lowercase_simple(char32_t code);
+char32_t unicode_titlecase_simple(char32_t code);
+char32_t unicode_uppercase_simple(char32_t code);
+char32_t unicode_casefold_simple(char32_t code);
size_t unicode_strlower(char *dst, size_t dstsize, const char *src,
ssize_t srclen, bool full);
size_t unicode_strtitle(char *dst, size_t dstsize, const char *src,
diff --git a/src/include/common/unicode_case_table.h b/src/include/common/unicode_case_table.h
index d5311786582..0a14fb2d97b 100644
--- a/src/include/common/unicode_case_table.h
+++ b/src/include/common/unicode_case_table.h
@@ -18,7 +18,6 @@
*/
#include "common/unicode_case.h"
-#include "mb/pg_wchar.h"
/*
* The maximum number of codepoints that can result from case mapping
@@ -45,7 +44,7 @@ typedef enum
typedef struct
{
int16 conditions;
- pg_wchar map[NCaseKind][MAX_CASE_EXPANSION];
+ char32_t map[NCaseKind][MAX_CASE_EXPANSION];
} pg_special_case;
/*
@@ -166,7 +165,7 @@ static const pg_special_case special_case[106] =
* The entry case_map_lower[case_index(codepoint)] is the mapping for the
* given codepoint.
*/
-static const pg_wchar case_map_lower[1704] =
+static const char32_t case_map_lower[1704] =
{
0x000000, /* reserved */
0x000000, /* U+000000 */
@@ -1879,7 +1878,7 @@ static const pg_wchar case_map_lower[1704] =
* The entry case_map_title[case_index(codepoint)] is the mapping for the
* given codepoint.
*/
-static const pg_wchar case_map_title[1704] =
+static const char32_t case_map_title[1704] =
{
0x000000, /* reserved */
0x000000, /* U+000000 */
@@ -3592,7 +3591,7 @@ static const pg_wchar case_map_title[1704] =
* The entry case_map_upper[case_index(codepoint)] is the mapping for the
* given codepoint.
*/
-static const pg_wchar case_map_upper[1704] =
+static const char32_t case_map_upper[1704] =
{
0x000000, /* reserved */
0x000000, /* U+000000 */
@@ -5305,7 +5304,7 @@ static const pg_wchar case_map_upper[1704] =
* The entry case_map_fold[case_index(codepoint)] is the mapping for the
* given codepoint.
*/
-static const pg_wchar case_map_fold[1704] =
+static const char32_t case_map_fold[1704] =
{
0x000000, /* reserved */
0x000000, /* U+000000 */
@@ -13522,7 +13521,7 @@ static const uint16 case_map[4778] =
* the offset into the mapping tables.
*/
static inline uint16
-case_index(pg_wchar cp)
+case_index(char32_t cp)
{
/* Fast path for codepoints < 0x0588 */
if (cp < 0x0588)
diff --git a/src/include/common/unicode_category.h b/src/include/common/unicode_category.h
index 8fd8b67a416..684143d3c8a 100644
--- a/src/include/common/unicode_category.h
+++ b/src/include/common/unicode_category.h
@@ -14,8 +14,6 @@
#ifndef UNICODE_CATEGORY_H
#define UNICODE_CATEGORY_H
-#include "mb/pg_wchar.h"
-
/*
* Unicode General Category Values
*
@@ -61,31 +59,31 @@ typedef enum pg_unicode_category
PG_U_FINAL_PUNCTUATION = 29 /* Pf */
} pg_unicode_category;
-extern pg_unicode_category unicode_category(pg_wchar code);
+extern pg_unicode_category unicode_category(char32_t code);
extern const char *unicode_category_string(pg_unicode_category category);
extern const char *unicode_category_abbrev(pg_unicode_category category);
-extern bool pg_u_prop_alphabetic(pg_wchar code);
-extern bool pg_u_prop_lowercase(pg_wchar code);
-extern bool pg_u_prop_uppercase(pg_wchar code);
-extern bool pg_u_prop_cased(pg_wchar code);
-extern bool pg_u_prop_case_ignorable(pg_wchar code);
-extern bool pg_u_prop_white_space(pg_wchar code);
-extern bool pg_u_prop_hex_digit(pg_wchar code);
-extern bool pg_u_prop_join_control(pg_wchar code);
+extern bool pg_u_prop_alphabetic(char32_t code);
+extern bool pg_u_prop_lowercase(char32_t code);
+extern bool pg_u_prop_uppercase(char32_t code);
+extern bool pg_u_prop_cased(char32_t code);
+extern bool pg_u_prop_case_ignorable(char32_t code);
+extern bool pg_u_prop_white_space(char32_t code);
+extern bool pg_u_prop_hex_digit(char32_t code);
+extern bool pg_u_prop_join_control(char32_t code);
-extern bool pg_u_isdigit(pg_wchar code, bool posix);
-extern bool pg_u_isalpha(pg_wchar code);
-extern bool pg_u_isalnum(pg_wchar code, bool posix);
-extern bool pg_u_isword(pg_wchar code);
-extern bool pg_u_isupper(pg_wchar code);
-extern bool pg_u_islower(pg_wchar code);
-extern bool pg_u_isblank(pg_wchar code);
-extern bool pg_u_iscntrl(pg_wchar code);
-extern bool pg_u_isgraph(pg_wchar code);
-extern bool pg_u_isprint(pg_wchar code);
-extern bool pg_u_ispunct(pg_wchar code, bool posix);
-extern bool pg_u_isspace(pg_wchar code);
-extern bool pg_u_isxdigit(pg_wchar code, bool posix);
+extern bool pg_u_isdigit(char32_t code, bool posix);
+extern bool pg_u_isalpha(char32_t code);
+extern bool pg_u_isalnum(char32_t code, bool posix);
+extern bool pg_u_isword(char32_t code);
+extern bool pg_u_isupper(char32_t code);
+extern bool pg_u_islower(char32_t code);
+extern bool pg_u_isblank(char32_t code);
+extern bool pg_u_iscntrl(char32_t code);
+extern bool pg_u_isgraph(char32_t code);
+extern bool pg_u_isprint(char32_t code);
+extern bool pg_u_ispunct(char32_t code, bool posix);
+extern bool pg_u_isspace(char32_t code);
+extern bool pg_u_isxdigit(char32_t code, bool posix);
#endif /* UNICODE_CATEGORY_H */
diff --git a/src/include/common/unicode_category_table.h b/src/include/common/unicode_category_table.h
index 95a1c65da7e..466a41b72b0 100644
--- a/src/include/common/unicode_category_table.h
+++ b/src/include/common/unicode_category_table.h
@@ -20,15 +20,15 @@
*/
typedef struct
{
- uint32 first; /* Unicode codepoint */
- uint32 last; /* Unicode codepoint */
+ char32_t first; /* Unicode codepoint */
+ char32_t last; /* Unicode codepoint */
uint8 category; /* General Category */
} pg_category_range;
typedef struct
{
- uint32 first; /* Unicode codepoint */
- uint32 last; /* Unicode codepoint */
+ char32_t first; /* Unicode codepoint */
+ char32_t last; /* Unicode codepoint */
} pg_unicode_range;
typedef struct
diff --git a/src/include/common/unicode_norm.h b/src/include/common/unicode_norm.h
index 5bc3b79e78e..516c192cc4c 100644
--- a/src/include/common/unicode_norm.h
+++ b/src/include/common/unicode_norm.h
@@ -14,8 +14,6 @@
#ifndef UNICODE_NORM_H
#define UNICODE_NORM_H
-#include "mb/pg_wchar.h"
-
typedef enum
{
UNICODE_NFC = 0,
@@ -32,8 +30,8 @@ typedef enum
UNICODE_NORM_QC_MAYBE = -1,
} UnicodeNormalizationQC;
-extern pg_wchar *unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input);
+extern char32_t *unicode_normalize(UnicodeNormalizationForm form, const char32_t *input);
-extern UnicodeNormalizationQC unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *input);
+extern UnicodeNormalizationQC unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const char32_t *input);
#endif /* UNICODE_NORM_H */
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index 4b4a9974b75..a41bf47649e 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -532,7 +532,7 @@ typedef uint32 (*utf_local_conversion_func) (uint32 code);
* Some handy functions for Unicode-specific tests.
*/
static inline bool
-is_valid_unicode_codepoint(pg_wchar c)
+is_valid_unicode_codepoint(char32_t c)
{
return (c > 0 && c <= 0x10FFFF);
}
@@ -549,7 +549,7 @@ is_utf16_surrogate_second(pg_wchar c)
return (c >= 0xDC00 && c <= 0xDFFF);
}
-static inline pg_wchar
+static inline char32_t
surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
{
return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF);
@@ -588,7 +588,7 @@ utf8_to_unicode(const unsigned char *c)
* unicode_utf8len(c) bytes available.
*/
static inline unsigned char *
-unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
+unicode_to_utf8(char32_t c, unsigned char *utf8string)
{
if (c <= 0x7F)
{
@@ -620,7 +620,7 @@ unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
* Number of bytes needed to represent the given char in UTF8.
*/
static inline int
-unicode_utf8len(pg_wchar c)
+unicode_utf8len(char32_t c)
{
if (c <= 0x7F)
return 1;
@@ -676,8 +676,8 @@ extern int pg_valid_server_encoding(const char *name);
extern bool is_encoding_supported_by_icu(int encoding);
extern const char *get_encoding_name_for_icu(int encoding);
-extern unsigned char *unicode_to_utf8(pg_wchar c, unsigned char *utf8string);
-extern pg_wchar utf8_to_unicode(const unsigned char *c);
+extern unsigned char *unicode_to_utf8(char32_t c, unsigned char *utf8string);
+extern char32_t utf8_to_unicode(const unsigned char *c);
extern bool pg_utf8_islegal(const unsigned char *source, int length);
extern int pg_utf_mblen(const unsigned char *s);
extern int pg_mule_mblen(const unsigned char *s);
@@ -739,8 +739,8 @@ extern char *pg_server_to_client(const char *s, int len);
extern char *pg_any_to_server(const char *s, int len, int encoding);
extern char *pg_server_to_any(const char *s, int len, int encoding);
-extern void pg_unicode_to_server(pg_wchar c, unsigned char *s);
-extern bool pg_unicode_to_server_noerror(pg_wchar c, unsigned char *s);
+extern void pg_unicode_to_server(char32_t c, unsigned char *s);
+extern bool pg_unicode_to_server_noerror(char32_t c, unsigned char *s);
extern unsigned short BIG5toCNS(unsigned short big5, unsigned char *lc);
extern unsigned short CNStoBIG5(unsigned short cns, unsigned char lc);
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 377a7946585..f2bb9b4bc7b 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -3505,6 +3505,7 @@ cb_cleanup_dir
cb_options
cb_tablespace
cb_tablespace_mapping
+char32_t
check_agg_arguments_context
check_function_callback
check_network_data
--
2.43.0
Now that we're using C11, should we use char32_t for unicode code
points?Right now, we use pg_wchar for two purposes:
1. to abstract away some problems with wchar_t on platforms where
it's 16 bits; and
2. hold unicode code point valuesIn UTF8, they are are equivalent and can be freely cast back and forth,
but not necessarily in other encodings. That can be confusing in some
contexts. Attached is a patch to use char32_t for the second purpose.Both are equivalent to uint32, so there's no functional change and no
actual typechecking, it's just for readability.Is this helpful, or needless code churn?
Unless char32_t is solely used for the Unicode code point data, I
think it would be better to define something like "pg_unicode" and use
it instead of directly using char32_t because it would be cleaner for
code readers.
Best regards,
--
Tatsuo Ishii
SRA OSS K.K.
English: http://www.sraoss.co.jp/index_en/
Japanese:http://www.sraoss.co.jp
On Fri, 2025-10-24 at 18:43 +0900, Tatsuo Ishii wrote:
Unless char32_t is solely used for the Unicode code point data, I
think it would be better to define something like "pg_unicode" and
use
it instead of directly using char32_t because it would be cleaner for
code readers.
That was my original idea, but then I saw that apparently char32_t is
intended for Unicode code points:
https://www.gnu.org/software/gnulib/manual/html_node/The-char32_005ft-type.html
But I am also OK with a new type if others find it more readable.
Regards,
Jeff Davis
On Sat, Oct 25, 2025 at 4:25 AM Jeff Davis <pgsql@j-davis.com> wrote:
On Fri, 2025-10-24 at 18:43 +0900, Tatsuo Ishii wrote:
Unless char32_t is solely used for the Unicode code point data, I
think it would be better to define something like "pg_unicode" and
use
it instead of directly using char32_t because it would be cleaner for
code readers.That was my original idea, but then I saw that apparently char32_t is
intended for Unicode code points:https://www.gnu.org/software/gnulib/manual/html_node/The-char32_005ft-type.html
It's definitely a codepoint but C11 only promised UTF-32 encoding if
__STDC_UTF_32__ is defined to 1, and otherwise the encoding is
unknown. The C23 standard resolved that insanity and required UTF-32,
and there are no known systems[1]https://thephd.dev/c-the-improvements-june-september-virtual-c-meeting that didn't already conform, but I
guess you could static_assert(__STDC_UTF_32__, "char32_t must use
UTF-32 encoding"). It's also defined as at least, not exactly, 32
bits but we already require the machine to have uint32_t so it must be
exactly 32 bits for us, and we could static_assert(sizeof(char32_t) ==
4) for good measure. So all up, the standard type matches our
existing assumptions about pg_wchar *if* the database encoding is
UTF8.
IIUC you're proposing that all the stuff that only works when database
encoding is UTF8 should be flipped over to the new type, and that
seems like a really good idea to me: remaining uses of pg_wchar would
be warnings that the encoding is only conditionally known. It'd be
documentation without new type safety though: for example I think you
missed a spot, the return type of the definition of utf8_to_unicode()
(I didn't search exhaustively). Only in C++ is it a distinct type
that would catch that and a few other mistakes.
Do you consider explicit casts between eg pg_wchar and char32_t to be
useful documentation for humans, when coercion should just work? I
kinda thought we were trying to cut down on useless casts, they might
signal something but can also hide bugs. Should the few places that
deal in surrogates be using char16_t instead?
I wonder if the XXX_libc_mb() functions that contain our hard-coded
assumptions that libc's wchar_t values are in UTF-16 or UTF-32 should
use your to_char32_t() too (probably with a longer name
pg_wchar_to_char32_t() if it's in a header for wider use). That'd
highlight the exact points at which we make that assumption and
centralise the assertion about database encoding, and then the code
that compares with various known cut-off values would be clearly in
the char32_t world.
But I am also OK with a new type if others find it more readable.
Adding yet another name to this soup doesn't immediately sound like it
would make anything more readable to me. ISO has standardised this
for the industry, so I'd vote for adopting it without indirection that
makes the reader work harder to understand what it is. The churn
doesn't seem excessive either, it's fairly well contained stuff
already moving around a lot in recent releases with all your recent
and ongoing revamping work.
There is one small practical problem though: Apple hasn't got around
to supplying <uchar.h> in its C SDK yet. It's there for C++ only, and
isn't needed for the type in C++ anyway. I don't think that alone
warrants a new name wart, as the standard tells us it must match
uint32_least32_t so we can just define it ourselves if
!defined(__cplusplus__) && !defined(HAVE_UCHAR_H), until Apple gets
around to that.
Since it confused me briefly: Apple does provide <unicode/uchar.h> but
that's a coincidentally named ICU header, and on that subject I see
that ICU hasn't adopted these types yet but there are some hints that
they're thinking about it; meanwhile their C++ interfaces have begun
to document that they are acceptable in a few template functions.
All other target systems have it AFAICS. Windows: tested by CI,
MinGW: found discussion, *BSD, Solaris, Illumos: found man pages.
As for the conversion functions in <uchar.h>, they're of course
missing on macOS but they also depend on the current locale, so it's
almost like C, POSIX and NetBSD have conspired to make them as useless
to us as possible. They solve the "size and encoding of wchar_t is
undefined" problem, but there are no _l() variants and we can't depend
on uselocale() being available. Probably wouldn't be much use to us
anyway considering our more complex and general transcoding
requirements, I just thought about this while contemplating
hypothetical pre-C23 systems that don't use UTF-32, specifically what
would break if such a system existed: probably nothing as long as you
don't use these. I guess another way you could tell would be if you
used the fancy new U-prefixed character/string literal syntax, but I
can't see much need for that.
In passing, we seem to have a couple of mentions of "pg_wchar_t"
(bogus _t) in existing comments.
[1]: https://thephd.dev/c-the-improvements-june-september-virtual-c-meeting
On Sat, 2025-10-25 at 16:21 +1300, Thomas Munro wrote:
I
guess you could static_assert(__STDC_UTF_32__, "char32_t must use
UTF-32 encoding").
Done.
It's also defined as at least, not exactly, 32
bits but we already require the machine to have uint32_t so it must
be
exactly 32 bits for us, and we could static_assert(sizeof(char32_t)
==
4) for good measure.
What would be the problem if it were larger than 32 bits?
I don't mind adding the asserts, but it's slightly awkward because
StaticAssertDecl() isn't defined yet at the point we are including
uchar.h.
IIUC you're proposing that all the stuff that only works when
database
encoding is UTF8 should be flipped over to the new type, and that
seems like a really good idea to me: remaining uses of pg_wchar would
be warnings that the encoding is only conditionally known.
Exactly. The idea is to make pg_wchar stand out more as a platform-
dependent (or encoding-dependent) representation, and remove the doubt
when someone sees char32_t.
It'd be
documentation without new type safety though: for example I think you
missed a spot, the return type of the definition of utf8_to_unicode()
(I didn't search exhaustively).
Right, it's not offering type safety. Fixed the omission.
Do you consider explicit casts between eg pg_wchar and char32_t to be
useful documentation for humans, when coercion should just work? I
kinda thought we were trying to cut down on useless casts, they might
signal something but can also hide bugs.
The patch doesn't add any explicit casts, except in to_char32() and
to_pg_wchar(), so I assume that the callsites of those functions are
what you meant by "explicit casts"?
We can get rid of those functions if you want. The main reason they
exist is for a place to comment on the safety of converting pg_wchar to
char32_t. I can put that somewhere else, though.
Should the few places that
deal in surrogates be using char16_t instead?
Yes, done.
I wonder if the XXX_libc_mb() functions that contain our hard-coded
assumptions that libc's wchar_t values are in UTF-16 or UTF-32 should
use your to_char32_t() too (probably with a longer name
pg_wchar_to_char32_t() if it's in a header for wider use).
I don't think those functions do depend on UTF-32. iswalpha(), etc.,
take a wint_t, which is just a wchar_t that can also be WEOF.
And if we don't use to_char32/to_pg_wchar in there, I don't see much
need for it outside of pg_locale_builtin.c, but if the need arises we
can move it to a header file and give it a longer name.
That'd
highlight the exact points at which we make that assumption and
centralise the assertion about database encoding, and then the code
that compares with various known cut-off values would be clearly in
the char32_t world.
The asserts about UTF-8 in pg_locale_libc.c are there because the
previous code only took those code paths for UTF-8, and I preserved
that. Also there is some code that depends on UTF-8 for decoding, but I
don't think anything in there depends on UTF-32 specifically.
There is one small practical problem though: Apple hasn't got around
to supplying <uchar.h> in its C SDK yet. It's there for C++ only,
and
isn't needed for the type in C++ anyway. I don't think that alone
warrants a new name wart, as the standard tells us it must match
uint32_least32_t so we can just define it ourselves if
!defined(__cplusplus__) && !defined(HAVE_UCHAR_H), until Apple gets
around to that.
Thank you, I added a configure test for uchar.h and some more
preprocessor logic in c.h.
Since it confused me briefly: Apple does provide <unicode/uchar.h>
but
that's a coincidentally named ICU header, and on that subject I see
that ICU hasn't adopted these types yet but there are some hints that
they're thinking about it; meanwhile their C++ interfaces have begun
to document that they are acceptable in a few template functions.
Even when they fully move to char32_t, we will still have to support
the older ICU versions for a long time.
All other target systems have it AFAICS. Windows: tested by CI,
MinGW: found discussion, *BSD, Solaris, Illumos: found man pages.
Great, thank you!
They solve the "size and encoding of wchar_t is
undefined" problem
One thing I never understood about this is that it's our code that
converts from the server encoding to pg_wchar (e.g.
pg_latin12wchar_with_len()), so we must understand the representation
of pg_wchar. And we cast directly from pg_wchar to wchar_t, so we
understand the encoding of wchar_t, too, right?
In passing, we seem to have a couple of mentions of "pg_wchar_t"
(bogus _t) in existing comments.
Thank you. I'll fix that separately.
Regards,
Jeff Davis
Attachments:
v2-0001-Use-C11-char16_t-and-char32_t-for-Unicode-code-po.patchtext/x-patch; charset=UTF-8; name=v2-0001-Use-C11-char16_t-and-char32_t-for-Unicode-code-po.patchDownload
From a289df81514b1f784f9242a1c214bcc0749b6f9c Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Tue, 21 Oct 2025 13:16:47 -0700
Subject: [PATCH v2] Use C11 char16_t and char32_t for Unicode code points.
Reviewed-by: Tatsuo Ishii <ishii@postgresql.org>
Reviewed-by: Thomas Munro <thomas.munro@gmail.com>
Discussion: https://postgr.es/m/bedcc93d06203dfd89815b10f815ca2de8626e85.camel%40j-davis.com
---
configure.ac | 1 +
meson.build | 1 +
src/backend/parser/parser.c | 8 +--
src/backend/parser/scan.l | 8 +--
src/backend/utils/adt/jsonpath_scan.l | 6 +-
src/backend/utils/adt/pg_locale_builtin.c | 44 ++++++++++-----
src/backend/utils/adt/varlena.c | 40 ++++++-------
src/backend/utils/mb/mbutils.c | 4 +-
src/common/saslprep.c | 48 ++++++++--------
src/common/unicode/case_test.c | 23 ++++----
src/common/unicode/category_test.c | 3 +-
.../unicode/generate-norm_test_table.pl | 4 +-
.../unicode/generate-unicode_case_table.pl | 7 +--
.../generate-unicode_category_table.pl | 8 +--
src/common/unicode/norm_test.c | 6 +-
src/common/unicode_case.c | 56 +++++++++----------
src/common/unicode_category.c | 50 ++++++++---------
src/common/unicode_norm.c | 56 +++++++++----------
src/fe_utils/mbprint.c | 10 ++--
src/include/c.h | 19 +++++++
src/include/common/unicode_case.h | 10 ++--
src/include/common/unicode_case_table.h | 13 ++---
src/include/common/unicode_category.h | 46 ++++++++-------
src/include/common/unicode_category_table.h | 8 +--
src/include/common/unicode_norm.h | 6 +-
src/include/mb/pg_wchar.h | 32 +++++------
src/tools/pgindent/typedefs.list | 2 +
27 files changed, 276 insertions(+), 243 deletions(-)
diff --git a/configure.ac b/configure.ac
index e44943aa6fe..6ab2e157531 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1513,6 +1513,7 @@ AC_CHECK_HEADERS(m4_normalize([
sys/signalfd.h
sys/ucred.h
termios.h
+ uchar.h
ucred.h
xlocale.h
]))
diff --git a/meson.build b/meson.build
index 395416a6060..c3128c7554f 100644
--- a/meson.build
+++ b/meson.build
@@ -2613,6 +2613,7 @@ header_checks = [
'sys/signalfd.h',
'sys/ucred.h',
'termios.h',
+ 'uchar.h',
'ucred.h',
'xlocale.h',
]
diff --git a/src/backend/parser/parser.c b/src/backend/parser/parser.c
index 33a040506b4..a3679f8e86c 100644
--- a/src/backend/parser/parser.c
+++ b/src/backend/parser/parser.c
@@ -339,7 +339,7 @@ hexval(unsigned char c)
/* is Unicode code point acceptable? */
static void
-check_unicode_value(pg_wchar c)
+check_unicode_value(char32_t c)
{
if (!is_valid_unicode_codepoint(c))
ereport(ERROR,
@@ -376,7 +376,7 @@ str_udeescape(const char *str, char escape,
char *new,
*out;
size_t new_len;
- pg_wchar pair_first = 0;
+ char16_t pair_first = 0;
ScannerCallbackState scbstate;
/*
@@ -420,7 +420,7 @@ str_udeescape(const char *str, char escape,
isxdigit((unsigned char) in[3]) &&
isxdigit((unsigned char) in[4]))
{
- pg_wchar unicode;
+ char32_t unicode;
unicode = (hexval(in[1]) << 12) +
(hexval(in[2]) << 8) +
@@ -457,7 +457,7 @@ str_udeescape(const char *str, char escape,
isxdigit((unsigned char) in[6]) &&
isxdigit((unsigned char) in[7]))
{
- pg_wchar unicode;
+ char32_t unicode;
unicode = (hexval(in[2]) << 20) +
(hexval(in[3]) << 16) +
diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l
index 08990831fe8..a67815339b7 100644
--- a/src/backend/parser/scan.l
+++ b/src/backend/parser/scan.l
@@ -121,7 +121,7 @@ static void addlitchar(unsigned char ychar, core_yyscan_t yyscanner);
static char *litbufdup(core_yyscan_t yyscanner);
static unsigned char unescape_single_char(unsigned char c, core_yyscan_t yyscanner);
static int process_integer_literal(const char *token, YYSTYPE *lval, int base);
-static void addunicode(pg_wchar c, yyscan_t yyscanner);
+static void addunicode(char32_t c, yyscan_t yyscanner);
#define yyerror(msg) scanner_yyerror(msg, yyscanner)
@@ -640,7 +640,7 @@ other .
addlit(yytext, yyleng, yyscanner);
}
<xe>{xeunicode} {
- pg_wchar c = strtoul(yytext + 2, NULL, 16);
+ char32_t c = strtoul(yytext + 2, NULL, 16);
/*
* For consistency with other productions, issue any
@@ -668,7 +668,7 @@ other .
POP_YYLLOC();
}
<xeu>{xeunicode} {
- pg_wchar c = strtoul(yytext + 2, NULL, 16);
+ char32_t c = strtoul(yytext + 2, NULL, 16);
/* Remember start of overall string token ... */
PUSH_YYLLOC();
@@ -1376,7 +1376,7 @@ process_integer_literal(const char *token, YYSTYPE *lval, int base)
}
static void
-addunicode(pg_wchar c, core_yyscan_t yyscanner)
+addunicode(char32_t c, core_yyscan_t yyscanner)
{
ScannerCallbackState scbstate;
char buf[MAX_UNICODE_EQUIVALENT_STRING + 1];
diff --git a/src/backend/utils/adt/jsonpath_scan.l b/src/backend/utils/adt/jsonpath_scan.l
index c7aab83eeb4..8c3a0a9c642 100644
--- a/src/backend/utils/adt/jsonpath_scan.l
+++ b/src/backend/utils/adt/jsonpath_scan.l
@@ -574,7 +574,7 @@ hexval(char c, int *result, struct Node *escontext, yyscan_t yyscanner)
/* Add given unicode character to scanstring */
static bool
-addUnicodeChar(int ch, struct Node *escontext, yyscan_t yyscanner)
+addUnicodeChar(char32_t ch, struct Node *escontext, yyscan_t yyscanner)
{
if (ch == 0)
{
@@ -607,7 +607,7 @@ addUnicodeChar(int ch, struct Node *escontext, yyscan_t yyscanner)
/* Add unicode character, processing any surrogate pairs */
static bool
-addUnicode(int ch, int *hi_surrogate, struct Node *escontext, yyscan_t yyscanner)
+addUnicode(char32_t ch, int *hi_surrogate, struct Node *escontext, yyscan_t yyscanner)
{
if (is_utf16_surrogate_first(ch))
{
@@ -655,7 +655,7 @@ parseUnicode(char *s, int l, struct Node *escontext, yyscan_t yyscanner)
for (i = 2; i < l; i += 2) /* skip '\u' */
{
- int ch = 0;
+ char32_t ch = 0;
int j,
si;
diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c
index 3dc611b50e1..1021e0d129b 100644
--- a/src/backend/utils/adt/pg_locale_builtin.c
+++ b/src/backend/utils/adt/pg_locale_builtin.c
@@ -15,7 +15,6 @@
#include "catalog/pg_collation.h"
#include "common/unicode_case.h"
#include "common/unicode_category.h"
-#include "mb/pg_wchar.h"
#include "miscadmin.h"
#include "utils/builtins.h"
#include "utils/pg_locale.h"
@@ -35,6 +34,23 @@ struct WordBoundaryState
bool prev_alnum;
};
+/*
+ * In UTF-8, pg_wchar is guaranteed to be the code point value.
+ */
+static inline char32_t
+to_char32(pg_wchar wc)
+{
+ Assert(GetDatabaseEncoding() == PG_UTF8);
+ return (char32_t) wc;
+}
+
+static inline pg_wchar
+to_pg_wchar(char32_t c32)
+{
+ Assert(GetDatabaseEncoding() == PG_UTF8);
+ return (pg_wchar) c32;
+}
+
/*
* Simple word boundary iterator that draws boundaries each time the result of
* pg_u_isalnum() changes.
@@ -47,7 +63,7 @@ initcap_wbnext(void *state)
while (wbstate->offset < wbstate->len &&
wbstate->str[wbstate->offset] != '\0')
{
- pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str +
+ char32_t u = utf8_to_unicode((unsigned char *) wbstate->str +
wbstate->offset);
bool curr_alnum = pg_u_isalnum(u, wbstate->posix);
@@ -112,61 +128,61 @@ strfold_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
static bool
wc_isdigit_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_isdigit(wc, !locale->builtin.casemap_full);
+ return pg_u_isdigit(to_char32(wc), !locale->builtin.casemap_full);
}
static bool
wc_isalpha_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_isalpha(wc);
+ return pg_u_isalpha(to_char32(wc));
}
static bool
wc_isalnum_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_isalnum(wc, !locale->builtin.casemap_full);
+ return pg_u_isalnum(to_char32(wc), !locale->builtin.casemap_full);
}
static bool
wc_isupper_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_isupper(wc);
+ return pg_u_isupper(to_char32(wc));
}
static bool
wc_islower_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_islower(wc);
+ return pg_u_islower(to_char32(wc));
}
static bool
wc_isgraph_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_isgraph(wc);
+ return pg_u_isgraph(to_char32(wc));
}
static bool
wc_isprint_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_isprint(wc);
+ return pg_u_isprint(to_char32(wc));
}
static bool
wc_ispunct_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_ispunct(wc, !locale->builtin.casemap_full);
+ return pg_u_ispunct(to_char32(wc), !locale->builtin.casemap_full);
}
static bool
wc_isspace_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_isspace(wc);
+ return pg_u_isspace(to_char32(wc));
}
static bool
wc_isxdigit_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_isxdigit(wc, !locale->builtin.casemap_full);
+ return pg_u_isxdigit(to_char32(wc), !locale->builtin.casemap_full);
}
static bool
@@ -179,13 +195,13 @@ char_is_cased_builtin(char ch, pg_locale_t locale)
static pg_wchar
wc_toupper_builtin(pg_wchar wc, pg_locale_t locale)
{
- return unicode_uppercase_simple(wc);
+ return to_pg_wchar(unicode_uppercase_simple(to_char32(wc)));
}
static pg_wchar
wc_tolower_builtin(pg_wchar wc, pg_locale_t locale)
{
- return unicode_lowercase_simple(wc);
+ return to_pg_wchar(unicode_lowercase_simple(to_char32(wc)));
}
static const struct ctype_methods ctype_methods_builtin = {
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index 2c398cd9e5c..8d735786e51 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -5419,12 +5419,12 @@ unicode_assigned(PG_FUNCTION_ARGS)
ereport(ERROR,
(errmsg("Unicode categorization can only be performed if server encoding is UTF8")));
- /* convert to pg_wchar */
+ /* convert to char32_t */
size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
p = (unsigned char *) VARDATA_ANY(input);
for (int i = 0; i < size; i++)
{
- pg_wchar uchar = utf8_to_unicode(p);
+ char32_t uchar = utf8_to_unicode(p);
int category = unicode_category(uchar);
if (category == PG_U_UNASSIGNED)
@@ -5443,24 +5443,24 @@ unicode_normalize_func(PG_FUNCTION_ARGS)
char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
UnicodeNormalizationForm form;
int size;
- pg_wchar *input_chars;
- pg_wchar *output_chars;
+ char32_t *input_chars;
+ char32_t *output_chars;
unsigned char *p;
text *result;
int i;
form = unicode_norm_form_from_string(formstr);
- /* convert to pg_wchar */
+ /* convert to char32_t */
size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
- input_chars = palloc((size + 1) * sizeof(pg_wchar));
+ input_chars = palloc((size + 1) * sizeof(char32_t));
p = (unsigned char *) VARDATA_ANY(input);
for (i = 0; i < size; i++)
{
input_chars[i] = utf8_to_unicode(p);
p += pg_utf_mblen(p);
}
- input_chars[i] = (pg_wchar) '\0';
+ input_chars[i] = (char32_t) '\0';
Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
/* action */
@@ -5468,7 +5468,7 @@ unicode_normalize_func(PG_FUNCTION_ARGS)
/* convert back to UTF-8 string */
size = 0;
- for (pg_wchar *wp = output_chars; *wp; wp++)
+ for (char32_t *wp = output_chars; *wp; wp++)
{
unsigned char buf[4];
@@ -5480,7 +5480,7 @@ unicode_normalize_func(PG_FUNCTION_ARGS)
SET_VARSIZE(result, size + VARHDRSZ);
p = (unsigned char *) VARDATA_ANY(result);
- for (pg_wchar *wp = output_chars; *wp; wp++)
+ for (char32_t *wp = output_chars; *wp; wp++)
{
unicode_to_utf8(*wp, p);
p += pg_utf_mblen(p);
@@ -5509,8 +5509,8 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
UnicodeNormalizationForm form;
int size;
- pg_wchar *input_chars;
- pg_wchar *output_chars;
+ char32_t *input_chars;
+ char32_t *output_chars;
unsigned char *p;
int i;
UnicodeNormalizationQC quickcheck;
@@ -5519,16 +5519,16 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
form = unicode_norm_form_from_string(formstr);
- /* convert to pg_wchar */
+ /* convert to char32_t */
size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
- input_chars = palloc((size + 1) * sizeof(pg_wchar));
+ input_chars = palloc((size + 1) * sizeof(char32_t));
p = (unsigned char *) VARDATA_ANY(input);
for (i = 0; i < size; i++)
{
input_chars[i] = utf8_to_unicode(p);
p += pg_utf_mblen(p);
}
- input_chars[i] = (pg_wchar) '\0';
+ input_chars[i] = (char32_t) '\0';
Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
/* quick check (see UAX #15) */
@@ -5542,11 +5542,11 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
output_chars = unicode_normalize(form, input_chars);
output_size = 0;
- for (pg_wchar *wp = output_chars; *wp; wp++)
+ for (char32_t *wp = output_chars; *wp; wp++)
output_size++;
result = (size == output_size) &&
- (memcmp(input_chars, output_chars, size * sizeof(pg_wchar)) == 0);
+ (memcmp(input_chars, output_chars, size * sizeof(char32_t)) == 0);
PG_RETURN_BOOL(result);
}
@@ -5602,7 +5602,7 @@ unistr(PG_FUNCTION_ARGS)
int len;
StringInfoData str;
text *result;
- pg_wchar pair_first = 0;
+ char16_t pair_first = 0;
char cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
instr = VARDATA_ANY(input_text);
@@ -5626,7 +5626,7 @@ unistr(PG_FUNCTION_ARGS)
else if ((len >= 5 && isxdigits_n(instr + 1, 4)) ||
(len >= 6 && instr[1] == 'u' && isxdigits_n(instr + 2, 4)))
{
- pg_wchar unicode;
+ char32_t unicode;
int offset = instr[1] == 'u' ? 2 : 1;
unicode = hexval_n(instr + offset, 4);
@@ -5662,7 +5662,7 @@ unistr(PG_FUNCTION_ARGS)
}
else if (len >= 8 && instr[1] == '+' && isxdigits_n(instr + 2, 6))
{
- pg_wchar unicode;
+ char32_t unicode;
unicode = hexval_n(instr + 2, 6);
@@ -5697,7 +5697,7 @@ unistr(PG_FUNCTION_ARGS)
}
else if (len >= 10 && instr[1] == 'U' && isxdigits_n(instr + 2, 8))
{
- pg_wchar unicode;
+ char32_t unicode;
unicode = hexval_n(instr + 2, 8);
diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c
index 886ecbad871..fb629ed5c8f 100644
--- a/src/backend/utils/mb/mbutils.c
+++ b/src/backend/utils/mb/mbutils.c
@@ -862,7 +862,7 @@ perform_default_encoding_conversion(const char *src, int len,
* may call this outside any transaction, or in an aborted transaction.
*/
void
-pg_unicode_to_server(pg_wchar c, unsigned char *s)
+pg_unicode_to_server(char32_t c, unsigned char *s)
{
unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1];
int c_as_utf8_len;
@@ -924,7 +924,7 @@ pg_unicode_to_server(pg_wchar c, unsigned char *s)
* but simply return false on conversion failure.
*/
bool
-pg_unicode_to_server_noerror(pg_wchar c, unsigned char *s)
+pg_unicode_to_server_noerror(char32_t c, unsigned char *s)
{
unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1];
int c_as_utf8_len;
diff --git a/src/common/saslprep.c b/src/common/saslprep.c
index 97beb47940b..101e8d65a4d 100644
--- a/src/common/saslprep.c
+++ b/src/common/saslprep.c
@@ -47,7 +47,7 @@
/* Prototypes for local functions */
static int codepoint_range_cmp(const void *a, const void *b);
-static bool is_code_in_table(pg_wchar code, const pg_wchar *map, int mapsize);
+static bool is_code_in_table(char32_t code, const char32_t *map, int mapsize);
static int pg_utf8_string_len(const char *source);
/*
@@ -64,7 +64,7 @@ static int pg_utf8_string_len(const char *source);
*
* These are all mapped to the ASCII space character (U+00A0).
*/
-static const pg_wchar non_ascii_space_ranges[] =
+static const char32_t non_ascii_space_ranges[] =
{
0x00A0, 0x00A0,
0x1680, 0x1680,
@@ -79,7 +79,7 @@ static const pg_wchar non_ascii_space_ranges[] =
*
* If any of these appear in the input, they are removed.
*/
-static const pg_wchar commonly_mapped_to_nothing_ranges[] =
+static const char32_t commonly_mapped_to_nothing_ranges[] =
{
0x00AD, 0x00AD,
0x034F, 0x034F,
@@ -114,7 +114,7 @@ static const pg_wchar commonly_mapped_to_nothing_ranges[] =
* tables, so one code might originate from multiple source tables.
* Adjacent ranges have also been merged together, to save space.
*/
-static const pg_wchar prohibited_output_ranges[] =
+static const char32_t prohibited_output_ranges[] =
{
0x0000, 0x001F, /* C.2.1 */
0x007F, 0x00A0, /* C.1.2, C.2.1, C.2.2 */
@@ -155,7 +155,7 @@ static const pg_wchar prohibited_output_ranges[] =
};
/* A.1 Unassigned code points in Unicode 3.2 */
-static const pg_wchar unassigned_codepoint_ranges[] =
+static const char32_t unassigned_codepoint_ranges[] =
{
0x0221, 0x0221,
0x0234, 0x024F,
@@ -556,7 +556,7 @@ static const pg_wchar unassigned_codepoint_ranges[] =
};
/* D.1 Characters with bidirectional property "R" or "AL" */
-static const pg_wchar RandALCat_codepoint_ranges[] =
+static const char32_t RandALCat_codepoint_ranges[] =
{
0x05BE, 0x05BE,
0x05C0, 0x05C0,
@@ -595,7 +595,7 @@ static const pg_wchar RandALCat_codepoint_ranges[] =
};
/* D.2 Characters with bidirectional property "L" */
-static const pg_wchar LCat_codepoint_ranges[] =
+static const char32_t LCat_codepoint_ranges[] =
{
0x0041, 0x005A,
0x0061, 0x007A,
@@ -968,8 +968,8 @@ static const pg_wchar LCat_codepoint_ranges[] =
static int
codepoint_range_cmp(const void *a, const void *b)
{
- const pg_wchar *key = (const pg_wchar *) a;
- const pg_wchar *range = (const pg_wchar *) b;
+ const char32_t *key = (const char32_t *) a;
+ const char32_t *range = (const char32_t *) b;
if (*key < range[0])
return -1; /* less than lower bound */
@@ -980,14 +980,14 @@ codepoint_range_cmp(const void *a, const void *b)
}
static bool
-is_code_in_table(pg_wchar code, const pg_wchar *map, int mapsize)
+is_code_in_table(char32_t code, const char32_t *map, int mapsize)
{
Assert(mapsize % 2 == 0);
if (code < map[0] || code > map[mapsize - 1])
return false;
- if (bsearch(&code, map, mapsize / 2, sizeof(pg_wchar) * 2,
+ if (bsearch(&code, map, mapsize / 2, sizeof(char32_t) * 2,
codepoint_range_cmp))
return true;
else
@@ -1046,8 +1046,8 @@ pg_utf8_string_len(const char *source)
pg_saslprep_rc
pg_saslprep(const char *input, char **output)
{
- pg_wchar *input_chars = NULL;
- pg_wchar *output_chars = NULL;
+ char32_t *input_chars = NULL;
+ char32_t *output_chars = NULL;
int input_size;
char *result;
int result_size;
@@ -1055,7 +1055,7 @@ pg_saslprep(const char *input, char **output)
int i;
bool contains_RandALCat;
unsigned char *p;
- pg_wchar *wp;
+ char32_t *wp;
/* Ensure we return *output as NULL on failure */
*output = NULL;
@@ -1080,10 +1080,10 @@ pg_saslprep(const char *input, char **output)
input_size = pg_utf8_string_len(input);
if (input_size < 0)
return SASLPREP_INVALID_UTF8;
- if (input_size >= MaxAllocSize / sizeof(pg_wchar))
+ if (input_size >= MaxAllocSize / sizeof(char32_t))
goto oom;
- input_chars = ALLOC((input_size + 1) * sizeof(pg_wchar));
+ input_chars = ALLOC((input_size + 1) * sizeof(char32_t));
if (!input_chars)
goto oom;
@@ -1093,7 +1093,7 @@ pg_saslprep(const char *input, char **output)
input_chars[i] = utf8_to_unicode(p);
p += pg_utf_mblen(p);
}
- input_chars[i] = (pg_wchar) '\0';
+ input_chars[i] = (char32_t) '\0';
/*
* The steps below correspond to the steps listed in [RFC3454], Section
@@ -1107,7 +1107,7 @@ pg_saslprep(const char *input, char **output)
count = 0;
for (i = 0; i < input_size; i++)
{
- pg_wchar code = input_chars[i];
+ char32_t code = input_chars[i];
if (IS_CODE_IN_TABLE(code, non_ascii_space_ranges))
input_chars[count++] = 0x0020;
@@ -1118,7 +1118,7 @@ pg_saslprep(const char *input, char **output)
else
input_chars[count++] = code;
}
- input_chars[count] = (pg_wchar) '\0';
+ input_chars[count] = (char32_t) '\0';
input_size = count;
if (input_size == 0)
@@ -1138,7 +1138,7 @@ pg_saslprep(const char *input, char **output)
*/
for (i = 0; i < input_size; i++)
{
- pg_wchar code = input_chars[i];
+ char32_t code = input_chars[i];
if (IS_CODE_IN_TABLE(code, prohibited_output_ranges))
goto prohibited;
@@ -1170,7 +1170,7 @@ pg_saslprep(const char *input, char **output)
contains_RandALCat = false;
for (i = 0; i < input_size; i++)
{
- pg_wchar code = input_chars[i];
+ char32_t code = input_chars[i];
if (IS_CODE_IN_TABLE(code, RandALCat_codepoint_ranges))
{
@@ -1181,12 +1181,12 @@ pg_saslprep(const char *input, char **output)
if (contains_RandALCat)
{
- pg_wchar first = input_chars[0];
- pg_wchar last = input_chars[input_size - 1];
+ char32_t first = input_chars[0];
+ char32_t last = input_chars[input_size - 1];
for (i = 0; i < input_size; i++)
{
- pg_wchar code = input_chars[i];
+ char32_t code = input_chars[i];
if (IS_CODE_IN_TABLE(code, LCat_codepoint_ranges))
goto prohibited;
diff --git a/src/common/unicode/case_test.c b/src/common/unicode/case_test.c
index fdfb62e8552..00d4f85e5a5 100644
--- a/src/common/unicode/case_test.c
+++ b/src/common/unicode/case_test.c
@@ -24,6 +24,7 @@
#include "common/unicode_case.h"
#include "common/unicode_category.h"
#include "common/unicode_version.h"
+#include "mb/pg_wchar.h"
/* enough to hold largest source or result string, including NUL */
#define BUFSZ 256
@@ -54,7 +55,7 @@ initcap_wbnext(void *state)
while (wbstate->offset < wbstate->len &&
wbstate->str[wbstate->offset] != '\0')
{
- pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str +
+ char32_t u = utf8_to_unicode((unsigned char *) wbstate->str +
wbstate->offset);
bool curr_alnum = pg_u_isalnum(u, wbstate->posix);
@@ -77,16 +78,16 @@ initcap_wbnext(void *state)
#ifdef USE_ICU
static void
-icu_test_simple(pg_wchar code)
+icu_test_simple(char32_t code)
{
- pg_wchar lower = unicode_lowercase_simple(code);
- pg_wchar title = unicode_titlecase_simple(code);
- pg_wchar upper = unicode_uppercase_simple(code);
- pg_wchar fold = unicode_casefold_simple(code);
- pg_wchar iculower = u_tolower(code);
- pg_wchar icutitle = u_totitle(code);
- pg_wchar icuupper = u_toupper(code);
- pg_wchar icufold = u_foldCase(code, U_FOLD_CASE_DEFAULT);
+ char32_t lower = unicode_lowercase_simple(code);
+ char32_t title = unicode_titlecase_simple(code);
+ char32_t upper = unicode_uppercase_simple(code);
+ char32_t fold = unicode_casefold_simple(code);
+ char32_t iculower = u_tolower(code);
+ char32_t icutitle = u_totitle(code);
+ char32_t icuupper = u_toupper(code);
+ char32_t icufold = u_foldCase(code, U_FOLD_CASE_DEFAULT);
if (lower != iculower || title != icutitle || upper != icuupper ||
fold != icufold)
@@ -172,7 +173,7 @@ test_icu(void)
int successful = 0;
int skipped_mismatch = 0;
- for (pg_wchar code = 0; code <= 0x10ffff; code++)
+ for (char32_t code = 0; code <= 0x10ffff; code++)
{
pg_unicode_category category = unicode_category(code);
diff --git a/src/common/unicode/category_test.c b/src/common/unicode/category_test.c
index 5d37ba39196..1e8c1f7905f 100644
--- a/src/common/unicode/category_test.c
+++ b/src/common/unicode/category_test.c
@@ -22,6 +22,7 @@
#include "common/unicode_category.h"
#include "common/unicode_version.h"
+#include "mb/pg_wchar.h"
static int pg_unicode_version = 0;
#ifdef USE_ICU
@@ -59,7 +60,7 @@ icu_test()
int pg_skipped_codepoints = 0;
int icu_skipped_codepoints = 0;
- for (pg_wchar code = 0; code <= 0x10ffff; code++)
+ for (char32_t code = 0; code <= 0x10ffff; code++)
{
uint8_t pg_category = unicode_category(code);
uint8_t icu_category = u_charType(code);
diff --git a/src/common/unicode/generate-norm_test_table.pl b/src/common/unicode/generate-norm_test_table.pl
index 1b401be9409..1a8b908ff33 100644
--- a/src/common/unicode/generate-norm_test_table.pl
+++ b/src/common/unicode/generate-norm_test_table.pl
@@ -47,8 +47,8 @@ print $OUTPUT <<HEADER;
typedef struct
{
int linenum;
- pg_wchar input[50];
- pg_wchar output[4][50];
+ char32_t input[50];
+ char32_t output[4][50];
} pg_unicode_test;
/* test table */
diff --git a/src/common/unicode/generate-unicode_case_table.pl b/src/common/unicode/generate-unicode_case_table.pl
index 5d9ddd62803..f71eb25c94e 100644
--- a/src/common/unicode/generate-unicode_case_table.pl
+++ b/src/common/unicode/generate-unicode_case_table.pl
@@ -270,7 +270,6 @@ print $OT <<"EOS";
*/
#include "common/unicode_case.h"
-#include "mb/pg_wchar.h"
/*
* The maximum number of codepoints that can result from case mapping
@@ -297,7 +296,7 @@ typedef enum
typedef struct
{
int16 conditions;
- pg_wchar map[NCaseKind][MAX_CASE_EXPANSION];
+ char32_t map[NCaseKind][MAX_CASE_EXPANSION];
} pg_special_case;
/*
@@ -430,7 +429,7 @@ foreach my $kind ('lower', 'title', 'upper', 'fold')
* The entry case_map_${kind}[case_index(codepoint)] is the mapping for the
* given codepoint.
*/
-static const pg_wchar case_map_$kind\[$index\] =
+static const char32_t case_map_$kind\[$index\] =
{
EOS
@@ -502,7 +501,7 @@ print $OT <<"EOS";
* the offset into the mapping tables.
*/
static inline uint16
-case_index(pg_wchar cp)
+case_index(char32_t cp)
{
/* Fast path for codepoints < $fastpath_limit */
if (cp < $fastpath_limit)
diff --git a/src/common/unicode/generate-unicode_category_table.pl b/src/common/unicode/generate-unicode_category_table.pl
index abab5cd9696..7e094b13720 100644
--- a/src/common/unicode/generate-unicode_category_table.pl
+++ b/src/common/unicode/generate-unicode_category_table.pl
@@ -366,15 +366,15 @@ print $OT <<"EOS";
*/
typedef struct
{
- uint32 first; /* Unicode codepoint */
- uint32 last; /* Unicode codepoint */
+ char32_t first; /* Unicode codepoint */
+ char32_t last; /* Unicode codepoint */
uint8 category; /* General Category */
} pg_category_range;
typedef struct
{
- uint32 first; /* Unicode codepoint */
- uint32 last; /* Unicode codepoint */
+ char32_t first; /* Unicode codepoint */
+ char32_t last; /* Unicode codepoint */
} pg_unicode_range;
typedef struct
diff --git a/src/common/unicode/norm_test.c b/src/common/unicode/norm_test.c
index 25bc59463f2..058817f1719 100644
--- a/src/common/unicode/norm_test.c
+++ b/src/common/unicode/norm_test.c
@@ -20,7 +20,7 @@
#include "norm_test_table.h"
static char *
-print_wchar_str(const pg_wchar *s)
+print_wchar_str(const char32_t *s)
{
#define BUF_DIGITS 50
static char buf[BUF_DIGITS * 11 + 1];
@@ -41,7 +41,7 @@ print_wchar_str(const pg_wchar *s)
}
static int
-pg_wcscmp(const pg_wchar *s1, const pg_wchar *s2)
+pg_wcscmp(const char32_t *s1, const char32_t *s2)
{
for (;;)
{
@@ -65,7 +65,7 @@ main(int argc, char **argv)
{
for (int form = 0; form < 4; form++)
{
- pg_wchar *result;
+ char32_t *result;
result = unicode_normalize(form, test->input);
diff --git a/src/common/unicode_case.c b/src/common/unicode_case.c
index 073faf6a0d5..e5e494db43c 100644
--- a/src/common/unicode_case.c
+++ b/src/common/unicode_case.c
@@ -30,7 +30,7 @@ enum CaseMapResult
/*
* Map for each case kind.
*/
-static const pg_wchar *const casekind_map[NCaseKind] =
+static const char32_t *const casekind_map[NCaseKind] =
{
[CaseLower] = case_map_lower,
[CaseTitle] = case_map_title,
@@ -38,42 +38,42 @@ static const pg_wchar *const casekind_map[NCaseKind] =
[CaseFold] = case_map_fold,
};
-static pg_wchar find_case_map(pg_wchar ucs, const pg_wchar *map);
+static char32_t find_case_map(char32_t ucs, const char32_t *map);
static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
void *wbstate);
-static enum CaseMapResult casemap(pg_wchar u1, CaseKind casekind, bool full,
+static enum CaseMapResult casemap(char32_t u1, CaseKind casekind, bool full,
const char *src, size_t srclen, size_t srcoff,
- pg_wchar *simple, const pg_wchar **special);
+ char32_t *simple, const char32_t **special);
-pg_wchar
-unicode_lowercase_simple(pg_wchar code)
+char32_t
+unicode_lowercase_simple(char32_t code)
{
- pg_wchar cp = find_case_map(code, case_map_lower);
+ char32_t cp = find_case_map(code, case_map_lower);
return cp != 0 ? cp : code;
}
-pg_wchar
-unicode_titlecase_simple(pg_wchar code)
+char32_t
+unicode_titlecase_simple(char32_t code)
{
- pg_wchar cp = find_case_map(code, case_map_title);
+ char32_t cp = find_case_map(code, case_map_title);
return cp != 0 ? cp : code;
}
-pg_wchar
-unicode_uppercase_simple(pg_wchar code)
+char32_t
+unicode_uppercase_simple(char32_t code)
{
- pg_wchar cp = find_case_map(code, case_map_upper);
+ char32_t cp = find_case_map(code, case_map_upper);
return cp != 0 ? cp : code;
}
-pg_wchar
-unicode_casefold_simple(pg_wchar code)
+char32_t
+unicode_casefold_simple(char32_t code)
{
- pg_wchar cp = find_case_map(code, case_map_fold);
+ char32_t cp = find_case_map(code, case_map_fold);
return cp != 0 ? cp : code;
}
@@ -231,10 +231,10 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
{
- pg_wchar u1 = utf8_to_unicode((unsigned char *) src + srcoff);
+ char32_t u1 = utf8_to_unicode((unsigned char *) src + srcoff);
int u1len = unicode_utf8len(u1);
- pg_wchar simple = 0;
- const pg_wchar *special = NULL;
+ char32_t simple = 0;
+ const char32_t *special = NULL;
enum CaseMapResult casemap_result;
if (str_casekind == CaseTitle)
@@ -265,8 +265,8 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
case CASEMAP_SIMPLE:
{
/* replace with single character */
- pg_wchar u2 = simple;
- pg_wchar u2len = unicode_utf8len(u2);
+ char32_t u2 = simple;
+ char32_t u2len = unicode_utf8len(u2);
Assert(special == NULL);
if (result_len + u2len <= dstsize)
@@ -280,7 +280,7 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
Assert(simple == 0);
for (int i = 0; i < MAX_CASE_EXPANSION && special[i]; i++)
{
- pg_wchar u2 = special[i];
+ char32_t u2 = special[i];
size_t u2len = unicode_utf8len(u2);
if (result_len + u2len <= dstsize)
@@ -320,7 +320,7 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
{
if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
{
- pg_wchar curr = utf8_to_unicode(str + i);
+ char32_t curr = utf8_to_unicode(str + i);
if (pg_u_prop_case_ignorable(curr))
continue;
@@ -344,7 +344,7 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
{
if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
{
- pg_wchar curr = utf8_to_unicode(str + i);
+ char32_t curr = utf8_to_unicode(str + i);
if (pg_u_prop_case_ignorable(curr))
continue;
@@ -394,9 +394,9 @@ check_special_conditions(int conditions, const char *str, size_t len,
* character without modification.
*/
static enum CaseMapResult
-casemap(pg_wchar u1, CaseKind casekind, bool full,
+casemap(char32_t u1, CaseKind casekind, bool full,
const char *src, size_t srclen, size_t srcoff,
- pg_wchar *simple, const pg_wchar **special)
+ char32_t *simple, const char32_t **special)
{
uint16 idx;
@@ -434,8 +434,8 @@ casemap(pg_wchar u1, CaseKind casekind, bool full,
* Find entry in simple case map.
* If the entry does not exist, 0 will be returned.
*/
-static pg_wchar
-find_case_map(pg_wchar ucs, const pg_wchar *map)
+static char32_t
+find_case_map(char32_t ucs, const char32_t *map)
{
/* Fast path for codepoints < 0x80 */
if (ucs < 0x80)
diff --git a/src/common/unicode_category.c b/src/common/unicode_category.c
index 4136c4d4f92..aab667a7bb4 100644
--- a/src/common/unicode_category.c
+++ b/src/common/unicode_category.c
@@ -1,7 +1,7 @@
/*-------------------------------------------------------------------------
* unicode_category.c
* Determine general category and character properties of Unicode
- * characters. Encoding must be UTF8, where we assume that the pg_wchar
+ * characters. Encoding must be UTF8, where we assume that the char32_t
* representation is a code point.
*
* Portions Copyright (c) 2017-2025, PostgreSQL Global Development Group
@@ -76,13 +76,13 @@
#define PG_U_CHARACTER_TAB 0x09
static bool range_search(const pg_unicode_range *tbl, size_t size,
- pg_wchar code);
+ char32_t code);
/*
* Unicode general category for the given codepoint.
*/
pg_unicode_category
-unicode_category(pg_wchar code)
+unicode_category(char32_t code)
{
int min = 0;
int mid;
@@ -108,7 +108,7 @@ unicode_category(pg_wchar code)
}
bool
-pg_u_prop_alphabetic(pg_wchar code)
+pg_u_prop_alphabetic(char32_t code)
{
if (code < 0x80)
return unicode_opt_ascii[code].properties & PG_U_PROP_ALPHABETIC;
@@ -119,7 +119,7 @@ pg_u_prop_alphabetic(pg_wchar code)
}
bool
-pg_u_prop_lowercase(pg_wchar code)
+pg_u_prop_lowercase(char32_t code)
{
if (code < 0x80)
return unicode_opt_ascii[code].properties & PG_U_PROP_LOWERCASE;
@@ -130,7 +130,7 @@ pg_u_prop_lowercase(pg_wchar code)
}
bool
-pg_u_prop_uppercase(pg_wchar code)
+pg_u_prop_uppercase(char32_t code)
{
if (code < 0x80)
return unicode_opt_ascii[code].properties & PG_U_PROP_UPPERCASE;
@@ -141,7 +141,7 @@ pg_u_prop_uppercase(pg_wchar code)
}
bool
-pg_u_prop_cased(pg_wchar code)
+pg_u_prop_cased(char32_t code)
{
uint32 category_mask;
@@ -156,7 +156,7 @@ pg_u_prop_cased(pg_wchar code)
}
bool
-pg_u_prop_case_ignorable(pg_wchar code)
+pg_u_prop_case_ignorable(char32_t code)
{
if (code < 0x80)
return unicode_opt_ascii[code].properties & PG_U_PROP_CASE_IGNORABLE;
@@ -167,7 +167,7 @@ pg_u_prop_case_ignorable(pg_wchar code)
}
bool
-pg_u_prop_white_space(pg_wchar code)
+pg_u_prop_white_space(char32_t code)
{
if (code < 0x80)
return unicode_opt_ascii[code].properties & PG_U_PROP_WHITE_SPACE;
@@ -178,7 +178,7 @@ pg_u_prop_white_space(pg_wchar code)
}
bool
-pg_u_prop_hex_digit(pg_wchar code)
+pg_u_prop_hex_digit(char32_t code)
{
if (code < 0x80)
return unicode_opt_ascii[code].properties & PG_U_PROP_HEX_DIGIT;
@@ -189,7 +189,7 @@ pg_u_prop_hex_digit(pg_wchar code)
}
bool
-pg_u_prop_join_control(pg_wchar code)
+pg_u_prop_join_control(char32_t code)
{
if (code < 0x80)
return unicode_opt_ascii[code].properties & PG_U_PROP_JOIN_CONTROL;
@@ -208,7 +208,7 @@ pg_u_prop_join_control(pg_wchar code)
*/
bool
-pg_u_isdigit(pg_wchar code, bool posix)
+pg_u_isdigit(char32_t code, bool posix)
{
if (posix)
return ('0' <= code && code <= '9');
@@ -217,19 +217,19 @@ pg_u_isdigit(pg_wchar code, bool posix)
}
bool
-pg_u_isalpha(pg_wchar code)
+pg_u_isalpha(char32_t code)
{
return pg_u_prop_alphabetic(code);
}
bool
-pg_u_isalnum(pg_wchar code, bool posix)
+pg_u_isalnum(char32_t code, bool posix)
{
return pg_u_isalpha(code) || pg_u_isdigit(code, posix);
}
bool
-pg_u_isword(pg_wchar code)
+pg_u_isword(char32_t code)
{
uint32 category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
@@ -240,32 +240,32 @@ pg_u_isword(pg_wchar code)
}
bool
-pg_u_isupper(pg_wchar code)
+pg_u_isupper(char32_t code)
{
return pg_u_prop_uppercase(code);
}
bool
-pg_u_islower(pg_wchar code)
+pg_u_islower(char32_t code)
{
return pg_u_prop_lowercase(code);
}
bool
-pg_u_isblank(pg_wchar code)
+pg_u_isblank(char32_t code)
{
return code == PG_U_CHARACTER_TAB ||
unicode_category(code) == PG_U_SPACE_SEPARATOR;
}
bool
-pg_u_iscntrl(pg_wchar code)
+pg_u_iscntrl(char32_t code)
{
return unicode_category(code) == PG_U_CONTROL;
}
bool
-pg_u_isgraph(pg_wchar code)
+pg_u_isgraph(char32_t code)
{
uint32 category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
@@ -276,7 +276,7 @@ pg_u_isgraph(pg_wchar code)
}
bool
-pg_u_isprint(pg_wchar code)
+pg_u_isprint(char32_t code)
{
pg_unicode_category category = unicode_category(code);
@@ -287,7 +287,7 @@ pg_u_isprint(pg_wchar code)
}
bool
-pg_u_ispunct(pg_wchar code, bool posix)
+pg_u_ispunct(char32_t code, bool posix)
{
uint32 category_mask;
@@ -308,13 +308,13 @@ pg_u_ispunct(pg_wchar code, bool posix)
}
bool
-pg_u_isspace(pg_wchar code)
+pg_u_isspace(char32_t code)
{
return pg_u_prop_white_space(code);
}
bool
-pg_u_isxdigit(pg_wchar code, bool posix)
+pg_u_isxdigit(char32_t code, bool posix)
{
if (posix)
return (('0' <= code && code <= '9') ||
@@ -478,7 +478,7 @@ unicode_category_abbrev(pg_unicode_category category)
* given table.
*/
static bool
-range_search(const pg_unicode_range *tbl, size_t size, pg_wchar code)
+range_search(const pg_unicode_range *tbl, size_t size, char32_t code)
{
int min = 0;
int mid;
diff --git a/src/common/unicode_norm.c b/src/common/unicode_norm.c
index 6654b4cbc49..489d99cd5ab 100644
--- a/src/common/unicode_norm.c
+++ b/src/common/unicode_norm.c
@@ -69,7 +69,7 @@ conv_compare(const void *p1, const void *p2)
* lookup, while the frontend version uses a binary search.
*/
static const pg_unicode_decomposition *
-get_code_entry(pg_wchar code)
+get_code_entry(char32_t code)
{
#ifndef FRONTEND
int h;
@@ -109,7 +109,7 @@ get_code_entry(pg_wchar code)
* Get the combining class of the given codepoint.
*/
static uint8
-get_canonical_class(pg_wchar code)
+get_canonical_class(char32_t code)
{
const pg_unicode_decomposition *entry = get_code_entry(code);
@@ -130,15 +130,15 @@ get_canonical_class(pg_wchar code)
* Note: the returned pointer can point to statically allocated buffer, and
* is only valid until next call to this function!
*/
-static const pg_wchar *
+static const char32_t *
get_code_decomposition(const pg_unicode_decomposition *entry, int *dec_size)
{
- static pg_wchar x;
+ static char32_t x;
if (DECOMPOSITION_IS_INLINE(entry))
{
Assert(DECOMPOSITION_SIZE(entry) == 1);
- x = (pg_wchar) entry->dec_index;
+ x = (char32_t) entry->dec_index;
*dec_size = 1;
return &x;
}
@@ -156,7 +156,7 @@ get_code_decomposition(const pg_unicode_decomposition *entry, int *dec_size)
* are, in turn, decomposable.
*/
static int
-get_decomposed_size(pg_wchar code, bool compat)
+get_decomposed_size(char32_t code, bool compat)
{
const pg_unicode_decomposition *entry;
int size = 0;
@@ -318,7 +318,7 @@ recompose_code(uint32 start, uint32 code, uint32 *result)
* in the array result.
*/
static void
-decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
+decompose_code(char32_t code, bool compat, char32_t **result, int *current)
{
const pg_unicode_decomposition *entry;
int i;
@@ -337,7 +337,7 @@ decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
v,
tindex,
sindex;
- pg_wchar *res = *result;
+ char32_t *res = *result;
sindex = code - SBASE;
l = LBASE + sindex / (VCOUNT * TCOUNT);
@@ -369,7 +369,7 @@ decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
(!compat && DECOMPOSITION_IS_COMPAT(entry)))
{
- pg_wchar *res = *result;
+ char32_t *res = *result;
res[*current] = code;
(*current)++;
@@ -382,7 +382,7 @@ decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
decomp = get_code_decomposition(entry, &dec_size);
for (i = 0; i < dec_size; i++)
{
- pg_wchar lcode = (pg_wchar) decomp[i];
+ char32_t lcode = (char32_t) decomp[i];
/* Leave if no more decompositions */
decompose_code(lcode, compat, result, current);
@@ -398,17 +398,17 @@ decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
* malloc. Or NULL if we run out of memory. In backend, the returned
* string is palloc'd instead, and OOM is reported with ereport().
*/
-pg_wchar *
-unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
+char32_t *
+unicode_normalize(UnicodeNormalizationForm form, const char32_t *input)
{
bool compat = (form == UNICODE_NFKC || form == UNICODE_NFKD);
bool recompose = (form == UNICODE_NFC || form == UNICODE_NFKC);
- pg_wchar *decomp_chars;
- pg_wchar *recomp_chars;
+ char32_t *decomp_chars;
+ char32_t *recomp_chars;
int decomp_size,
current_size;
int count;
- const pg_wchar *p;
+ const char32_t *p;
/* variables for recomposition */
int last_class;
@@ -425,7 +425,7 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
for (p = input; *p; p++)
decomp_size += get_decomposed_size(*p, compat);
- decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
+ decomp_chars = (char32_t *) ALLOC((decomp_size + 1) * sizeof(char32_t));
if (decomp_chars == NULL)
return NULL;
@@ -448,9 +448,9 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
*/
for (count = 1; count < decomp_size; count++)
{
- pg_wchar prev = decomp_chars[count - 1];
- pg_wchar next = decomp_chars[count];
- pg_wchar tmp;
+ char32_t prev = decomp_chars[count - 1];
+ char32_t next = decomp_chars[count];
+ char32_t tmp;
const uint8 prevClass = get_canonical_class(prev);
const uint8 nextClass = get_canonical_class(next);
@@ -487,7 +487,7 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
* longer than the decomposed one, so make the allocation of the output
* string based on that assumption.
*/
- recomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
+ recomp_chars = (char32_t *) ALLOC((decomp_size + 1) * sizeof(char32_t));
if (!recomp_chars)
{
FREE(decomp_chars);
@@ -501,9 +501,9 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
for (count = 1; count < decomp_size; count++)
{
- pg_wchar ch = decomp_chars[count];
+ char32_t ch = decomp_chars[count];
int ch_class = get_canonical_class(ch);
- pg_wchar composite;
+ char32_t composite;
if (last_class < ch_class &&
recompose_code(starter_ch, ch, &composite))
@@ -524,7 +524,7 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
recomp_chars[target_pos++] = ch;
}
}
- recomp_chars[target_pos] = (pg_wchar) '\0';
+ recomp_chars[target_pos] = (char32_t) '\0';
FREE(decomp_chars);
@@ -540,7 +540,7 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
#ifndef FRONTEND
static const pg_unicode_normprops *
-qc_hash_lookup(pg_wchar ch, const pg_unicode_norminfo *norminfo)
+qc_hash_lookup(char32_t ch, const pg_unicode_norminfo *norminfo)
{
int h;
uint32 hashkey;
@@ -571,7 +571,7 @@ qc_hash_lookup(pg_wchar ch, const pg_unicode_norminfo *norminfo)
* Look up the normalization quick check character property
*/
static UnicodeNormalizationQC
-qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
+qc_is_allowed(UnicodeNormalizationForm form, char32_t ch)
{
const pg_unicode_normprops *found = NULL;
@@ -595,7 +595,7 @@ qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
}
UnicodeNormalizationQC
-unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *input)
+unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const char32_t *input)
{
uint8 lastCanonicalClass = 0;
UnicodeNormalizationQC result = UNICODE_NORM_QC_YES;
@@ -610,9 +610,9 @@ unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *
if (form == UNICODE_NFD || form == UNICODE_NFKD)
return UNICODE_NORM_QC_MAYBE;
- for (const pg_wchar *p = input; *p; p++)
+ for (const char32_t *p = input; *p; p++)
{
- pg_wchar ch = *p;
+ char32_t ch = *p;
uint8 canonicalClass;
UnicodeNormalizationQC check;
diff --git a/src/fe_utils/mbprint.c b/src/fe_utils/mbprint.c
index eb3eeee9925..abffdbe18a2 100644
--- a/src/fe_utils/mbprint.c
+++ b/src/fe_utils/mbprint.c
@@ -49,20 +49,20 @@ pg_get_utf8_id(void)
*
* No error checks here, c must point to a long-enough string.
*/
-static pg_wchar
+static char32_t
utf8_to_unicode(const unsigned char *c)
{
if ((*c & 0x80) == 0)
- return (pg_wchar) c[0];
+ return (char32_t) c[0];
else if ((*c & 0xe0) == 0xc0)
- return (pg_wchar) (((c[0] & 0x1f) << 6) |
+ return (char32_t) (((c[0] & 0x1f) << 6) |
(c[1] & 0x3f));
else if ((*c & 0xf0) == 0xe0)
- return (pg_wchar) (((c[0] & 0x0f) << 12) |
+ return (char32_t) (((c[0] & 0x0f) << 12) |
((c[1] & 0x3f) << 6) |
(c[2] & 0x3f));
else if ((*c & 0xf8) == 0xf0)
- return (pg_wchar) (((c[0] & 0x07) << 18) |
+ return (char32_t) (((c[0] & 0x07) << 18) |
((c[1] & 0x3f) << 12) |
((c[2] & 0x3f) << 6) |
(c[3] & 0x3f));
diff --git a/src/include/c.h b/src/include/c.h
index 9ab5e617995..54fa20b0e83 100644
--- a/src/include/c.h
+++ b/src/include/c.h
@@ -513,6 +513,25 @@ typedef void (*pg_funcptr_t) (void);
#include <stdbool.h>
+/*
+ * char16_t and char32_t
+ * Unicode code points.
+ */
+#ifndef __cplusplus
+#ifdef HAVE_UCHAR_H
+#include <uchar.h>
+#ifndef __STDC_UTF_16__
+#error "char16_t must use UTF-16 encoding"
+#endif
+#ifndef __STDC_UTF_32__
+#error "char32_t must use UTF-32 encoding"
+#endif
+#else
+typedef uint16_t char16_t;
+typedef uint32_t char32_t;
+#endif
+#endif
+
/* ----------------------------------------------------------------
* Section 3: standard system types
diff --git a/src/include/common/unicode_case.h b/src/include/common/unicode_case.h
index 41e2c1f4b33..6bcffd349c2 100644
--- a/src/include/common/unicode_case.h
+++ b/src/include/common/unicode_case.h
@@ -14,14 +14,12 @@
#ifndef UNICODE_CASE_H
#define UNICODE_CASE_H
-#include "mb/pg_wchar.h"
-
typedef size_t (*WordBoundaryNext) (void *wbstate);
-pg_wchar unicode_lowercase_simple(pg_wchar code);
-pg_wchar unicode_titlecase_simple(pg_wchar code);
-pg_wchar unicode_uppercase_simple(pg_wchar code);
-pg_wchar unicode_casefold_simple(pg_wchar code);
+char32_t unicode_lowercase_simple(char32_t code);
+char32_t unicode_titlecase_simple(char32_t code);
+char32_t unicode_uppercase_simple(char32_t code);
+char32_t unicode_casefold_simple(char32_t code);
size_t unicode_strlower(char *dst, size_t dstsize, const char *src,
ssize_t srclen, bool full);
size_t unicode_strtitle(char *dst, size_t dstsize, const char *src,
diff --git a/src/include/common/unicode_case_table.h b/src/include/common/unicode_case_table.h
index d5311786582..0a14fb2d97b 100644
--- a/src/include/common/unicode_case_table.h
+++ b/src/include/common/unicode_case_table.h
@@ -18,7 +18,6 @@
*/
#include "common/unicode_case.h"
-#include "mb/pg_wchar.h"
/*
* The maximum number of codepoints that can result from case mapping
@@ -45,7 +44,7 @@ typedef enum
typedef struct
{
int16 conditions;
- pg_wchar map[NCaseKind][MAX_CASE_EXPANSION];
+ char32_t map[NCaseKind][MAX_CASE_EXPANSION];
} pg_special_case;
/*
@@ -166,7 +165,7 @@ static const pg_special_case special_case[106] =
* The entry case_map_lower[case_index(codepoint)] is the mapping for the
* given codepoint.
*/
-static const pg_wchar case_map_lower[1704] =
+static const char32_t case_map_lower[1704] =
{
0x000000, /* reserved */
0x000000, /* U+000000 */
@@ -1879,7 +1878,7 @@ static const pg_wchar case_map_lower[1704] =
* The entry case_map_title[case_index(codepoint)] is the mapping for the
* given codepoint.
*/
-static const pg_wchar case_map_title[1704] =
+static const char32_t case_map_title[1704] =
{
0x000000, /* reserved */
0x000000, /* U+000000 */
@@ -3592,7 +3591,7 @@ static const pg_wchar case_map_title[1704] =
* The entry case_map_upper[case_index(codepoint)] is the mapping for the
* given codepoint.
*/
-static const pg_wchar case_map_upper[1704] =
+static const char32_t case_map_upper[1704] =
{
0x000000, /* reserved */
0x000000, /* U+000000 */
@@ -5305,7 +5304,7 @@ static const pg_wchar case_map_upper[1704] =
* The entry case_map_fold[case_index(codepoint)] is the mapping for the
* given codepoint.
*/
-static const pg_wchar case_map_fold[1704] =
+static const char32_t case_map_fold[1704] =
{
0x000000, /* reserved */
0x000000, /* U+000000 */
@@ -13522,7 +13521,7 @@ static const uint16 case_map[4778] =
* the offset into the mapping tables.
*/
static inline uint16
-case_index(pg_wchar cp)
+case_index(char32_t cp)
{
/* Fast path for codepoints < 0x0588 */
if (cp < 0x0588)
diff --git a/src/include/common/unicode_category.h b/src/include/common/unicode_category.h
index 8fd8b67a416..684143d3c8a 100644
--- a/src/include/common/unicode_category.h
+++ b/src/include/common/unicode_category.h
@@ -14,8 +14,6 @@
#ifndef UNICODE_CATEGORY_H
#define UNICODE_CATEGORY_H
-#include "mb/pg_wchar.h"
-
/*
* Unicode General Category Values
*
@@ -61,31 +59,31 @@ typedef enum pg_unicode_category
PG_U_FINAL_PUNCTUATION = 29 /* Pf */
} pg_unicode_category;
-extern pg_unicode_category unicode_category(pg_wchar code);
+extern pg_unicode_category unicode_category(char32_t code);
extern const char *unicode_category_string(pg_unicode_category category);
extern const char *unicode_category_abbrev(pg_unicode_category category);
-extern bool pg_u_prop_alphabetic(pg_wchar code);
-extern bool pg_u_prop_lowercase(pg_wchar code);
-extern bool pg_u_prop_uppercase(pg_wchar code);
-extern bool pg_u_prop_cased(pg_wchar code);
-extern bool pg_u_prop_case_ignorable(pg_wchar code);
-extern bool pg_u_prop_white_space(pg_wchar code);
-extern bool pg_u_prop_hex_digit(pg_wchar code);
-extern bool pg_u_prop_join_control(pg_wchar code);
+extern bool pg_u_prop_alphabetic(char32_t code);
+extern bool pg_u_prop_lowercase(char32_t code);
+extern bool pg_u_prop_uppercase(char32_t code);
+extern bool pg_u_prop_cased(char32_t code);
+extern bool pg_u_prop_case_ignorable(char32_t code);
+extern bool pg_u_prop_white_space(char32_t code);
+extern bool pg_u_prop_hex_digit(char32_t code);
+extern bool pg_u_prop_join_control(char32_t code);
-extern bool pg_u_isdigit(pg_wchar code, bool posix);
-extern bool pg_u_isalpha(pg_wchar code);
-extern bool pg_u_isalnum(pg_wchar code, bool posix);
-extern bool pg_u_isword(pg_wchar code);
-extern bool pg_u_isupper(pg_wchar code);
-extern bool pg_u_islower(pg_wchar code);
-extern bool pg_u_isblank(pg_wchar code);
-extern bool pg_u_iscntrl(pg_wchar code);
-extern bool pg_u_isgraph(pg_wchar code);
-extern bool pg_u_isprint(pg_wchar code);
-extern bool pg_u_ispunct(pg_wchar code, bool posix);
-extern bool pg_u_isspace(pg_wchar code);
-extern bool pg_u_isxdigit(pg_wchar code, bool posix);
+extern bool pg_u_isdigit(char32_t code, bool posix);
+extern bool pg_u_isalpha(char32_t code);
+extern bool pg_u_isalnum(char32_t code, bool posix);
+extern bool pg_u_isword(char32_t code);
+extern bool pg_u_isupper(char32_t code);
+extern bool pg_u_islower(char32_t code);
+extern bool pg_u_isblank(char32_t code);
+extern bool pg_u_iscntrl(char32_t code);
+extern bool pg_u_isgraph(char32_t code);
+extern bool pg_u_isprint(char32_t code);
+extern bool pg_u_ispunct(char32_t code, bool posix);
+extern bool pg_u_isspace(char32_t code);
+extern bool pg_u_isxdigit(char32_t code, bool posix);
#endif /* UNICODE_CATEGORY_H */
diff --git a/src/include/common/unicode_category_table.h b/src/include/common/unicode_category_table.h
index 95a1c65da7e..466a41b72b0 100644
--- a/src/include/common/unicode_category_table.h
+++ b/src/include/common/unicode_category_table.h
@@ -20,15 +20,15 @@
*/
typedef struct
{
- uint32 first; /* Unicode codepoint */
- uint32 last; /* Unicode codepoint */
+ char32_t first; /* Unicode codepoint */
+ char32_t last; /* Unicode codepoint */
uint8 category; /* General Category */
} pg_category_range;
typedef struct
{
- uint32 first; /* Unicode codepoint */
- uint32 last; /* Unicode codepoint */
+ char32_t first; /* Unicode codepoint */
+ char32_t last; /* Unicode codepoint */
} pg_unicode_range;
typedef struct
diff --git a/src/include/common/unicode_norm.h b/src/include/common/unicode_norm.h
index 5bc3b79e78e..516c192cc4c 100644
--- a/src/include/common/unicode_norm.h
+++ b/src/include/common/unicode_norm.h
@@ -14,8 +14,6 @@
#ifndef UNICODE_NORM_H
#define UNICODE_NORM_H
-#include "mb/pg_wchar.h"
-
typedef enum
{
UNICODE_NFC = 0,
@@ -32,8 +30,8 @@ typedef enum
UNICODE_NORM_QC_MAYBE = -1,
} UnicodeNormalizationQC;
-extern pg_wchar *unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input);
+extern char32_t *unicode_normalize(UnicodeNormalizationForm form, const char32_t *input);
-extern UnicodeNormalizationQC unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *input);
+extern UnicodeNormalizationQC unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const char32_t *input);
#endif /* UNICODE_NORM_H */
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index 4b4a9974b75..4d84bdc81e4 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -532,25 +532,25 @@ typedef uint32 (*utf_local_conversion_func) (uint32 code);
* Some handy functions for Unicode-specific tests.
*/
static inline bool
-is_valid_unicode_codepoint(pg_wchar c)
+is_valid_unicode_codepoint(char32_t c)
{
return (c > 0 && c <= 0x10FFFF);
}
static inline bool
-is_utf16_surrogate_first(pg_wchar c)
+is_utf16_surrogate_first(char32_t c)
{
return (c >= 0xD800 && c <= 0xDBFF);
}
static inline bool
-is_utf16_surrogate_second(pg_wchar c)
+is_utf16_surrogate_second(char32_t c)
{
return (c >= 0xDC00 && c <= 0xDFFF);
}
-static inline pg_wchar
-surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
+static inline char32_t
+surrogate_pair_to_codepoint(char16_t first, char16_t second)
{
return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF);
}
@@ -561,20 +561,20 @@ surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
*
* No error checks here, c must point to a long-enough string.
*/
-static inline pg_wchar
+static inline char32_t
utf8_to_unicode(const unsigned char *c)
{
if ((*c & 0x80) == 0)
- return (pg_wchar) c[0];
+ return (char32_t) c[0];
else if ((*c & 0xe0) == 0xc0)
- return (pg_wchar) (((c[0] & 0x1f) << 6) |
+ return (char32_t) (((c[0] & 0x1f) << 6) |
(c[1] & 0x3f));
else if ((*c & 0xf0) == 0xe0)
- return (pg_wchar) (((c[0] & 0x0f) << 12) |
+ return (char32_t) (((c[0] & 0x0f) << 12) |
((c[1] & 0x3f) << 6) |
(c[2] & 0x3f));
else if ((*c & 0xf8) == 0xf0)
- return (pg_wchar) (((c[0] & 0x07) << 18) |
+ return (char32_t) (((c[0] & 0x07) << 18) |
((c[1] & 0x3f) << 12) |
((c[2] & 0x3f) << 6) |
(c[3] & 0x3f));
@@ -588,7 +588,7 @@ utf8_to_unicode(const unsigned char *c)
* unicode_utf8len(c) bytes available.
*/
static inline unsigned char *
-unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
+unicode_to_utf8(char32_t c, unsigned char *utf8string)
{
if (c <= 0x7F)
{
@@ -620,7 +620,7 @@ unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
* Number of bytes needed to represent the given char in UTF8.
*/
static inline int
-unicode_utf8len(pg_wchar c)
+unicode_utf8len(char32_t c)
{
if (c <= 0x7F)
return 1;
@@ -676,8 +676,8 @@ extern int pg_valid_server_encoding(const char *name);
extern bool is_encoding_supported_by_icu(int encoding);
extern const char *get_encoding_name_for_icu(int encoding);
-extern unsigned char *unicode_to_utf8(pg_wchar c, unsigned char *utf8string);
-extern pg_wchar utf8_to_unicode(const unsigned char *c);
+extern unsigned char *unicode_to_utf8(char32_t c, unsigned char *utf8string);
+extern char32_t utf8_to_unicode(const unsigned char *c);
extern bool pg_utf8_islegal(const unsigned char *source, int length);
extern int pg_utf_mblen(const unsigned char *s);
extern int pg_mule_mblen(const unsigned char *s);
@@ -739,8 +739,8 @@ extern char *pg_server_to_client(const char *s, int len);
extern char *pg_any_to_server(const char *s, int len, int encoding);
extern char *pg_server_to_any(const char *s, int len, int encoding);
-extern void pg_unicode_to_server(pg_wchar c, unsigned char *s);
-extern bool pg_unicode_to_server_noerror(pg_wchar c, unsigned char *s);
+extern void pg_unicode_to_server(char32_t c, unsigned char *s);
+extern bool pg_unicode_to_server_noerror(char32_t c, unsigned char *s);
extern unsigned short BIG5toCNS(unsigned short big5, unsigned char *lc);
extern unsigned short CNStoBIG5(unsigned short cns, unsigned char lc);
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 43fe3bcd593..ce382c546e2 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -3506,6 +3506,8 @@ cb_cleanup_dir
cb_options
cb_tablespace
cb_tablespace_mapping
+char16_t
+char32_t
check_agg_arguments_context
check_function_callback
check_network_data
--
2.43.0
On Mon, Oct 27, 2025 at 8:43 AM Jeff Davis <pgsql@j-davis.com> wrote:
What would be the problem if it were larger than 32 bits?
Hmm, OK fair question, I can't think of any, I was just working
through the standard and thinking myopically about the exact
definition, but I think it's actually already covered by other things
we assume/require (ie the existence of uint32_t forces the size of
char32_t if you follow the chain of definitions backwards), and as you
say it probably doesn't even matter. I suppose you could also skip
the __STC_UTF_32__ assertion given that we already make a larger
assumption about wchar_t encoding, and it seems to be exhaustively
established that no implementation fails to conform to C23 for
char32_t (see earlier link to Meneide's blog). I don't personally
understand what C11 was smoking when it left that unspecified for
another 12 years.
I wonder if the XXX_libc_mb() functions that contain our hard-coded
assumptions that libc's wchar_t values are in UTF-16 or UTF-32 should
use your to_char32_t() too (probably with a longer name
pg_wchar_to_char32_t() if it's in a header for wider use).I don't think those functions do depend on UTF-32. iswalpha(), etc.,
take a wint_t, which is just a wchar_t that can also be WEOF.
I was noticing that toupper_libc_mb() directly tests if a pg_wchar
value is in the ASCII range, which only makes sense given knowledge of
pg_wchar's encoding, so perhap that should trigger this new coding
rule. But I agree that's pretty obscure... feel free to ignore that
suggestion.
Hmm, the comment at the top explains that we apply that special ASCII
treatment for default locales and not non-default locales, but it
doesn't explain *why* we make that distinction. Do you know?
One thing I never understood about this is that it's our code that
converts from the server encoding to pg_wchar (e.g.
pg_latin12wchar_with_len()), so we must understand the representation
of pg_wchar. And we cast directly from pg_wchar to wchar_t, so we
understand the encoding of wchar_t, too, right?
Right, we do know the encoding of pg_wchar in every case (assuming
that all pg_wchar values come from our transcoding routines). We just
don't know if that encoding is also the one used by libc's
locale-sensitive functions that deal in wchar_t, except when the
locale is one that uses UTF-8 for char encoding, in which case we
assume that every libc must surely use Unicode codepoints in wchar_t.
That probably covers the vast majority of real world databases in the
UTF-8 age, and no known system fails to meet this expectation. Of
course the encoding used by every libc for non-UTF-8 locales is
theoretically knowable too, but since they vary and in some cases are
not even documented, it would be too painful to contemplate any
dependency on that.
Let me try to work through this in more detail... corrections
welcome, but this is what I have managed to understand about this
module so far, in my quest to grok PostgreSQL's overall character
encoding model (and holes therein):
For locales that use UTF-8 for char, we expect libc to understand
pg_wchar/wchar_t/wint_t values as UTF-32 or at a stretch UTF-16. The
expected source of these pg_wchar values is our various regexp code
paths that will use our mbutils pg_wchar conversion to UTF-32, with a
reasonable copying strategy for sizeof(wchar_t) == 2 (that's Windows
and I think otherwise only AIX in 32 bit builds, if it comes back).
If any libc didn't use Unicode codepoints in its locale-sensitive
wchar_t functions for UTF-8 locales we'd get garbage results, but we
don't know of any such system. It's a bit of a shame that C11 didn't
introduce the obvious isualpha(char32_t) variants for a
standard-supported version of that realpolitik we depend on, but
perhaps one day...
There is one minor quirk here that it might be nice to document in top
comment section 2: on Windows we also expect wchar_t to be understood
by system wctype functions as UTF-16 for locales that *don't* use
UTF-8 for char (an assumption that definitely doesn't hold on many
Unixen). That is important because on Windows we allow non-UTF-8
locales to be used in UTF-8 databases for historical reasons.
For single-byte encodings: pg_latin12wchar_with_len() just
zero-extends the bytes to pg_wchar, so when the pg_locale_libc.c
functions truncate them and call 8-bit ctype stuff eg isalpha_l(), it
completes a perfect round trip inside our code. (BTW
pg_latin12wchar_with_len() has the same definition as
pg_ascii2wchar_with_len(), and is used for many single-byte encodings
other than LATIN1 which makes me wonder why we don't just have a
single function pg_char2wchar_with_len() that is used by all "simple
widening" cases.) We never know or care which encoding libc would
itself use for these locales' wchar_t, as we don't ever pass it a
wchar_t. Assuming I understood that correctly, I think it would be
nice if the "100% correct for LATINn" comment stated the reason for
that certainty explicitly, ie that it closes an information-preserving
round-trip beginning with the coercion in pg_latin12wchar_with_len()
and that libc never receives a wchar_t/wint_t that we fabricated.
A bit of a digression, which I *think* is out-of-scope for this
module, but just while I'm working through all the implications: This
could produce unspecified results if a wchar_t from another source
ever arrived into these functions eg wchar_t made by libc or
L"literal" made by the compiler, both unspecified. In practice, a
wchar_t of non-PostgreSQL origin that is truncated to 8 bits would
probably still give a sensible result for codepoints 0-127 (= 7 bit
subset of Unicode, and we require all server encodings to be supersets
of ASCII), and 0-255 for LATIN1 (= 8 bit subset of Unicode), because:
the two main approaches to single-byte char -> wchar_t conversion in
libc implementations seem to be conversion to Unicode (Windows,
glibc?), and simply casting char to wchar_t (I think this is probably
what *BSD and Solaris do for single-byte non-UTF-8 locales leading to
the complaint that wchar_t encoding is locale-dependent on those
systems, though I haven't checked in detail, and that's of course also
exactly what our own conversion does), so I think that means that
128-255 that would give nonsense results for non-LATIN1 single byte
encodings on Windows or glibc (?) but perhaps not other Unixen. For
example, take ISO 8859-7, the legacy single byte encoding for Greek:
it encodes α as 0xe1, and Windows and glibc (?) would presumably
encode that as (wchar_t) 0x03b1 (the Unicode codepoint), and then
wc_isalpha_libc_sb() would truncate that to 0xb1 which is ± in ISO
8859-7, so isalpha_l() would return false, despite α being the OG
alpha (not tested, just a thought experiment looking at tables). But
since handling pg_wchar of non-PostgreSQL origin doesn't seem to be
one of our goals, there is no problem to fix here, it might just be
worthy of a note in that commentary: we don't try to deal with wchar_t
values not made by PostgreSQL, except where noted (non-escaping uses
of char2wchar() in controlled scopes).
For multi-byte encodings other than UTF-8, pg_locale_libc.c is
basically giving up almost completely, but could probably be tightened
up. I can't imagine we'll ever add another multibyte encoding, and I
believe we can ignore MULE internal, as no libc supports it (so you
could only get here with the C locale where you'll get the garbage
results you asked for... in fact I wonder why need MULE internal at
all... it seems to be a sort of double-encoding for multiplexing other
encodings, so we can't exactly say it's not blessed by a standard,
it's indirectly defined by "all the standards" in a sense, but it's
also entirely obsoleted by Unicode's unification so I don't know what
problem it solves for anyone, or if anyone ever needed it in any
reasonable pg_upgrade window of history...). Of server-supported
encodings, that leaves only EUC_* to think about.
The EUC family has direct encoding of 7-bit ASCII and then 3
selectable character sets represented by sequences with the high bit
set, with details varying between the Chinese (simplified Chinese),
Taiwanese (traditional Chinese), Japanese (2 kinds) and Korean
variants. I don't know if the pg_wchar encoding we're producing in
pg_euc*2wchar_with_len() has a name, but it doesn't appear to match
the description of the standard "fixed" representation on the
Wikipedia page for Extended Unix Code (it's too wide for starters,
looking at the shift distances). The main thing seems to be that we
simply zero-extend the ASCII range into a pg_wchar directly, so when
we cast it down to call 8-bit ctype functions, I expect we produce
correct results for ASCII characters... and then I don't know what but
I guess nothing good for 128-255, and then surely hot garbage for
everything else, cycling through the 0-255 answers repeatedly as we
climb the pg_wchar value range. The key point being that it's *not* a
perfect information-preserving round-trip, as we achieve for
single-byte encodings. Some ideas for improvements:
1. Cheap but incomplete: use a different ctype method table that
short-circuits the results (false for isalpha et al, pass-through for
upper/lower) for pg_wchar >= 128 and uses the existing 8-bit ctype
functions for ASCII.
2. More expensive but complete: handle ASCII range with existing
8-bit ctype functions, and otherwise convert our pg_wchar back to MB
char format and then use libc's mbstowcs_l() to make a wchar_t that
libc's wchar_t-based functions should understand. To avoid doing hard
work for nothing (ideogram-based languages generally don't care about
ctype stuff so that'd be the vast majority of characters appearing in
Chinese/Japanese/Korean text) at the cost of having to do a bunch of
research, we could should short-circuit the core CJK character ranges,
and do the extra CPU cycles for the rest, to catch the Latin +
accents, Greek, Cyrillic characters that are also supported in these
encodings for foreign names, variables in scientific language etc. I
guess that implies a classifier that would be associated with ... the
encoding? That would of course break if wchar_t values of
non-PostgreSQL origin arrive here, but see above note about nailing
down a contract that formally excludes that outside narrow
non-escaping sites.
3. I assume there are some good reasons we don't do this but... if we
used char2wchar() in the first place (= libc native wchar_t) for the
regexp stuff that calls this stuff (as we do already inside
whole-string upper/lower, just not character upper/lower or character
classification), then we could simply call the wchar_t libc functions
directly and unconditionally in the libc provider for all cases,
instead of the 8-bit variants with broken edge cases for non-UTF-8
databases. I didn't try to find the historical discussions, but I can
imagine already that we might not have done that because it has to
copy to cope with non-NULL-terminated strings, might perhaps have
weird incompatibilities with our own multibyte sequence detection,
might be slower (and/or might have been unusably broken ancient
libcs?), and it would only be appropriate for libc locales anyway and
yet now we have other locale providers that certainly don't want some
unspecified wchar_t encoding or libc involved. It's also likely that
non-UTF-8 systems are of dwindling interest to anyone outside perhaps
client encodings (hence my attempt to ram home some simplifying
assumptions about that in that project to nail down some rules where
the encoding is fuzzy that I mentioned in a thread from a few months
ago). So I'm not seriously suggesting this, just thinking out loud
about the corner we've painted ourselves into where idea #2's multiple
transcoding steps would be necessary to get the "right" answer for any
character in these encodings. Hnngh.
In passing, I wonder why _libc.c has that comment about ICU in
parentheses. Not relevant here. I haven't thought much about whether
it's relevant in the ICU provider code (it may come back to that
do-we-accept-pg_wchar-we-didn't-make? question), but if it is then it
also applies to Windows and probably glibc in the libc provider and I
don't immediately see any problem (assuming no-we-don't! answer).
The EUC family has direct encoding of 7-bit ASCII and then 3
selectable character sets represented by sequences with the high bit
set, with details varying between the Chinese (simplified Chinese),
Taiwanese (traditional Chinese), Japanese (2 kinds) and Korean
variants. I don't know if the pg_wchar encoding we're producing in
pg_euc*2wchar_with_len() has a name, but it doesn't appear to match
the description of the standard "fixed" representation on the
Wikipedia page for Extended Unix Code (it's too wide for starters,
looking at the shift distances).
Yes. pg_euc*2wchar_with_len() creates "variable length" representation
of EUC, 1 byte to 4 bytes range per character. Then, expands each
character into pg_wchar. Also it can be converted back to the
multibyte representation easily.
Note that the standard "fixed" representation of EUC includes ASCII
range bytes in *non* ASCII characters, thus I think it is not easy to
use for backend safe encoding.
Best regards,
--
Tatsuo Ishii
SRA OSS K.K.
English: http://www.sraoss.co.jp/index_en/
Japanese:http://www.sraoss.co.jp
On Tue, 2025-10-28 at 15:40 +1300, Thomas Munro wrote:
I was noticing that toupper_libc_mb() directly tests if a pg_wchar
value is in the ASCII range, which only makes sense given knowledge
of
pg_wchar's encoding, so perhap that should trigger this new coding
rule. But I agree that's pretty obscure... feel free to ignore that
suggestion.
I'm not sure that casting it to char32_t would be an improvement there.
Perhaps if we can find some ways to generally clarify things (some of
which you suggest below), that could be part of a follow-up.
It looks like the current patch is a step in the right direction, so
I'll commit that soon and see what the buildfarm says.
Hmm, the comment at the top explains that we apply that special ASCII
treatment for default locales and not non-default locales, but it
doesn't explain *why* we make that distinction. Do you know?
It makes some sense: I suppose someone thought that non-ASCII behavior
in the default locale is just too likely to cause problems. But the
non-ASCII behavior is allowed if you use a COLLATE clause.
But the pattern wasn't followed quite the same way with ICU, which uses
the given locale for UPPER()/LOWER() regardless of whether it's the
default locale or not. And for regexes, ICU doesn't use the locale at
all, it just uses u_isalpha(), etc., even if you use a COLLATE clause.
And there are still some places that call plain tolower()/toupper(),
such as fuzzystrmatch and ltree.
Right, we do know the encoding of pg_wchar in every case (assuming
that all pg_wchar values come from our transcoding routines). We
just
don't know if that encoding is also the one used by libc's
locale-sensitive functions that deal in wchar_t, except when the
locale is one that uses UTF-8 for char encoding, in which case we
assume that every libc must surely use Unicode codepoints in wchar_t.
Ah, right. We create pg_wchars for any encoding, but we only pass a
pg_wchar to a libc multibyte function in the UTF-8 encoding.
(Aside: we do pass pg_wchars directly to ICU as UTF-32 codepoints,
regardless of encoding, which is a bug.)
For locales that use UTF-8 for char, we expect libc to understand
pg_wchar/wchar_t/wint_t values as UTF-32 or at a stretch UTF-16. The
expected source of these pg_wchar values is our various regexp code
paths that will use our mbutils pg_wchar conversion to UTF-32, with a
reasonable copying strategy for sizeof(wchar_t) == 2 (that's Windows
and I think otherwise only AIX in 32 bit builds, if it comes back).
If any libc didn't use Unicode codepoints in its locale-sensitive
wchar_t functions for UTF-8 locales we'd get garbage results, but we
don't know of any such system.
Check.
It's a bit of a shame that C11 didn't
introduce the obvious isualpha(char32_t) variants for a
standard-supported version of that realpolitik we depend on, but
perhaps one day...
Yeah...
There is one minor quirk here that it might be nice to document in
top
comment section 2: on Windows we also expect wchar_t to be understood
by system wctype functions as UTF-16 for locales that *don't* use
UTF-8 for char (an assumption that definitely doesn't hold on many
Unixen). That is important because on Windows we allow non-UTF-8
locales to be used in UTF-8 databases for historical reasons.
Interesting.
For single-byte encodings: pg_latin12wchar_with_len() just
zero-extends the bytes to pg_wchar, so when the pg_locale_libc.c
functions truncate them and call 8-bit ctype stuff eg isalpha_l(), it
completes a perfect round trip inside our code.
So you're saying that pg_wchar is more like a union type?
typedef pg_wchar
{
char ch; /* single-byte encodings or
non-UTF8 encodings on unix */
char16_t utf16; /* windows non-UTF8 encodings */
char32_t utf32; /* UTF-8 encoding */
} pg_wchar;
(we'd have to be careful about the memory layout if we're casting,
though)
(BTW
pg_latin12wchar_with_len() has the same definition as
pg_ascii2wchar_with_len(), and is used for many single-byte encodings
other than LATIN1 which makes me wonder why we don't just have a
single function pg_char2wchar_with_len() that is used by all "simple
widening" cases.)
Sounds like a nice simplification.
We never know or care which encoding libc would
itself use for these locales' wchar_t, as we don't ever pass it a
wchar_t.
Ah, that makes sense.
Assuming I understood that correctly, I think it would be
nice if the "100% correct for LATINn" comment stated the reason for
that certainty explicitly, ie that it closes an information-
preserving
round-trip beginning with the coercion in pg_latin12wchar_with_len()
and that libc never receives a wchar_t/wint_t that we fabricated.
Agreed, though I think some refactoring would be helpful to accompany
the comment. I've worked with this stuff a lot and I still find it hard
to keep everything in mind at once.
A bit of a digression, which I *think* is out-of-scope for this
module, but just while I'm working through all the implications:
This
could produce unspecified results if a wchar_t from another source
ever arrived into these functions
Ugh.
When I first started dealing with pg_wchar, I assumed it was just a
wider wchar_t to abstract away some of the complexity when
sizeof(wchar_t) == 2 (e.g. get rid of surrogate pairs). It's clearly
more complicated than that.
For multi-byte encodings other than UTF-8, pg_locale_libc.c is
basically giving up almost completely
Right.
I
believe we can ignore MULE internal, as no libc supports it (so you
could only get here with the C locale where you'll get the garbage
results you asked for... in fact I wonder why need MULE internal at
all... it seems to be a sort of double-encoding for multiplexing
other
encodings, so we can't exactly say it's not blessed by a standard,
it's indirectly defined by "all the standards" in a sense, but it's
also entirely obsoleted by Unicode's unification so I don't know what
problem it solves for anyone, or if anyone ever needed it in any
reasonable pg_upgrade window of history...).
I have never heard of someone using it in production, and I wouldn't
object if someone wants to deprecate it.
2. More expensive but complete: handle ASCII range with existing
8-bit ctype functions, and otherwise convert our pg_wchar back to MB
char format and then use libc's mbstowcs_l() to make a wchar_t that
libc's wchar_t-based functions should understand.
Correct. Sounds painful, but perhaps we could just do it and measure
the performance.
To avoid doing hard
work for nothing (ideogram-based languages generally don't care about
ctype stuff so that'd be the vast majority of characters appearing in
Chinese/Japanese/Korean text) at the cost of having to do a bunch of
research, we could should short-circuit the core CJK character
ranges,
and do the extra CPU cycles for the rest,
I don't think we should start making a bunch of assumptions like that.
3. I assume there are some good reasons we don't do this but... if
we
used char2wchar() in the first place (= libc native wchar_t) for the
regexp stuff that calls this stuff (as we do already inside
whole-string upper/lower, just not character upper/lower or character
classification), then we could simply call the wchar_t libc functions
directly and unconditionally in the libc provider for all cases,
instead of the 8-bit variants with broken edge cases for non-UTF-8
databases.
I'm not sure about that either, but I think it's because you can end up
with surrogate pairs, which can't be represented in UTF-8.
I didn't try to find the historical discussions, but I can
imagine already that we might not have done that because it has to
copy to cope with non-NULL-terminated strings,
That's probably another reason.
and it would only be appropriate for libc locales anyway and
yet now we have other locale providers that certainly don't want some
unspecified wchar_t encoding or libc involved.
We could fix that by making some of these APIs take a char pointer
instead. That would allow libc to decode to wchar_t, and other
providers to decode to UTF-32. Or, we could say that pg_wchar is an
opaque type that can only be created by the provider, and passed back
to the same provider.
It's also likely that
non-UTF-8 systems are of dwindling interest to anyone outside perhaps
client encodings
That's been my experience -- haven't run into many non-UTF8 server
encodings.
In passing, I wonder why _libc.c has that comment about ICU in
parentheses. Not relevant here.
I moved it in 4da12e9e2e.
I haven't thought much about whether
it's relevant in the ICU provider code (it may come back to that
do-we-accept-pg_wchar-we-didn't-make? question), but if it is then it
also applies to Windows and probably glibc in the libc provider and I
don't immediately see any problem (assuming no-we-don't! answer).
It's relevant for the regc_wc_isalpha(), etc. functions:
/messages/by-id/e7b67d24288f811aebada7c33f9ae629dde0def5.camel@j-davis.com
Regards,
Jeff Davis
This patch looks good to me overall, it's a nice improvement in clarity.
On 26.10.25 20:43, Jeff Davis wrote:
+/* + * char16_t and char32_t + * Unicode code points. + */ +#ifndef __cplusplus +#ifdef HAVE_UCHAR_H +#include <uchar.h> +#ifndef __STDC_UTF_16__ +#error "char16_t must use UTF-16 encoding" +#endif +#ifndef __STDC_UTF_32__ +#error "char32_t must use UTF-32 encoding" +#endif +#else +typedef uint16_t char16_t; +typedef uint32_t char32_t; +#endif +#endif
This could be improved a bit. The reason for some of these conditionals
is not clear. Like, what does __cplusplus have to do with this? I
think it would be more correct to write a configure/meson check for the
actual types rather than depend indirectly on a header check.
The checks for __STDC_UTF_16__ and __STDC_UTF_32__ can be removed, as
was discussed elsewhere, since we don't use any standard library
functions that make use of these facts, and the need goes away with C23
anyway.
On Tue, 2025-10-28 at 19:45 +0100, Peter Eisentraut wrote:
This could be improved a bit. The reason for some of these
conditionals
is not clear. Like, what does __cplusplus have to do with this? I
think it would be more correct to write a configure/meson check for
the
actual types rather than depend indirectly on a header check.
Fixed, thank you.
The checks for __STDC_UTF_16__ and __STDC_UTF_32__ can be removed, as
was discussed elsewhere, since we don't use any standard library
functions that make use of these facts, and the need goes away with
C23
anyway.
Removed.
I also made the pg_config.h.in changes and ran autoconf.
Regards,
Jeff Davis
Attachments:
v3-0001-Use-C11-char16_t-and-char32_t-for-Unicode-code-po.patchtext/x-patch; charset=UTF-8; name=v3-0001-Use-C11-char16_t-and-char32_t-for-Unicode-code-po.patchDownload
From ef398d17fe36afdecc55390cf0f35d24a57e7fdf Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Tue, 21 Oct 2025 13:16:47 -0700
Subject: [PATCH v3] Use C11 char16_t and char32_t for Unicode code points.
Reviewed-by: Tatsuo Ishii <ishii@postgresql.org>
Reviewed-by: Thomas Munro <thomas.munro@gmail.com>
Reviewed-by: Peter Eisentraut <peter@eisentraut.org>
Discussion: https://postgr.es/m/bedcc93d06203dfd89815b10f815ca2de8626e85.camel%40j-davis.com
---
configure | 22 +++++++-
configure.ac | 3 +
meson.build | 11 ++++
src/backend/parser/parser.c | 8 +--
src/backend/parser/scan.l | 8 +--
src/backend/utils/adt/jsonpath_scan.l | 6 +-
src/backend/utils/adt/pg_locale_builtin.c | 44 ++++++++++-----
src/backend/utils/adt/varlena.c | 40 ++++++-------
src/backend/utils/mb/mbutils.c | 4 +-
src/common/saslprep.c | 48 ++++++++--------
src/common/unicode/case_test.c | 23 ++++----
src/common/unicode/category_test.c | 3 +-
.../unicode/generate-norm_test_table.pl | 4 +-
.../unicode/generate-unicode_case_table.pl | 7 +--
.../generate-unicode_category_table.pl | 8 +--
src/common/unicode/norm_test.c | 6 +-
src/common/unicode_case.c | 56 +++++++++----------
src/common/unicode_category.c | 50 ++++++++---------
src/common/unicode_norm.c | 56 +++++++++----------
src/fe_utils/mbprint.c | 10 ++--
src/include/c.h | 13 +++++
src/include/common/unicode_case.h | 10 ++--
src/include/common/unicode_case_table.h | 13 ++---
src/include/common/unicode_category.h | 46 ++++++++-------
src/include/common/unicode_category_table.h | 8 +--
src/include/common/unicode_norm.h | 6 +-
src/include/mb/pg_wchar.h | 32 +++++------
src/include/pg_config.h.in | 9 +++
src/tools/pgindent/typedefs.list | 2 +
29 files changed, 312 insertions(+), 244 deletions(-)
diff --git a/configure b/configure
index 22cd866147b..0f03b92ee37 100755
--- a/configure
+++ b/configure
@@ -13627,7 +13627,7 @@ fi
## Header files
##
-for ac_header in atomic.h copyfile.h execinfo.h getopt.h ifaddrs.h mbarrier.h sys/epoll.h sys/event.h sys/personality.h sys/prctl.h sys/procctl.h sys/signalfd.h sys/ucred.h termios.h ucred.h xlocale.h
+for ac_header in atomic.h copyfile.h execinfo.h getopt.h ifaddrs.h mbarrier.h sys/epoll.h sys/event.h sys/personality.h sys/prctl.h sys/procctl.h sys/signalfd.h sys/ucred.h termios.h uchar.h ucred.h xlocale.h
do :
as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh`
ac_fn_c_check_header_mongrel "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default"
@@ -14996,6 +14996,26 @@ cat >>confdefs.h <<_ACEOF
_ACEOF
+fi
+
+ac_fn_c_check_type "$LINENO" "char16_t" "ac_cv_type_char16_t" "$ac_includes_default"
+if test "x$ac_cv_type_char16_t" = xyes; then :
+
+cat >>confdefs.h <<_ACEOF
+#define HAVE_CHAR16_T 1
+_ACEOF
+
+
+fi
+
+ac_fn_c_check_type "$LINENO" "char32_t" "ac_cv_type_char32_t" "$ac_includes_default"
+if test "x$ac_cv_type_char32_t" = xyes; then :
+
+cat >>confdefs.h <<_ACEOF
+#define HAVE_CHAR32_T 1
+_ACEOF
+
+
fi
diff --git a/configure.ac b/configure.ac
index e44943aa6fe..9e677d37dce 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1513,6 +1513,7 @@ AC_CHECK_HEADERS(m4_normalize([
sys/signalfd.h
sys/ucred.h
termios.h
+ uchar.h
ucred.h
xlocale.h
]))
@@ -1684,6 +1685,8 @@ PGAC_STRUCT_TIMEZONE
PGAC_UNION_SEMUN
AC_CHECK_TYPES(socklen_t, [], [], [#include <sys/socket.h>])
PGAC_STRUCT_SOCKADDR_SA_LEN
+AC_CHECK_TYPES(char16_t, [], [], [])
+AC_CHECK_TYPES(char32_t, [], [], [])
# MSVC doesn't cope well with defining restrict to __restrict, the
# spelling it understands, because it conflicts with
diff --git a/meson.build b/meson.build
index 395416a6060..a60635ff6a0 100644
--- a/meson.build
+++ b/meson.build
@@ -2613,6 +2613,7 @@ header_checks = [
'sys/signalfd.h',
'sys/ucred.h',
'termios.h',
+ 'uchar.h',
'ucred.h',
'xlocale.h',
]
@@ -2720,6 +2721,16 @@ if cc.has_type('socklen_t',
cdata.set('HAVE_SOCKLEN_T', 1)
endif
+if cc.has_type('char16_t',
+ args: test_c_args, include_directories: postgres_inc)
+ cdata.set('HAVE_CHAR16_T', 1)
+endif
+
+if cc.has_type('char32_t',
+ args: test_c_args, include_directories: postgres_inc)
+ cdata.set('HAVE_CHAR32_T', 1)
+endif
+
if cc.has_member('struct sockaddr', 'sa_len',
args: test_c_args, include_directories: postgres_inc,
prefix: '''
diff --git a/src/backend/parser/parser.c b/src/backend/parser/parser.c
index 33a040506b4..a3679f8e86c 100644
--- a/src/backend/parser/parser.c
+++ b/src/backend/parser/parser.c
@@ -339,7 +339,7 @@ hexval(unsigned char c)
/* is Unicode code point acceptable? */
static void
-check_unicode_value(pg_wchar c)
+check_unicode_value(char32_t c)
{
if (!is_valid_unicode_codepoint(c))
ereport(ERROR,
@@ -376,7 +376,7 @@ str_udeescape(const char *str, char escape,
char *new,
*out;
size_t new_len;
- pg_wchar pair_first = 0;
+ char16_t pair_first = 0;
ScannerCallbackState scbstate;
/*
@@ -420,7 +420,7 @@ str_udeescape(const char *str, char escape,
isxdigit((unsigned char) in[3]) &&
isxdigit((unsigned char) in[4]))
{
- pg_wchar unicode;
+ char32_t unicode;
unicode = (hexval(in[1]) << 12) +
(hexval(in[2]) << 8) +
@@ -457,7 +457,7 @@ str_udeescape(const char *str, char escape,
isxdigit((unsigned char) in[6]) &&
isxdigit((unsigned char) in[7]))
{
- pg_wchar unicode;
+ char32_t unicode;
unicode = (hexval(in[2]) << 20) +
(hexval(in[3]) << 16) +
diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l
index 08990831fe8..a67815339b7 100644
--- a/src/backend/parser/scan.l
+++ b/src/backend/parser/scan.l
@@ -121,7 +121,7 @@ static void addlitchar(unsigned char ychar, core_yyscan_t yyscanner);
static char *litbufdup(core_yyscan_t yyscanner);
static unsigned char unescape_single_char(unsigned char c, core_yyscan_t yyscanner);
static int process_integer_literal(const char *token, YYSTYPE *lval, int base);
-static void addunicode(pg_wchar c, yyscan_t yyscanner);
+static void addunicode(char32_t c, yyscan_t yyscanner);
#define yyerror(msg) scanner_yyerror(msg, yyscanner)
@@ -640,7 +640,7 @@ other .
addlit(yytext, yyleng, yyscanner);
}
<xe>{xeunicode} {
- pg_wchar c = strtoul(yytext + 2, NULL, 16);
+ char32_t c = strtoul(yytext + 2, NULL, 16);
/*
* For consistency with other productions, issue any
@@ -668,7 +668,7 @@ other .
POP_YYLLOC();
}
<xeu>{xeunicode} {
- pg_wchar c = strtoul(yytext + 2, NULL, 16);
+ char32_t c = strtoul(yytext + 2, NULL, 16);
/* Remember start of overall string token ... */
PUSH_YYLLOC();
@@ -1376,7 +1376,7 @@ process_integer_literal(const char *token, YYSTYPE *lval, int base)
}
static void
-addunicode(pg_wchar c, core_yyscan_t yyscanner)
+addunicode(char32_t c, core_yyscan_t yyscanner)
{
ScannerCallbackState scbstate;
char buf[MAX_UNICODE_EQUIVALENT_STRING + 1];
diff --git a/src/backend/utils/adt/jsonpath_scan.l b/src/backend/utils/adt/jsonpath_scan.l
index c7aab83eeb4..8c3a0a9c642 100644
--- a/src/backend/utils/adt/jsonpath_scan.l
+++ b/src/backend/utils/adt/jsonpath_scan.l
@@ -574,7 +574,7 @@ hexval(char c, int *result, struct Node *escontext, yyscan_t yyscanner)
/* Add given unicode character to scanstring */
static bool
-addUnicodeChar(int ch, struct Node *escontext, yyscan_t yyscanner)
+addUnicodeChar(char32_t ch, struct Node *escontext, yyscan_t yyscanner)
{
if (ch == 0)
{
@@ -607,7 +607,7 @@ addUnicodeChar(int ch, struct Node *escontext, yyscan_t yyscanner)
/* Add unicode character, processing any surrogate pairs */
static bool
-addUnicode(int ch, int *hi_surrogate, struct Node *escontext, yyscan_t yyscanner)
+addUnicode(char32_t ch, int *hi_surrogate, struct Node *escontext, yyscan_t yyscanner)
{
if (is_utf16_surrogate_first(ch))
{
@@ -655,7 +655,7 @@ parseUnicode(char *s, int l, struct Node *escontext, yyscan_t yyscanner)
for (i = 2; i < l; i += 2) /* skip '\u' */
{
- int ch = 0;
+ char32_t ch = 0;
int j,
si;
diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c
index 3dc611b50e1..1021e0d129b 100644
--- a/src/backend/utils/adt/pg_locale_builtin.c
+++ b/src/backend/utils/adt/pg_locale_builtin.c
@@ -15,7 +15,6 @@
#include "catalog/pg_collation.h"
#include "common/unicode_case.h"
#include "common/unicode_category.h"
-#include "mb/pg_wchar.h"
#include "miscadmin.h"
#include "utils/builtins.h"
#include "utils/pg_locale.h"
@@ -35,6 +34,23 @@ struct WordBoundaryState
bool prev_alnum;
};
+/*
+ * In UTF-8, pg_wchar is guaranteed to be the code point value.
+ */
+static inline char32_t
+to_char32(pg_wchar wc)
+{
+ Assert(GetDatabaseEncoding() == PG_UTF8);
+ return (char32_t) wc;
+}
+
+static inline pg_wchar
+to_pg_wchar(char32_t c32)
+{
+ Assert(GetDatabaseEncoding() == PG_UTF8);
+ return (pg_wchar) c32;
+}
+
/*
* Simple word boundary iterator that draws boundaries each time the result of
* pg_u_isalnum() changes.
@@ -47,7 +63,7 @@ initcap_wbnext(void *state)
while (wbstate->offset < wbstate->len &&
wbstate->str[wbstate->offset] != '\0')
{
- pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str +
+ char32_t u = utf8_to_unicode((unsigned char *) wbstate->str +
wbstate->offset);
bool curr_alnum = pg_u_isalnum(u, wbstate->posix);
@@ -112,61 +128,61 @@ strfold_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
static bool
wc_isdigit_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_isdigit(wc, !locale->builtin.casemap_full);
+ return pg_u_isdigit(to_char32(wc), !locale->builtin.casemap_full);
}
static bool
wc_isalpha_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_isalpha(wc);
+ return pg_u_isalpha(to_char32(wc));
}
static bool
wc_isalnum_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_isalnum(wc, !locale->builtin.casemap_full);
+ return pg_u_isalnum(to_char32(wc), !locale->builtin.casemap_full);
}
static bool
wc_isupper_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_isupper(wc);
+ return pg_u_isupper(to_char32(wc));
}
static bool
wc_islower_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_islower(wc);
+ return pg_u_islower(to_char32(wc));
}
static bool
wc_isgraph_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_isgraph(wc);
+ return pg_u_isgraph(to_char32(wc));
}
static bool
wc_isprint_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_isprint(wc);
+ return pg_u_isprint(to_char32(wc));
}
static bool
wc_ispunct_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_ispunct(wc, !locale->builtin.casemap_full);
+ return pg_u_ispunct(to_char32(wc), !locale->builtin.casemap_full);
}
static bool
wc_isspace_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_isspace(wc);
+ return pg_u_isspace(to_char32(wc));
}
static bool
wc_isxdigit_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_isxdigit(wc, !locale->builtin.casemap_full);
+ return pg_u_isxdigit(to_char32(wc), !locale->builtin.casemap_full);
}
static bool
@@ -179,13 +195,13 @@ char_is_cased_builtin(char ch, pg_locale_t locale)
static pg_wchar
wc_toupper_builtin(pg_wchar wc, pg_locale_t locale)
{
- return unicode_uppercase_simple(wc);
+ return to_pg_wchar(unicode_uppercase_simple(to_char32(wc)));
}
static pg_wchar
wc_tolower_builtin(pg_wchar wc, pg_locale_t locale)
{
- return unicode_lowercase_simple(wc);
+ return to_pg_wchar(unicode_lowercase_simple(to_char32(wc)));
}
static const struct ctype_methods ctype_methods_builtin = {
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index 2c398cd9e5c..8d735786e51 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -5419,12 +5419,12 @@ unicode_assigned(PG_FUNCTION_ARGS)
ereport(ERROR,
(errmsg("Unicode categorization can only be performed if server encoding is UTF8")));
- /* convert to pg_wchar */
+ /* convert to char32_t */
size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
p = (unsigned char *) VARDATA_ANY(input);
for (int i = 0; i < size; i++)
{
- pg_wchar uchar = utf8_to_unicode(p);
+ char32_t uchar = utf8_to_unicode(p);
int category = unicode_category(uchar);
if (category == PG_U_UNASSIGNED)
@@ -5443,24 +5443,24 @@ unicode_normalize_func(PG_FUNCTION_ARGS)
char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
UnicodeNormalizationForm form;
int size;
- pg_wchar *input_chars;
- pg_wchar *output_chars;
+ char32_t *input_chars;
+ char32_t *output_chars;
unsigned char *p;
text *result;
int i;
form = unicode_norm_form_from_string(formstr);
- /* convert to pg_wchar */
+ /* convert to char32_t */
size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
- input_chars = palloc((size + 1) * sizeof(pg_wchar));
+ input_chars = palloc((size + 1) * sizeof(char32_t));
p = (unsigned char *) VARDATA_ANY(input);
for (i = 0; i < size; i++)
{
input_chars[i] = utf8_to_unicode(p);
p += pg_utf_mblen(p);
}
- input_chars[i] = (pg_wchar) '\0';
+ input_chars[i] = (char32_t) '\0';
Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
/* action */
@@ -5468,7 +5468,7 @@ unicode_normalize_func(PG_FUNCTION_ARGS)
/* convert back to UTF-8 string */
size = 0;
- for (pg_wchar *wp = output_chars; *wp; wp++)
+ for (char32_t *wp = output_chars; *wp; wp++)
{
unsigned char buf[4];
@@ -5480,7 +5480,7 @@ unicode_normalize_func(PG_FUNCTION_ARGS)
SET_VARSIZE(result, size + VARHDRSZ);
p = (unsigned char *) VARDATA_ANY(result);
- for (pg_wchar *wp = output_chars; *wp; wp++)
+ for (char32_t *wp = output_chars; *wp; wp++)
{
unicode_to_utf8(*wp, p);
p += pg_utf_mblen(p);
@@ -5509,8 +5509,8 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
UnicodeNormalizationForm form;
int size;
- pg_wchar *input_chars;
- pg_wchar *output_chars;
+ char32_t *input_chars;
+ char32_t *output_chars;
unsigned char *p;
int i;
UnicodeNormalizationQC quickcheck;
@@ -5519,16 +5519,16 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
form = unicode_norm_form_from_string(formstr);
- /* convert to pg_wchar */
+ /* convert to char32_t */
size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
- input_chars = palloc((size + 1) * sizeof(pg_wchar));
+ input_chars = palloc((size + 1) * sizeof(char32_t));
p = (unsigned char *) VARDATA_ANY(input);
for (i = 0; i < size; i++)
{
input_chars[i] = utf8_to_unicode(p);
p += pg_utf_mblen(p);
}
- input_chars[i] = (pg_wchar) '\0';
+ input_chars[i] = (char32_t) '\0';
Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
/* quick check (see UAX #15) */
@@ -5542,11 +5542,11 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
output_chars = unicode_normalize(form, input_chars);
output_size = 0;
- for (pg_wchar *wp = output_chars; *wp; wp++)
+ for (char32_t *wp = output_chars; *wp; wp++)
output_size++;
result = (size == output_size) &&
- (memcmp(input_chars, output_chars, size * sizeof(pg_wchar)) == 0);
+ (memcmp(input_chars, output_chars, size * sizeof(char32_t)) == 0);
PG_RETURN_BOOL(result);
}
@@ -5602,7 +5602,7 @@ unistr(PG_FUNCTION_ARGS)
int len;
StringInfoData str;
text *result;
- pg_wchar pair_first = 0;
+ char16_t pair_first = 0;
char cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
instr = VARDATA_ANY(input_text);
@@ -5626,7 +5626,7 @@ unistr(PG_FUNCTION_ARGS)
else if ((len >= 5 && isxdigits_n(instr + 1, 4)) ||
(len >= 6 && instr[1] == 'u' && isxdigits_n(instr + 2, 4)))
{
- pg_wchar unicode;
+ char32_t unicode;
int offset = instr[1] == 'u' ? 2 : 1;
unicode = hexval_n(instr + offset, 4);
@@ -5662,7 +5662,7 @@ unistr(PG_FUNCTION_ARGS)
}
else if (len >= 8 && instr[1] == '+' && isxdigits_n(instr + 2, 6))
{
- pg_wchar unicode;
+ char32_t unicode;
unicode = hexval_n(instr + 2, 6);
@@ -5697,7 +5697,7 @@ unistr(PG_FUNCTION_ARGS)
}
else if (len >= 10 && instr[1] == 'U' && isxdigits_n(instr + 2, 8))
{
- pg_wchar unicode;
+ char32_t unicode;
unicode = hexval_n(instr + 2, 8);
diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c
index 886ecbad871..fb629ed5c8f 100644
--- a/src/backend/utils/mb/mbutils.c
+++ b/src/backend/utils/mb/mbutils.c
@@ -862,7 +862,7 @@ perform_default_encoding_conversion(const char *src, int len,
* may call this outside any transaction, or in an aborted transaction.
*/
void
-pg_unicode_to_server(pg_wchar c, unsigned char *s)
+pg_unicode_to_server(char32_t c, unsigned char *s)
{
unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1];
int c_as_utf8_len;
@@ -924,7 +924,7 @@ pg_unicode_to_server(pg_wchar c, unsigned char *s)
* but simply return false on conversion failure.
*/
bool
-pg_unicode_to_server_noerror(pg_wchar c, unsigned char *s)
+pg_unicode_to_server_noerror(char32_t c, unsigned char *s)
{
unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1];
int c_as_utf8_len;
diff --git a/src/common/saslprep.c b/src/common/saslprep.c
index 97beb47940b..101e8d65a4d 100644
--- a/src/common/saslprep.c
+++ b/src/common/saslprep.c
@@ -47,7 +47,7 @@
/* Prototypes for local functions */
static int codepoint_range_cmp(const void *a, const void *b);
-static bool is_code_in_table(pg_wchar code, const pg_wchar *map, int mapsize);
+static bool is_code_in_table(char32_t code, const char32_t *map, int mapsize);
static int pg_utf8_string_len(const char *source);
/*
@@ -64,7 +64,7 @@ static int pg_utf8_string_len(const char *source);
*
* These are all mapped to the ASCII space character (U+00A0).
*/
-static const pg_wchar non_ascii_space_ranges[] =
+static const char32_t non_ascii_space_ranges[] =
{
0x00A0, 0x00A0,
0x1680, 0x1680,
@@ -79,7 +79,7 @@ static const pg_wchar non_ascii_space_ranges[] =
*
* If any of these appear in the input, they are removed.
*/
-static const pg_wchar commonly_mapped_to_nothing_ranges[] =
+static const char32_t commonly_mapped_to_nothing_ranges[] =
{
0x00AD, 0x00AD,
0x034F, 0x034F,
@@ -114,7 +114,7 @@ static const pg_wchar commonly_mapped_to_nothing_ranges[] =
* tables, so one code might originate from multiple source tables.
* Adjacent ranges have also been merged together, to save space.
*/
-static const pg_wchar prohibited_output_ranges[] =
+static const char32_t prohibited_output_ranges[] =
{
0x0000, 0x001F, /* C.2.1 */
0x007F, 0x00A0, /* C.1.2, C.2.1, C.2.2 */
@@ -155,7 +155,7 @@ static const pg_wchar prohibited_output_ranges[] =
};
/* A.1 Unassigned code points in Unicode 3.2 */
-static const pg_wchar unassigned_codepoint_ranges[] =
+static const char32_t unassigned_codepoint_ranges[] =
{
0x0221, 0x0221,
0x0234, 0x024F,
@@ -556,7 +556,7 @@ static const pg_wchar unassigned_codepoint_ranges[] =
};
/* D.1 Characters with bidirectional property "R" or "AL" */
-static const pg_wchar RandALCat_codepoint_ranges[] =
+static const char32_t RandALCat_codepoint_ranges[] =
{
0x05BE, 0x05BE,
0x05C0, 0x05C0,
@@ -595,7 +595,7 @@ static const pg_wchar RandALCat_codepoint_ranges[] =
};
/* D.2 Characters with bidirectional property "L" */
-static const pg_wchar LCat_codepoint_ranges[] =
+static const char32_t LCat_codepoint_ranges[] =
{
0x0041, 0x005A,
0x0061, 0x007A,
@@ -968,8 +968,8 @@ static const pg_wchar LCat_codepoint_ranges[] =
static int
codepoint_range_cmp(const void *a, const void *b)
{
- const pg_wchar *key = (const pg_wchar *) a;
- const pg_wchar *range = (const pg_wchar *) b;
+ const char32_t *key = (const char32_t *) a;
+ const char32_t *range = (const char32_t *) b;
if (*key < range[0])
return -1; /* less than lower bound */
@@ -980,14 +980,14 @@ codepoint_range_cmp(const void *a, const void *b)
}
static bool
-is_code_in_table(pg_wchar code, const pg_wchar *map, int mapsize)
+is_code_in_table(char32_t code, const char32_t *map, int mapsize)
{
Assert(mapsize % 2 == 0);
if (code < map[0] || code > map[mapsize - 1])
return false;
- if (bsearch(&code, map, mapsize / 2, sizeof(pg_wchar) * 2,
+ if (bsearch(&code, map, mapsize / 2, sizeof(char32_t) * 2,
codepoint_range_cmp))
return true;
else
@@ -1046,8 +1046,8 @@ pg_utf8_string_len(const char *source)
pg_saslprep_rc
pg_saslprep(const char *input, char **output)
{
- pg_wchar *input_chars = NULL;
- pg_wchar *output_chars = NULL;
+ char32_t *input_chars = NULL;
+ char32_t *output_chars = NULL;
int input_size;
char *result;
int result_size;
@@ -1055,7 +1055,7 @@ pg_saslprep(const char *input, char **output)
int i;
bool contains_RandALCat;
unsigned char *p;
- pg_wchar *wp;
+ char32_t *wp;
/* Ensure we return *output as NULL on failure */
*output = NULL;
@@ -1080,10 +1080,10 @@ pg_saslprep(const char *input, char **output)
input_size = pg_utf8_string_len(input);
if (input_size < 0)
return SASLPREP_INVALID_UTF8;
- if (input_size >= MaxAllocSize / sizeof(pg_wchar))
+ if (input_size >= MaxAllocSize / sizeof(char32_t))
goto oom;
- input_chars = ALLOC((input_size + 1) * sizeof(pg_wchar));
+ input_chars = ALLOC((input_size + 1) * sizeof(char32_t));
if (!input_chars)
goto oom;
@@ -1093,7 +1093,7 @@ pg_saslprep(const char *input, char **output)
input_chars[i] = utf8_to_unicode(p);
p += pg_utf_mblen(p);
}
- input_chars[i] = (pg_wchar) '\0';
+ input_chars[i] = (char32_t) '\0';
/*
* The steps below correspond to the steps listed in [RFC3454], Section
@@ -1107,7 +1107,7 @@ pg_saslprep(const char *input, char **output)
count = 0;
for (i = 0; i < input_size; i++)
{
- pg_wchar code = input_chars[i];
+ char32_t code = input_chars[i];
if (IS_CODE_IN_TABLE(code, non_ascii_space_ranges))
input_chars[count++] = 0x0020;
@@ -1118,7 +1118,7 @@ pg_saslprep(const char *input, char **output)
else
input_chars[count++] = code;
}
- input_chars[count] = (pg_wchar) '\0';
+ input_chars[count] = (char32_t) '\0';
input_size = count;
if (input_size == 0)
@@ -1138,7 +1138,7 @@ pg_saslprep(const char *input, char **output)
*/
for (i = 0; i < input_size; i++)
{
- pg_wchar code = input_chars[i];
+ char32_t code = input_chars[i];
if (IS_CODE_IN_TABLE(code, prohibited_output_ranges))
goto prohibited;
@@ -1170,7 +1170,7 @@ pg_saslprep(const char *input, char **output)
contains_RandALCat = false;
for (i = 0; i < input_size; i++)
{
- pg_wchar code = input_chars[i];
+ char32_t code = input_chars[i];
if (IS_CODE_IN_TABLE(code, RandALCat_codepoint_ranges))
{
@@ -1181,12 +1181,12 @@ pg_saslprep(const char *input, char **output)
if (contains_RandALCat)
{
- pg_wchar first = input_chars[0];
- pg_wchar last = input_chars[input_size - 1];
+ char32_t first = input_chars[0];
+ char32_t last = input_chars[input_size - 1];
for (i = 0; i < input_size; i++)
{
- pg_wchar code = input_chars[i];
+ char32_t code = input_chars[i];
if (IS_CODE_IN_TABLE(code, LCat_codepoint_ranges))
goto prohibited;
diff --git a/src/common/unicode/case_test.c b/src/common/unicode/case_test.c
index fdfb62e8552..00d4f85e5a5 100644
--- a/src/common/unicode/case_test.c
+++ b/src/common/unicode/case_test.c
@@ -24,6 +24,7 @@
#include "common/unicode_case.h"
#include "common/unicode_category.h"
#include "common/unicode_version.h"
+#include "mb/pg_wchar.h"
/* enough to hold largest source or result string, including NUL */
#define BUFSZ 256
@@ -54,7 +55,7 @@ initcap_wbnext(void *state)
while (wbstate->offset < wbstate->len &&
wbstate->str[wbstate->offset] != '\0')
{
- pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str +
+ char32_t u = utf8_to_unicode((unsigned char *) wbstate->str +
wbstate->offset);
bool curr_alnum = pg_u_isalnum(u, wbstate->posix);
@@ -77,16 +78,16 @@ initcap_wbnext(void *state)
#ifdef USE_ICU
static void
-icu_test_simple(pg_wchar code)
+icu_test_simple(char32_t code)
{
- pg_wchar lower = unicode_lowercase_simple(code);
- pg_wchar title = unicode_titlecase_simple(code);
- pg_wchar upper = unicode_uppercase_simple(code);
- pg_wchar fold = unicode_casefold_simple(code);
- pg_wchar iculower = u_tolower(code);
- pg_wchar icutitle = u_totitle(code);
- pg_wchar icuupper = u_toupper(code);
- pg_wchar icufold = u_foldCase(code, U_FOLD_CASE_DEFAULT);
+ char32_t lower = unicode_lowercase_simple(code);
+ char32_t title = unicode_titlecase_simple(code);
+ char32_t upper = unicode_uppercase_simple(code);
+ char32_t fold = unicode_casefold_simple(code);
+ char32_t iculower = u_tolower(code);
+ char32_t icutitle = u_totitle(code);
+ char32_t icuupper = u_toupper(code);
+ char32_t icufold = u_foldCase(code, U_FOLD_CASE_DEFAULT);
if (lower != iculower || title != icutitle || upper != icuupper ||
fold != icufold)
@@ -172,7 +173,7 @@ test_icu(void)
int successful = 0;
int skipped_mismatch = 0;
- for (pg_wchar code = 0; code <= 0x10ffff; code++)
+ for (char32_t code = 0; code <= 0x10ffff; code++)
{
pg_unicode_category category = unicode_category(code);
diff --git a/src/common/unicode/category_test.c b/src/common/unicode/category_test.c
index 5d37ba39196..1e8c1f7905f 100644
--- a/src/common/unicode/category_test.c
+++ b/src/common/unicode/category_test.c
@@ -22,6 +22,7 @@
#include "common/unicode_category.h"
#include "common/unicode_version.h"
+#include "mb/pg_wchar.h"
static int pg_unicode_version = 0;
#ifdef USE_ICU
@@ -59,7 +60,7 @@ icu_test()
int pg_skipped_codepoints = 0;
int icu_skipped_codepoints = 0;
- for (pg_wchar code = 0; code <= 0x10ffff; code++)
+ for (char32_t code = 0; code <= 0x10ffff; code++)
{
uint8_t pg_category = unicode_category(code);
uint8_t icu_category = u_charType(code);
diff --git a/src/common/unicode/generate-norm_test_table.pl b/src/common/unicode/generate-norm_test_table.pl
index 1b401be9409..1a8b908ff33 100644
--- a/src/common/unicode/generate-norm_test_table.pl
+++ b/src/common/unicode/generate-norm_test_table.pl
@@ -47,8 +47,8 @@ print $OUTPUT <<HEADER;
typedef struct
{
int linenum;
- pg_wchar input[50];
- pg_wchar output[4][50];
+ char32_t input[50];
+ char32_t output[4][50];
} pg_unicode_test;
/* test table */
diff --git a/src/common/unicode/generate-unicode_case_table.pl b/src/common/unicode/generate-unicode_case_table.pl
index 5d9ddd62803..f71eb25c94e 100644
--- a/src/common/unicode/generate-unicode_case_table.pl
+++ b/src/common/unicode/generate-unicode_case_table.pl
@@ -270,7 +270,6 @@ print $OT <<"EOS";
*/
#include "common/unicode_case.h"
-#include "mb/pg_wchar.h"
/*
* The maximum number of codepoints that can result from case mapping
@@ -297,7 +296,7 @@ typedef enum
typedef struct
{
int16 conditions;
- pg_wchar map[NCaseKind][MAX_CASE_EXPANSION];
+ char32_t map[NCaseKind][MAX_CASE_EXPANSION];
} pg_special_case;
/*
@@ -430,7 +429,7 @@ foreach my $kind ('lower', 'title', 'upper', 'fold')
* The entry case_map_${kind}[case_index(codepoint)] is the mapping for the
* given codepoint.
*/
-static const pg_wchar case_map_$kind\[$index\] =
+static const char32_t case_map_$kind\[$index\] =
{
EOS
@@ -502,7 +501,7 @@ print $OT <<"EOS";
* the offset into the mapping tables.
*/
static inline uint16
-case_index(pg_wchar cp)
+case_index(char32_t cp)
{
/* Fast path for codepoints < $fastpath_limit */
if (cp < $fastpath_limit)
diff --git a/src/common/unicode/generate-unicode_category_table.pl b/src/common/unicode/generate-unicode_category_table.pl
index abab5cd9696..7e094b13720 100644
--- a/src/common/unicode/generate-unicode_category_table.pl
+++ b/src/common/unicode/generate-unicode_category_table.pl
@@ -366,15 +366,15 @@ print $OT <<"EOS";
*/
typedef struct
{
- uint32 first; /* Unicode codepoint */
- uint32 last; /* Unicode codepoint */
+ char32_t first; /* Unicode codepoint */
+ char32_t last; /* Unicode codepoint */
uint8 category; /* General Category */
} pg_category_range;
typedef struct
{
- uint32 first; /* Unicode codepoint */
- uint32 last; /* Unicode codepoint */
+ char32_t first; /* Unicode codepoint */
+ char32_t last; /* Unicode codepoint */
} pg_unicode_range;
typedef struct
diff --git a/src/common/unicode/norm_test.c b/src/common/unicode/norm_test.c
index 25bc59463f2..058817f1719 100644
--- a/src/common/unicode/norm_test.c
+++ b/src/common/unicode/norm_test.c
@@ -20,7 +20,7 @@
#include "norm_test_table.h"
static char *
-print_wchar_str(const pg_wchar *s)
+print_wchar_str(const char32_t *s)
{
#define BUF_DIGITS 50
static char buf[BUF_DIGITS * 11 + 1];
@@ -41,7 +41,7 @@ print_wchar_str(const pg_wchar *s)
}
static int
-pg_wcscmp(const pg_wchar *s1, const pg_wchar *s2)
+pg_wcscmp(const char32_t *s1, const char32_t *s2)
{
for (;;)
{
@@ -65,7 +65,7 @@ main(int argc, char **argv)
{
for (int form = 0; form < 4; form++)
{
- pg_wchar *result;
+ char32_t *result;
result = unicode_normalize(form, test->input);
diff --git a/src/common/unicode_case.c b/src/common/unicode_case.c
index 073faf6a0d5..e5e494db43c 100644
--- a/src/common/unicode_case.c
+++ b/src/common/unicode_case.c
@@ -30,7 +30,7 @@ enum CaseMapResult
/*
* Map for each case kind.
*/
-static const pg_wchar *const casekind_map[NCaseKind] =
+static const char32_t *const casekind_map[NCaseKind] =
{
[CaseLower] = case_map_lower,
[CaseTitle] = case_map_title,
@@ -38,42 +38,42 @@ static const pg_wchar *const casekind_map[NCaseKind] =
[CaseFold] = case_map_fold,
};
-static pg_wchar find_case_map(pg_wchar ucs, const pg_wchar *map);
+static char32_t find_case_map(char32_t ucs, const char32_t *map);
static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
void *wbstate);
-static enum CaseMapResult casemap(pg_wchar u1, CaseKind casekind, bool full,
+static enum CaseMapResult casemap(char32_t u1, CaseKind casekind, bool full,
const char *src, size_t srclen, size_t srcoff,
- pg_wchar *simple, const pg_wchar **special);
+ char32_t *simple, const char32_t **special);
-pg_wchar
-unicode_lowercase_simple(pg_wchar code)
+char32_t
+unicode_lowercase_simple(char32_t code)
{
- pg_wchar cp = find_case_map(code, case_map_lower);
+ char32_t cp = find_case_map(code, case_map_lower);
return cp != 0 ? cp : code;
}
-pg_wchar
-unicode_titlecase_simple(pg_wchar code)
+char32_t
+unicode_titlecase_simple(char32_t code)
{
- pg_wchar cp = find_case_map(code, case_map_title);
+ char32_t cp = find_case_map(code, case_map_title);
return cp != 0 ? cp : code;
}
-pg_wchar
-unicode_uppercase_simple(pg_wchar code)
+char32_t
+unicode_uppercase_simple(char32_t code)
{
- pg_wchar cp = find_case_map(code, case_map_upper);
+ char32_t cp = find_case_map(code, case_map_upper);
return cp != 0 ? cp : code;
}
-pg_wchar
-unicode_casefold_simple(pg_wchar code)
+char32_t
+unicode_casefold_simple(char32_t code)
{
- pg_wchar cp = find_case_map(code, case_map_fold);
+ char32_t cp = find_case_map(code, case_map_fold);
return cp != 0 ? cp : code;
}
@@ -231,10 +231,10 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
{
- pg_wchar u1 = utf8_to_unicode((unsigned char *) src + srcoff);
+ char32_t u1 = utf8_to_unicode((unsigned char *) src + srcoff);
int u1len = unicode_utf8len(u1);
- pg_wchar simple = 0;
- const pg_wchar *special = NULL;
+ char32_t simple = 0;
+ const char32_t *special = NULL;
enum CaseMapResult casemap_result;
if (str_casekind == CaseTitle)
@@ -265,8 +265,8 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
case CASEMAP_SIMPLE:
{
/* replace with single character */
- pg_wchar u2 = simple;
- pg_wchar u2len = unicode_utf8len(u2);
+ char32_t u2 = simple;
+ char32_t u2len = unicode_utf8len(u2);
Assert(special == NULL);
if (result_len + u2len <= dstsize)
@@ -280,7 +280,7 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
Assert(simple == 0);
for (int i = 0; i < MAX_CASE_EXPANSION && special[i]; i++)
{
- pg_wchar u2 = special[i];
+ char32_t u2 = special[i];
size_t u2len = unicode_utf8len(u2);
if (result_len + u2len <= dstsize)
@@ -320,7 +320,7 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
{
if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
{
- pg_wchar curr = utf8_to_unicode(str + i);
+ char32_t curr = utf8_to_unicode(str + i);
if (pg_u_prop_case_ignorable(curr))
continue;
@@ -344,7 +344,7 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
{
if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
{
- pg_wchar curr = utf8_to_unicode(str + i);
+ char32_t curr = utf8_to_unicode(str + i);
if (pg_u_prop_case_ignorable(curr))
continue;
@@ -394,9 +394,9 @@ check_special_conditions(int conditions, const char *str, size_t len,
* character without modification.
*/
static enum CaseMapResult
-casemap(pg_wchar u1, CaseKind casekind, bool full,
+casemap(char32_t u1, CaseKind casekind, bool full,
const char *src, size_t srclen, size_t srcoff,
- pg_wchar *simple, const pg_wchar **special)
+ char32_t *simple, const char32_t **special)
{
uint16 idx;
@@ -434,8 +434,8 @@ casemap(pg_wchar u1, CaseKind casekind, bool full,
* Find entry in simple case map.
* If the entry does not exist, 0 will be returned.
*/
-static pg_wchar
-find_case_map(pg_wchar ucs, const pg_wchar *map)
+static char32_t
+find_case_map(char32_t ucs, const char32_t *map)
{
/* Fast path for codepoints < 0x80 */
if (ucs < 0x80)
diff --git a/src/common/unicode_category.c b/src/common/unicode_category.c
index 4136c4d4f92..aab667a7bb4 100644
--- a/src/common/unicode_category.c
+++ b/src/common/unicode_category.c
@@ -1,7 +1,7 @@
/*-------------------------------------------------------------------------
* unicode_category.c
* Determine general category and character properties of Unicode
- * characters. Encoding must be UTF8, where we assume that the pg_wchar
+ * characters. Encoding must be UTF8, where we assume that the char32_t
* representation is a code point.
*
* Portions Copyright (c) 2017-2025, PostgreSQL Global Development Group
@@ -76,13 +76,13 @@
#define PG_U_CHARACTER_TAB 0x09
static bool range_search(const pg_unicode_range *tbl, size_t size,
- pg_wchar code);
+ char32_t code);
/*
* Unicode general category for the given codepoint.
*/
pg_unicode_category
-unicode_category(pg_wchar code)
+unicode_category(char32_t code)
{
int min = 0;
int mid;
@@ -108,7 +108,7 @@ unicode_category(pg_wchar code)
}
bool
-pg_u_prop_alphabetic(pg_wchar code)
+pg_u_prop_alphabetic(char32_t code)
{
if (code < 0x80)
return unicode_opt_ascii[code].properties & PG_U_PROP_ALPHABETIC;
@@ -119,7 +119,7 @@ pg_u_prop_alphabetic(pg_wchar code)
}
bool
-pg_u_prop_lowercase(pg_wchar code)
+pg_u_prop_lowercase(char32_t code)
{
if (code < 0x80)
return unicode_opt_ascii[code].properties & PG_U_PROP_LOWERCASE;
@@ -130,7 +130,7 @@ pg_u_prop_lowercase(pg_wchar code)
}
bool
-pg_u_prop_uppercase(pg_wchar code)
+pg_u_prop_uppercase(char32_t code)
{
if (code < 0x80)
return unicode_opt_ascii[code].properties & PG_U_PROP_UPPERCASE;
@@ -141,7 +141,7 @@ pg_u_prop_uppercase(pg_wchar code)
}
bool
-pg_u_prop_cased(pg_wchar code)
+pg_u_prop_cased(char32_t code)
{
uint32 category_mask;
@@ -156,7 +156,7 @@ pg_u_prop_cased(pg_wchar code)
}
bool
-pg_u_prop_case_ignorable(pg_wchar code)
+pg_u_prop_case_ignorable(char32_t code)
{
if (code < 0x80)
return unicode_opt_ascii[code].properties & PG_U_PROP_CASE_IGNORABLE;
@@ -167,7 +167,7 @@ pg_u_prop_case_ignorable(pg_wchar code)
}
bool
-pg_u_prop_white_space(pg_wchar code)
+pg_u_prop_white_space(char32_t code)
{
if (code < 0x80)
return unicode_opt_ascii[code].properties & PG_U_PROP_WHITE_SPACE;
@@ -178,7 +178,7 @@ pg_u_prop_white_space(pg_wchar code)
}
bool
-pg_u_prop_hex_digit(pg_wchar code)
+pg_u_prop_hex_digit(char32_t code)
{
if (code < 0x80)
return unicode_opt_ascii[code].properties & PG_U_PROP_HEX_DIGIT;
@@ -189,7 +189,7 @@ pg_u_prop_hex_digit(pg_wchar code)
}
bool
-pg_u_prop_join_control(pg_wchar code)
+pg_u_prop_join_control(char32_t code)
{
if (code < 0x80)
return unicode_opt_ascii[code].properties & PG_U_PROP_JOIN_CONTROL;
@@ -208,7 +208,7 @@ pg_u_prop_join_control(pg_wchar code)
*/
bool
-pg_u_isdigit(pg_wchar code, bool posix)
+pg_u_isdigit(char32_t code, bool posix)
{
if (posix)
return ('0' <= code && code <= '9');
@@ -217,19 +217,19 @@ pg_u_isdigit(pg_wchar code, bool posix)
}
bool
-pg_u_isalpha(pg_wchar code)
+pg_u_isalpha(char32_t code)
{
return pg_u_prop_alphabetic(code);
}
bool
-pg_u_isalnum(pg_wchar code, bool posix)
+pg_u_isalnum(char32_t code, bool posix)
{
return pg_u_isalpha(code) || pg_u_isdigit(code, posix);
}
bool
-pg_u_isword(pg_wchar code)
+pg_u_isword(char32_t code)
{
uint32 category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
@@ -240,32 +240,32 @@ pg_u_isword(pg_wchar code)
}
bool
-pg_u_isupper(pg_wchar code)
+pg_u_isupper(char32_t code)
{
return pg_u_prop_uppercase(code);
}
bool
-pg_u_islower(pg_wchar code)
+pg_u_islower(char32_t code)
{
return pg_u_prop_lowercase(code);
}
bool
-pg_u_isblank(pg_wchar code)
+pg_u_isblank(char32_t code)
{
return code == PG_U_CHARACTER_TAB ||
unicode_category(code) == PG_U_SPACE_SEPARATOR;
}
bool
-pg_u_iscntrl(pg_wchar code)
+pg_u_iscntrl(char32_t code)
{
return unicode_category(code) == PG_U_CONTROL;
}
bool
-pg_u_isgraph(pg_wchar code)
+pg_u_isgraph(char32_t code)
{
uint32 category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
@@ -276,7 +276,7 @@ pg_u_isgraph(pg_wchar code)
}
bool
-pg_u_isprint(pg_wchar code)
+pg_u_isprint(char32_t code)
{
pg_unicode_category category = unicode_category(code);
@@ -287,7 +287,7 @@ pg_u_isprint(pg_wchar code)
}
bool
-pg_u_ispunct(pg_wchar code, bool posix)
+pg_u_ispunct(char32_t code, bool posix)
{
uint32 category_mask;
@@ -308,13 +308,13 @@ pg_u_ispunct(pg_wchar code, bool posix)
}
bool
-pg_u_isspace(pg_wchar code)
+pg_u_isspace(char32_t code)
{
return pg_u_prop_white_space(code);
}
bool
-pg_u_isxdigit(pg_wchar code, bool posix)
+pg_u_isxdigit(char32_t code, bool posix)
{
if (posix)
return (('0' <= code && code <= '9') ||
@@ -478,7 +478,7 @@ unicode_category_abbrev(pg_unicode_category category)
* given table.
*/
static bool
-range_search(const pg_unicode_range *tbl, size_t size, pg_wchar code)
+range_search(const pg_unicode_range *tbl, size_t size, char32_t code)
{
int min = 0;
int mid;
diff --git a/src/common/unicode_norm.c b/src/common/unicode_norm.c
index 6654b4cbc49..489d99cd5ab 100644
--- a/src/common/unicode_norm.c
+++ b/src/common/unicode_norm.c
@@ -69,7 +69,7 @@ conv_compare(const void *p1, const void *p2)
* lookup, while the frontend version uses a binary search.
*/
static const pg_unicode_decomposition *
-get_code_entry(pg_wchar code)
+get_code_entry(char32_t code)
{
#ifndef FRONTEND
int h;
@@ -109,7 +109,7 @@ get_code_entry(pg_wchar code)
* Get the combining class of the given codepoint.
*/
static uint8
-get_canonical_class(pg_wchar code)
+get_canonical_class(char32_t code)
{
const pg_unicode_decomposition *entry = get_code_entry(code);
@@ -130,15 +130,15 @@ get_canonical_class(pg_wchar code)
* Note: the returned pointer can point to statically allocated buffer, and
* is only valid until next call to this function!
*/
-static const pg_wchar *
+static const char32_t *
get_code_decomposition(const pg_unicode_decomposition *entry, int *dec_size)
{
- static pg_wchar x;
+ static char32_t x;
if (DECOMPOSITION_IS_INLINE(entry))
{
Assert(DECOMPOSITION_SIZE(entry) == 1);
- x = (pg_wchar) entry->dec_index;
+ x = (char32_t) entry->dec_index;
*dec_size = 1;
return &x;
}
@@ -156,7 +156,7 @@ get_code_decomposition(const pg_unicode_decomposition *entry, int *dec_size)
* are, in turn, decomposable.
*/
static int
-get_decomposed_size(pg_wchar code, bool compat)
+get_decomposed_size(char32_t code, bool compat)
{
const pg_unicode_decomposition *entry;
int size = 0;
@@ -318,7 +318,7 @@ recompose_code(uint32 start, uint32 code, uint32 *result)
* in the array result.
*/
static void
-decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
+decompose_code(char32_t code, bool compat, char32_t **result, int *current)
{
const pg_unicode_decomposition *entry;
int i;
@@ -337,7 +337,7 @@ decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
v,
tindex,
sindex;
- pg_wchar *res = *result;
+ char32_t *res = *result;
sindex = code - SBASE;
l = LBASE + sindex / (VCOUNT * TCOUNT);
@@ -369,7 +369,7 @@ decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
(!compat && DECOMPOSITION_IS_COMPAT(entry)))
{
- pg_wchar *res = *result;
+ char32_t *res = *result;
res[*current] = code;
(*current)++;
@@ -382,7 +382,7 @@ decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
decomp = get_code_decomposition(entry, &dec_size);
for (i = 0; i < dec_size; i++)
{
- pg_wchar lcode = (pg_wchar) decomp[i];
+ char32_t lcode = (char32_t) decomp[i];
/* Leave if no more decompositions */
decompose_code(lcode, compat, result, current);
@@ -398,17 +398,17 @@ decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
* malloc. Or NULL if we run out of memory. In backend, the returned
* string is palloc'd instead, and OOM is reported with ereport().
*/
-pg_wchar *
-unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
+char32_t *
+unicode_normalize(UnicodeNormalizationForm form, const char32_t *input)
{
bool compat = (form == UNICODE_NFKC || form == UNICODE_NFKD);
bool recompose = (form == UNICODE_NFC || form == UNICODE_NFKC);
- pg_wchar *decomp_chars;
- pg_wchar *recomp_chars;
+ char32_t *decomp_chars;
+ char32_t *recomp_chars;
int decomp_size,
current_size;
int count;
- const pg_wchar *p;
+ const char32_t *p;
/* variables for recomposition */
int last_class;
@@ -425,7 +425,7 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
for (p = input; *p; p++)
decomp_size += get_decomposed_size(*p, compat);
- decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
+ decomp_chars = (char32_t *) ALLOC((decomp_size + 1) * sizeof(char32_t));
if (decomp_chars == NULL)
return NULL;
@@ -448,9 +448,9 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
*/
for (count = 1; count < decomp_size; count++)
{
- pg_wchar prev = decomp_chars[count - 1];
- pg_wchar next = decomp_chars[count];
- pg_wchar tmp;
+ char32_t prev = decomp_chars[count - 1];
+ char32_t next = decomp_chars[count];
+ char32_t tmp;
const uint8 prevClass = get_canonical_class(prev);
const uint8 nextClass = get_canonical_class(next);
@@ -487,7 +487,7 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
* longer than the decomposed one, so make the allocation of the output
* string based on that assumption.
*/
- recomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
+ recomp_chars = (char32_t *) ALLOC((decomp_size + 1) * sizeof(char32_t));
if (!recomp_chars)
{
FREE(decomp_chars);
@@ -501,9 +501,9 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
for (count = 1; count < decomp_size; count++)
{
- pg_wchar ch = decomp_chars[count];
+ char32_t ch = decomp_chars[count];
int ch_class = get_canonical_class(ch);
- pg_wchar composite;
+ char32_t composite;
if (last_class < ch_class &&
recompose_code(starter_ch, ch, &composite))
@@ -524,7 +524,7 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
recomp_chars[target_pos++] = ch;
}
}
- recomp_chars[target_pos] = (pg_wchar) '\0';
+ recomp_chars[target_pos] = (char32_t) '\0';
FREE(decomp_chars);
@@ -540,7 +540,7 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
#ifndef FRONTEND
static const pg_unicode_normprops *
-qc_hash_lookup(pg_wchar ch, const pg_unicode_norminfo *norminfo)
+qc_hash_lookup(char32_t ch, const pg_unicode_norminfo *norminfo)
{
int h;
uint32 hashkey;
@@ -571,7 +571,7 @@ qc_hash_lookup(pg_wchar ch, const pg_unicode_norminfo *norminfo)
* Look up the normalization quick check character property
*/
static UnicodeNormalizationQC
-qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
+qc_is_allowed(UnicodeNormalizationForm form, char32_t ch)
{
const pg_unicode_normprops *found = NULL;
@@ -595,7 +595,7 @@ qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
}
UnicodeNormalizationQC
-unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *input)
+unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const char32_t *input)
{
uint8 lastCanonicalClass = 0;
UnicodeNormalizationQC result = UNICODE_NORM_QC_YES;
@@ -610,9 +610,9 @@ unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *
if (form == UNICODE_NFD || form == UNICODE_NFKD)
return UNICODE_NORM_QC_MAYBE;
- for (const pg_wchar *p = input; *p; p++)
+ for (const char32_t *p = input; *p; p++)
{
- pg_wchar ch = *p;
+ char32_t ch = *p;
uint8 canonicalClass;
UnicodeNormalizationQC check;
diff --git a/src/fe_utils/mbprint.c b/src/fe_utils/mbprint.c
index eb3eeee9925..abffdbe18a2 100644
--- a/src/fe_utils/mbprint.c
+++ b/src/fe_utils/mbprint.c
@@ -49,20 +49,20 @@ pg_get_utf8_id(void)
*
* No error checks here, c must point to a long-enough string.
*/
-static pg_wchar
+static char32_t
utf8_to_unicode(const unsigned char *c)
{
if ((*c & 0x80) == 0)
- return (pg_wchar) c[0];
+ return (char32_t) c[0];
else if ((*c & 0xe0) == 0xc0)
- return (pg_wchar) (((c[0] & 0x1f) << 6) |
+ return (char32_t) (((c[0] & 0x1f) << 6) |
(c[1] & 0x3f));
else if ((*c & 0xf0) == 0xe0)
- return (pg_wchar) (((c[0] & 0x0f) << 12) |
+ return (char32_t) (((c[0] & 0x0f) << 12) |
((c[1] & 0x3f) << 6) |
(c[2] & 0x3f));
else if ((*c & 0xf8) == 0xf0)
- return (pg_wchar) (((c[0] & 0x07) << 18) |
+ return (char32_t) (((c[0] & 0x07) << 18) |
((c[1] & 0x3f) << 12) |
((c[2] & 0x3f) << 6) |
(c[3] & 0x3f));
diff --git a/src/include/c.h b/src/include/c.h
index 9ab5e617995..15e33ae650d 100644
--- a/src/include/c.h
+++ b/src/include/c.h
@@ -513,6 +513,19 @@ typedef void (*pg_funcptr_t) (void);
#include <stdbool.h>
+/*
+ * char16_t and char32_t
+ * Unicode code points.
+ */
+#if !defined(HAVE_CHAR16_T) || !defined(HAVE_CHAR32_T)
+#ifdef HAVE_UCHAR_H
+#include <uchar.h>
+#else
+typedef uint16_t char16_t;
+typedef uint32_t char32_t;
+#endif
+#endif
+
/* ----------------------------------------------------------------
* Section 3: standard system types
diff --git a/src/include/common/unicode_case.h b/src/include/common/unicode_case.h
index 41e2c1f4b33..6bcffd349c2 100644
--- a/src/include/common/unicode_case.h
+++ b/src/include/common/unicode_case.h
@@ -14,14 +14,12 @@
#ifndef UNICODE_CASE_H
#define UNICODE_CASE_H
-#include "mb/pg_wchar.h"
-
typedef size_t (*WordBoundaryNext) (void *wbstate);
-pg_wchar unicode_lowercase_simple(pg_wchar code);
-pg_wchar unicode_titlecase_simple(pg_wchar code);
-pg_wchar unicode_uppercase_simple(pg_wchar code);
-pg_wchar unicode_casefold_simple(pg_wchar code);
+char32_t unicode_lowercase_simple(char32_t code);
+char32_t unicode_titlecase_simple(char32_t code);
+char32_t unicode_uppercase_simple(char32_t code);
+char32_t unicode_casefold_simple(char32_t code);
size_t unicode_strlower(char *dst, size_t dstsize, const char *src,
ssize_t srclen, bool full);
size_t unicode_strtitle(char *dst, size_t dstsize, const char *src,
diff --git a/src/include/common/unicode_case_table.h b/src/include/common/unicode_case_table.h
index d5311786582..0a14fb2d97b 100644
--- a/src/include/common/unicode_case_table.h
+++ b/src/include/common/unicode_case_table.h
@@ -18,7 +18,6 @@
*/
#include "common/unicode_case.h"
-#include "mb/pg_wchar.h"
/*
* The maximum number of codepoints that can result from case mapping
@@ -45,7 +44,7 @@ typedef enum
typedef struct
{
int16 conditions;
- pg_wchar map[NCaseKind][MAX_CASE_EXPANSION];
+ char32_t map[NCaseKind][MAX_CASE_EXPANSION];
} pg_special_case;
/*
@@ -166,7 +165,7 @@ static const pg_special_case special_case[106] =
* The entry case_map_lower[case_index(codepoint)] is the mapping for the
* given codepoint.
*/
-static const pg_wchar case_map_lower[1704] =
+static const char32_t case_map_lower[1704] =
{
0x000000, /* reserved */
0x000000, /* U+000000 */
@@ -1879,7 +1878,7 @@ static const pg_wchar case_map_lower[1704] =
* The entry case_map_title[case_index(codepoint)] is the mapping for the
* given codepoint.
*/
-static const pg_wchar case_map_title[1704] =
+static const char32_t case_map_title[1704] =
{
0x000000, /* reserved */
0x000000, /* U+000000 */
@@ -3592,7 +3591,7 @@ static const pg_wchar case_map_title[1704] =
* The entry case_map_upper[case_index(codepoint)] is the mapping for the
* given codepoint.
*/
-static const pg_wchar case_map_upper[1704] =
+static const char32_t case_map_upper[1704] =
{
0x000000, /* reserved */
0x000000, /* U+000000 */
@@ -5305,7 +5304,7 @@ static const pg_wchar case_map_upper[1704] =
* The entry case_map_fold[case_index(codepoint)] is the mapping for the
* given codepoint.
*/
-static const pg_wchar case_map_fold[1704] =
+static const char32_t case_map_fold[1704] =
{
0x000000, /* reserved */
0x000000, /* U+000000 */
@@ -13522,7 +13521,7 @@ static const uint16 case_map[4778] =
* the offset into the mapping tables.
*/
static inline uint16
-case_index(pg_wchar cp)
+case_index(char32_t cp)
{
/* Fast path for codepoints < 0x0588 */
if (cp < 0x0588)
diff --git a/src/include/common/unicode_category.h b/src/include/common/unicode_category.h
index 8fd8b67a416..684143d3c8a 100644
--- a/src/include/common/unicode_category.h
+++ b/src/include/common/unicode_category.h
@@ -14,8 +14,6 @@
#ifndef UNICODE_CATEGORY_H
#define UNICODE_CATEGORY_H
-#include "mb/pg_wchar.h"
-
/*
* Unicode General Category Values
*
@@ -61,31 +59,31 @@ typedef enum pg_unicode_category
PG_U_FINAL_PUNCTUATION = 29 /* Pf */
} pg_unicode_category;
-extern pg_unicode_category unicode_category(pg_wchar code);
+extern pg_unicode_category unicode_category(char32_t code);
extern const char *unicode_category_string(pg_unicode_category category);
extern const char *unicode_category_abbrev(pg_unicode_category category);
-extern bool pg_u_prop_alphabetic(pg_wchar code);
-extern bool pg_u_prop_lowercase(pg_wchar code);
-extern bool pg_u_prop_uppercase(pg_wchar code);
-extern bool pg_u_prop_cased(pg_wchar code);
-extern bool pg_u_prop_case_ignorable(pg_wchar code);
-extern bool pg_u_prop_white_space(pg_wchar code);
-extern bool pg_u_prop_hex_digit(pg_wchar code);
-extern bool pg_u_prop_join_control(pg_wchar code);
+extern bool pg_u_prop_alphabetic(char32_t code);
+extern bool pg_u_prop_lowercase(char32_t code);
+extern bool pg_u_prop_uppercase(char32_t code);
+extern bool pg_u_prop_cased(char32_t code);
+extern bool pg_u_prop_case_ignorable(char32_t code);
+extern bool pg_u_prop_white_space(char32_t code);
+extern bool pg_u_prop_hex_digit(char32_t code);
+extern bool pg_u_prop_join_control(char32_t code);
-extern bool pg_u_isdigit(pg_wchar code, bool posix);
-extern bool pg_u_isalpha(pg_wchar code);
-extern bool pg_u_isalnum(pg_wchar code, bool posix);
-extern bool pg_u_isword(pg_wchar code);
-extern bool pg_u_isupper(pg_wchar code);
-extern bool pg_u_islower(pg_wchar code);
-extern bool pg_u_isblank(pg_wchar code);
-extern bool pg_u_iscntrl(pg_wchar code);
-extern bool pg_u_isgraph(pg_wchar code);
-extern bool pg_u_isprint(pg_wchar code);
-extern bool pg_u_ispunct(pg_wchar code, bool posix);
-extern bool pg_u_isspace(pg_wchar code);
-extern bool pg_u_isxdigit(pg_wchar code, bool posix);
+extern bool pg_u_isdigit(char32_t code, bool posix);
+extern bool pg_u_isalpha(char32_t code);
+extern bool pg_u_isalnum(char32_t code, bool posix);
+extern bool pg_u_isword(char32_t code);
+extern bool pg_u_isupper(char32_t code);
+extern bool pg_u_islower(char32_t code);
+extern bool pg_u_isblank(char32_t code);
+extern bool pg_u_iscntrl(char32_t code);
+extern bool pg_u_isgraph(char32_t code);
+extern bool pg_u_isprint(char32_t code);
+extern bool pg_u_ispunct(char32_t code, bool posix);
+extern bool pg_u_isspace(char32_t code);
+extern bool pg_u_isxdigit(char32_t code, bool posix);
#endif /* UNICODE_CATEGORY_H */
diff --git a/src/include/common/unicode_category_table.h b/src/include/common/unicode_category_table.h
index 95a1c65da7e..466a41b72b0 100644
--- a/src/include/common/unicode_category_table.h
+++ b/src/include/common/unicode_category_table.h
@@ -20,15 +20,15 @@
*/
typedef struct
{
- uint32 first; /* Unicode codepoint */
- uint32 last; /* Unicode codepoint */
+ char32_t first; /* Unicode codepoint */
+ char32_t last; /* Unicode codepoint */
uint8 category; /* General Category */
} pg_category_range;
typedef struct
{
- uint32 first; /* Unicode codepoint */
- uint32 last; /* Unicode codepoint */
+ char32_t first; /* Unicode codepoint */
+ char32_t last; /* Unicode codepoint */
} pg_unicode_range;
typedef struct
diff --git a/src/include/common/unicode_norm.h b/src/include/common/unicode_norm.h
index 5bc3b79e78e..516c192cc4c 100644
--- a/src/include/common/unicode_norm.h
+++ b/src/include/common/unicode_norm.h
@@ -14,8 +14,6 @@
#ifndef UNICODE_NORM_H
#define UNICODE_NORM_H
-#include "mb/pg_wchar.h"
-
typedef enum
{
UNICODE_NFC = 0,
@@ -32,8 +30,8 @@ typedef enum
UNICODE_NORM_QC_MAYBE = -1,
} UnicodeNormalizationQC;
-extern pg_wchar *unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input);
+extern char32_t *unicode_normalize(UnicodeNormalizationForm form, const char32_t *input);
-extern UnicodeNormalizationQC unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *input);
+extern UnicodeNormalizationQC unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const char32_t *input);
#endif /* UNICODE_NORM_H */
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index 4b4a9974b75..4d84bdc81e4 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -532,25 +532,25 @@ typedef uint32 (*utf_local_conversion_func) (uint32 code);
* Some handy functions for Unicode-specific tests.
*/
static inline bool
-is_valid_unicode_codepoint(pg_wchar c)
+is_valid_unicode_codepoint(char32_t c)
{
return (c > 0 && c <= 0x10FFFF);
}
static inline bool
-is_utf16_surrogate_first(pg_wchar c)
+is_utf16_surrogate_first(char32_t c)
{
return (c >= 0xD800 && c <= 0xDBFF);
}
static inline bool
-is_utf16_surrogate_second(pg_wchar c)
+is_utf16_surrogate_second(char32_t c)
{
return (c >= 0xDC00 && c <= 0xDFFF);
}
-static inline pg_wchar
-surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
+static inline char32_t
+surrogate_pair_to_codepoint(char16_t first, char16_t second)
{
return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF);
}
@@ -561,20 +561,20 @@ surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
*
* No error checks here, c must point to a long-enough string.
*/
-static inline pg_wchar
+static inline char32_t
utf8_to_unicode(const unsigned char *c)
{
if ((*c & 0x80) == 0)
- return (pg_wchar) c[0];
+ return (char32_t) c[0];
else if ((*c & 0xe0) == 0xc0)
- return (pg_wchar) (((c[0] & 0x1f) << 6) |
+ return (char32_t) (((c[0] & 0x1f) << 6) |
(c[1] & 0x3f));
else if ((*c & 0xf0) == 0xe0)
- return (pg_wchar) (((c[0] & 0x0f) << 12) |
+ return (char32_t) (((c[0] & 0x0f) << 12) |
((c[1] & 0x3f) << 6) |
(c[2] & 0x3f));
else if ((*c & 0xf8) == 0xf0)
- return (pg_wchar) (((c[0] & 0x07) << 18) |
+ return (char32_t) (((c[0] & 0x07) << 18) |
((c[1] & 0x3f) << 12) |
((c[2] & 0x3f) << 6) |
(c[3] & 0x3f));
@@ -588,7 +588,7 @@ utf8_to_unicode(const unsigned char *c)
* unicode_utf8len(c) bytes available.
*/
static inline unsigned char *
-unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
+unicode_to_utf8(char32_t c, unsigned char *utf8string)
{
if (c <= 0x7F)
{
@@ -620,7 +620,7 @@ unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
* Number of bytes needed to represent the given char in UTF8.
*/
static inline int
-unicode_utf8len(pg_wchar c)
+unicode_utf8len(char32_t c)
{
if (c <= 0x7F)
return 1;
@@ -676,8 +676,8 @@ extern int pg_valid_server_encoding(const char *name);
extern bool is_encoding_supported_by_icu(int encoding);
extern const char *get_encoding_name_for_icu(int encoding);
-extern unsigned char *unicode_to_utf8(pg_wchar c, unsigned char *utf8string);
-extern pg_wchar utf8_to_unicode(const unsigned char *c);
+extern unsigned char *unicode_to_utf8(char32_t c, unsigned char *utf8string);
+extern char32_t utf8_to_unicode(const unsigned char *c);
extern bool pg_utf8_islegal(const unsigned char *source, int length);
extern int pg_utf_mblen(const unsigned char *s);
extern int pg_mule_mblen(const unsigned char *s);
@@ -739,8 +739,8 @@ extern char *pg_server_to_client(const char *s, int len);
extern char *pg_any_to_server(const char *s, int len, int encoding);
extern char *pg_server_to_any(const char *s, int len, int encoding);
-extern void pg_unicode_to_server(pg_wchar c, unsigned char *s);
-extern bool pg_unicode_to_server_noerror(pg_wchar c, unsigned char *s);
+extern void pg_unicode_to_server(char32_t c, unsigned char *s);
+extern bool pg_unicode_to_server_noerror(char32_t c, unsigned char *s);
extern unsigned short BIG5toCNS(unsigned short big5, unsigned char *lc);
extern unsigned short CNStoBIG5(unsigned short cns, unsigned char lc);
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index c4dc5d72bdb..4a150fbd551 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -373,6 +373,12 @@
/* Define to 1 if the system has the type `socklen_t'. */
#undef HAVE_SOCKLEN_T
+/* Define to 1 if the system has the type `char16_t'. */
+#undef HAVE_CHAR16_T
+
+/* Define to 1 if the system has the type `char32_t'. */
+#undef HAVE_CHAR32_T
+
/* Define to 1 if you have the `SSL_CTX_set_cert_cb' function. */
#undef HAVE_SSL_CTX_SET_CERT_CB
@@ -463,6 +469,9 @@
/* Define to 1 if you have the <termios.h> header file. */
#undef HAVE_TERMIOS_H
+/* Define to 1 if you have the <uchar.h> header file. */
+#undef HAVE_UCHAR_H
+
/* Define to 1 if curl_global_init() is guaranteed to be thread-safe. */
#undef HAVE_THREADSAFE_CURL_GLOBAL_INIT
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index bb4e1b37005..790be386d75 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -3505,6 +3505,8 @@ cb_cleanup_dir
cb_options
cb_tablespace
cb_tablespace_mapping
+char16_t
+char32_t
check_agg_arguments_context
check_function_callback
check_network_data
--
2.43.0
On Wed, Oct 29, 2025 at 7:45 AM Peter Eisentraut <peter@eisentraut.org> wrote:
On 26.10.25 20:43, Jeff Davis wrote:
+/* + * char16_t and char32_t + * Unicode code points. + */ +#ifndef __cplusplus +#ifdef HAVE_UCHAR_H +#include <uchar.h> +#ifndef __STDC_UTF_16__ +#error "char16_t must use UTF-16 encoding" +#endif +#ifndef __STDC_UTF_32__ +#error "char32_t must use UTF-32 encoding" +#endif +#else +typedef uint16_t char16_t; +typedef uint32_t char32_t; +#endif +#endifThis could be improved a bit. The reason for some of these conditionals
is not clear. Like, what does __cplusplus have to do with this? I
think it would be more correct to write a configure/meson check for the
actual types rather than depend indirectly on a header check.
I suggested testing __cplusplus because I predicted that that typedef
would fail on a C++ compiler (since C++11), where char32_t is a
language keyword identifying a distinct type requiring no #include.
This is an Apple-only problem, without which we could just include
<uchar.h> unconditionally, and presumably will eventually when Apple
supplies this non-optional-per-C11 header. On a Mac, #include
<uchar.h> fails for C (there is no $SDK/usr/include/uchar.h) but works
for C++ (it finds $SDK/usr/include/c++/v1/uchar.h), and since we'd
probe for HAVE_UCHAR_H with the C compiler, we'd not find it and thus
also need to exclude __cplusplus at compile time. Otherwise, let's
see what the error looks like...
test.cpp:2:22: error: cannot combine with previous 'int' declaration specifier
2 | typedef unsigned int char32_t;
| ^
test.cpp:2:1: warning: typedef requires a name [-Wmissing-declarations]
2 | typedef unsigned int char32_t;
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1 warning and 1 error generated.
GCC has a clearer message:
test.cpp:2:22: error: redeclaration of C++ built-in type 'char32_t'
[-fpermissive]
2 | typedef unsigned int char32_t;
| ^~~~~~~~
If you try to test for the existence of the type rather than the
header in meson/configure, won't you still have the configure-with-C
compile-with-C++ problem, with no way to resolve it except by keeping
the test for __cplusplus that you're trying to get rid of? So what do
you gain other than more lines of configure stuff?
Out of curiosity, even with -std=C++03 (old C++ standard that might
not work for PostgreSQL for other reasons, but I wanted to see what
would happen with a standard before char32_t became a fundamental
language type) I was surprised to see that the standard library
supplied char32_t. It incorrectly(?) imports a typename from the
future standards using an internal type, so our typedef still fails,
just with a different Clang error:
test.cpp:2:22: error: typedef redefinition with different types
('unsigned int' vs 'char32_t')
2 | typedef unsigned int char32_t;
| ^
/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/usr/include/c++/v1/__config:320:20:
note: previous definition is here
320 | typedef __char32_t char32_t;
| ^
The checks for __STDC_UTF_16__ and __STDC_UTF_32__ can be removed, as
was discussed elsewhere, since we don't use any standard library
functions that make use of these facts, and the need goes away with C23
anyway.
+1
On Wed, 2025-10-29 at 09:03 +1300, Thomas Munro wrote:
If you try to test for the existence of the type rather than the
header in meson/configure, won't you still have the configure-with-C
compile-with-C++ problem
I must have misunderstood the first time. If we depend on
HAVE_CHAR32_T, then it will be set in stone in pg_config.h, and if C++
tries to include the file then it will try the typedef again and fail.
I tried with headerscheck --cplusplus before posting it, but because my
machine has uchar.h, then it didn't fail.
I went back to using the check for __cplusplus, and added a comment
that hopefully clarifies things.
I also reordered the checks so that it prefers to include uchar.h if
available, even when using C++, because that seems like the cleaner end
goal. However, that caused another problem in CI (mingw_cross_warning),
apparently due to a conflict between uchar.h and win32_port.h on that
platform:
[21:48:21.794] ../../src/include/port/win32_port.h: At top level:
[21:48:21.794] ../../src/include/port/win32_port.h:254:8: error:
redefinition of ‘struct stat’
[21:48:21.794] 254 | struct stat
/* This should match struct __stat64 */
[21:48:21.794] | ^~~~
[21:48:21.794] In file included from /usr/share/mingw-
w64/include/wchar.h:413,
[21:48:21.794] from /usr/share/mingw-
w64/include/uchar.h:28,
[21:48:21.794] from ../../src/include/c.h:526:
[21:48:21.794] /usr/share/mingw-w64/include/_mingw_stat64.h:40:10:
note: originally defined here
[21:48:21.794] 40 | struct stat {
[21:48:21.794] | ^~~~
https://cirrus-ci.com/task/4849300577976320
I could reverse the checks again and I think it will work, but let me
know if you have an idea for a better fix.
I never thought it would be so much trouble just to get a suitable type
for a UTF-32 code point...
Regards,
Jeff Davis
Attachments:
v4-0001-Use-C11-char16_t-and-char32_t-for-Unicode-code-po.patchtext/x-patch; charset=UTF-8; name=v4-0001-Use-C11-char16_t-and-char32_t-for-Unicode-code-po.patchDownload
From ceecbc970e7fcd026edbbe7cc324ea8fe74c571c Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Tue, 21 Oct 2025 13:16:47 -0700
Subject: [PATCH v4] Use C11 char16_t and char32_t for Unicode code points.
Reviewed-by: Tatsuo Ishii <ishii@postgresql.org>
Reviewed-by: Thomas Munro <thomas.munro@gmail.com>
Reviewed-by: Peter Eisentraut <peter@eisentraut.org>
Discussion: https://postgr.es/m/bedcc93d06203dfd89815b10f815ca2de8626e85.camel%40j-davis.com
---
configure | 2 +-
configure.ac | 1 +
meson.build | 1 +
src/backend/parser/parser.c | 8 +--
src/backend/parser/scan.l | 8 +--
src/backend/utils/adt/jsonpath_scan.l | 6 +-
src/backend/utils/adt/pg_locale_builtin.c | 44 ++++++++++-----
src/backend/utils/adt/varlena.c | 40 ++++++-------
src/backend/utils/mb/mbutils.c | 4 +-
src/common/saslprep.c | 48 ++++++++--------
src/common/unicode/case_test.c | 23 ++++----
src/common/unicode/category_test.c | 3 +-
.../unicode/generate-norm_test_table.pl | 4 +-
.../unicode/generate-unicode_case_table.pl | 7 +--
.../generate-unicode_category_table.pl | 8 +--
src/common/unicode/norm_test.c | 6 +-
src/common/unicode_case.c | 56 +++++++++----------
src/common/unicode_category.c | 50 ++++++++---------
src/common/unicode_norm.c | 56 +++++++++----------
src/fe_utils/mbprint.c | 10 ++--
src/include/c.h | 18 ++++++
src/include/common/unicode_case.h | 10 ++--
src/include/common/unicode_case_table.h | 13 ++---
src/include/common/unicode_category.h | 46 ++++++++-------
src/include/common/unicode_category_table.h | 8 +--
src/include/common/unicode_norm.h | 6 +-
src/include/mb/pg_wchar.h | 32 +++++------
src/include/pg_config.h.in | 3 +
src/tools/pgindent/typedefs.list | 2 +
29 files changed, 279 insertions(+), 244 deletions(-)
diff --git a/configure b/configure
index 22cd866147b..600a56f91a3 100755
--- a/configure
+++ b/configure
@@ -13627,7 +13627,7 @@ fi
## Header files
##
-for ac_header in atomic.h copyfile.h execinfo.h getopt.h ifaddrs.h mbarrier.h sys/epoll.h sys/event.h sys/personality.h sys/prctl.h sys/procctl.h sys/signalfd.h sys/ucred.h termios.h ucred.h xlocale.h
+for ac_header in atomic.h copyfile.h execinfo.h getopt.h ifaddrs.h mbarrier.h sys/epoll.h sys/event.h sys/personality.h sys/prctl.h sys/procctl.h sys/signalfd.h sys/ucred.h termios.h uchar.h ucred.h xlocale.h
do :
as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh`
ac_fn_c_check_header_mongrel "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default"
diff --git a/configure.ac b/configure.ac
index e44943aa6fe..6ab2e157531 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1513,6 +1513,7 @@ AC_CHECK_HEADERS(m4_normalize([
sys/signalfd.h
sys/ucred.h
termios.h
+ uchar.h
ucred.h
xlocale.h
]))
diff --git a/meson.build b/meson.build
index 395416a6060..c3128c7554f 100644
--- a/meson.build
+++ b/meson.build
@@ -2613,6 +2613,7 @@ header_checks = [
'sys/signalfd.h',
'sys/ucred.h',
'termios.h',
+ 'uchar.h',
'ucred.h',
'xlocale.h',
]
diff --git a/src/backend/parser/parser.c b/src/backend/parser/parser.c
index 33a040506b4..a3679f8e86c 100644
--- a/src/backend/parser/parser.c
+++ b/src/backend/parser/parser.c
@@ -339,7 +339,7 @@ hexval(unsigned char c)
/* is Unicode code point acceptable? */
static void
-check_unicode_value(pg_wchar c)
+check_unicode_value(char32_t c)
{
if (!is_valid_unicode_codepoint(c))
ereport(ERROR,
@@ -376,7 +376,7 @@ str_udeescape(const char *str, char escape,
char *new,
*out;
size_t new_len;
- pg_wchar pair_first = 0;
+ char16_t pair_first = 0;
ScannerCallbackState scbstate;
/*
@@ -420,7 +420,7 @@ str_udeescape(const char *str, char escape,
isxdigit((unsigned char) in[3]) &&
isxdigit((unsigned char) in[4]))
{
- pg_wchar unicode;
+ char32_t unicode;
unicode = (hexval(in[1]) << 12) +
(hexval(in[2]) << 8) +
@@ -457,7 +457,7 @@ str_udeescape(const char *str, char escape,
isxdigit((unsigned char) in[6]) &&
isxdigit((unsigned char) in[7]))
{
- pg_wchar unicode;
+ char32_t unicode;
unicode = (hexval(in[2]) << 20) +
(hexval(in[3]) << 16) +
diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l
index 08990831fe8..a67815339b7 100644
--- a/src/backend/parser/scan.l
+++ b/src/backend/parser/scan.l
@@ -121,7 +121,7 @@ static void addlitchar(unsigned char ychar, core_yyscan_t yyscanner);
static char *litbufdup(core_yyscan_t yyscanner);
static unsigned char unescape_single_char(unsigned char c, core_yyscan_t yyscanner);
static int process_integer_literal(const char *token, YYSTYPE *lval, int base);
-static void addunicode(pg_wchar c, yyscan_t yyscanner);
+static void addunicode(char32_t c, yyscan_t yyscanner);
#define yyerror(msg) scanner_yyerror(msg, yyscanner)
@@ -640,7 +640,7 @@ other .
addlit(yytext, yyleng, yyscanner);
}
<xe>{xeunicode} {
- pg_wchar c = strtoul(yytext + 2, NULL, 16);
+ char32_t c = strtoul(yytext + 2, NULL, 16);
/*
* For consistency with other productions, issue any
@@ -668,7 +668,7 @@ other .
POP_YYLLOC();
}
<xeu>{xeunicode} {
- pg_wchar c = strtoul(yytext + 2, NULL, 16);
+ char32_t c = strtoul(yytext + 2, NULL, 16);
/* Remember start of overall string token ... */
PUSH_YYLLOC();
@@ -1376,7 +1376,7 @@ process_integer_literal(const char *token, YYSTYPE *lval, int base)
}
static void
-addunicode(pg_wchar c, core_yyscan_t yyscanner)
+addunicode(char32_t c, core_yyscan_t yyscanner)
{
ScannerCallbackState scbstate;
char buf[MAX_UNICODE_EQUIVALENT_STRING + 1];
diff --git a/src/backend/utils/adt/jsonpath_scan.l b/src/backend/utils/adt/jsonpath_scan.l
index c7aab83eeb4..8c3a0a9c642 100644
--- a/src/backend/utils/adt/jsonpath_scan.l
+++ b/src/backend/utils/adt/jsonpath_scan.l
@@ -574,7 +574,7 @@ hexval(char c, int *result, struct Node *escontext, yyscan_t yyscanner)
/* Add given unicode character to scanstring */
static bool
-addUnicodeChar(int ch, struct Node *escontext, yyscan_t yyscanner)
+addUnicodeChar(char32_t ch, struct Node *escontext, yyscan_t yyscanner)
{
if (ch == 0)
{
@@ -607,7 +607,7 @@ addUnicodeChar(int ch, struct Node *escontext, yyscan_t yyscanner)
/* Add unicode character, processing any surrogate pairs */
static bool
-addUnicode(int ch, int *hi_surrogate, struct Node *escontext, yyscan_t yyscanner)
+addUnicode(char32_t ch, int *hi_surrogate, struct Node *escontext, yyscan_t yyscanner)
{
if (is_utf16_surrogate_first(ch))
{
@@ -655,7 +655,7 @@ parseUnicode(char *s, int l, struct Node *escontext, yyscan_t yyscanner)
for (i = 2; i < l; i += 2) /* skip '\u' */
{
- int ch = 0;
+ char32_t ch = 0;
int j,
si;
diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c
index 3dc611b50e1..1021e0d129b 100644
--- a/src/backend/utils/adt/pg_locale_builtin.c
+++ b/src/backend/utils/adt/pg_locale_builtin.c
@@ -15,7 +15,6 @@
#include "catalog/pg_collation.h"
#include "common/unicode_case.h"
#include "common/unicode_category.h"
-#include "mb/pg_wchar.h"
#include "miscadmin.h"
#include "utils/builtins.h"
#include "utils/pg_locale.h"
@@ -35,6 +34,23 @@ struct WordBoundaryState
bool prev_alnum;
};
+/*
+ * In UTF-8, pg_wchar is guaranteed to be the code point value.
+ */
+static inline char32_t
+to_char32(pg_wchar wc)
+{
+ Assert(GetDatabaseEncoding() == PG_UTF8);
+ return (char32_t) wc;
+}
+
+static inline pg_wchar
+to_pg_wchar(char32_t c32)
+{
+ Assert(GetDatabaseEncoding() == PG_UTF8);
+ return (pg_wchar) c32;
+}
+
/*
* Simple word boundary iterator that draws boundaries each time the result of
* pg_u_isalnum() changes.
@@ -47,7 +63,7 @@ initcap_wbnext(void *state)
while (wbstate->offset < wbstate->len &&
wbstate->str[wbstate->offset] != '\0')
{
- pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str +
+ char32_t u = utf8_to_unicode((unsigned char *) wbstate->str +
wbstate->offset);
bool curr_alnum = pg_u_isalnum(u, wbstate->posix);
@@ -112,61 +128,61 @@ strfold_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
static bool
wc_isdigit_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_isdigit(wc, !locale->builtin.casemap_full);
+ return pg_u_isdigit(to_char32(wc), !locale->builtin.casemap_full);
}
static bool
wc_isalpha_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_isalpha(wc);
+ return pg_u_isalpha(to_char32(wc));
}
static bool
wc_isalnum_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_isalnum(wc, !locale->builtin.casemap_full);
+ return pg_u_isalnum(to_char32(wc), !locale->builtin.casemap_full);
}
static bool
wc_isupper_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_isupper(wc);
+ return pg_u_isupper(to_char32(wc));
}
static bool
wc_islower_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_islower(wc);
+ return pg_u_islower(to_char32(wc));
}
static bool
wc_isgraph_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_isgraph(wc);
+ return pg_u_isgraph(to_char32(wc));
}
static bool
wc_isprint_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_isprint(wc);
+ return pg_u_isprint(to_char32(wc));
}
static bool
wc_ispunct_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_ispunct(wc, !locale->builtin.casemap_full);
+ return pg_u_ispunct(to_char32(wc), !locale->builtin.casemap_full);
}
static bool
wc_isspace_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_isspace(wc);
+ return pg_u_isspace(to_char32(wc));
}
static bool
wc_isxdigit_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_isxdigit(wc, !locale->builtin.casemap_full);
+ return pg_u_isxdigit(to_char32(wc), !locale->builtin.casemap_full);
}
static bool
@@ -179,13 +195,13 @@ char_is_cased_builtin(char ch, pg_locale_t locale)
static pg_wchar
wc_toupper_builtin(pg_wchar wc, pg_locale_t locale)
{
- return unicode_uppercase_simple(wc);
+ return to_pg_wchar(unicode_uppercase_simple(to_char32(wc)));
}
static pg_wchar
wc_tolower_builtin(pg_wchar wc, pg_locale_t locale)
{
- return unicode_lowercase_simple(wc);
+ return to_pg_wchar(unicode_lowercase_simple(to_char32(wc)));
}
static const struct ctype_methods ctype_methods_builtin = {
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index 2c398cd9e5c..8d735786e51 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -5419,12 +5419,12 @@ unicode_assigned(PG_FUNCTION_ARGS)
ereport(ERROR,
(errmsg("Unicode categorization can only be performed if server encoding is UTF8")));
- /* convert to pg_wchar */
+ /* convert to char32_t */
size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
p = (unsigned char *) VARDATA_ANY(input);
for (int i = 0; i < size; i++)
{
- pg_wchar uchar = utf8_to_unicode(p);
+ char32_t uchar = utf8_to_unicode(p);
int category = unicode_category(uchar);
if (category == PG_U_UNASSIGNED)
@@ -5443,24 +5443,24 @@ unicode_normalize_func(PG_FUNCTION_ARGS)
char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
UnicodeNormalizationForm form;
int size;
- pg_wchar *input_chars;
- pg_wchar *output_chars;
+ char32_t *input_chars;
+ char32_t *output_chars;
unsigned char *p;
text *result;
int i;
form = unicode_norm_form_from_string(formstr);
- /* convert to pg_wchar */
+ /* convert to char32_t */
size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
- input_chars = palloc((size + 1) * sizeof(pg_wchar));
+ input_chars = palloc((size + 1) * sizeof(char32_t));
p = (unsigned char *) VARDATA_ANY(input);
for (i = 0; i < size; i++)
{
input_chars[i] = utf8_to_unicode(p);
p += pg_utf_mblen(p);
}
- input_chars[i] = (pg_wchar) '\0';
+ input_chars[i] = (char32_t) '\0';
Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
/* action */
@@ -5468,7 +5468,7 @@ unicode_normalize_func(PG_FUNCTION_ARGS)
/* convert back to UTF-8 string */
size = 0;
- for (pg_wchar *wp = output_chars; *wp; wp++)
+ for (char32_t *wp = output_chars; *wp; wp++)
{
unsigned char buf[4];
@@ -5480,7 +5480,7 @@ unicode_normalize_func(PG_FUNCTION_ARGS)
SET_VARSIZE(result, size + VARHDRSZ);
p = (unsigned char *) VARDATA_ANY(result);
- for (pg_wchar *wp = output_chars; *wp; wp++)
+ for (char32_t *wp = output_chars; *wp; wp++)
{
unicode_to_utf8(*wp, p);
p += pg_utf_mblen(p);
@@ -5509,8 +5509,8 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
UnicodeNormalizationForm form;
int size;
- pg_wchar *input_chars;
- pg_wchar *output_chars;
+ char32_t *input_chars;
+ char32_t *output_chars;
unsigned char *p;
int i;
UnicodeNormalizationQC quickcheck;
@@ -5519,16 +5519,16 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
form = unicode_norm_form_from_string(formstr);
- /* convert to pg_wchar */
+ /* convert to char32_t */
size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
- input_chars = palloc((size + 1) * sizeof(pg_wchar));
+ input_chars = palloc((size + 1) * sizeof(char32_t));
p = (unsigned char *) VARDATA_ANY(input);
for (i = 0; i < size; i++)
{
input_chars[i] = utf8_to_unicode(p);
p += pg_utf_mblen(p);
}
- input_chars[i] = (pg_wchar) '\0';
+ input_chars[i] = (char32_t) '\0';
Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
/* quick check (see UAX #15) */
@@ -5542,11 +5542,11 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
output_chars = unicode_normalize(form, input_chars);
output_size = 0;
- for (pg_wchar *wp = output_chars; *wp; wp++)
+ for (char32_t *wp = output_chars; *wp; wp++)
output_size++;
result = (size == output_size) &&
- (memcmp(input_chars, output_chars, size * sizeof(pg_wchar)) == 0);
+ (memcmp(input_chars, output_chars, size * sizeof(char32_t)) == 0);
PG_RETURN_BOOL(result);
}
@@ -5602,7 +5602,7 @@ unistr(PG_FUNCTION_ARGS)
int len;
StringInfoData str;
text *result;
- pg_wchar pair_first = 0;
+ char16_t pair_first = 0;
char cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
instr = VARDATA_ANY(input_text);
@@ -5626,7 +5626,7 @@ unistr(PG_FUNCTION_ARGS)
else if ((len >= 5 && isxdigits_n(instr + 1, 4)) ||
(len >= 6 && instr[1] == 'u' && isxdigits_n(instr + 2, 4)))
{
- pg_wchar unicode;
+ char32_t unicode;
int offset = instr[1] == 'u' ? 2 : 1;
unicode = hexval_n(instr + offset, 4);
@@ -5662,7 +5662,7 @@ unistr(PG_FUNCTION_ARGS)
}
else if (len >= 8 && instr[1] == '+' && isxdigits_n(instr + 2, 6))
{
- pg_wchar unicode;
+ char32_t unicode;
unicode = hexval_n(instr + 2, 6);
@@ -5697,7 +5697,7 @@ unistr(PG_FUNCTION_ARGS)
}
else if (len >= 10 && instr[1] == 'U' && isxdigits_n(instr + 2, 8))
{
- pg_wchar unicode;
+ char32_t unicode;
unicode = hexval_n(instr + 2, 8);
diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c
index 886ecbad871..fb629ed5c8f 100644
--- a/src/backend/utils/mb/mbutils.c
+++ b/src/backend/utils/mb/mbutils.c
@@ -862,7 +862,7 @@ perform_default_encoding_conversion(const char *src, int len,
* may call this outside any transaction, or in an aborted transaction.
*/
void
-pg_unicode_to_server(pg_wchar c, unsigned char *s)
+pg_unicode_to_server(char32_t c, unsigned char *s)
{
unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1];
int c_as_utf8_len;
@@ -924,7 +924,7 @@ pg_unicode_to_server(pg_wchar c, unsigned char *s)
* but simply return false on conversion failure.
*/
bool
-pg_unicode_to_server_noerror(pg_wchar c, unsigned char *s)
+pg_unicode_to_server_noerror(char32_t c, unsigned char *s)
{
unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1];
int c_as_utf8_len;
diff --git a/src/common/saslprep.c b/src/common/saslprep.c
index 97beb47940b..101e8d65a4d 100644
--- a/src/common/saslprep.c
+++ b/src/common/saslprep.c
@@ -47,7 +47,7 @@
/* Prototypes for local functions */
static int codepoint_range_cmp(const void *a, const void *b);
-static bool is_code_in_table(pg_wchar code, const pg_wchar *map, int mapsize);
+static bool is_code_in_table(char32_t code, const char32_t *map, int mapsize);
static int pg_utf8_string_len(const char *source);
/*
@@ -64,7 +64,7 @@ static int pg_utf8_string_len(const char *source);
*
* These are all mapped to the ASCII space character (U+00A0).
*/
-static const pg_wchar non_ascii_space_ranges[] =
+static const char32_t non_ascii_space_ranges[] =
{
0x00A0, 0x00A0,
0x1680, 0x1680,
@@ -79,7 +79,7 @@ static const pg_wchar non_ascii_space_ranges[] =
*
* If any of these appear in the input, they are removed.
*/
-static const pg_wchar commonly_mapped_to_nothing_ranges[] =
+static const char32_t commonly_mapped_to_nothing_ranges[] =
{
0x00AD, 0x00AD,
0x034F, 0x034F,
@@ -114,7 +114,7 @@ static const pg_wchar commonly_mapped_to_nothing_ranges[] =
* tables, so one code might originate from multiple source tables.
* Adjacent ranges have also been merged together, to save space.
*/
-static const pg_wchar prohibited_output_ranges[] =
+static const char32_t prohibited_output_ranges[] =
{
0x0000, 0x001F, /* C.2.1 */
0x007F, 0x00A0, /* C.1.2, C.2.1, C.2.2 */
@@ -155,7 +155,7 @@ static const pg_wchar prohibited_output_ranges[] =
};
/* A.1 Unassigned code points in Unicode 3.2 */
-static const pg_wchar unassigned_codepoint_ranges[] =
+static const char32_t unassigned_codepoint_ranges[] =
{
0x0221, 0x0221,
0x0234, 0x024F,
@@ -556,7 +556,7 @@ static const pg_wchar unassigned_codepoint_ranges[] =
};
/* D.1 Characters with bidirectional property "R" or "AL" */
-static const pg_wchar RandALCat_codepoint_ranges[] =
+static const char32_t RandALCat_codepoint_ranges[] =
{
0x05BE, 0x05BE,
0x05C0, 0x05C0,
@@ -595,7 +595,7 @@ static const pg_wchar RandALCat_codepoint_ranges[] =
};
/* D.2 Characters with bidirectional property "L" */
-static const pg_wchar LCat_codepoint_ranges[] =
+static const char32_t LCat_codepoint_ranges[] =
{
0x0041, 0x005A,
0x0061, 0x007A,
@@ -968,8 +968,8 @@ static const pg_wchar LCat_codepoint_ranges[] =
static int
codepoint_range_cmp(const void *a, const void *b)
{
- const pg_wchar *key = (const pg_wchar *) a;
- const pg_wchar *range = (const pg_wchar *) b;
+ const char32_t *key = (const char32_t *) a;
+ const char32_t *range = (const char32_t *) b;
if (*key < range[0])
return -1; /* less than lower bound */
@@ -980,14 +980,14 @@ codepoint_range_cmp(const void *a, const void *b)
}
static bool
-is_code_in_table(pg_wchar code, const pg_wchar *map, int mapsize)
+is_code_in_table(char32_t code, const char32_t *map, int mapsize)
{
Assert(mapsize % 2 == 0);
if (code < map[0] || code > map[mapsize - 1])
return false;
- if (bsearch(&code, map, mapsize / 2, sizeof(pg_wchar) * 2,
+ if (bsearch(&code, map, mapsize / 2, sizeof(char32_t) * 2,
codepoint_range_cmp))
return true;
else
@@ -1046,8 +1046,8 @@ pg_utf8_string_len(const char *source)
pg_saslprep_rc
pg_saslprep(const char *input, char **output)
{
- pg_wchar *input_chars = NULL;
- pg_wchar *output_chars = NULL;
+ char32_t *input_chars = NULL;
+ char32_t *output_chars = NULL;
int input_size;
char *result;
int result_size;
@@ -1055,7 +1055,7 @@ pg_saslprep(const char *input, char **output)
int i;
bool contains_RandALCat;
unsigned char *p;
- pg_wchar *wp;
+ char32_t *wp;
/* Ensure we return *output as NULL on failure */
*output = NULL;
@@ -1080,10 +1080,10 @@ pg_saslprep(const char *input, char **output)
input_size = pg_utf8_string_len(input);
if (input_size < 0)
return SASLPREP_INVALID_UTF8;
- if (input_size >= MaxAllocSize / sizeof(pg_wchar))
+ if (input_size >= MaxAllocSize / sizeof(char32_t))
goto oom;
- input_chars = ALLOC((input_size + 1) * sizeof(pg_wchar));
+ input_chars = ALLOC((input_size + 1) * sizeof(char32_t));
if (!input_chars)
goto oom;
@@ -1093,7 +1093,7 @@ pg_saslprep(const char *input, char **output)
input_chars[i] = utf8_to_unicode(p);
p += pg_utf_mblen(p);
}
- input_chars[i] = (pg_wchar) '\0';
+ input_chars[i] = (char32_t) '\0';
/*
* The steps below correspond to the steps listed in [RFC3454], Section
@@ -1107,7 +1107,7 @@ pg_saslprep(const char *input, char **output)
count = 0;
for (i = 0; i < input_size; i++)
{
- pg_wchar code = input_chars[i];
+ char32_t code = input_chars[i];
if (IS_CODE_IN_TABLE(code, non_ascii_space_ranges))
input_chars[count++] = 0x0020;
@@ -1118,7 +1118,7 @@ pg_saslprep(const char *input, char **output)
else
input_chars[count++] = code;
}
- input_chars[count] = (pg_wchar) '\0';
+ input_chars[count] = (char32_t) '\0';
input_size = count;
if (input_size == 0)
@@ -1138,7 +1138,7 @@ pg_saslprep(const char *input, char **output)
*/
for (i = 0; i < input_size; i++)
{
- pg_wchar code = input_chars[i];
+ char32_t code = input_chars[i];
if (IS_CODE_IN_TABLE(code, prohibited_output_ranges))
goto prohibited;
@@ -1170,7 +1170,7 @@ pg_saslprep(const char *input, char **output)
contains_RandALCat = false;
for (i = 0; i < input_size; i++)
{
- pg_wchar code = input_chars[i];
+ char32_t code = input_chars[i];
if (IS_CODE_IN_TABLE(code, RandALCat_codepoint_ranges))
{
@@ -1181,12 +1181,12 @@ pg_saslprep(const char *input, char **output)
if (contains_RandALCat)
{
- pg_wchar first = input_chars[0];
- pg_wchar last = input_chars[input_size - 1];
+ char32_t first = input_chars[0];
+ char32_t last = input_chars[input_size - 1];
for (i = 0; i < input_size; i++)
{
- pg_wchar code = input_chars[i];
+ char32_t code = input_chars[i];
if (IS_CODE_IN_TABLE(code, LCat_codepoint_ranges))
goto prohibited;
diff --git a/src/common/unicode/case_test.c b/src/common/unicode/case_test.c
index fdfb62e8552..00d4f85e5a5 100644
--- a/src/common/unicode/case_test.c
+++ b/src/common/unicode/case_test.c
@@ -24,6 +24,7 @@
#include "common/unicode_case.h"
#include "common/unicode_category.h"
#include "common/unicode_version.h"
+#include "mb/pg_wchar.h"
/* enough to hold largest source or result string, including NUL */
#define BUFSZ 256
@@ -54,7 +55,7 @@ initcap_wbnext(void *state)
while (wbstate->offset < wbstate->len &&
wbstate->str[wbstate->offset] != '\0')
{
- pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str +
+ char32_t u = utf8_to_unicode((unsigned char *) wbstate->str +
wbstate->offset);
bool curr_alnum = pg_u_isalnum(u, wbstate->posix);
@@ -77,16 +78,16 @@ initcap_wbnext(void *state)
#ifdef USE_ICU
static void
-icu_test_simple(pg_wchar code)
+icu_test_simple(char32_t code)
{
- pg_wchar lower = unicode_lowercase_simple(code);
- pg_wchar title = unicode_titlecase_simple(code);
- pg_wchar upper = unicode_uppercase_simple(code);
- pg_wchar fold = unicode_casefold_simple(code);
- pg_wchar iculower = u_tolower(code);
- pg_wchar icutitle = u_totitle(code);
- pg_wchar icuupper = u_toupper(code);
- pg_wchar icufold = u_foldCase(code, U_FOLD_CASE_DEFAULT);
+ char32_t lower = unicode_lowercase_simple(code);
+ char32_t title = unicode_titlecase_simple(code);
+ char32_t upper = unicode_uppercase_simple(code);
+ char32_t fold = unicode_casefold_simple(code);
+ char32_t iculower = u_tolower(code);
+ char32_t icutitle = u_totitle(code);
+ char32_t icuupper = u_toupper(code);
+ char32_t icufold = u_foldCase(code, U_FOLD_CASE_DEFAULT);
if (lower != iculower || title != icutitle || upper != icuupper ||
fold != icufold)
@@ -172,7 +173,7 @@ test_icu(void)
int successful = 0;
int skipped_mismatch = 0;
- for (pg_wchar code = 0; code <= 0x10ffff; code++)
+ for (char32_t code = 0; code <= 0x10ffff; code++)
{
pg_unicode_category category = unicode_category(code);
diff --git a/src/common/unicode/category_test.c b/src/common/unicode/category_test.c
index 5d37ba39196..1e8c1f7905f 100644
--- a/src/common/unicode/category_test.c
+++ b/src/common/unicode/category_test.c
@@ -22,6 +22,7 @@
#include "common/unicode_category.h"
#include "common/unicode_version.h"
+#include "mb/pg_wchar.h"
static int pg_unicode_version = 0;
#ifdef USE_ICU
@@ -59,7 +60,7 @@ icu_test()
int pg_skipped_codepoints = 0;
int icu_skipped_codepoints = 0;
- for (pg_wchar code = 0; code <= 0x10ffff; code++)
+ for (char32_t code = 0; code <= 0x10ffff; code++)
{
uint8_t pg_category = unicode_category(code);
uint8_t icu_category = u_charType(code);
diff --git a/src/common/unicode/generate-norm_test_table.pl b/src/common/unicode/generate-norm_test_table.pl
index 1b401be9409..1a8b908ff33 100644
--- a/src/common/unicode/generate-norm_test_table.pl
+++ b/src/common/unicode/generate-norm_test_table.pl
@@ -47,8 +47,8 @@ print $OUTPUT <<HEADER;
typedef struct
{
int linenum;
- pg_wchar input[50];
- pg_wchar output[4][50];
+ char32_t input[50];
+ char32_t output[4][50];
} pg_unicode_test;
/* test table */
diff --git a/src/common/unicode/generate-unicode_case_table.pl b/src/common/unicode/generate-unicode_case_table.pl
index 5d9ddd62803..f71eb25c94e 100644
--- a/src/common/unicode/generate-unicode_case_table.pl
+++ b/src/common/unicode/generate-unicode_case_table.pl
@@ -270,7 +270,6 @@ print $OT <<"EOS";
*/
#include "common/unicode_case.h"
-#include "mb/pg_wchar.h"
/*
* The maximum number of codepoints that can result from case mapping
@@ -297,7 +296,7 @@ typedef enum
typedef struct
{
int16 conditions;
- pg_wchar map[NCaseKind][MAX_CASE_EXPANSION];
+ char32_t map[NCaseKind][MAX_CASE_EXPANSION];
} pg_special_case;
/*
@@ -430,7 +429,7 @@ foreach my $kind ('lower', 'title', 'upper', 'fold')
* The entry case_map_${kind}[case_index(codepoint)] is the mapping for the
* given codepoint.
*/
-static const pg_wchar case_map_$kind\[$index\] =
+static const char32_t case_map_$kind\[$index\] =
{
EOS
@@ -502,7 +501,7 @@ print $OT <<"EOS";
* the offset into the mapping tables.
*/
static inline uint16
-case_index(pg_wchar cp)
+case_index(char32_t cp)
{
/* Fast path for codepoints < $fastpath_limit */
if (cp < $fastpath_limit)
diff --git a/src/common/unicode/generate-unicode_category_table.pl b/src/common/unicode/generate-unicode_category_table.pl
index abab5cd9696..7e094b13720 100644
--- a/src/common/unicode/generate-unicode_category_table.pl
+++ b/src/common/unicode/generate-unicode_category_table.pl
@@ -366,15 +366,15 @@ print $OT <<"EOS";
*/
typedef struct
{
- uint32 first; /* Unicode codepoint */
- uint32 last; /* Unicode codepoint */
+ char32_t first; /* Unicode codepoint */
+ char32_t last; /* Unicode codepoint */
uint8 category; /* General Category */
} pg_category_range;
typedef struct
{
- uint32 first; /* Unicode codepoint */
- uint32 last; /* Unicode codepoint */
+ char32_t first; /* Unicode codepoint */
+ char32_t last; /* Unicode codepoint */
} pg_unicode_range;
typedef struct
diff --git a/src/common/unicode/norm_test.c b/src/common/unicode/norm_test.c
index 25bc59463f2..058817f1719 100644
--- a/src/common/unicode/norm_test.c
+++ b/src/common/unicode/norm_test.c
@@ -20,7 +20,7 @@
#include "norm_test_table.h"
static char *
-print_wchar_str(const pg_wchar *s)
+print_wchar_str(const char32_t *s)
{
#define BUF_DIGITS 50
static char buf[BUF_DIGITS * 11 + 1];
@@ -41,7 +41,7 @@ print_wchar_str(const pg_wchar *s)
}
static int
-pg_wcscmp(const pg_wchar *s1, const pg_wchar *s2)
+pg_wcscmp(const char32_t *s1, const char32_t *s2)
{
for (;;)
{
@@ -65,7 +65,7 @@ main(int argc, char **argv)
{
for (int form = 0; form < 4; form++)
{
- pg_wchar *result;
+ char32_t *result;
result = unicode_normalize(form, test->input);
diff --git a/src/common/unicode_case.c b/src/common/unicode_case.c
index 073faf6a0d5..e5e494db43c 100644
--- a/src/common/unicode_case.c
+++ b/src/common/unicode_case.c
@@ -30,7 +30,7 @@ enum CaseMapResult
/*
* Map for each case kind.
*/
-static const pg_wchar *const casekind_map[NCaseKind] =
+static const char32_t *const casekind_map[NCaseKind] =
{
[CaseLower] = case_map_lower,
[CaseTitle] = case_map_title,
@@ -38,42 +38,42 @@ static const pg_wchar *const casekind_map[NCaseKind] =
[CaseFold] = case_map_fold,
};
-static pg_wchar find_case_map(pg_wchar ucs, const pg_wchar *map);
+static char32_t find_case_map(char32_t ucs, const char32_t *map);
static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
void *wbstate);
-static enum CaseMapResult casemap(pg_wchar u1, CaseKind casekind, bool full,
+static enum CaseMapResult casemap(char32_t u1, CaseKind casekind, bool full,
const char *src, size_t srclen, size_t srcoff,
- pg_wchar *simple, const pg_wchar **special);
+ char32_t *simple, const char32_t **special);
-pg_wchar
-unicode_lowercase_simple(pg_wchar code)
+char32_t
+unicode_lowercase_simple(char32_t code)
{
- pg_wchar cp = find_case_map(code, case_map_lower);
+ char32_t cp = find_case_map(code, case_map_lower);
return cp != 0 ? cp : code;
}
-pg_wchar
-unicode_titlecase_simple(pg_wchar code)
+char32_t
+unicode_titlecase_simple(char32_t code)
{
- pg_wchar cp = find_case_map(code, case_map_title);
+ char32_t cp = find_case_map(code, case_map_title);
return cp != 0 ? cp : code;
}
-pg_wchar
-unicode_uppercase_simple(pg_wchar code)
+char32_t
+unicode_uppercase_simple(char32_t code)
{
- pg_wchar cp = find_case_map(code, case_map_upper);
+ char32_t cp = find_case_map(code, case_map_upper);
return cp != 0 ? cp : code;
}
-pg_wchar
-unicode_casefold_simple(pg_wchar code)
+char32_t
+unicode_casefold_simple(char32_t code)
{
- pg_wchar cp = find_case_map(code, case_map_fold);
+ char32_t cp = find_case_map(code, case_map_fold);
return cp != 0 ? cp : code;
}
@@ -231,10 +231,10 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
{
- pg_wchar u1 = utf8_to_unicode((unsigned char *) src + srcoff);
+ char32_t u1 = utf8_to_unicode((unsigned char *) src + srcoff);
int u1len = unicode_utf8len(u1);
- pg_wchar simple = 0;
- const pg_wchar *special = NULL;
+ char32_t simple = 0;
+ const char32_t *special = NULL;
enum CaseMapResult casemap_result;
if (str_casekind == CaseTitle)
@@ -265,8 +265,8 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
case CASEMAP_SIMPLE:
{
/* replace with single character */
- pg_wchar u2 = simple;
- pg_wchar u2len = unicode_utf8len(u2);
+ char32_t u2 = simple;
+ char32_t u2len = unicode_utf8len(u2);
Assert(special == NULL);
if (result_len + u2len <= dstsize)
@@ -280,7 +280,7 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
Assert(simple == 0);
for (int i = 0; i < MAX_CASE_EXPANSION && special[i]; i++)
{
- pg_wchar u2 = special[i];
+ char32_t u2 = special[i];
size_t u2len = unicode_utf8len(u2);
if (result_len + u2len <= dstsize)
@@ -320,7 +320,7 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
{
if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
{
- pg_wchar curr = utf8_to_unicode(str + i);
+ char32_t curr = utf8_to_unicode(str + i);
if (pg_u_prop_case_ignorable(curr))
continue;
@@ -344,7 +344,7 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
{
if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
{
- pg_wchar curr = utf8_to_unicode(str + i);
+ char32_t curr = utf8_to_unicode(str + i);
if (pg_u_prop_case_ignorable(curr))
continue;
@@ -394,9 +394,9 @@ check_special_conditions(int conditions, const char *str, size_t len,
* character without modification.
*/
static enum CaseMapResult
-casemap(pg_wchar u1, CaseKind casekind, bool full,
+casemap(char32_t u1, CaseKind casekind, bool full,
const char *src, size_t srclen, size_t srcoff,
- pg_wchar *simple, const pg_wchar **special)
+ char32_t *simple, const char32_t **special)
{
uint16 idx;
@@ -434,8 +434,8 @@ casemap(pg_wchar u1, CaseKind casekind, bool full,
* Find entry in simple case map.
* If the entry does not exist, 0 will be returned.
*/
-static pg_wchar
-find_case_map(pg_wchar ucs, const pg_wchar *map)
+static char32_t
+find_case_map(char32_t ucs, const char32_t *map)
{
/* Fast path for codepoints < 0x80 */
if (ucs < 0x80)
diff --git a/src/common/unicode_category.c b/src/common/unicode_category.c
index 4136c4d4f92..aab667a7bb4 100644
--- a/src/common/unicode_category.c
+++ b/src/common/unicode_category.c
@@ -1,7 +1,7 @@
/*-------------------------------------------------------------------------
* unicode_category.c
* Determine general category and character properties of Unicode
- * characters. Encoding must be UTF8, where we assume that the pg_wchar
+ * characters. Encoding must be UTF8, where we assume that the char32_t
* representation is a code point.
*
* Portions Copyright (c) 2017-2025, PostgreSQL Global Development Group
@@ -76,13 +76,13 @@
#define PG_U_CHARACTER_TAB 0x09
static bool range_search(const pg_unicode_range *tbl, size_t size,
- pg_wchar code);
+ char32_t code);
/*
* Unicode general category for the given codepoint.
*/
pg_unicode_category
-unicode_category(pg_wchar code)
+unicode_category(char32_t code)
{
int min = 0;
int mid;
@@ -108,7 +108,7 @@ unicode_category(pg_wchar code)
}
bool
-pg_u_prop_alphabetic(pg_wchar code)
+pg_u_prop_alphabetic(char32_t code)
{
if (code < 0x80)
return unicode_opt_ascii[code].properties & PG_U_PROP_ALPHABETIC;
@@ -119,7 +119,7 @@ pg_u_prop_alphabetic(pg_wchar code)
}
bool
-pg_u_prop_lowercase(pg_wchar code)
+pg_u_prop_lowercase(char32_t code)
{
if (code < 0x80)
return unicode_opt_ascii[code].properties & PG_U_PROP_LOWERCASE;
@@ -130,7 +130,7 @@ pg_u_prop_lowercase(pg_wchar code)
}
bool
-pg_u_prop_uppercase(pg_wchar code)
+pg_u_prop_uppercase(char32_t code)
{
if (code < 0x80)
return unicode_opt_ascii[code].properties & PG_U_PROP_UPPERCASE;
@@ -141,7 +141,7 @@ pg_u_prop_uppercase(pg_wchar code)
}
bool
-pg_u_prop_cased(pg_wchar code)
+pg_u_prop_cased(char32_t code)
{
uint32 category_mask;
@@ -156,7 +156,7 @@ pg_u_prop_cased(pg_wchar code)
}
bool
-pg_u_prop_case_ignorable(pg_wchar code)
+pg_u_prop_case_ignorable(char32_t code)
{
if (code < 0x80)
return unicode_opt_ascii[code].properties & PG_U_PROP_CASE_IGNORABLE;
@@ -167,7 +167,7 @@ pg_u_prop_case_ignorable(pg_wchar code)
}
bool
-pg_u_prop_white_space(pg_wchar code)
+pg_u_prop_white_space(char32_t code)
{
if (code < 0x80)
return unicode_opt_ascii[code].properties & PG_U_PROP_WHITE_SPACE;
@@ -178,7 +178,7 @@ pg_u_prop_white_space(pg_wchar code)
}
bool
-pg_u_prop_hex_digit(pg_wchar code)
+pg_u_prop_hex_digit(char32_t code)
{
if (code < 0x80)
return unicode_opt_ascii[code].properties & PG_U_PROP_HEX_DIGIT;
@@ -189,7 +189,7 @@ pg_u_prop_hex_digit(pg_wchar code)
}
bool
-pg_u_prop_join_control(pg_wchar code)
+pg_u_prop_join_control(char32_t code)
{
if (code < 0x80)
return unicode_opt_ascii[code].properties & PG_U_PROP_JOIN_CONTROL;
@@ -208,7 +208,7 @@ pg_u_prop_join_control(pg_wchar code)
*/
bool
-pg_u_isdigit(pg_wchar code, bool posix)
+pg_u_isdigit(char32_t code, bool posix)
{
if (posix)
return ('0' <= code && code <= '9');
@@ -217,19 +217,19 @@ pg_u_isdigit(pg_wchar code, bool posix)
}
bool
-pg_u_isalpha(pg_wchar code)
+pg_u_isalpha(char32_t code)
{
return pg_u_prop_alphabetic(code);
}
bool
-pg_u_isalnum(pg_wchar code, bool posix)
+pg_u_isalnum(char32_t code, bool posix)
{
return pg_u_isalpha(code) || pg_u_isdigit(code, posix);
}
bool
-pg_u_isword(pg_wchar code)
+pg_u_isword(char32_t code)
{
uint32 category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
@@ -240,32 +240,32 @@ pg_u_isword(pg_wchar code)
}
bool
-pg_u_isupper(pg_wchar code)
+pg_u_isupper(char32_t code)
{
return pg_u_prop_uppercase(code);
}
bool
-pg_u_islower(pg_wchar code)
+pg_u_islower(char32_t code)
{
return pg_u_prop_lowercase(code);
}
bool
-pg_u_isblank(pg_wchar code)
+pg_u_isblank(char32_t code)
{
return code == PG_U_CHARACTER_TAB ||
unicode_category(code) == PG_U_SPACE_SEPARATOR;
}
bool
-pg_u_iscntrl(pg_wchar code)
+pg_u_iscntrl(char32_t code)
{
return unicode_category(code) == PG_U_CONTROL;
}
bool
-pg_u_isgraph(pg_wchar code)
+pg_u_isgraph(char32_t code)
{
uint32 category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
@@ -276,7 +276,7 @@ pg_u_isgraph(pg_wchar code)
}
bool
-pg_u_isprint(pg_wchar code)
+pg_u_isprint(char32_t code)
{
pg_unicode_category category = unicode_category(code);
@@ -287,7 +287,7 @@ pg_u_isprint(pg_wchar code)
}
bool
-pg_u_ispunct(pg_wchar code, bool posix)
+pg_u_ispunct(char32_t code, bool posix)
{
uint32 category_mask;
@@ -308,13 +308,13 @@ pg_u_ispunct(pg_wchar code, bool posix)
}
bool
-pg_u_isspace(pg_wchar code)
+pg_u_isspace(char32_t code)
{
return pg_u_prop_white_space(code);
}
bool
-pg_u_isxdigit(pg_wchar code, bool posix)
+pg_u_isxdigit(char32_t code, bool posix)
{
if (posix)
return (('0' <= code && code <= '9') ||
@@ -478,7 +478,7 @@ unicode_category_abbrev(pg_unicode_category category)
* given table.
*/
static bool
-range_search(const pg_unicode_range *tbl, size_t size, pg_wchar code)
+range_search(const pg_unicode_range *tbl, size_t size, char32_t code)
{
int min = 0;
int mid;
diff --git a/src/common/unicode_norm.c b/src/common/unicode_norm.c
index 6654b4cbc49..489d99cd5ab 100644
--- a/src/common/unicode_norm.c
+++ b/src/common/unicode_norm.c
@@ -69,7 +69,7 @@ conv_compare(const void *p1, const void *p2)
* lookup, while the frontend version uses a binary search.
*/
static const pg_unicode_decomposition *
-get_code_entry(pg_wchar code)
+get_code_entry(char32_t code)
{
#ifndef FRONTEND
int h;
@@ -109,7 +109,7 @@ get_code_entry(pg_wchar code)
* Get the combining class of the given codepoint.
*/
static uint8
-get_canonical_class(pg_wchar code)
+get_canonical_class(char32_t code)
{
const pg_unicode_decomposition *entry = get_code_entry(code);
@@ -130,15 +130,15 @@ get_canonical_class(pg_wchar code)
* Note: the returned pointer can point to statically allocated buffer, and
* is only valid until next call to this function!
*/
-static const pg_wchar *
+static const char32_t *
get_code_decomposition(const pg_unicode_decomposition *entry, int *dec_size)
{
- static pg_wchar x;
+ static char32_t x;
if (DECOMPOSITION_IS_INLINE(entry))
{
Assert(DECOMPOSITION_SIZE(entry) == 1);
- x = (pg_wchar) entry->dec_index;
+ x = (char32_t) entry->dec_index;
*dec_size = 1;
return &x;
}
@@ -156,7 +156,7 @@ get_code_decomposition(const pg_unicode_decomposition *entry, int *dec_size)
* are, in turn, decomposable.
*/
static int
-get_decomposed_size(pg_wchar code, bool compat)
+get_decomposed_size(char32_t code, bool compat)
{
const pg_unicode_decomposition *entry;
int size = 0;
@@ -318,7 +318,7 @@ recompose_code(uint32 start, uint32 code, uint32 *result)
* in the array result.
*/
static void
-decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
+decompose_code(char32_t code, bool compat, char32_t **result, int *current)
{
const pg_unicode_decomposition *entry;
int i;
@@ -337,7 +337,7 @@ decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
v,
tindex,
sindex;
- pg_wchar *res = *result;
+ char32_t *res = *result;
sindex = code - SBASE;
l = LBASE + sindex / (VCOUNT * TCOUNT);
@@ -369,7 +369,7 @@ decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
(!compat && DECOMPOSITION_IS_COMPAT(entry)))
{
- pg_wchar *res = *result;
+ char32_t *res = *result;
res[*current] = code;
(*current)++;
@@ -382,7 +382,7 @@ decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
decomp = get_code_decomposition(entry, &dec_size);
for (i = 0; i < dec_size; i++)
{
- pg_wchar lcode = (pg_wchar) decomp[i];
+ char32_t lcode = (char32_t) decomp[i];
/* Leave if no more decompositions */
decompose_code(lcode, compat, result, current);
@@ -398,17 +398,17 @@ decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
* malloc. Or NULL if we run out of memory. In backend, the returned
* string is palloc'd instead, and OOM is reported with ereport().
*/
-pg_wchar *
-unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
+char32_t *
+unicode_normalize(UnicodeNormalizationForm form, const char32_t *input)
{
bool compat = (form == UNICODE_NFKC || form == UNICODE_NFKD);
bool recompose = (form == UNICODE_NFC || form == UNICODE_NFKC);
- pg_wchar *decomp_chars;
- pg_wchar *recomp_chars;
+ char32_t *decomp_chars;
+ char32_t *recomp_chars;
int decomp_size,
current_size;
int count;
- const pg_wchar *p;
+ const char32_t *p;
/* variables for recomposition */
int last_class;
@@ -425,7 +425,7 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
for (p = input; *p; p++)
decomp_size += get_decomposed_size(*p, compat);
- decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
+ decomp_chars = (char32_t *) ALLOC((decomp_size + 1) * sizeof(char32_t));
if (decomp_chars == NULL)
return NULL;
@@ -448,9 +448,9 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
*/
for (count = 1; count < decomp_size; count++)
{
- pg_wchar prev = decomp_chars[count - 1];
- pg_wchar next = decomp_chars[count];
- pg_wchar tmp;
+ char32_t prev = decomp_chars[count - 1];
+ char32_t next = decomp_chars[count];
+ char32_t tmp;
const uint8 prevClass = get_canonical_class(prev);
const uint8 nextClass = get_canonical_class(next);
@@ -487,7 +487,7 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
* longer than the decomposed one, so make the allocation of the output
* string based on that assumption.
*/
- recomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
+ recomp_chars = (char32_t *) ALLOC((decomp_size + 1) * sizeof(char32_t));
if (!recomp_chars)
{
FREE(decomp_chars);
@@ -501,9 +501,9 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
for (count = 1; count < decomp_size; count++)
{
- pg_wchar ch = decomp_chars[count];
+ char32_t ch = decomp_chars[count];
int ch_class = get_canonical_class(ch);
- pg_wchar composite;
+ char32_t composite;
if (last_class < ch_class &&
recompose_code(starter_ch, ch, &composite))
@@ -524,7 +524,7 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
recomp_chars[target_pos++] = ch;
}
}
- recomp_chars[target_pos] = (pg_wchar) '\0';
+ recomp_chars[target_pos] = (char32_t) '\0';
FREE(decomp_chars);
@@ -540,7 +540,7 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
#ifndef FRONTEND
static const pg_unicode_normprops *
-qc_hash_lookup(pg_wchar ch, const pg_unicode_norminfo *norminfo)
+qc_hash_lookup(char32_t ch, const pg_unicode_norminfo *norminfo)
{
int h;
uint32 hashkey;
@@ -571,7 +571,7 @@ qc_hash_lookup(pg_wchar ch, const pg_unicode_norminfo *norminfo)
* Look up the normalization quick check character property
*/
static UnicodeNormalizationQC
-qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
+qc_is_allowed(UnicodeNormalizationForm form, char32_t ch)
{
const pg_unicode_normprops *found = NULL;
@@ -595,7 +595,7 @@ qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
}
UnicodeNormalizationQC
-unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *input)
+unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const char32_t *input)
{
uint8 lastCanonicalClass = 0;
UnicodeNormalizationQC result = UNICODE_NORM_QC_YES;
@@ -610,9 +610,9 @@ unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *
if (form == UNICODE_NFD || form == UNICODE_NFKD)
return UNICODE_NORM_QC_MAYBE;
- for (const pg_wchar *p = input; *p; p++)
+ for (const char32_t *p = input; *p; p++)
{
- pg_wchar ch = *p;
+ char32_t ch = *p;
uint8 canonicalClass;
UnicodeNormalizationQC check;
diff --git a/src/fe_utils/mbprint.c b/src/fe_utils/mbprint.c
index eb3eeee9925..abffdbe18a2 100644
--- a/src/fe_utils/mbprint.c
+++ b/src/fe_utils/mbprint.c
@@ -49,20 +49,20 @@ pg_get_utf8_id(void)
*
* No error checks here, c must point to a long-enough string.
*/
-static pg_wchar
+static char32_t
utf8_to_unicode(const unsigned char *c)
{
if ((*c & 0x80) == 0)
- return (pg_wchar) c[0];
+ return (char32_t) c[0];
else if ((*c & 0xe0) == 0xc0)
- return (pg_wchar) (((c[0] & 0x1f) << 6) |
+ return (char32_t) (((c[0] & 0x1f) << 6) |
(c[1] & 0x3f));
else if ((*c & 0xf0) == 0xe0)
- return (pg_wchar) (((c[0] & 0x0f) << 12) |
+ return (char32_t) (((c[0] & 0x0f) << 12) |
((c[1] & 0x3f) << 6) |
(c[2] & 0x3f));
else if ((*c & 0xf8) == 0xf0)
- return (pg_wchar) (((c[0] & 0x07) << 18) |
+ return (char32_t) (((c[0] & 0x07) << 18) |
((c[1] & 0x3f) << 12) |
((c[2] & 0x3f) << 6) |
(c[3] & 0x3f));
diff --git a/src/include/c.h b/src/include/c.h
index 9ab5e617995..ce48732311e 100644
--- a/src/include/c.h
+++ b/src/include/c.h
@@ -513,6 +513,24 @@ typedef void (*pg_funcptr_t) (void);
#include <stdbool.h>
+/*
+ * char16_t and char32_t
+ * Unicode code points.
+ *
+ * uchar.h should always be available in C11, but it's not available on
+ * Mac. However, these types are keywords in C++11, so when using C++, we
+ * can't redefine the types. XXX: when uchar.h is available everywhere, we can
+ * remove this check and just include uchar.h unconditionally.
+ */
+#ifdef HAVE_UCHAR_H
+#include <uchar.h>
+#else
+#ifndef __cplusplus
+typedef uint16_t char16_t;
+typedef uint32_t char32_t;
+#endif
+#endif
+
/* ----------------------------------------------------------------
* Section 3: standard system types
diff --git a/src/include/common/unicode_case.h b/src/include/common/unicode_case.h
index 41e2c1f4b33..6bcffd349c2 100644
--- a/src/include/common/unicode_case.h
+++ b/src/include/common/unicode_case.h
@@ -14,14 +14,12 @@
#ifndef UNICODE_CASE_H
#define UNICODE_CASE_H
-#include "mb/pg_wchar.h"
-
typedef size_t (*WordBoundaryNext) (void *wbstate);
-pg_wchar unicode_lowercase_simple(pg_wchar code);
-pg_wchar unicode_titlecase_simple(pg_wchar code);
-pg_wchar unicode_uppercase_simple(pg_wchar code);
-pg_wchar unicode_casefold_simple(pg_wchar code);
+char32_t unicode_lowercase_simple(char32_t code);
+char32_t unicode_titlecase_simple(char32_t code);
+char32_t unicode_uppercase_simple(char32_t code);
+char32_t unicode_casefold_simple(char32_t code);
size_t unicode_strlower(char *dst, size_t dstsize, const char *src,
ssize_t srclen, bool full);
size_t unicode_strtitle(char *dst, size_t dstsize, const char *src,
diff --git a/src/include/common/unicode_case_table.h b/src/include/common/unicode_case_table.h
index d5311786582..0a14fb2d97b 100644
--- a/src/include/common/unicode_case_table.h
+++ b/src/include/common/unicode_case_table.h
@@ -18,7 +18,6 @@
*/
#include "common/unicode_case.h"
-#include "mb/pg_wchar.h"
/*
* The maximum number of codepoints that can result from case mapping
@@ -45,7 +44,7 @@ typedef enum
typedef struct
{
int16 conditions;
- pg_wchar map[NCaseKind][MAX_CASE_EXPANSION];
+ char32_t map[NCaseKind][MAX_CASE_EXPANSION];
} pg_special_case;
/*
@@ -166,7 +165,7 @@ static const pg_special_case special_case[106] =
* The entry case_map_lower[case_index(codepoint)] is the mapping for the
* given codepoint.
*/
-static const pg_wchar case_map_lower[1704] =
+static const char32_t case_map_lower[1704] =
{
0x000000, /* reserved */
0x000000, /* U+000000 */
@@ -1879,7 +1878,7 @@ static const pg_wchar case_map_lower[1704] =
* The entry case_map_title[case_index(codepoint)] is the mapping for the
* given codepoint.
*/
-static const pg_wchar case_map_title[1704] =
+static const char32_t case_map_title[1704] =
{
0x000000, /* reserved */
0x000000, /* U+000000 */
@@ -3592,7 +3591,7 @@ static const pg_wchar case_map_title[1704] =
* The entry case_map_upper[case_index(codepoint)] is the mapping for the
* given codepoint.
*/
-static const pg_wchar case_map_upper[1704] =
+static const char32_t case_map_upper[1704] =
{
0x000000, /* reserved */
0x000000, /* U+000000 */
@@ -5305,7 +5304,7 @@ static const pg_wchar case_map_upper[1704] =
* The entry case_map_fold[case_index(codepoint)] is the mapping for the
* given codepoint.
*/
-static const pg_wchar case_map_fold[1704] =
+static const char32_t case_map_fold[1704] =
{
0x000000, /* reserved */
0x000000, /* U+000000 */
@@ -13522,7 +13521,7 @@ static const uint16 case_map[4778] =
* the offset into the mapping tables.
*/
static inline uint16
-case_index(pg_wchar cp)
+case_index(char32_t cp)
{
/* Fast path for codepoints < 0x0588 */
if (cp < 0x0588)
diff --git a/src/include/common/unicode_category.h b/src/include/common/unicode_category.h
index 8fd8b67a416..684143d3c8a 100644
--- a/src/include/common/unicode_category.h
+++ b/src/include/common/unicode_category.h
@@ -14,8 +14,6 @@
#ifndef UNICODE_CATEGORY_H
#define UNICODE_CATEGORY_H
-#include "mb/pg_wchar.h"
-
/*
* Unicode General Category Values
*
@@ -61,31 +59,31 @@ typedef enum pg_unicode_category
PG_U_FINAL_PUNCTUATION = 29 /* Pf */
} pg_unicode_category;
-extern pg_unicode_category unicode_category(pg_wchar code);
+extern pg_unicode_category unicode_category(char32_t code);
extern const char *unicode_category_string(pg_unicode_category category);
extern const char *unicode_category_abbrev(pg_unicode_category category);
-extern bool pg_u_prop_alphabetic(pg_wchar code);
-extern bool pg_u_prop_lowercase(pg_wchar code);
-extern bool pg_u_prop_uppercase(pg_wchar code);
-extern bool pg_u_prop_cased(pg_wchar code);
-extern bool pg_u_prop_case_ignorable(pg_wchar code);
-extern bool pg_u_prop_white_space(pg_wchar code);
-extern bool pg_u_prop_hex_digit(pg_wchar code);
-extern bool pg_u_prop_join_control(pg_wchar code);
+extern bool pg_u_prop_alphabetic(char32_t code);
+extern bool pg_u_prop_lowercase(char32_t code);
+extern bool pg_u_prop_uppercase(char32_t code);
+extern bool pg_u_prop_cased(char32_t code);
+extern bool pg_u_prop_case_ignorable(char32_t code);
+extern bool pg_u_prop_white_space(char32_t code);
+extern bool pg_u_prop_hex_digit(char32_t code);
+extern bool pg_u_prop_join_control(char32_t code);
-extern bool pg_u_isdigit(pg_wchar code, bool posix);
-extern bool pg_u_isalpha(pg_wchar code);
-extern bool pg_u_isalnum(pg_wchar code, bool posix);
-extern bool pg_u_isword(pg_wchar code);
-extern bool pg_u_isupper(pg_wchar code);
-extern bool pg_u_islower(pg_wchar code);
-extern bool pg_u_isblank(pg_wchar code);
-extern bool pg_u_iscntrl(pg_wchar code);
-extern bool pg_u_isgraph(pg_wchar code);
-extern bool pg_u_isprint(pg_wchar code);
-extern bool pg_u_ispunct(pg_wchar code, bool posix);
-extern bool pg_u_isspace(pg_wchar code);
-extern bool pg_u_isxdigit(pg_wchar code, bool posix);
+extern bool pg_u_isdigit(char32_t code, bool posix);
+extern bool pg_u_isalpha(char32_t code);
+extern bool pg_u_isalnum(char32_t code, bool posix);
+extern bool pg_u_isword(char32_t code);
+extern bool pg_u_isupper(char32_t code);
+extern bool pg_u_islower(char32_t code);
+extern bool pg_u_isblank(char32_t code);
+extern bool pg_u_iscntrl(char32_t code);
+extern bool pg_u_isgraph(char32_t code);
+extern bool pg_u_isprint(char32_t code);
+extern bool pg_u_ispunct(char32_t code, bool posix);
+extern bool pg_u_isspace(char32_t code);
+extern bool pg_u_isxdigit(char32_t code, bool posix);
#endif /* UNICODE_CATEGORY_H */
diff --git a/src/include/common/unicode_category_table.h b/src/include/common/unicode_category_table.h
index 95a1c65da7e..466a41b72b0 100644
--- a/src/include/common/unicode_category_table.h
+++ b/src/include/common/unicode_category_table.h
@@ -20,15 +20,15 @@
*/
typedef struct
{
- uint32 first; /* Unicode codepoint */
- uint32 last; /* Unicode codepoint */
+ char32_t first; /* Unicode codepoint */
+ char32_t last; /* Unicode codepoint */
uint8 category; /* General Category */
} pg_category_range;
typedef struct
{
- uint32 first; /* Unicode codepoint */
- uint32 last; /* Unicode codepoint */
+ char32_t first; /* Unicode codepoint */
+ char32_t last; /* Unicode codepoint */
} pg_unicode_range;
typedef struct
diff --git a/src/include/common/unicode_norm.h b/src/include/common/unicode_norm.h
index 5bc3b79e78e..516c192cc4c 100644
--- a/src/include/common/unicode_norm.h
+++ b/src/include/common/unicode_norm.h
@@ -14,8 +14,6 @@
#ifndef UNICODE_NORM_H
#define UNICODE_NORM_H
-#include "mb/pg_wchar.h"
-
typedef enum
{
UNICODE_NFC = 0,
@@ -32,8 +30,8 @@ typedef enum
UNICODE_NORM_QC_MAYBE = -1,
} UnicodeNormalizationQC;
-extern pg_wchar *unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input);
+extern char32_t *unicode_normalize(UnicodeNormalizationForm form, const char32_t *input);
-extern UnicodeNormalizationQC unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *input);
+extern UnicodeNormalizationQC unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const char32_t *input);
#endif /* UNICODE_NORM_H */
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index 4b4a9974b75..4d84bdc81e4 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -532,25 +532,25 @@ typedef uint32 (*utf_local_conversion_func) (uint32 code);
* Some handy functions for Unicode-specific tests.
*/
static inline bool
-is_valid_unicode_codepoint(pg_wchar c)
+is_valid_unicode_codepoint(char32_t c)
{
return (c > 0 && c <= 0x10FFFF);
}
static inline bool
-is_utf16_surrogate_first(pg_wchar c)
+is_utf16_surrogate_first(char32_t c)
{
return (c >= 0xD800 && c <= 0xDBFF);
}
static inline bool
-is_utf16_surrogate_second(pg_wchar c)
+is_utf16_surrogate_second(char32_t c)
{
return (c >= 0xDC00 && c <= 0xDFFF);
}
-static inline pg_wchar
-surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
+static inline char32_t
+surrogate_pair_to_codepoint(char16_t first, char16_t second)
{
return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF);
}
@@ -561,20 +561,20 @@ surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
*
* No error checks here, c must point to a long-enough string.
*/
-static inline pg_wchar
+static inline char32_t
utf8_to_unicode(const unsigned char *c)
{
if ((*c & 0x80) == 0)
- return (pg_wchar) c[0];
+ return (char32_t) c[0];
else if ((*c & 0xe0) == 0xc0)
- return (pg_wchar) (((c[0] & 0x1f) << 6) |
+ return (char32_t) (((c[0] & 0x1f) << 6) |
(c[1] & 0x3f));
else if ((*c & 0xf0) == 0xe0)
- return (pg_wchar) (((c[0] & 0x0f) << 12) |
+ return (char32_t) (((c[0] & 0x0f) << 12) |
((c[1] & 0x3f) << 6) |
(c[2] & 0x3f));
else if ((*c & 0xf8) == 0xf0)
- return (pg_wchar) (((c[0] & 0x07) << 18) |
+ return (char32_t) (((c[0] & 0x07) << 18) |
((c[1] & 0x3f) << 12) |
((c[2] & 0x3f) << 6) |
(c[3] & 0x3f));
@@ -588,7 +588,7 @@ utf8_to_unicode(const unsigned char *c)
* unicode_utf8len(c) bytes available.
*/
static inline unsigned char *
-unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
+unicode_to_utf8(char32_t c, unsigned char *utf8string)
{
if (c <= 0x7F)
{
@@ -620,7 +620,7 @@ unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
* Number of bytes needed to represent the given char in UTF8.
*/
static inline int
-unicode_utf8len(pg_wchar c)
+unicode_utf8len(char32_t c)
{
if (c <= 0x7F)
return 1;
@@ -676,8 +676,8 @@ extern int pg_valid_server_encoding(const char *name);
extern bool is_encoding_supported_by_icu(int encoding);
extern const char *get_encoding_name_for_icu(int encoding);
-extern unsigned char *unicode_to_utf8(pg_wchar c, unsigned char *utf8string);
-extern pg_wchar utf8_to_unicode(const unsigned char *c);
+extern unsigned char *unicode_to_utf8(char32_t c, unsigned char *utf8string);
+extern char32_t utf8_to_unicode(const unsigned char *c);
extern bool pg_utf8_islegal(const unsigned char *source, int length);
extern int pg_utf_mblen(const unsigned char *s);
extern int pg_mule_mblen(const unsigned char *s);
@@ -739,8 +739,8 @@ extern char *pg_server_to_client(const char *s, int len);
extern char *pg_any_to_server(const char *s, int len, int encoding);
extern char *pg_server_to_any(const char *s, int len, int encoding);
-extern void pg_unicode_to_server(pg_wchar c, unsigned char *s);
-extern bool pg_unicode_to_server_noerror(pg_wchar c, unsigned char *s);
+extern void pg_unicode_to_server(char32_t c, unsigned char *s);
+extern bool pg_unicode_to_server_noerror(char32_t c, unsigned char *s);
extern unsigned short BIG5toCNS(unsigned short big5, unsigned char *lc);
extern unsigned short CNStoBIG5(unsigned short cns, unsigned char lc);
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index c4dc5d72bdb..fab8ba4aa6a 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -463,6 +463,9 @@
/* Define to 1 if you have the <termios.h> header file. */
#undef HAVE_TERMIOS_H
+/* Define to 1 if you have the <uchar.h> header file. */
+#undef HAVE_UCHAR_H
+
/* Define to 1 if curl_global_init() is guaranteed to be thread-safe. */
#undef HAVE_THREADSAFE_CURL_GLOBAL_INIT
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index bb4e1b37005..790be386d75 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -3505,6 +3505,8 @@ cb_cleanup_dir
cb_options
cb_tablespace
cb_tablespace_mapping
+char16_t
+char32_t
check_agg_arguments_context
check_function_callback
check_network_data
--
2.43.0
On Wed, Oct 29, 2025 at 6:59 AM Jeff Davis <pgsql@j-davis.com> wrote:
So you're saying that pg_wchar is more like a union type?
typedef pg_wchar
{
char ch; /* single-byte encodings or
non-UTF8 encodings on unix */
char16_t utf16; /* windows non-UTF8 encodings */
char32_t utf32; /* UTF-8 encoding */
} pg_wchar;(we'd have to be careful about the memory layout if we're casting,
though)
Interesting idea. I think it'd have to be something like:
typedef union
{
unsigned char ch; /* (1) single-byte encoding databases */
char32_t utf32; /* (2) UTF-8 databases */
uint32_t ascii_or_custom; /* (3) MULE, EUC_XX databases */
} pg_wchar;
Dunno if it's worth actually doing, but it's a good illustration and a
better way to explain all this than the wall of text I wrote
yesterday. The collusion between common/wchar.c and pg_locale_libc.c
is made more explicit.
I wonder if the logic to select the member/semantics could be turned
into an enum in the encoding table, to make it even clearer, and then
that could be used as an index into a table of ctype methods obejcts
in _libc.c. The encoding module would be declaring which pg_wchar
semantics it uses, instead of having the _libc.c module infer it from
other properties, for a more explicit contract. Or since they are
inferrable, perhaps a function in the mb module could do that and
return the enum. Hmm, perhaps that alone would be clarifying enough,
without the union type. I'm picturing something like PG_WCHAR_CHAR
(direclty usable with ctype.h), PG_WCHAR_UTF32 (self-explanatory, also
assumed be compatible with UTF-8 locales' wchar_t), PG_WCHAR_CUSTOM
(we only know that ASCII range is sane as Ishii-san explained, and for
anything else you'd need to re-encode via libc or give up, but
preferably not go nuts and return junk). The enum would create a new
central place to document the cross-module semantics.
You showed char16_t for Windows, but we don't ever get char16_t out of
wchar.c, it's always char32_t for UTF-8 input. It's just that _libc.c
truncates to UTF-16 or short-circuits to avoid overflow on that
platform (and in the past AIX 32-bit and maybe more), so it wouldn't
belong in a hypothetical union or enum.
To avoid doing hard
work for nothing (ideogram-based languages generally don't care about
ctype stuff so that'd be the vast majority of characters appearing in
Chinese/Japanese/Korean text) at the cost of having to do a bunch of
research, we could should short-circuit the core CJK character
ranges,
and do the extra CPU cycles for the rest,I don't think we should start making a bunch of assumptions like that.
Yeah, maybe not. Thought process: I had noticed that EUC was the only
relevant encoding family, and it has a character set selector, CS0 =
ASCII, and CS1, CS2, CS3 defined appropriately by the national
variants. I had noticed that at least the Japanese one can encode
Latin with accents, Greek etc (non-ASCII stuff that has a meaningful
isalpha() etc) and I took a wild guess that it might be easy to
distinguish them if they'd chosen to put those under a different CS
number. But I see now that they actually stuffed them all into CS1
along with kanji and kana, making it slightly more difficult: they're
still in different assigned "rows" though. At a guess, you can
probably identify extra punctuation (huh, that's surely relevant even
for pure Japanese text if we want ispunct to work?) and foreign
alphabets with some bitmasks. There might be something similar for
the other EUCs.
It's true that it's really not nice to carry special knowledge like
that (it's not just "assumptions", it's a set of black and white
published standards), and we should probably try hard to avoid that.
Perhaps we could at least put the conversion in a new encoding table
function pointer "pg_wchar_custom_to_wchar_t", so we could reserve a
place to put that sort of optimisation in (as opposed to making
_libc.c call char2wchar() with no hope of fast path)... that is, if
we want to do any of this at all and not just make new ctype functions
that return false for PG_WCHAR_CUSTOM with value >= 128 and call it a
day...
If we do develop this idea though, one issue to contemplate is that
EUC code points might generate more than one wchar_t, looking at
EUC_JIS_2004[1]https://en.wikipedia.org/wiki/JIS_X_0213. We'd need a pg_wchar_custom_to_wchar_t() signature
that takes a single pg_wchar and writes to an output array and returns
the count, and then we'd have to decide what to do if we get more than
one. Surrogates are trivial under the existing "punt" doctrine:
Windows went big on Unicode before it grew, C doesn't do wctype for
multi-wchar_t sequences, and we can't fix any of that. If it's a
(rare?) combining character sequence then uhh... same problem one
level up, I think, even on Unix? I'm not sure if we could do much
better than the "punt" path in both cases: return either false or the
input character as appropriate.
3. I assume there are some good reasons we don't do this but... if
we
used char2wchar() in the first place (= libc native wchar_t) for the
regexp stuff that calls this stuff (as we do already inside
whole-string upper/lower, just not character upper/lower or character
classification), then we could simply call the wchar_t libc functions
directly and unconditionally in the libc provider for all cases,
instead of the 8-bit variants with broken edge cases for non-UTF-8
databases.I'm not sure about that either, but I think it's because you can end up
with surrogate pairs, which can't be represented in UTF-8.
Yeah, I think that alone is a good reason. We need PG_WCHAR_UTF32 (in
the sketch terminology above).
I wondered about PG_WCHAR_SYSTEM_WCHAR_T, that could potentially
replace PG_WCHAR_CUSTOM, in other words using system wchar_t but only
for EUC_*. The point of this would be for eg regexes to be able to
convert whole strings up-front with one libc call, rather than calling
for each character. The problem seems to be that you'd lose any
ability to deal with surrogates and combining characters as discussed
above, as you'd lose character synchronisation for want of a better
word. So I just can't see how to make this work. Which leads back to
the do-it-one-by-one idea, which then leads back to the
maybe-try-to-make-a-fast-path-for-kanji-etc idea 'cos otherwise it
sounds too expensive...
On Wed, 2025-10-29 at 14:00 +1300, Thomas Munro wrote:
I wonder if the logic to select the member/semantics could be turned
into an enum in the encoding table, to make it even clearer, and then
that could be used as an index into a table of ctype methods obejcts
in _libc.c.
As long as we're able to isolate that logic in the libc provider,
that's reasonable. The other providers don't need that complexity, they
just need to decode straight to UTF-32.
You showed char16_t for Windows, but we don't ever get char16_t out
of
wchar.c, it's always char32_t for UTF-8 input. It's just that
_libc.c
truncates to UTF-16 or short-circuits to avoid overflow on that
platform (and in the past AIX 32-bit and maybe more), so it wouldn't
belong in a hypothetical union or enum.
Oh, I see.
Perhaps we could at least put the conversion in a new encoding table
function pointer "pg_wchar_custom_to_wchar_t", so we could reserve a
place to put that sort of optimisation in
That sounds like a good step forward. And maybe one to convert to UTF-
32 for ICU, also?
If we do develop this idea though, one issue to contemplate is that
EUC code points might generate more than one wchar_t, looking at
EUC_JIS_2004[1].
Wow, that's unfortunate.
Regards,
Jeff Davis
On Wed, Oct 29, 2025 at 2:00 PM Thomas Munro <thomas.munro@gmail.com> wrote:
I'm picturing something like PG_WCHAR_CHAR
(direclty usable with ctype.h), PG_WCHAR_UTF32 (self-explanatory, also
assumed be compatible with UTF-8 locales' wchar_t), PG_WCHAR_CUSTOM
(we only know that ASCII range is sane as Ishii-san explained, and for
anything else you'd need to re-encode via libc or give up, but
preferably not go nuts and return junk). The enum would create a new
central place to document the cross-module semantics.
Here are some sketch-quality patches to try out some of these ideas,
for discussion. I gave them .txt endings so as not to hijack your
thread's CI.
* Fixing a different but related bug spotted in passing: we truncate
codepoints passed to Windows' iswalpha_l() et al, instead of detecting
overflow like some other places do. Not tested on Windows, but it
seemed pretty obviously wrong?
* Classifying all pg_wchar encodings as producing PG_WCHAR_CHAR,
PG_WCHAR_UTF32 or PG_WCHAR_CUSTOM, and dispatching to libc ctype
methods based with that.
* Easy EUC change: filtering out non-ASCII for _CUSTOM. I can't seem
to convince SQL-level regexes to expose bogus results on master
though... maybe the pg_wchar encoding actively avoids the by shifting
values up so you often or always cast to a harmless value? Still
better to formalise that I think, if we don't move ahead with the more
ambitious plan...
* More ambitious re-encoding strategy, replacing previous change, with
apparently plausible results.
* Various refactorings with helper macros to avoid making mistakes in
all that repetitive wrapper stuff.
Here's what my ja_JP.eucJP database shows, on FreeBSD. BTW in my
earlier emails I was confused and thought that kanji would not be in
class [[:alpha:]], but that's wrong: Unicode calls it "other letter",
and it looks like that makes all modern libcs return true for
iswalpha():
postgres=# select regexp_replace('1234 Постгрес 5678', '[[:alpha:]]+', '象');
regexp_replace
----------------
1234 象 5678
(1 row)
postgres=# select regexp_replace('1234 ポスグレ 5678', '[[:alpha:]]+', '象');
regexp_replace
----------------
1234 象 5678
(1 row)
postgres=# select regexp_replace('1234 ポスグレ? 5678', '[[:punct:]]+', '。');
regexp_replace
----------------------
1234 ポスグレ。 5678
(1 row)
(That's not an ASCII question mark, it's one of the kanji-box sized
punctuation characters.)
I had to hack regc_pg_locale.c slightly to teach it that just because
I set max_chr to 127 it doesn't mean I want it to turn locale support
off. Haven't looked into that code to figure out what it should do
instead, but it definitely shouldn't be allowed to probe made up
pg_wchar values, because EUC's pg_wchar encoding is sparse and
transcoding can error out.
A mystery that blocked me for too long: regexp_match('café', 'CAFÉ',
'i') and regexp_match('Αθήνα', 'ΑΘΉΝΑ', 'i') match with Apple's
ja_JP.eucJP as do the examples above, but mysteriously didn't on
FreeBSD's where this code started, could be a bug in its ja_JP.eucJP
locale affecting toupper/tolower... Wish I could get that time back.
I imagine that for the ICU + non-UTF-8 locale bug you mentioned, we
might need a very similar set of re-encoding wrappers: something like
pg_wchar -> mb -> UTF-8 -> UTF-32. All this re-encoding sounds
pretty bad, but I can't see any way around the re-encoding with these
edge-case configurations, and we're still supposed to spit out correct
right answers...
Attachments:
0001-Fix-Windows-wctype.h-usage-for-codepoints-outside-Unic.txttext/plain; charset=US-ASCII; name=0001-Fix-Windows-wctype.h-usage-for-codepoints-outside-Unic.txtDownload
From 5525b5e35121bdfd5eb566b7a08916fe90822422 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Wed, 29 Oct 2025 15:53:46 +1300
Subject: [PATCH 1/8] Fix Windows wctype.h usage for codepoints outside Unicode
BMP.
Windows' wchar_t is only 16 bits wide. As established by the
towupper_l()/towlower_l() wrapper functions, we should avoid truncating
overflowing code points when calling wctype.h functions, and just return
false. Windows just can't answer that question, but it didn't make
sense to return the answer for a totally different character.
---
src/backend/utils/adt/pg_locale_libc.c | 27 +++++++++++++++++---------
1 file changed, 18 insertions(+), 9 deletions(-)
diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c
index 9c7fcd1fc7a..761ed1a0603 100644
--- a/src/backend/utils/adt/pg_locale_libc.c
+++ b/src/backend/utils/adt/pg_locale_libc.c
@@ -187,55 +187,64 @@ wc_isxdigit_libc_sb(pg_wchar wc, pg_locale_t locale)
static bool
wc_isdigit_libc_mb(pg_wchar wc, pg_locale_t locale)
{
- return iswdigit_l((wint_t) wc, locale->lt);
+ return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
+ iswdigit_l((wint_t) wc, locale->lt);
}
static bool
wc_isalpha_libc_mb(pg_wchar wc, pg_locale_t locale)
{
- return iswalpha_l((wint_t) wc, locale->lt);
+ return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
+ iswalpha_l((wint_t) wc, locale->lt);
}
static bool
wc_isalnum_libc_mb(pg_wchar wc, pg_locale_t locale)
{
- return iswalnum_l((wint_t) wc, locale->lt);
+ return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
+ iswalnum_l((wint_t) wc, locale->lt);
}
static bool
wc_isupper_libc_mb(pg_wchar wc, pg_locale_t locale)
{
- return iswupper_l((wint_t) wc, locale->lt);
+ return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
+ iswupper_l((wint_t) wc, locale->lt);
}
static bool
wc_islower_libc_mb(pg_wchar wc, pg_locale_t locale)
{
- return iswlower_l((wint_t) wc, locale->lt);
+ return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
+ iswlower_l((wint_t) wc, locale->lt);
}
static bool
wc_isgraph_libc_mb(pg_wchar wc, pg_locale_t locale)
{
- return iswgraph_l((wint_t) wc, locale->lt);
+ return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
+ iswgraph_l((wint_t) wc, locale->lt);
}
static bool
wc_isprint_libc_mb(pg_wchar wc, pg_locale_t locale)
{
- return iswprint_l((wint_t) wc, locale->lt);
+ return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
+ iswprint_l((wint_t) wc, locale->lt);
}
static bool
wc_ispunct_libc_mb(pg_wchar wc, pg_locale_t locale)
{
- return iswpunct_l((wint_t) wc, locale->lt);
+ return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
+ iswpunct_l((wint_t) wc, locale->lt);
}
static bool
wc_isspace_libc_mb(pg_wchar wc, pg_locale_t locale)
{
- return iswspace_l((wint_t) wc, locale->lt);
+ return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
+ iswspace_l((wint_t) wc, locale->lt);
}
static bool
--
2.50.1 (Apple Git-155)
0002-Formalize-pg_wchar-encoding-schemes.txttext/plain; charset=US-ASCII; name=0002-Formalize-pg_wchar-encoding-schemes.txtDownload
From 9a9026c29f3e9cd3c1b7fd92e053bcb5ecc5f6ae Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Wed, 29 Oct 2025 15:14:13 +1300
Subject: [PATCH 2/8] Formalize pg_wchar encoding schemes.
Create a bit more clarity about the different ways that pg_wchar can be
encoded, by naming the three schemes in use. This also allows a
dispatch-table format in pg_locale_libc.c.
Discussion: https://www.postgresql.org/message-id/flat/CA%2BhUKG%2BhDkp1etcfy%3DtaxJ8ybf8KapyOjqdBRPF7yaoSoSj1_w%40mail.gmail.com
---
src/backend/utils/adt/pg_locale_libc.c | 163 +++++++++++++------------
src/common/wchar.c | 94 +++++++-------
src/include/mb/pg_wchar.h | 51 ++++++++
src/tools/pgindent/typedefs.list | 1 +
4 files changed, 187 insertions(+), 122 deletions(-)
diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c
index 761ed1a0603..1892ed3c5ce 100644
--- a/src/backend/utils/adt/pg_locale_libc.c
+++ b/src/backend/utils/adt/pg_locale_libc.c
@@ -43,20 +43,25 @@
* the <ctype.h> functions since those will obey LC_CTYPE. Note that these
* collations don't give a fig about multibyte characters.
*
- * 2. When working in UTF8 encoding, we use the <wctype.h> functions.
+ * 2. PG_WCHAR_UTF32 encoding scheme:
+ *
+ * When working in UTF8 encoding, we use the <wctype.h> functions.
* This assumes that every platform uses Unicode codepoints directly
* as the wchar_t representation of Unicode. On some platforms
* wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
*
- * 3. In all other encodings, we use the <ctype.h> functions for pg_wchar
+ * 3. PG_WCHAR_CHAR and PG_WCHAR_CUSTOM encoding schemes:
+ *
+ * In all other encodings, we use the <ctype.h> functions for pg_wchar
* values up to 255, and punt for values above that. This is 100% correct
- * only in single-byte encodings such as LATINn. However, non-Unicode
- * multibyte encodings are mostly Far Eastern character sets for which the
- * properties being tested here aren't very relevant for higher code values
- * anyway. The difficulty with using the <wctype.h> functions with
- * non-Unicode multibyte encodings is that we can have no certainty that
- * the platform's wchar_t representation matches what we do in pg_wchar
- * conversions.
+ * only in single-byte encodings such as LATINn (PG_WCHAR_CHAR). However,
+ * non-Unicode multibyte encodings (PG_WCHAR_CUSTOM) are all Far Eastern
+ * character sets for which the properties being tested here aren't very
+ * relevant for higher code values anyway. The difficulty with using the
+ * <wctype.h> functions with non-Unicode multibyte encodings is that we can
+ * have no certainty that the platform's wchar_t representation matches what we
+ * do in pg_wchar conversions. (MULE is also declared PG_WCHAR_CUSTOM but is
+ * not available as a multi-byte encoding in any known libc.)
*
* As a special case, in the "default" collation, (2) and (3) force ASCII
* letters to follow ASCII upcase/downcase rules, while in a non-default
@@ -331,70 +336,75 @@ tolower_libc_mb(pg_wchar wc, pg_locale_t locale)
return wc;
}
-static const struct ctype_methods ctype_methods_libc_sb = {
- .strlower = strlower_libc_sb,
- .strtitle = strtitle_libc_sb,
- .strupper = strupper_libc_sb,
- .wc_isdigit = wc_isdigit_libc_sb,
- .wc_isalpha = wc_isalpha_libc_sb,
- .wc_isalnum = wc_isalnum_libc_sb,
- .wc_isupper = wc_isupper_libc_sb,
- .wc_islower = wc_islower_libc_sb,
- .wc_isgraph = wc_isgraph_libc_sb,
- .wc_isprint = wc_isprint_libc_sb,
- .wc_ispunct = wc_ispunct_libc_sb,
- .wc_isspace = wc_isspace_libc_sb,
- .wc_isxdigit = wc_isxdigit_libc_sb,
- .char_is_cased = char_is_cased_libc,
- .char_tolower = char_tolower_libc,
- .wc_toupper = toupper_libc_sb,
- .wc_tolower = tolower_libc_sb,
- .max_chr = UCHAR_MAX,
-};
-
-/*
- * Non-UTF8 multibyte encodings use multibyte semantics for case mapping, but
- * single-byte semantics for pattern matching.
- */
-static const struct ctype_methods ctype_methods_libc_other_mb = {
- .strlower = strlower_libc_mb,
- .strtitle = strtitle_libc_mb,
- .strupper = strupper_libc_mb,
- .wc_isdigit = wc_isdigit_libc_sb,
- .wc_isalpha = wc_isalpha_libc_sb,
- .wc_isalnum = wc_isalnum_libc_sb,
- .wc_isupper = wc_isupper_libc_sb,
- .wc_islower = wc_islower_libc_sb,
- .wc_isgraph = wc_isgraph_libc_sb,
- .wc_isprint = wc_isprint_libc_sb,
- .wc_ispunct = wc_ispunct_libc_sb,
- .wc_isspace = wc_isspace_libc_sb,
- .wc_isxdigit = wc_isxdigit_libc_sb,
- .char_is_cased = char_is_cased_libc,
- .char_tolower = char_tolower_libc,
- .wc_toupper = toupper_libc_sb,
- .wc_tolower = tolower_libc_sb,
- .max_chr = UCHAR_MAX,
-};
+static const struct ctype_methods ctype_methods_libc[] = {
+ [PG_WCHAR_CHAR] = {
+ .strlower = strlower_libc_sb,
+ .strtitle = strtitle_libc_sb,
+ .strupper = strupper_libc_sb,
+ .wc_isdigit = wc_isdigit_libc_sb,
+ .wc_isalpha = wc_isalpha_libc_sb,
+ .wc_isalnum = wc_isalnum_libc_sb,
+ .wc_isupper = wc_isupper_libc_sb,
+ .wc_islower = wc_islower_libc_sb,
+ .wc_isgraph = wc_isgraph_libc_sb,
+ .wc_isprint = wc_isprint_libc_sb,
+ .wc_ispunct = wc_ispunct_libc_sb,
+ .wc_isspace = wc_isspace_libc_sb,
+ .wc_isxdigit = wc_isxdigit_libc_sb,
+ .char_is_cased = char_is_cased_libc,
+ .char_tolower = char_tolower_libc,
+ .wc_toupper = toupper_libc_sb,
+ .wc_tolower = tolower_libc_sb,
+ .max_chr = UCHAR_MAX,
+ },
+ [PG_WCHAR_UTF32] = {
+ .strlower = strlower_libc_mb,
+ .strtitle = strtitle_libc_mb,
+ .strupper = strupper_libc_mb,
+ .wc_isdigit = wc_isdigit_libc_mb,
+ .wc_isalpha = wc_isalpha_libc_mb,
+ .wc_isalnum = wc_isalnum_libc_mb,
+ .wc_isupper = wc_isupper_libc_mb,
+ .wc_islower = wc_islower_libc_mb,
+ .wc_isgraph = wc_isgraph_libc_mb,
+ .wc_isprint = wc_isprint_libc_mb,
+ .wc_ispunct = wc_ispunct_libc_mb,
+ .wc_isspace = wc_isspace_libc_mb,
+ .wc_isxdigit = wc_isxdigit_libc_mb,
+ .char_is_cased = char_is_cased_libc,
+ .char_tolower = char_tolower_libc,
+ .wc_toupper = toupper_libc_mb,
+ .wc_tolower = tolower_libc_mb,
+ },
-static const struct ctype_methods ctype_methods_libc_utf8 = {
- .strlower = strlower_libc_mb,
- .strtitle = strtitle_libc_mb,
- .strupper = strupper_libc_mb,
- .wc_isdigit = wc_isdigit_libc_mb,
- .wc_isalpha = wc_isalpha_libc_mb,
- .wc_isalnum = wc_isalnum_libc_mb,
- .wc_isupper = wc_isupper_libc_mb,
- .wc_islower = wc_islower_libc_mb,
- .wc_isgraph = wc_isgraph_libc_mb,
- .wc_isprint = wc_isprint_libc_mb,
- .wc_ispunct = wc_ispunct_libc_mb,
- .wc_isspace = wc_isspace_libc_mb,
- .wc_isxdigit = wc_isxdigit_libc_mb,
- .char_is_cased = char_is_cased_libc,
- .char_tolower = char_tolower_libc,
- .wc_toupper = toupper_libc_mb,
- .wc_tolower = tolower_libc_mb,
+ /*
+ * Custom pg_wchar format converted from non-UTF8 multibyte encodings use
+ * multibyte semantics for case mapping, but single-byte semantics for
+ * pattern matching.
+ *
+ * XXX Therefore this gives incorrect results for pattern matching outside
+ * the ASCII range. Could be fixed.
+ */
+ [PG_WCHAR_CUSTOM] = {
+ .strlower = strlower_libc_mb,
+ .strtitle = strtitle_libc_mb,
+ .strupper = strupper_libc_mb,
+ .wc_isdigit = wc_isdigit_libc_sb,
+ .wc_isalpha = wc_isalpha_libc_sb,
+ .wc_isalnum = wc_isalnum_libc_sb,
+ .wc_isupper = wc_isupper_libc_sb,
+ .wc_islower = wc_islower_libc_sb,
+ .wc_isgraph = wc_isgraph_libc_sb,
+ .wc_isprint = wc_isprint_libc_sb,
+ .wc_ispunct = wc_ispunct_libc_sb,
+ .wc_isspace = wc_isspace_libc_sb,
+ .wc_isxdigit = wc_isxdigit_libc_sb,
+ .char_is_cased = char_is_cased_libc,
+ .char_tolower = char_tolower_libc,
+ .wc_toupper = toupper_libc_sb,
+ .wc_tolower = tolower_libc_sb,
+ .max_chr = UCHAR_MAX,
+ },
};
static const struct collate_methods collate_methods_libc = {
@@ -763,14 +773,7 @@ create_pg_locale_libc(Oid collid, MemoryContext context)
result->collate = &collate_methods_libc;
}
if (!result->ctype_is_c)
- {
- if (GetDatabaseEncoding() == PG_UTF8)
- result->ctype = &ctype_methods_libc_utf8;
- else if (pg_database_encoding_max_length() > 1)
- result->ctype = &ctype_methods_libc_other_mb;
- else
- result->ctype = &ctype_methods_libc_sb;
- }
+ result->ctype = &ctype_methods_libc[pg_wchar_encoding_scheme(GetDatabaseEncoding())];
return result;
}
diff --git a/src/common/wchar.c b/src/common/wchar.c
index a4bc29921de..f453587749a 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -2062,50 +2062,60 @@ pg_encoding_set_invalid(int encoding, char *dst)
*-------------------------------------------------------------------
*/
const pg_wchar_tbl pg_wchar_table[] = {
- [PG_SQL_ASCII] = {pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifychar, pg_ascii_verifystr, 1},
- [PG_EUC_JP] = {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},
- [PG_EUC_CN] = {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifychar, pg_euccn_verifystr, 2},
- [PG_EUC_KR] = {pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifychar, pg_euckr_verifystr, 3},
- [PG_EUC_TW] = {pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifychar, pg_euctw_verifystr, 4},
- [PG_EUC_JIS_2004] = {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},
- [PG_UTF8] = {pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifychar, pg_utf8_verifystr, 4},
- [PG_MULE_INTERNAL] = {pg_mule2wchar_with_len, pg_wchar2mule_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifychar, pg_mule_verifystr, 4},
- [PG_LATIN1] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
- [PG_LATIN2] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
- [PG_LATIN3] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
- [PG_LATIN4] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
- [PG_LATIN5] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
- [PG_LATIN6] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
- [PG_LATIN7] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
- [PG_LATIN8] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
- [PG_LATIN9] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
- [PG_LATIN10] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
- [PG_WIN1256] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
- [PG_WIN1258] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
- [PG_WIN866] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
- [PG_WIN874] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
- [PG_KOI8R] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
- [PG_WIN1251] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
- [PG_WIN1252] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
- [PG_ISO_8859_5] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
- [PG_ISO_8859_6] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
- [PG_ISO_8859_7] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
- [PG_ISO_8859_8] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
- [PG_WIN1250] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
- [PG_WIN1253] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
- [PG_WIN1254] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
- [PG_WIN1255] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
- [PG_WIN1257] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
- [PG_KOI8U] = {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
- [PG_SJIS] = {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},
- [PG_BIG5] = {0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifychar, pg_big5_verifystr, 2},
- [PG_GBK] = {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifychar, pg_gbk_verifystr, 2},
- [PG_UHC] = {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifychar, pg_uhc_verifystr, 2},
- [PG_GB18030] = {0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifychar, pg_gb18030_verifystr, 4},
- [PG_JOHAB] = {0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifychar, pg_johab_verifystr, 3},
- [PG_SHIFT_JIS_2004] = {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},
+ [PG_SQL_ASCII] = {PG_WCHAR_CHAR, pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifychar, pg_ascii_verifystr, 1},
+ [PG_EUC_JP] = {PG_WCHAR_CUSTOM, pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},
+ [PG_EUC_CN] = {PG_WCHAR_CUSTOM, pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifychar, pg_euccn_verifystr, 2},
+ [PG_EUC_KR] = {PG_WCHAR_CUSTOM, pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifychar, pg_euckr_verifystr, 3},
+ [PG_EUC_TW] = {PG_WCHAR_CUSTOM, pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifychar, pg_euctw_verifystr, 4},
+ [PG_EUC_JIS_2004] = {PG_WCHAR_CUSTOM, pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifychar, pg_eucjp_verifystr, 3},
+ [PG_UTF8] = {PG_WCHAR_UTF32, pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifychar, pg_utf8_verifystr, 4},
+ [PG_MULE_INTERNAL] = {PG_WCHAR_CUSTOM, pg_mule2wchar_with_len, pg_wchar2mule_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifychar, pg_mule_verifystr, 4},
+ [PG_LATIN1] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_LATIN2] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_LATIN3] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_LATIN4] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_LATIN5] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_LATIN6] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_LATIN7] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_LATIN8] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_LATIN9] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_LATIN10] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_WIN1256] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_WIN1258] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_WIN866] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_WIN874] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_KOI8R] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_WIN1251] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_WIN1252] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_ISO_8859_5] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_ISO_8859_6] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_ISO_8859_7] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_ISO_8859_8] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_WIN1250] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_WIN1253] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_WIN1254] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_WIN1255] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_WIN1257] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_KOI8U] = {PG_WCHAR_CHAR, pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifychar, pg_latin1_verifystr, 1},
+ [PG_SJIS] = {PG_WCHAR_NONE, 0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},
+ [PG_BIG5] = {PG_WCHAR_NONE, 0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifychar, pg_big5_verifystr, 2},
+ [PG_GBK] = {PG_WCHAR_NONE, 0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifychar, pg_gbk_verifystr, 2},
+ [PG_UHC] = {PG_WCHAR_NONE, 0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifychar, pg_uhc_verifystr, 2},
+ [PG_GB18030] = {PG_WCHAR_NONE, 0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifychar, pg_gb18030_verifystr, 4},
+ [PG_JOHAB] = {PG_WCHAR_NONE, 0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifychar, pg_johab_verifystr, 3},
+ [PG_SHIFT_JIS_2004] = {PG_WCHAR_NONE, 0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifychar, pg_sjis_verifystr, 2},
};
+/*
+ * Returns the encoding scheme for pg_wchar values in the current database
+ * encoding.
+ */
+PgWcharEncodingScheme
+pg_wchar_encoding_scheme(int encoding)
+{
+ return pg_wchar_table[encoding].encoding_scheme;
+}
+
/*
* Returns the byte length of a multibyte character.
*
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index 4b4a9974b75..5db00cebcef 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -27,6 +27,55 @@
*/
typedef unsigned int pg_wchar;
+/*
+ * Encoding schemes that pg_wchar might hold.
+ *
+ * Each multi-byte encoding has a corresponding wide encoding scheme,
+ * conceptually like wchar_t in C. Conversions to and from char should be
+ * performed by pg_mb2wchar*() and pg_wchar2mb*() functions. In all encoding
+ * schemes, values 0-127 represent ASCII. For higher values, see below.
+ *
+ * Locale providers make use of the known properties of these encoding schemes
+ * to implement ctype/wctype functionality.
+ */
+typedef enum PgWcharEncodingScheme
+{
+ /*
+ * 8-bit characters in the database encoding, zero-extended to pg_wchar
+ * width.
+ */
+ PG_WCHAR_CHAR,
+
+ /*
+ * 32-bit Unicode code points. PostgreSQL assumes that all libc
+ * implementations use UTF-32 or at least UTF-16 if wchar_t is narrow for
+ * locales that use UTF-8 encoding for char strings, so it has a special
+ * case for this.
+ */
+ PG_WCHAR_UTF32,
+
+ /*
+ * For multi-byte database encodings other than UTF-8, the encoding is
+ * unspecified outside the ASCII range.
+ */
+ PG_WCHAR_CUSTOM,
+
+ /*
+ * This scheme is not currently used by any of the supported encodings,
+ * but is included here for completeness, providing terminology. In a few
+ * places, pg_wchar is used to transport wchar_t in whatever unknown
+ * encoding libc uses for the database encoding. This is second from last
+ * so that lookup arrays don't have to waste an entry.
+ */
+ PG_WCHAR_SYSTEM_WCHAR_T,
+
+ /*
+ * pg_wchar conversion is not available for the database encoding. This
+ * is last so that lookup arrays don't have to waste an entry.
+ */
+ PG_WCHAR_NONE,
+} PgWcharEncodingScheme;
+
/*
* Maximum byte length of multibyte characters in any backend encoding
*/
@@ -391,6 +440,7 @@ typedef int (*mbstr_verifier) (const unsigned char *mbstr, int len);
typedef struct
{
+ PgWcharEncodingScheme encoding_scheme; /* pg_wchar representation */
mb2wchar_with_len_converter mb2wchar_with_len; /* convert a multibyte
* string to a wchar */
wchar2mb_with_len_converter wchar2mb_with_len; /* convert a wchar string
@@ -713,6 +763,7 @@ extern int SetClientEncoding(int encoding);
extern void InitializeClientEncoding(void);
extern int pg_get_client_encoding(void);
extern const char *pg_get_client_encoding_name(void);
+extern PgWcharEncodingScheme pg_wchar_encoding_scheme(int encoding);
extern void SetDatabaseEncoding(int encoding);
extern int GetDatabaseEncoding(void);
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index ac2da4c98cf..d6973751f12 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -2264,6 +2264,7 @@ PgStat_WalCounters
PgStat_WalStats
PgXmlErrorContext
PgXmlStrictness
+PgWcharEncodingScheme
Pg_abi_values
Pg_finfo_record
Pg_magic_struct
--
2.50.1 (Apple Git-155)
0003-Fix-corrupted-ctype.h-handling-for-non-ASCII-in-EUC-en.txttext/plain; charset=US-ASCII; name=0003-Fix-corrupted-ctype.h-handling-for-non-ASCII-in-EUC-en.txtDownload
From 304dc61ed765f1a57a9b9f9cf32a6342f0b15e6a Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Wed, 29 Oct 2025 19:25:40 +1300
Subject: [PATCH 3/8] Fix corrupted ctype.h handling for non-ASCII in EUC
encodings.
Previously we treated PG_WCHAR_CUSTOM encodings the same way as
PG_WCHAR_CHAR, by passing the lower 8 bits of pg_wchar to ctype.h
functions. That was OK for 7-bit ASCII, but arbitrary junk for any
higher values. New functions are provided that just return false for
non-ASCII values.
A more correct fix would convert to libc wchar_t format and use
wctype.h, but that isn't done here.
---
src/backend/utils/adt/pg_locale_libc.c | 103 ++++++++++++++++---------
1 file changed, 68 insertions(+), 35 deletions(-)
diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c
index 1892ed3c5ce..1d6e8be3a82 100644
--- a/src/backend/utils/adt/pg_locale_libc.c
+++ b/src/backend/utils/adt/pg_locale_libc.c
@@ -33,6 +33,11 @@
#include <shlwapi.h>
#endif
+#if defined(WIN32)
+#define isxdigit_l _isxdigit_l
+#define iswxdigit_l _iswxdigit_l
+#endif
+
/*
* For the libc provider, to provide as much functionality as possible on a
* variety of platforms without going so far as to implement everything from
@@ -50,20 +55,20 @@
* as the wchar_t representation of Unicode. On some platforms
* wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
*
- * 3. PG_WCHAR_CHAR and PG_WCHAR_CUSTOM encoding schemes:
+ * 3. PG_WCHAR_CUSTOM encoding scheme:
+ *
+ * When working with the EUC_* family of encodings (and technically MULE
+ * internal too, but no libc systems are known to support that encoding), we
+ * convert to wchar_t on the fly and use the <wctype.h> functions, except in
+ * the ASCII range where we use the <ctype.h> functions.
+ *
+ * 4. PG_WCHAR_CHAR encoding scheme:
*
* In all other encodings, we use the <ctype.h> functions for pg_wchar
- * values up to 255, and punt for values above that. This is 100% correct
- * only in single-byte encodings such as LATINn (PG_WCHAR_CHAR). However,
- * non-Unicode multibyte encodings (PG_WCHAR_CUSTOM) are all Far Eastern
- * character sets for which the properties being tested here aren't very
- * relevant for higher code values anyway. The difficulty with using the
- * <wctype.h> functions with non-Unicode multibyte encodings is that we can
- * have no certainty that the platform's wchar_t representation matches what we
- * do in pg_wchar conversions. (MULE is also declared PG_WCHAR_CUSTOM but is
- * not available as a multi-byte encoding in any known libc.)
+ * values up to 255. This is 100% correct since the values originated as char
+ * and were just widened to pg_wchar without change.
*
- * As a special case, in the "default" collation, (2) and (3) force ASCII
+ * As a special case, in the "default" collation, (2), (3) and (4) force ASCII
* letters to follow ASCII upcase/downcase rules, while in a non-default
* collation we just let the library functions do what they will. The case
* where this matters is treatment of I/i in Turkish, and the behavior is
@@ -125,6 +130,30 @@ static size_t strupper_libc_mb(char *dest, size_t destsize,
const char *src, ssize_t srclen,
pg_locale_t locale);
+/*
+ * Generate a function that passes single-byte characters directly to <ctype.h>
+ * functions, but only if they are in the ASCII range. This is suitable for
+ * PG_WCHAR_CUSTOM pg_wchar encoding (used with EUC_* encodings). Values
+ * outside ASCII have an unknown encoding, so we just return false.
+ */
+#define DEFINE_WC_CTYPE_LIBC_ASCII(ctype) \
+static bool \
+wc_is##ctype##_libc_ascii(pg_wchar wc, pg_locale_t locale) \
+{ \
+ return is##ctype##_l((unsigned char) wc, locale->lt); \
+}
+
+DEFINE_WC_CTYPE_LIBC_ASCII(digit);
+DEFINE_WC_CTYPE_LIBC_ASCII(alpha);
+DEFINE_WC_CTYPE_LIBC_ASCII(alnum);
+DEFINE_WC_CTYPE_LIBC_ASCII(upper);
+DEFINE_WC_CTYPE_LIBC_ASCII(lower);
+DEFINE_WC_CTYPE_LIBC_ASCII(graph);
+DEFINE_WC_CTYPE_LIBC_ASCII(print);
+DEFINE_WC_CTYPE_LIBC_ASCII(punct);
+DEFINE_WC_CTYPE_LIBC_ASCII(space);
+DEFINE_WC_CTYPE_LIBC_ASCII(xdigit);
+
static bool
wc_isdigit_libc_sb(pg_wchar wc, pg_locale_t locale)
{
@@ -182,11 +211,7 @@ wc_isspace_libc_sb(pg_wchar wc, pg_locale_t locale)
static bool
wc_isxdigit_libc_sb(pg_wchar wc, pg_locale_t locale)
{
-#ifndef WIN32
return isxdigit_l((unsigned char) wc, locale->lt);
-#else
- return _isxdigit_l((unsigned char) wc, locale->lt);
-#endif
}
static bool
@@ -255,11 +280,7 @@ wc_isspace_libc_mb(pg_wchar wc, pg_locale_t locale)
static bool
wc_isxdigit_libc_mb(pg_wchar wc, pg_locale_t locale)
{
-#ifndef WIN32
return iswxdigit_l((wint_t) wc, locale->lt);
-#else
- return _iswxdigit_l((wint_t) wc, locale->lt);
-#endif
}
static char
@@ -280,6 +301,12 @@ char_is_cased_libc(char ch, pg_locale_t locale)
return isalpha_l((unsigned char) ch, locale->lt);
}
+static pg_wchar
+toupper_libc_ascii(pg_wchar wc, pg_locale_t locale)
+{
+ return wc < 128 ? toupper_l((unsigned char) wc, locale->lt) : wc;
+}
+
static pg_wchar
toupper_libc_sb(pg_wchar wc, pg_locale_t locale)
{
@@ -308,6 +335,12 @@ toupper_libc_mb(pg_wchar wc, pg_locale_t locale)
return wc;
}
+static pg_wchar
+tolower_libc_ascii(pg_wchar wc, pg_locale_t locale)
+{
+ return wc < 128 ? tolower_l((unsigned char) wc, locale->lt) : wc;
+}
+
static pg_wchar
tolower_libc_sb(pg_wchar wc, pg_locale_t locale)
{
@@ -379,30 +412,30 @@ static const struct ctype_methods ctype_methods_libc[] = {
/*
* Custom pg_wchar format converted from non-UTF8 multibyte encodings use
- * multibyte semantics for case mapping, but single-byte semantics for
- * pattern matching.
+ * multibyte semantics for case mapping, but ASCII-only semantics for
+ * pattern matching, since libc doesn't understand custom encoding of
+ * higher values.
*
- * XXX Therefore this gives incorrect results for pattern matching outside
- * the ASCII range. Could be fixed.
+ * XXX We could convert to wchar_t to fix that, at considerable cost.
*/
[PG_WCHAR_CUSTOM] = {
.strlower = strlower_libc_mb,
.strtitle = strtitle_libc_mb,
.strupper = strupper_libc_mb,
- .wc_isdigit = wc_isdigit_libc_sb,
- .wc_isalpha = wc_isalpha_libc_sb,
- .wc_isalnum = wc_isalnum_libc_sb,
- .wc_isupper = wc_isupper_libc_sb,
- .wc_islower = wc_islower_libc_sb,
- .wc_isgraph = wc_isgraph_libc_sb,
- .wc_isprint = wc_isprint_libc_sb,
- .wc_ispunct = wc_ispunct_libc_sb,
- .wc_isspace = wc_isspace_libc_sb,
- .wc_isxdigit = wc_isxdigit_libc_sb,
+ .wc_isdigit = wc_isdigit_libc_ascii,
+ .wc_isalpha = wc_isalpha_libc_ascii,
+ .wc_isalnum = wc_isalnum_libc_ascii,
+ .wc_isupper = wc_isupper_libc_ascii,
+ .wc_islower = wc_islower_libc_ascii,
+ .wc_isgraph = wc_isgraph_libc_ascii,
+ .wc_isprint = wc_isprint_libc_ascii,
+ .wc_ispunct = wc_ispunct_libc_ascii,
+ .wc_isspace = wc_isspace_libc_ascii,
+ .wc_isxdigit = wc_isxdigit_libc_ascii,
.char_is_cased = char_is_cased_libc,
.char_tolower = char_tolower_libc,
- .wc_toupper = toupper_libc_sb,
- .wc_tolower = tolower_libc_sb,
+ .wc_toupper = toupper_libc_ascii,
+ .wc_tolower = tolower_libc_ascii,
.max_chr = UCHAR_MAX,
},
};
--
2.50.1 (Apple Git-155)
0004-Support-wctype.h-classification-for-EUC-encodings.txttext/plain; charset=US-ASCII; name=0004-Support-wctype.h-classification-for-EUC-encodings.txtDownload
From 39ebd5e689a458508b2762b84beb197f7dc6fd92 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Wed, 29 Oct 2025 17:37:03 +1300
Subject: [PATCH 4/8] Support wctype.h classification for EUC encodings.
Instead of giving up on non-ASCII characters, convert pg_wchar values
using the PG_WCHAR_CUSTOM encoding scheme to wchar_t so that we can use
wctype.h functions on any character.
XXX This replaces the _ascii() version from the previous patch, to
experiment with a different approach
XXX Is this too expensive?
---
src/backend/utils/adt/pg_locale_libc.c | 148 ++++++++++++++++++-------
1 file changed, 109 insertions(+), 39 deletions(-)
diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c
index 1d6e8be3a82..e6724880f1b 100644
--- a/src/backend/utils/adt/pg_locale_libc.c
+++ b/src/backend/utils/adt/pg_locale_libc.c
@@ -130,29 +130,81 @@ static size_t strupper_libc_mb(char *dest, size_t destsize,
const char *src, ssize_t srclen,
pg_locale_t locale);
+static wint_t
+pg_wchar_to_wchar_t(pg_wchar wc, pg_locale_t locale)
+{
+ wchar_t out[MAX_CONVERSION_GROWTH + 1];
+ char mb[MAX_CONVERSION_GROWTH + 1];
+ size_t mb_len;
+ size_t wchar_t_len;
+
+ /* pg_wchar -> multibyte using PostgreSQL pg_wchar encoding */
+ mb_len = pg_wchar2mb_with_len(&wc, mb, 1);
+
+ /* multibyte -> wchar_t using libc */
+ wchar_t_len = char2wchar(out, lengthof(out), mb, mb_len, locale->lt);
+
+ /* reject surrogates and combining characters */
+ if (wchar_t_len != 1)
+ return WEOF;
+
+ return out[0];
+}
+
+static pg_wchar
+wchar_t_to_pg_wchar(wchar_t wc, pg_locale_t locale)
+{
+ wchar_t in[] = {wc, 0};
+ pg_wchar out[MAX_CONVERSION_GROWTH + 1];
+ char mb[MAX_CONVERSION_GROWTH + 1];
+ size_t mb_len;
+ size_t pg_wchar_len;
+
+ /* wchar_t -> multibyte using libc */
+ mb_len = wchar2char(mb, in, lengthof(mb), locale->lt);
+
+ /* multibyte -> pg_wchar using PostgreSQL pg_wchar encoding */
+ pg_wchar_len = pg_mb2wchar_with_len(mb, out, mb_len);
+
+ /* should be exactly one character */
+ if (pg_wchar_len != 1)
+ elog(ERROR, "unexpected number of output characters: %zu", pg_wchar_len);
+
+ return out[0];
+}
+
/*
- * Generate a function that passes single-byte characters directly to <ctype.h>
- * functions, but only if they are in the ASCII range. This is suitable for
- * PG_WCHAR_CUSTOM pg_wchar encoding (used with EUC_* encodings). Values
- * outside ASCII have an unknown encoding, so we just return false.
+ * Generate a function that handles the ASCII range with <ctype.h>, and
+ * otherwise converts pg_wchar to libc's wchar_t to be able to use <wctype.h>
+ * routines. This extra conversion is only required when using encodings that
+ * declare a PG_WCHAR_CUSTOM encoding scheme (EUC encodings for CJK).
+ *
+ * XXX If wchar.c had a function that could identify pg_wchar values that
+ * definitely won't return true (eg the big kanji/hanzi ranges), then we could
+ * skip the expensive conversion but still give correct answers for other
+ * characters.
*/
-#define DEFINE_WC_CTYPE_LIBC_ASCII(ctype) \
+#define DEFINE_WC_CTYPE_LIBC_CUSTOM(ctype) \
static bool \
-wc_is##ctype##_libc_ascii(pg_wchar wc, pg_locale_t locale) \
+wc_is##ctype##_libc_custom(pg_wchar wc, pg_locale_t locale) \
{ \
- return is##ctype##_l((unsigned char) wc, locale->lt); \
+ wint_t wint; \
+ if (wc < 128) \
+ return is##ctype##_l(wc, locale->lt); \
+ wint = pg_wchar_to_wchar_t(wc, locale); \
+ return wint != WEOF && isw##ctype##_l(wint, locale->lt); \
}
-DEFINE_WC_CTYPE_LIBC_ASCII(digit);
-DEFINE_WC_CTYPE_LIBC_ASCII(alpha);
-DEFINE_WC_CTYPE_LIBC_ASCII(alnum);
-DEFINE_WC_CTYPE_LIBC_ASCII(upper);
-DEFINE_WC_CTYPE_LIBC_ASCII(lower);
-DEFINE_WC_CTYPE_LIBC_ASCII(graph);
-DEFINE_WC_CTYPE_LIBC_ASCII(print);
-DEFINE_WC_CTYPE_LIBC_ASCII(punct);
-DEFINE_WC_CTYPE_LIBC_ASCII(space);
-DEFINE_WC_CTYPE_LIBC_ASCII(xdigit);
+DEFINE_WC_CTYPE_LIBC_CUSTOM(digit);
+DEFINE_WC_CTYPE_LIBC_CUSTOM(alpha);
+DEFINE_WC_CTYPE_LIBC_CUSTOM(alnum);
+DEFINE_WC_CTYPE_LIBC_CUSTOM(upper);
+DEFINE_WC_CTYPE_LIBC_CUSTOM(lower);
+DEFINE_WC_CTYPE_LIBC_CUSTOM(graph);
+DEFINE_WC_CTYPE_LIBC_CUSTOM(print);
+DEFINE_WC_CTYPE_LIBC_CUSTOM(punct);
+DEFINE_WC_CTYPE_LIBC_CUSTOM(space);
+DEFINE_WC_CTYPE_LIBC_CUSTOM(xdigit);
static bool
wc_isdigit_libc_sb(pg_wchar wc, pg_locale_t locale)
@@ -302,9 +354,19 @@ char_is_cased_libc(char ch, pg_locale_t locale)
}
static pg_wchar
-toupper_libc_ascii(pg_wchar wc, pg_locale_t locale)
+toupper_libc_custom(pg_wchar wc, pg_locale_t locale)
{
- return wc < 128 ? toupper_l((unsigned char) wc, locale->lt) : wc;
+ wint_t wint;
+
+ /* force C behavior for ASCII characters, per comments above */
+ if (locale->is_default && wc <= (pg_wchar) 127)
+ return pg_ascii_toupper((unsigned char) wc);
+ if (wc <= 127)
+ return towupper_l((wint_t) wc, locale->lt);
+ wint = pg_wchar_to_wchar_t(wc, locale);
+ if ((wint = pg_wchar_to_wchar_t(wc, locale)) != WEOF)
+ return wchar_t_to_pg_wchar(towupper_l(wint, locale->lt), locale);
+ return wc;
}
static pg_wchar
@@ -336,9 +398,19 @@ toupper_libc_mb(pg_wchar wc, pg_locale_t locale)
}
static pg_wchar
-tolower_libc_ascii(pg_wchar wc, pg_locale_t locale)
+tolower_libc_custom(pg_wchar wc, pg_locale_t locale)
{
- return wc < 128 ? tolower_l((unsigned char) wc, locale->lt) : wc;
+ wint_t wint;
+
+ /* force C behavior for ASCII characters, per comments above */
+ if (locale->is_default && wc <= (pg_wchar) 127)
+ return pg_ascii_tolower((unsigned char) wc);
+ if (wc <= 127)
+ return towlower_l((wint_t) wc, locale->lt);
+ wint = pg_wchar_to_wchar_t(wc, locale);
+ if ((wint = pg_wchar_to_wchar_t(wc, locale)) != WEOF)
+ return wchar_t_to_pg_wchar(towlower_l(wint, locale->lt), locale);
+ return wc;
}
static pg_wchar
@@ -412,31 +484,29 @@ static const struct ctype_methods ctype_methods_libc[] = {
/*
* Custom pg_wchar format converted from non-UTF8 multibyte encodings use
- * multibyte semantics for case mapping, but ASCII-only semantics for
- * pattern matching, since libc doesn't understand custom encoding of
- * higher values.
- *
- * XXX We could convert to wchar_t to fix that, at considerable cost.
+ * multibyte semantics for case mapping, and conversions to libc's wchar_t
+ * except for the ASCII range that can be handled directly by ctype
+ * functions.
*/
[PG_WCHAR_CUSTOM] = {
.strlower = strlower_libc_mb,
.strtitle = strtitle_libc_mb,
.strupper = strupper_libc_mb,
- .wc_isdigit = wc_isdigit_libc_ascii,
- .wc_isalpha = wc_isalpha_libc_ascii,
- .wc_isalnum = wc_isalnum_libc_ascii,
- .wc_isupper = wc_isupper_libc_ascii,
- .wc_islower = wc_islower_libc_ascii,
- .wc_isgraph = wc_isgraph_libc_ascii,
- .wc_isprint = wc_isprint_libc_ascii,
- .wc_ispunct = wc_ispunct_libc_ascii,
- .wc_isspace = wc_isspace_libc_ascii,
- .wc_isxdigit = wc_isxdigit_libc_ascii,
+ .wc_isdigit = wc_isdigit_libc_custom,
+ .wc_isalpha = wc_isalpha_libc_custom,
+ .wc_isalnum = wc_isalnum_libc_custom,
+ .wc_isupper = wc_isupper_libc_custom,
+ .wc_islower = wc_islower_libc_custom,
+ .wc_isgraph = wc_isgraph_libc_custom,
+ .wc_isprint = wc_isprint_libc_custom,
+ .wc_ispunct = wc_ispunct_libc_custom,
+ .wc_isspace = wc_isspace_libc_custom,
+ .wc_isxdigit = wc_isxdigit_libc_custom,
.char_is_cased = char_is_cased_libc,
.char_tolower = char_tolower_libc,
- .wc_toupper = toupper_libc_ascii,
- .wc_tolower = tolower_libc_ascii,
- .max_chr = UCHAR_MAX,
+ .wc_toupper = toupper_libc_custom,
+ .wc_tolower = tolower_libc_custom,
+ .max_chr = 127, /* values outside ASCII may be illegal to probe */
},
};
--
2.50.1 (Apple Git-155)
0005-XXX-work-around-regc_pg_locale.c-s-probing-logic.txttext/plain; charset=US-ASCII; name=0005-XXX-work-around-regc_pg_locale.c-s-probing-logic.txtDownload
From 50dc5a57bcd31be893071da62c4ba6b0537695cc Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Thu, 30 Oct 2025 00:52:51 +1300
Subject: [PATCH 5/8] XXX work around regc_pg_locale.c's probing logic
---
src/backend/regex/regc_pg_locale.c | 10 +++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c
index e0c892db713..fdc3fac0bbe 100644
--- a/src/backend/regex/regc_pg_locale.c
+++ b/src/backend/regex/regc_pg_locale.c
@@ -352,7 +352,15 @@ regc_ctype_get_cache(regc_wc_probefunc probefunc, int cclasscode)
pg_regex_locale->ctype->max_chr <= MAX_SIMPLE_CHR)
{
max_chr = pg_regex_locale->ctype->max_chr;
- pcc->cv.cclasscode = -1;
+
+ /*
+ * XXX TODO: don't turn off locales just because pg_locale_libc.c
+ * told us it's not cool to probe arbitrary pg_wchar values over
+ * 127! Without this, re-encoding fails at pg_wchar 0x80, which
+ * can't be converted back to mb (the EUC pg_wchar encoding has
+ * holes in it)
+ */
+ //pcc->cv.cclasscode = -1;
}
else
max_chr = (pg_wchar) MAX_SIMPLE_CHR;
--
2.50.1 (Apple Git-155)
0006-Improve-naming-of-libc-collation-functions.txttext/plain; charset=US-ASCII; name=0006-Improve-naming-of-libc-collation-functions.txtDownload
From 553e2d5a2923d582a0f1a3ef72033b40149c08c7 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Wed, 29 Oct 2025 16:01:32 +1300
Subject: [PATCH 6/8] Improve naming of libc collation functions.
The functions that expect pg_wchar to hold a UTF-32-encoded code point
because the encoding scheme is PG_WCHAR_UTF32 had names ending _mb, but
_utf32 makes more sense. The remaining _mb functions really do work
with multibyte input.
It might be tempting to rename the _sb functions to _char to match
PG_WCHAR_CHAR, but since the _sb and _mb functions both work with chars (one
or variable), that would probably just be more confusing.
---
src/backend/utils/adt/pg_locale_libc.c | 48 +++++++++++++-------------
1 file changed, 24 insertions(+), 24 deletions(-)
diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c
index e6724880f1b..b33897c683e 100644
--- a/src/backend/utils/adt/pg_locale_libc.c
+++ b/src/backend/utils/adt/pg_locale_libc.c
@@ -267,70 +267,70 @@ wc_isxdigit_libc_sb(pg_wchar wc, pg_locale_t locale)
}
static bool
-wc_isdigit_libc_mb(pg_wchar wc, pg_locale_t locale)
+wc_isdigit_libc_utf32(pg_wchar wc, pg_locale_t locale)
{
return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
iswdigit_l((wint_t) wc, locale->lt);
}
static bool
-wc_isalpha_libc_mb(pg_wchar wc, pg_locale_t locale)
+wc_isalpha_libc_utf32(pg_wchar wc, pg_locale_t locale)
{
return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
iswalpha_l((wint_t) wc, locale->lt);
}
static bool
-wc_isalnum_libc_mb(pg_wchar wc, pg_locale_t locale)
+wc_isalnum_libc_utf32(pg_wchar wc, pg_locale_t locale)
{
return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
iswalnum_l((wint_t) wc, locale->lt);
}
static bool
-wc_isupper_libc_mb(pg_wchar wc, pg_locale_t locale)
+wc_isupper_libc_utf32(pg_wchar wc, pg_locale_t locale)
{
return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
iswupper_l((wint_t) wc, locale->lt);
}
static bool
-wc_islower_libc_mb(pg_wchar wc, pg_locale_t locale)
+wc_islower_libc_utf32(pg_wchar wc, pg_locale_t locale)
{
return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
iswlower_l((wint_t) wc, locale->lt);
}
static bool
-wc_isgraph_libc_mb(pg_wchar wc, pg_locale_t locale)
+wc_isgraph_libc_utf32(pg_wchar wc, pg_locale_t locale)
{
return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
iswgraph_l((wint_t) wc, locale->lt);
}
static bool
-wc_isprint_libc_mb(pg_wchar wc, pg_locale_t locale)
+wc_isprint_libc_utf32(pg_wchar wc, pg_locale_t locale)
{
return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
iswprint_l((wint_t) wc, locale->lt);
}
static bool
-wc_ispunct_libc_mb(pg_wchar wc, pg_locale_t locale)
+wc_ispunct_libc_utf32(pg_wchar wc, pg_locale_t locale)
{
return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
iswpunct_l((wint_t) wc, locale->lt);
}
static bool
-wc_isspace_libc_mb(pg_wchar wc, pg_locale_t locale)
+wc_isspace_libc_utf32(pg_wchar wc, pg_locale_t locale)
{
return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
iswspace_l((wint_t) wc, locale->lt);
}
static bool
-wc_isxdigit_libc_mb(pg_wchar wc, pg_locale_t locale)
+wc_isxdigit_libc_utf32(pg_wchar wc, pg_locale_t locale)
{
return iswxdigit_l((wint_t) wc, locale->lt);
}
@@ -384,7 +384,7 @@ toupper_libc_sb(pg_wchar wc, pg_locale_t locale)
}
static pg_wchar
-toupper_libc_mb(pg_wchar wc, pg_locale_t locale)
+toupper_libc_utf32(pg_wchar wc, pg_locale_t locale)
{
Assert(GetDatabaseEncoding() == PG_UTF8);
@@ -428,7 +428,7 @@ tolower_libc_sb(pg_wchar wc, pg_locale_t locale)
}
static pg_wchar
-tolower_libc_mb(pg_wchar wc, pg_locale_t locale)
+tolower_libc_utf32(pg_wchar wc, pg_locale_t locale)
{
Assert(GetDatabaseEncoding() == PG_UTF8);
@@ -466,20 +466,20 @@ static const struct ctype_methods ctype_methods_libc[] = {
.strlower = strlower_libc_mb,
.strtitle = strtitle_libc_mb,
.strupper = strupper_libc_mb,
- .wc_isdigit = wc_isdigit_libc_mb,
- .wc_isalpha = wc_isalpha_libc_mb,
- .wc_isalnum = wc_isalnum_libc_mb,
- .wc_isupper = wc_isupper_libc_mb,
- .wc_islower = wc_islower_libc_mb,
- .wc_isgraph = wc_isgraph_libc_mb,
- .wc_isprint = wc_isprint_libc_mb,
- .wc_ispunct = wc_ispunct_libc_mb,
- .wc_isspace = wc_isspace_libc_mb,
- .wc_isxdigit = wc_isxdigit_libc_mb,
+ .wc_isdigit = wc_isdigit_libc_utf32,
+ .wc_isalpha = wc_isalpha_libc_utf32,
+ .wc_isalnum = wc_isalnum_libc_utf32,
+ .wc_isupper = wc_isupper_libc_utf32,
+ .wc_islower = wc_islower_libc_utf32,
+ .wc_isgraph = wc_isgraph_libc_utf32,
+ .wc_isprint = wc_isprint_libc_utf32,
+ .wc_ispunct = wc_ispunct_libc_utf32,
+ .wc_isspace = wc_isspace_libc_utf32,
+ .wc_isxdigit = wc_isxdigit_libc_utf32,
.char_is_cased = char_is_cased_libc,
.char_tolower = char_tolower_libc,
- .wc_toupper = toupper_libc_mb,
- .wc_tolower = tolower_libc_mb,
+ .wc_toupper = toupper_libc_utf32,
+ .wc_tolower = tolower_libc_utf32,
},
/*
--
2.50.1 (Apple Git-155)
0007-Use-compact-notation-for-isXXX_l-wrappers.txttext/plain; charset=US-ASCII; name=0007-Use-compact-notation-for-isXXX_l-wrappers.txtDownload
From b2ddd005740f73c6d4a9e2441616926fcea94684 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Wed, 29 Oct 2025 17:54:06 +1300
Subject: [PATCH 7/8] Use compact notation for isXXX_l() wrappers.
Instead of loads of repeating functions for the PG_WCHAR_CHAR and
PG_WCHAR_UTF32 handlers, make a macro to avoid typos, as was already
done for the new PG_WCHAR_CUSTOM handlers.
---
src/backend/utils/adt/pg_locale_libc.c | 166 +++++++------------------
1 file changed, 43 insertions(+), 123 deletions(-)
diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c
index b33897c683e..fc758e2607c 100644
--- a/src/backend/utils/adt/pg_locale_libc.c
+++ b/src/backend/utils/adt/pg_locale_libc.c
@@ -206,134 +206,54 @@ DEFINE_WC_CTYPE_LIBC_CUSTOM(punct);
DEFINE_WC_CTYPE_LIBC_CUSTOM(space);
DEFINE_WC_CTYPE_LIBC_CUSTOM(xdigit);
-static bool
-wc_isdigit_libc_sb(pg_wchar wc, pg_locale_t locale)
-{
- return isdigit_l((unsigned char) wc, locale->lt);
-}
-
-static bool
-wc_isalpha_libc_sb(pg_wchar wc, pg_locale_t locale)
-{
- return isalpha_l((unsigned char) wc, locale->lt);
-}
-
-static bool
-wc_isalnum_libc_sb(pg_wchar wc, pg_locale_t locale)
-{
- return isalnum_l((unsigned char) wc, locale->lt);
-}
-
-static bool
-wc_isupper_libc_sb(pg_wchar wc, pg_locale_t locale)
-{
- return isupper_l((unsigned char) wc, locale->lt);
-}
-
-static bool
-wc_islower_libc_sb(pg_wchar wc, pg_locale_t locale)
-{
- return islower_l((unsigned char) wc, locale->lt);
-}
-
-static bool
-wc_isgraph_libc_sb(pg_wchar wc, pg_locale_t locale)
-{
- return isgraph_l((unsigned char) wc, locale->lt);
-}
-
-static bool
-wc_isprint_libc_sb(pg_wchar wc, pg_locale_t locale)
-{
- return isprint_l((unsigned char) wc, locale->lt);
-}
-
-static bool
-wc_ispunct_libc_sb(pg_wchar wc, pg_locale_t locale)
-{
- return ispunct_l((unsigned char) wc, locale->lt);
-}
-
-static bool
-wc_isspace_libc_sb(pg_wchar wc, pg_locale_t locale)
-{
- return isspace_l((unsigned char) wc, locale->lt);
-}
-
-static bool
-wc_isxdigit_libc_sb(pg_wchar wc, pg_locale_t locale)
-{
- return isxdigit_l((unsigned char) wc, locale->lt);
-}
-
-static bool
-wc_isdigit_libc_utf32(pg_wchar wc, pg_locale_t locale)
-{
- return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
- iswdigit_l((wint_t) wc, locale->lt);
-}
-
-static bool
-wc_isalpha_libc_utf32(pg_wchar wc, pg_locale_t locale)
-{
- return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
- iswalpha_l((wint_t) wc, locale->lt);
-}
-
-static bool
-wc_isalnum_libc_utf32(pg_wchar wc, pg_locale_t locale)
-{
- return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
- iswalnum_l((wint_t) wc, locale->lt);
-}
-
-static bool
-wc_isupper_libc_utf32(pg_wchar wc, pg_locale_t locale)
-{
- return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
- iswupper_l((wint_t) wc, locale->lt);
-}
-
-static bool
-wc_islower_libc_utf32(pg_wchar wc, pg_locale_t locale)
-{
- return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
- iswlower_l((wint_t) wc, locale->lt);
-}
-
-static bool
-wc_isgraph_libc_utf32(pg_wchar wc, pg_locale_t locale)
-{
- return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
- iswgraph_l((wint_t) wc, locale->lt);
-}
-
-static bool
-wc_isprint_libc_utf32(pg_wchar wc, pg_locale_t locale)
-{
- return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
- iswprint_l((wint_t) wc, locale->lt);
+/*
+ * Generate a function that passes single-byte characters directly to <ctype.h>
+ * functions. This is suitable for PG_WCHAR_CHAR encodings, where pg_wchar
+ * holds a one byte.
+ */
+#define DEFINE_WC_CTYPE_LIBC_SB(ctype) \
+static bool \
+wc_is##ctype##_libc_sb(pg_wchar wc, pg_locale_t locale) \
+{ \
+ return is##ctype##_l((unsigned char) wc, locale->lt); \
}
-static bool
-wc_ispunct_libc_utf32(pg_wchar wc, pg_locale_t locale)
-{
- return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
- iswpunct_l((wint_t) wc, locale->lt);
-}
+DEFINE_WC_CTYPE_LIBC_SB(digit);
+DEFINE_WC_CTYPE_LIBC_SB(alpha);
+DEFINE_WC_CTYPE_LIBC_SB(alnum);
+DEFINE_WC_CTYPE_LIBC_SB(upper);
+DEFINE_WC_CTYPE_LIBC_SB(lower);
+DEFINE_WC_CTYPE_LIBC_SB(graph);
+DEFINE_WC_CTYPE_LIBC_SB(print);
+DEFINE_WC_CTYPE_LIBC_SB(punct);
+DEFINE_WC_CTYPE_LIBC_SB(space);
+DEFINE_WC_CTYPE_LIBC_SB(xdigit);
-static bool
-wc_isspace_libc_utf32(pg_wchar wc, pg_locale_t locale)
-{
- return (sizeof(wchar_t) >= 4 || wc <= 0xffff) &&
- iswspace_l((wint_t) wc, locale->lt);
+/*
+ * Generate a function that passes UTF-32 characters directly to <wctype.h>
+ * functions, This is suitable for PG_WCHAR_UTF32 encodings, with the
+ * assumption that any libc locale that uses UTF-8 as its char encoding must
+ * use UTF-32 or UTF-16 for its wchar_t encoding. For the UTF-16 case, just
+ * return false for codepoints outside the BMP.
+ */
+#define DEFINE_WC_CTYPE_LIBC_UTF32(ctype) \
+static bool \
+wc_is##ctype##_libc_utf32(pg_wchar wc, pg_locale_t locale) \
+{ \
+ return (sizeof(wchar_t) >= 4 || wc <= 0xffff) && \
+ isw##ctype##_l((wint_t) wc, locale->lt); \
}
-static bool
-wc_isxdigit_libc_utf32(pg_wchar wc, pg_locale_t locale)
-{
- return iswxdigit_l((wint_t) wc, locale->lt);
-}
+DEFINE_WC_CTYPE_LIBC_UTF32(digit);
+DEFINE_WC_CTYPE_LIBC_UTF32(alpha);
+DEFINE_WC_CTYPE_LIBC_UTF32(alnum);
+DEFINE_WC_CTYPE_LIBC_UTF32(upper);
+DEFINE_WC_CTYPE_LIBC_UTF32(lower);
+DEFINE_WC_CTYPE_LIBC_UTF32(graph);
+DEFINE_WC_CTYPE_LIBC_UTF32(print);
+DEFINE_WC_CTYPE_LIBC_UTF32(punct);
+DEFINE_WC_CTYPE_LIBC_UTF32(space);
+DEFINE_WC_CTYPE_LIBC_UTF32(xdigit);
static char
char_tolower_libc(unsigned char ch, pg_locale_t locale)
--
2.50.1 (Apple Git-155)
0008-Use-compact-notation-for-toupper-tolower-wrappers.txttext/plain; charset=US-ASCII; name=0008-Use-compact-notation-for-toupper-tolower-wrappers.txtDownload
From c70c67492e6c1a9fffd97037abe49251e408cff8 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Wed, 29 Oct 2025 23:23:42 +1300
Subject: [PATCH 8/8] Use compact notation for toupper/tolower wrappers.
Extend the macro technique used for generating isalpha etc also to
toupper/tolower functions, removing some duplication.
---
src/backend/utils/adt/pg_locale_libc.c | 118 ++++++++-----------------
1 file changed, 39 insertions(+), 79 deletions(-)
diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c
index fc758e2607c..7591fb812ac 100644
--- a/src/backend/utils/adt/pg_locale_libc.c
+++ b/src/backend/utils/adt/pg_locale_libc.c
@@ -273,93 +273,53 @@ char_is_cased_libc(char ch, pg_locale_t locale)
return isalpha_l((unsigned char) ch, locale->lt);
}
-static pg_wchar
-toupper_libc_custom(pg_wchar wc, pg_locale_t locale)
-{
- wint_t wint;
-
- /* force C behavior for ASCII characters, per comments above */
- if (locale->is_default && wc <= (pg_wchar) 127)
- return pg_ascii_toupper((unsigned char) wc);
- if (wc <= 127)
- return towupper_l((wint_t) wc, locale->lt);
- wint = pg_wchar_to_wchar_t(wc, locale);
- if ((wint = pg_wchar_to_wchar_t(wc, locale)) != WEOF)
- return wchar_t_to_pg_wchar(towupper_l(wint, locale->lt), locale);
- return wc;
-}
-
-static pg_wchar
-toupper_libc_sb(pg_wchar wc, pg_locale_t locale)
-{
- Assert(GetDatabaseEncoding() != PG_UTF8);
-
- /* force C behavior for ASCII characters, per comments above */
- if (locale->is_default && wc <= (pg_wchar) 127)
- return pg_ascii_toupper((unsigned char) wc);
- if (wc <= (pg_wchar) UCHAR_MAX)
- return toupper_l((unsigned char) wc, locale->lt);
- else
- return wc;
+#define DEFINE_WC_CASE_LIBC_CUSTOM(case) \
+static pg_wchar \
+to##case##_libc_custom(pg_wchar wc, pg_locale_t locale) \
+{ \
+ wint_t wint; \
+ if (locale->is_default && wc <= (pg_wchar) 127) \
+ return pg_ascii_to##case((unsigned char) wc); \
+ if (wc <= 127) \
+ return to##case##_l((wint_t) wc, locale->lt); \
+ wint = pg_wchar_to_wchar_t(wc, locale); \
+ if ((wint = pg_wchar_to_wchar_t(wc, locale)) != WEOF) \
+ return wchar_t_to_pg_wchar(to##case##_l(wint, locale->lt), locale); \
+ return wc; \
}
-static pg_wchar
-toupper_libc_utf32(pg_wchar wc, pg_locale_t locale)
-{
- Assert(GetDatabaseEncoding() == PG_UTF8);
+DEFINE_WC_CASE_LIBC_CUSTOM(upper);
+DEFINE_WC_CASE_LIBC_CUSTOM(lower);
- /* force C behavior for ASCII characters, per comments above */
- if (locale->is_default && wc <= (pg_wchar) 127)
- return pg_ascii_toupper((unsigned char) wc);
- if (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF)
- return towupper_l((wint_t) wc, locale->lt);
- else
- return wc;
-}
-
-static pg_wchar
-tolower_libc_custom(pg_wchar wc, pg_locale_t locale)
-{
- wint_t wint;
-
- /* force C behavior for ASCII characters, per comments above */
- if (locale->is_default && wc <= (pg_wchar) 127)
- return pg_ascii_tolower((unsigned char) wc);
- if (wc <= 127)
- return towlower_l((wint_t) wc, locale->lt);
- wint = pg_wchar_to_wchar_t(wc, locale);
- if ((wint = pg_wchar_to_wchar_t(wc, locale)) != WEOF)
- return wchar_t_to_pg_wchar(towlower_l(wint, locale->lt), locale);
- return wc;
+#define DEFINE_WC_CASE_LIBC_SB(case) \
+static pg_wchar \
+to##case##_libc_sb(pg_wchar wc, pg_locale_t locale) \
+{ \
+ if (locale->is_default && wc <= (pg_wchar) 127) \
+ return pg_ascii_to##case((unsigned char) wc); \
+ if (wc <= (pg_wchar) UCHAR_MAX) \
+ return to##case##_l((unsigned char) wc, locale->lt); \
+ else \
+ return wc; \
}
-static pg_wchar
-tolower_libc_sb(pg_wchar wc, pg_locale_t locale)
-{
- Assert(GetDatabaseEncoding() != PG_UTF8);
+DEFINE_WC_CASE_LIBC_SB(upper);
+DEFINE_WC_CASE_LIBC_SB(lower);
- /* force C behavior for ASCII characters, per comments above */
- if (locale->is_default && wc <= (pg_wchar) 127)
- return pg_ascii_tolower((unsigned char) wc);
- if (wc <= (pg_wchar) UCHAR_MAX)
- return tolower_l((unsigned char) wc, locale->lt);
- else
- return wc;
+#define DEFINE_WC_CASE_LIBC_UTF32(case) \
+static pg_wchar \
+to##case##_libc_utf32(pg_wchar wc, pg_locale_t locale) \
+{ \
+ if (locale->is_default && wc <= (pg_wchar) 127) \
+ return pg_ascii_to##case((unsigned char) wc); \
+ if (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF) \
+ return tow##case##_l((wint_t) wc, locale->lt); \
+ else \
+ return wc; \
}
-static pg_wchar
-tolower_libc_utf32(pg_wchar wc, pg_locale_t locale)
-{
- Assert(GetDatabaseEncoding() == PG_UTF8);
-
- /* force C behavior for ASCII characters, per comments above */
- if (locale->is_default && wc <= (pg_wchar) 127)
- return pg_ascii_tolower((unsigned char) wc);
- if (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF)
- return towlower_l((wint_t) wc, locale->lt);
- else
- return wc;
-}
+DEFINE_WC_CASE_LIBC_UTF32(upper);
+DEFINE_WC_CASE_LIBC_UTF32(lower);
static const struct ctype_methods ctype_methods_libc[] = {
[PG_WCHAR_CHAR] = {
--
2.50.1 (Apple Git-155)
On 28.10.25 22:54, Jeff Davis wrote:
I went back to using the check for __cplusplus, and added a comment
that hopefully clarifies things.
Yes, that looks more helpful now.
On Tue, 2025-10-28 at 14:54 -0700, Jeff Davis wrote:
[21:48:21.794] ../../src/include/port/win32_port.h: At top level:
[21:48:21.794] ../../src/include/port/win32_port.h:254:8: error:
redefinition of ‘struct stat’
[21:48:21.794] 254 | struct stat
/* This should match struct __stat64 */
[21:48:21.794] | ^~~~
[21:48:21.794] In file included from /usr/share/mingw-
w64/include/wchar.h:413,
[21:48:21.794] from /usr/share/mingw-
w64/include/uchar.h:28,
[21:48:21.794] from ../../src/include/c.h:526:
[21:48:21.794] /usr/share/mingw-w64/include/_mingw_stat64.h:40:10:
note: originally defined here
[21:48:21.794] 40 | struct stat {
[21:48:21.794] | ^~~~
It seems to work on the two windows CI instances just fine, but fails
mingw_cross_warning.
Apparently, <uchar.h> somehow includes (some portion of?) <sys/stat.h>
on that platform, which then conflicts with the hackery done in
<win32_port.h> (which expects to include <sys/stat.h> itself after some
special #defines).
The attached patch moves the inclusion of <uchar.h> after "port.h",
which solves the problem. It's a bit out of place, but I added a note
in the comment explaining why. I'll go ahead and commit.
Regards,
Jeff Davis
Attachments:
v5-0001-Use-C11-char16_t-and-char32_t-for-Unicode-code-po.patchtext/x-patch; charset=UTF-8; name=v5-0001-Use-C11-char16_t-and-char32_t-for-Unicode-code-po.patchDownload
From 87baaa9f4d6f6b230e082ea8c5667d11997bbeb1 Mon Sep 17 00:00:00 2001
From: Jeff Davis <jeff@j-davis.com>
Date: Tue, 21 Oct 2025 13:16:47 -0700
Subject: [PATCH v5] Use C11 char16_t and char32_t for Unicode code points.
Reviewed-by: Tatsuo Ishii <ishii@postgresql.org>
Reviewed-by: Thomas Munro <thomas.munro@gmail.com>
Reviewed-by: Peter Eisentraut <peter@eisentraut.org>
Discussion: https://postgr.es/m/bedcc93d06203dfd89815b10f815ca2de8626e85.camel%40j-davis.com
---
configure | 2 +-
configure.ac | 1 +
meson.build | 1 +
src/backend/parser/parser.c | 8 +--
src/backend/parser/scan.l | 8 +--
src/backend/utils/adt/jsonpath_scan.l | 6 +-
src/backend/utils/adt/pg_locale_builtin.c | 44 ++++++++++-----
src/backend/utils/adt/varlena.c | 40 ++++++-------
src/backend/utils/mb/mbutils.c | 4 +-
src/common/saslprep.c | 48 ++++++++--------
src/common/unicode/case_test.c | 23 ++++----
src/common/unicode/category_test.c | 3 +-
.../unicode/generate-norm_test_table.pl | 4 +-
.../unicode/generate-unicode_case_table.pl | 7 +--
.../generate-unicode_category_table.pl | 8 +--
src/common/unicode/norm_test.c | 6 +-
src/common/unicode_case.c | 56 +++++++++----------
src/common/unicode_category.c | 50 ++++++++---------
src/common/unicode_norm.c | 56 +++++++++----------
src/fe_utils/mbprint.c | 10 ++--
src/include/c.h | 23 ++++++++
src/include/common/unicode_case.h | 10 ++--
src/include/common/unicode_case_table.h | 13 ++---
src/include/common/unicode_category.h | 46 ++++++++-------
src/include/common/unicode_category_table.h | 8 +--
src/include/common/unicode_norm.h | 6 +-
src/include/mb/pg_wchar.h | 32 +++++------
src/include/pg_config.h.in | 3 +
src/tools/pgindent/typedefs.list | 2 +
29 files changed, 284 insertions(+), 244 deletions(-)
diff --git a/configure b/configure
index 7ce52173dd8..f7c24c8f576 100755
--- a/configure
+++ b/configure
@@ -13627,7 +13627,7 @@ fi
## Header files
##
-for ac_header in atomic.h copyfile.h execinfo.h getopt.h ifaddrs.h mbarrier.h sys/epoll.h sys/event.h sys/personality.h sys/prctl.h sys/procctl.h sys/signalfd.h sys/ucred.h termios.h ucred.h xlocale.h
+for ac_header in atomic.h copyfile.h execinfo.h getopt.h ifaddrs.h mbarrier.h sys/epoll.h sys/event.h sys/personality.h sys/prctl.h sys/procctl.h sys/signalfd.h sys/ucred.h termios.h uchar.h ucred.h xlocale.h
do :
as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh`
ac_fn_c_check_header_mongrel "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default"
diff --git a/configure.ac b/configure.ac
index 0842fd06259..6c802deaacb 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1513,6 +1513,7 @@ AC_CHECK_HEADERS(m4_normalize([
sys/signalfd.h
sys/ucred.h
termios.h
+ uchar.h
ucred.h
xlocale.h
]))
diff --git a/meson.build b/meson.build
index 1a123ce151a..0f61ff6a700 100644
--- a/meson.build
+++ b/meson.build
@@ -2613,6 +2613,7 @@ header_checks = [
'sys/signalfd.h',
'sys/ucred.h',
'termios.h',
+ 'uchar.h',
'ucred.h',
'xlocale.h',
]
diff --git a/src/backend/parser/parser.c b/src/backend/parser/parser.c
index 33a040506b4..a3679f8e86c 100644
--- a/src/backend/parser/parser.c
+++ b/src/backend/parser/parser.c
@@ -339,7 +339,7 @@ hexval(unsigned char c)
/* is Unicode code point acceptable? */
static void
-check_unicode_value(pg_wchar c)
+check_unicode_value(char32_t c)
{
if (!is_valid_unicode_codepoint(c))
ereport(ERROR,
@@ -376,7 +376,7 @@ str_udeescape(const char *str, char escape,
char *new,
*out;
size_t new_len;
- pg_wchar pair_first = 0;
+ char16_t pair_first = 0;
ScannerCallbackState scbstate;
/*
@@ -420,7 +420,7 @@ str_udeescape(const char *str, char escape,
isxdigit((unsigned char) in[3]) &&
isxdigit((unsigned char) in[4]))
{
- pg_wchar unicode;
+ char32_t unicode;
unicode = (hexval(in[1]) << 12) +
(hexval(in[2]) << 8) +
@@ -457,7 +457,7 @@ str_udeescape(const char *str, char escape,
isxdigit((unsigned char) in[6]) &&
isxdigit((unsigned char) in[7]))
{
- pg_wchar unicode;
+ char32_t unicode;
unicode = (hexval(in[2]) << 20) +
(hexval(in[3]) << 16) +
diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l
index 08990831fe8..a67815339b7 100644
--- a/src/backend/parser/scan.l
+++ b/src/backend/parser/scan.l
@@ -121,7 +121,7 @@ static void addlitchar(unsigned char ychar, core_yyscan_t yyscanner);
static char *litbufdup(core_yyscan_t yyscanner);
static unsigned char unescape_single_char(unsigned char c, core_yyscan_t yyscanner);
static int process_integer_literal(const char *token, YYSTYPE *lval, int base);
-static void addunicode(pg_wchar c, yyscan_t yyscanner);
+static void addunicode(char32_t c, yyscan_t yyscanner);
#define yyerror(msg) scanner_yyerror(msg, yyscanner)
@@ -640,7 +640,7 @@ other .
addlit(yytext, yyleng, yyscanner);
}
<xe>{xeunicode} {
- pg_wchar c = strtoul(yytext + 2, NULL, 16);
+ char32_t c = strtoul(yytext + 2, NULL, 16);
/*
* For consistency with other productions, issue any
@@ -668,7 +668,7 @@ other .
POP_YYLLOC();
}
<xeu>{xeunicode} {
- pg_wchar c = strtoul(yytext + 2, NULL, 16);
+ char32_t c = strtoul(yytext + 2, NULL, 16);
/* Remember start of overall string token ... */
PUSH_YYLLOC();
@@ -1376,7 +1376,7 @@ process_integer_literal(const char *token, YYSTYPE *lval, int base)
}
static void
-addunicode(pg_wchar c, core_yyscan_t yyscanner)
+addunicode(char32_t c, core_yyscan_t yyscanner)
{
ScannerCallbackState scbstate;
char buf[MAX_UNICODE_EQUIVALENT_STRING + 1];
diff --git a/src/backend/utils/adt/jsonpath_scan.l b/src/backend/utils/adt/jsonpath_scan.l
index c7aab83eeb4..8c3a0a9c642 100644
--- a/src/backend/utils/adt/jsonpath_scan.l
+++ b/src/backend/utils/adt/jsonpath_scan.l
@@ -574,7 +574,7 @@ hexval(char c, int *result, struct Node *escontext, yyscan_t yyscanner)
/* Add given unicode character to scanstring */
static bool
-addUnicodeChar(int ch, struct Node *escontext, yyscan_t yyscanner)
+addUnicodeChar(char32_t ch, struct Node *escontext, yyscan_t yyscanner)
{
if (ch == 0)
{
@@ -607,7 +607,7 @@ addUnicodeChar(int ch, struct Node *escontext, yyscan_t yyscanner)
/* Add unicode character, processing any surrogate pairs */
static bool
-addUnicode(int ch, int *hi_surrogate, struct Node *escontext, yyscan_t yyscanner)
+addUnicode(char32_t ch, int *hi_surrogate, struct Node *escontext, yyscan_t yyscanner)
{
if (is_utf16_surrogate_first(ch))
{
@@ -655,7 +655,7 @@ parseUnicode(char *s, int l, struct Node *escontext, yyscan_t yyscanner)
for (i = 2; i < l; i += 2) /* skip '\u' */
{
- int ch = 0;
+ char32_t ch = 0;
int j,
si;
diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c
index 3dc611b50e1..1021e0d129b 100644
--- a/src/backend/utils/adt/pg_locale_builtin.c
+++ b/src/backend/utils/adt/pg_locale_builtin.c
@@ -15,7 +15,6 @@
#include "catalog/pg_collation.h"
#include "common/unicode_case.h"
#include "common/unicode_category.h"
-#include "mb/pg_wchar.h"
#include "miscadmin.h"
#include "utils/builtins.h"
#include "utils/pg_locale.h"
@@ -35,6 +34,23 @@ struct WordBoundaryState
bool prev_alnum;
};
+/*
+ * In UTF-8, pg_wchar is guaranteed to be the code point value.
+ */
+static inline char32_t
+to_char32(pg_wchar wc)
+{
+ Assert(GetDatabaseEncoding() == PG_UTF8);
+ return (char32_t) wc;
+}
+
+static inline pg_wchar
+to_pg_wchar(char32_t c32)
+{
+ Assert(GetDatabaseEncoding() == PG_UTF8);
+ return (pg_wchar) c32;
+}
+
/*
* Simple word boundary iterator that draws boundaries each time the result of
* pg_u_isalnum() changes.
@@ -47,7 +63,7 @@ initcap_wbnext(void *state)
while (wbstate->offset < wbstate->len &&
wbstate->str[wbstate->offset] != '\0')
{
- pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str +
+ char32_t u = utf8_to_unicode((unsigned char *) wbstate->str +
wbstate->offset);
bool curr_alnum = pg_u_isalnum(u, wbstate->posix);
@@ -112,61 +128,61 @@ strfold_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen,
static bool
wc_isdigit_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_isdigit(wc, !locale->builtin.casemap_full);
+ return pg_u_isdigit(to_char32(wc), !locale->builtin.casemap_full);
}
static bool
wc_isalpha_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_isalpha(wc);
+ return pg_u_isalpha(to_char32(wc));
}
static bool
wc_isalnum_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_isalnum(wc, !locale->builtin.casemap_full);
+ return pg_u_isalnum(to_char32(wc), !locale->builtin.casemap_full);
}
static bool
wc_isupper_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_isupper(wc);
+ return pg_u_isupper(to_char32(wc));
}
static bool
wc_islower_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_islower(wc);
+ return pg_u_islower(to_char32(wc));
}
static bool
wc_isgraph_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_isgraph(wc);
+ return pg_u_isgraph(to_char32(wc));
}
static bool
wc_isprint_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_isprint(wc);
+ return pg_u_isprint(to_char32(wc));
}
static bool
wc_ispunct_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_ispunct(wc, !locale->builtin.casemap_full);
+ return pg_u_ispunct(to_char32(wc), !locale->builtin.casemap_full);
}
static bool
wc_isspace_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_isspace(wc);
+ return pg_u_isspace(to_char32(wc));
}
static bool
wc_isxdigit_builtin(pg_wchar wc, pg_locale_t locale)
{
- return pg_u_isxdigit(wc, !locale->builtin.casemap_full);
+ return pg_u_isxdigit(to_char32(wc), !locale->builtin.casemap_full);
}
static bool
@@ -179,13 +195,13 @@ char_is_cased_builtin(char ch, pg_locale_t locale)
static pg_wchar
wc_toupper_builtin(pg_wchar wc, pg_locale_t locale)
{
- return unicode_uppercase_simple(wc);
+ return to_pg_wchar(unicode_uppercase_simple(to_char32(wc)));
}
static pg_wchar
wc_tolower_builtin(pg_wchar wc, pg_locale_t locale)
{
- return unicode_lowercase_simple(wc);
+ return to_pg_wchar(unicode_lowercase_simple(to_char32(wc)));
}
static const struct ctype_methods ctype_methods_builtin = {
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index 2c398cd9e5c..8d735786e51 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -5419,12 +5419,12 @@ unicode_assigned(PG_FUNCTION_ARGS)
ereport(ERROR,
(errmsg("Unicode categorization can only be performed if server encoding is UTF8")));
- /* convert to pg_wchar */
+ /* convert to char32_t */
size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
p = (unsigned char *) VARDATA_ANY(input);
for (int i = 0; i < size; i++)
{
- pg_wchar uchar = utf8_to_unicode(p);
+ char32_t uchar = utf8_to_unicode(p);
int category = unicode_category(uchar);
if (category == PG_U_UNASSIGNED)
@@ -5443,24 +5443,24 @@ unicode_normalize_func(PG_FUNCTION_ARGS)
char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
UnicodeNormalizationForm form;
int size;
- pg_wchar *input_chars;
- pg_wchar *output_chars;
+ char32_t *input_chars;
+ char32_t *output_chars;
unsigned char *p;
text *result;
int i;
form = unicode_norm_form_from_string(formstr);
- /* convert to pg_wchar */
+ /* convert to char32_t */
size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
- input_chars = palloc((size + 1) * sizeof(pg_wchar));
+ input_chars = palloc((size + 1) * sizeof(char32_t));
p = (unsigned char *) VARDATA_ANY(input);
for (i = 0; i < size; i++)
{
input_chars[i] = utf8_to_unicode(p);
p += pg_utf_mblen(p);
}
- input_chars[i] = (pg_wchar) '\0';
+ input_chars[i] = (char32_t) '\0';
Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
/* action */
@@ -5468,7 +5468,7 @@ unicode_normalize_func(PG_FUNCTION_ARGS)
/* convert back to UTF-8 string */
size = 0;
- for (pg_wchar *wp = output_chars; *wp; wp++)
+ for (char32_t *wp = output_chars; *wp; wp++)
{
unsigned char buf[4];
@@ -5480,7 +5480,7 @@ unicode_normalize_func(PG_FUNCTION_ARGS)
SET_VARSIZE(result, size + VARHDRSZ);
p = (unsigned char *) VARDATA_ANY(result);
- for (pg_wchar *wp = output_chars; *wp; wp++)
+ for (char32_t *wp = output_chars; *wp; wp++)
{
unicode_to_utf8(*wp, p);
p += pg_utf_mblen(p);
@@ -5509,8 +5509,8 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
char *formstr = text_to_cstring(PG_GETARG_TEXT_PP(1));
UnicodeNormalizationForm form;
int size;
- pg_wchar *input_chars;
- pg_wchar *output_chars;
+ char32_t *input_chars;
+ char32_t *output_chars;
unsigned char *p;
int i;
UnicodeNormalizationQC quickcheck;
@@ -5519,16 +5519,16 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
form = unicode_norm_form_from_string(formstr);
- /* convert to pg_wchar */
+ /* convert to char32_t */
size = pg_mbstrlen_with_len(VARDATA_ANY(input), VARSIZE_ANY_EXHDR(input));
- input_chars = palloc((size + 1) * sizeof(pg_wchar));
+ input_chars = palloc((size + 1) * sizeof(char32_t));
p = (unsigned char *) VARDATA_ANY(input);
for (i = 0; i < size; i++)
{
input_chars[i] = utf8_to_unicode(p);
p += pg_utf_mblen(p);
}
- input_chars[i] = (pg_wchar) '\0';
+ input_chars[i] = (char32_t) '\0';
Assert((char *) p == VARDATA_ANY(input) + VARSIZE_ANY_EXHDR(input));
/* quick check (see UAX #15) */
@@ -5542,11 +5542,11 @@ unicode_is_normalized(PG_FUNCTION_ARGS)
output_chars = unicode_normalize(form, input_chars);
output_size = 0;
- for (pg_wchar *wp = output_chars; *wp; wp++)
+ for (char32_t *wp = output_chars; *wp; wp++)
output_size++;
result = (size == output_size) &&
- (memcmp(input_chars, output_chars, size * sizeof(pg_wchar)) == 0);
+ (memcmp(input_chars, output_chars, size * sizeof(char32_t)) == 0);
PG_RETURN_BOOL(result);
}
@@ -5602,7 +5602,7 @@ unistr(PG_FUNCTION_ARGS)
int len;
StringInfoData str;
text *result;
- pg_wchar pair_first = 0;
+ char16_t pair_first = 0;
char cbuf[MAX_UNICODE_EQUIVALENT_STRING + 1];
instr = VARDATA_ANY(input_text);
@@ -5626,7 +5626,7 @@ unistr(PG_FUNCTION_ARGS)
else if ((len >= 5 && isxdigits_n(instr + 1, 4)) ||
(len >= 6 && instr[1] == 'u' && isxdigits_n(instr + 2, 4)))
{
- pg_wchar unicode;
+ char32_t unicode;
int offset = instr[1] == 'u' ? 2 : 1;
unicode = hexval_n(instr + offset, 4);
@@ -5662,7 +5662,7 @@ unistr(PG_FUNCTION_ARGS)
}
else if (len >= 8 && instr[1] == '+' && isxdigits_n(instr + 2, 6))
{
- pg_wchar unicode;
+ char32_t unicode;
unicode = hexval_n(instr + 2, 6);
@@ -5697,7 +5697,7 @@ unistr(PG_FUNCTION_ARGS)
}
else if (len >= 10 && instr[1] == 'U' && isxdigits_n(instr + 2, 8))
{
- pg_wchar unicode;
+ char32_t unicode;
unicode = hexval_n(instr + 2, 8);
diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c
index 886ecbad871..fb629ed5c8f 100644
--- a/src/backend/utils/mb/mbutils.c
+++ b/src/backend/utils/mb/mbutils.c
@@ -862,7 +862,7 @@ perform_default_encoding_conversion(const char *src, int len,
* may call this outside any transaction, or in an aborted transaction.
*/
void
-pg_unicode_to_server(pg_wchar c, unsigned char *s)
+pg_unicode_to_server(char32_t c, unsigned char *s)
{
unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1];
int c_as_utf8_len;
@@ -924,7 +924,7 @@ pg_unicode_to_server(pg_wchar c, unsigned char *s)
* but simply return false on conversion failure.
*/
bool
-pg_unicode_to_server_noerror(pg_wchar c, unsigned char *s)
+pg_unicode_to_server_noerror(char32_t c, unsigned char *s)
{
unsigned char c_as_utf8[MAX_MULTIBYTE_CHAR_LEN + 1];
int c_as_utf8_len;
diff --git a/src/common/saslprep.c b/src/common/saslprep.c
index 97beb47940b..101e8d65a4d 100644
--- a/src/common/saslprep.c
+++ b/src/common/saslprep.c
@@ -47,7 +47,7 @@
/* Prototypes for local functions */
static int codepoint_range_cmp(const void *a, const void *b);
-static bool is_code_in_table(pg_wchar code, const pg_wchar *map, int mapsize);
+static bool is_code_in_table(char32_t code, const char32_t *map, int mapsize);
static int pg_utf8_string_len(const char *source);
/*
@@ -64,7 +64,7 @@ static int pg_utf8_string_len(const char *source);
*
* These are all mapped to the ASCII space character (U+00A0).
*/
-static const pg_wchar non_ascii_space_ranges[] =
+static const char32_t non_ascii_space_ranges[] =
{
0x00A0, 0x00A0,
0x1680, 0x1680,
@@ -79,7 +79,7 @@ static const pg_wchar non_ascii_space_ranges[] =
*
* If any of these appear in the input, they are removed.
*/
-static const pg_wchar commonly_mapped_to_nothing_ranges[] =
+static const char32_t commonly_mapped_to_nothing_ranges[] =
{
0x00AD, 0x00AD,
0x034F, 0x034F,
@@ -114,7 +114,7 @@ static const pg_wchar commonly_mapped_to_nothing_ranges[] =
* tables, so one code might originate from multiple source tables.
* Adjacent ranges have also been merged together, to save space.
*/
-static const pg_wchar prohibited_output_ranges[] =
+static const char32_t prohibited_output_ranges[] =
{
0x0000, 0x001F, /* C.2.1 */
0x007F, 0x00A0, /* C.1.2, C.2.1, C.2.2 */
@@ -155,7 +155,7 @@ static const pg_wchar prohibited_output_ranges[] =
};
/* A.1 Unassigned code points in Unicode 3.2 */
-static const pg_wchar unassigned_codepoint_ranges[] =
+static const char32_t unassigned_codepoint_ranges[] =
{
0x0221, 0x0221,
0x0234, 0x024F,
@@ -556,7 +556,7 @@ static const pg_wchar unassigned_codepoint_ranges[] =
};
/* D.1 Characters with bidirectional property "R" or "AL" */
-static const pg_wchar RandALCat_codepoint_ranges[] =
+static const char32_t RandALCat_codepoint_ranges[] =
{
0x05BE, 0x05BE,
0x05C0, 0x05C0,
@@ -595,7 +595,7 @@ static const pg_wchar RandALCat_codepoint_ranges[] =
};
/* D.2 Characters with bidirectional property "L" */
-static const pg_wchar LCat_codepoint_ranges[] =
+static const char32_t LCat_codepoint_ranges[] =
{
0x0041, 0x005A,
0x0061, 0x007A,
@@ -968,8 +968,8 @@ static const pg_wchar LCat_codepoint_ranges[] =
static int
codepoint_range_cmp(const void *a, const void *b)
{
- const pg_wchar *key = (const pg_wchar *) a;
- const pg_wchar *range = (const pg_wchar *) b;
+ const char32_t *key = (const char32_t *) a;
+ const char32_t *range = (const char32_t *) b;
if (*key < range[0])
return -1; /* less than lower bound */
@@ -980,14 +980,14 @@ codepoint_range_cmp(const void *a, const void *b)
}
static bool
-is_code_in_table(pg_wchar code, const pg_wchar *map, int mapsize)
+is_code_in_table(char32_t code, const char32_t *map, int mapsize)
{
Assert(mapsize % 2 == 0);
if (code < map[0] || code > map[mapsize - 1])
return false;
- if (bsearch(&code, map, mapsize / 2, sizeof(pg_wchar) * 2,
+ if (bsearch(&code, map, mapsize / 2, sizeof(char32_t) * 2,
codepoint_range_cmp))
return true;
else
@@ -1046,8 +1046,8 @@ pg_utf8_string_len(const char *source)
pg_saslprep_rc
pg_saslprep(const char *input, char **output)
{
- pg_wchar *input_chars = NULL;
- pg_wchar *output_chars = NULL;
+ char32_t *input_chars = NULL;
+ char32_t *output_chars = NULL;
int input_size;
char *result;
int result_size;
@@ -1055,7 +1055,7 @@ pg_saslprep(const char *input, char **output)
int i;
bool contains_RandALCat;
unsigned char *p;
- pg_wchar *wp;
+ char32_t *wp;
/* Ensure we return *output as NULL on failure */
*output = NULL;
@@ -1080,10 +1080,10 @@ pg_saslprep(const char *input, char **output)
input_size = pg_utf8_string_len(input);
if (input_size < 0)
return SASLPREP_INVALID_UTF8;
- if (input_size >= MaxAllocSize / sizeof(pg_wchar))
+ if (input_size >= MaxAllocSize / sizeof(char32_t))
goto oom;
- input_chars = ALLOC((input_size + 1) * sizeof(pg_wchar));
+ input_chars = ALLOC((input_size + 1) * sizeof(char32_t));
if (!input_chars)
goto oom;
@@ -1093,7 +1093,7 @@ pg_saslprep(const char *input, char **output)
input_chars[i] = utf8_to_unicode(p);
p += pg_utf_mblen(p);
}
- input_chars[i] = (pg_wchar) '\0';
+ input_chars[i] = (char32_t) '\0';
/*
* The steps below correspond to the steps listed in [RFC3454], Section
@@ -1107,7 +1107,7 @@ pg_saslprep(const char *input, char **output)
count = 0;
for (i = 0; i < input_size; i++)
{
- pg_wchar code = input_chars[i];
+ char32_t code = input_chars[i];
if (IS_CODE_IN_TABLE(code, non_ascii_space_ranges))
input_chars[count++] = 0x0020;
@@ -1118,7 +1118,7 @@ pg_saslprep(const char *input, char **output)
else
input_chars[count++] = code;
}
- input_chars[count] = (pg_wchar) '\0';
+ input_chars[count] = (char32_t) '\0';
input_size = count;
if (input_size == 0)
@@ -1138,7 +1138,7 @@ pg_saslprep(const char *input, char **output)
*/
for (i = 0; i < input_size; i++)
{
- pg_wchar code = input_chars[i];
+ char32_t code = input_chars[i];
if (IS_CODE_IN_TABLE(code, prohibited_output_ranges))
goto prohibited;
@@ -1170,7 +1170,7 @@ pg_saslprep(const char *input, char **output)
contains_RandALCat = false;
for (i = 0; i < input_size; i++)
{
- pg_wchar code = input_chars[i];
+ char32_t code = input_chars[i];
if (IS_CODE_IN_TABLE(code, RandALCat_codepoint_ranges))
{
@@ -1181,12 +1181,12 @@ pg_saslprep(const char *input, char **output)
if (contains_RandALCat)
{
- pg_wchar first = input_chars[0];
- pg_wchar last = input_chars[input_size - 1];
+ char32_t first = input_chars[0];
+ char32_t last = input_chars[input_size - 1];
for (i = 0; i < input_size; i++)
{
- pg_wchar code = input_chars[i];
+ char32_t code = input_chars[i];
if (IS_CODE_IN_TABLE(code, LCat_codepoint_ranges))
goto prohibited;
diff --git a/src/common/unicode/case_test.c b/src/common/unicode/case_test.c
index fdfb62e8552..00d4f85e5a5 100644
--- a/src/common/unicode/case_test.c
+++ b/src/common/unicode/case_test.c
@@ -24,6 +24,7 @@
#include "common/unicode_case.h"
#include "common/unicode_category.h"
#include "common/unicode_version.h"
+#include "mb/pg_wchar.h"
/* enough to hold largest source or result string, including NUL */
#define BUFSZ 256
@@ -54,7 +55,7 @@ initcap_wbnext(void *state)
while (wbstate->offset < wbstate->len &&
wbstate->str[wbstate->offset] != '\0')
{
- pg_wchar u = utf8_to_unicode((unsigned char *) wbstate->str +
+ char32_t u = utf8_to_unicode((unsigned char *) wbstate->str +
wbstate->offset);
bool curr_alnum = pg_u_isalnum(u, wbstate->posix);
@@ -77,16 +78,16 @@ initcap_wbnext(void *state)
#ifdef USE_ICU
static void
-icu_test_simple(pg_wchar code)
+icu_test_simple(char32_t code)
{
- pg_wchar lower = unicode_lowercase_simple(code);
- pg_wchar title = unicode_titlecase_simple(code);
- pg_wchar upper = unicode_uppercase_simple(code);
- pg_wchar fold = unicode_casefold_simple(code);
- pg_wchar iculower = u_tolower(code);
- pg_wchar icutitle = u_totitle(code);
- pg_wchar icuupper = u_toupper(code);
- pg_wchar icufold = u_foldCase(code, U_FOLD_CASE_DEFAULT);
+ char32_t lower = unicode_lowercase_simple(code);
+ char32_t title = unicode_titlecase_simple(code);
+ char32_t upper = unicode_uppercase_simple(code);
+ char32_t fold = unicode_casefold_simple(code);
+ char32_t iculower = u_tolower(code);
+ char32_t icutitle = u_totitle(code);
+ char32_t icuupper = u_toupper(code);
+ char32_t icufold = u_foldCase(code, U_FOLD_CASE_DEFAULT);
if (lower != iculower || title != icutitle || upper != icuupper ||
fold != icufold)
@@ -172,7 +173,7 @@ test_icu(void)
int successful = 0;
int skipped_mismatch = 0;
- for (pg_wchar code = 0; code <= 0x10ffff; code++)
+ for (char32_t code = 0; code <= 0x10ffff; code++)
{
pg_unicode_category category = unicode_category(code);
diff --git a/src/common/unicode/category_test.c b/src/common/unicode/category_test.c
index 5d37ba39196..1e8c1f7905f 100644
--- a/src/common/unicode/category_test.c
+++ b/src/common/unicode/category_test.c
@@ -22,6 +22,7 @@
#include "common/unicode_category.h"
#include "common/unicode_version.h"
+#include "mb/pg_wchar.h"
static int pg_unicode_version = 0;
#ifdef USE_ICU
@@ -59,7 +60,7 @@ icu_test()
int pg_skipped_codepoints = 0;
int icu_skipped_codepoints = 0;
- for (pg_wchar code = 0; code <= 0x10ffff; code++)
+ for (char32_t code = 0; code <= 0x10ffff; code++)
{
uint8_t pg_category = unicode_category(code);
uint8_t icu_category = u_charType(code);
diff --git a/src/common/unicode/generate-norm_test_table.pl b/src/common/unicode/generate-norm_test_table.pl
index 1b401be9409..1a8b908ff33 100644
--- a/src/common/unicode/generate-norm_test_table.pl
+++ b/src/common/unicode/generate-norm_test_table.pl
@@ -47,8 +47,8 @@ print $OUTPUT <<HEADER;
typedef struct
{
int linenum;
- pg_wchar input[50];
- pg_wchar output[4][50];
+ char32_t input[50];
+ char32_t output[4][50];
} pg_unicode_test;
/* test table */
diff --git a/src/common/unicode/generate-unicode_case_table.pl b/src/common/unicode/generate-unicode_case_table.pl
index 5d9ddd62803..f71eb25c94e 100644
--- a/src/common/unicode/generate-unicode_case_table.pl
+++ b/src/common/unicode/generate-unicode_case_table.pl
@@ -270,7 +270,6 @@ print $OT <<"EOS";
*/
#include "common/unicode_case.h"
-#include "mb/pg_wchar.h"
/*
* The maximum number of codepoints that can result from case mapping
@@ -297,7 +296,7 @@ typedef enum
typedef struct
{
int16 conditions;
- pg_wchar map[NCaseKind][MAX_CASE_EXPANSION];
+ char32_t map[NCaseKind][MAX_CASE_EXPANSION];
} pg_special_case;
/*
@@ -430,7 +429,7 @@ foreach my $kind ('lower', 'title', 'upper', 'fold')
* The entry case_map_${kind}[case_index(codepoint)] is the mapping for the
* given codepoint.
*/
-static const pg_wchar case_map_$kind\[$index\] =
+static const char32_t case_map_$kind\[$index\] =
{
EOS
@@ -502,7 +501,7 @@ print $OT <<"EOS";
* the offset into the mapping tables.
*/
static inline uint16
-case_index(pg_wchar cp)
+case_index(char32_t cp)
{
/* Fast path for codepoints < $fastpath_limit */
if (cp < $fastpath_limit)
diff --git a/src/common/unicode/generate-unicode_category_table.pl b/src/common/unicode/generate-unicode_category_table.pl
index abab5cd9696..7e094b13720 100644
--- a/src/common/unicode/generate-unicode_category_table.pl
+++ b/src/common/unicode/generate-unicode_category_table.pl
@@ -366,15 +366,15 @@ print $OT <<"EOS";
*/
typedef struct
{
- uint32 first; /* Unicode codepoint */
- uint32 last; /* Unicode codepoint */
+ char32_t first; /* Unicode codepoint */
+ char32_t last; /* Unicode codepoint */
uint8 category; /* General Category */
} pg_category_range;
typedef struct
{
- uint32 first; /* Unicode codepoint */
- uint32 last; /* Unicode codepoint */
+ char32_t first; /* Unicode codepoint */
+ char32_t last; /* Unicode codepoint */
} pg_unicode_range;
typedef struct
diff --git a/src/common/unicode/norm_test.c b/src/common/unicode/norm_test.c
index 25bc59463f2..058817f1719 100644
--- a/src/common/unicode/norm_test.c
+++ b/src/common/unicode/norm_test.c
@@ -20,7 +20,7 @@
#include "norm_test_table.h"
static char *
-print_wchar_str(const pg_wchar *s)
+print_wchar_str(const char32_t *s)
{
#define BUF_DIGITS 50
static char buf[BUF_DIGITS * 11 + 1];
@@ -41,7 +41,7 @@ print_wchar_str(const pg_wchar *s)
}
static int
-pg_wcscmp(const pg_wchar *s1, const pg_wchar *s2)
+pg_wcscmp(const char32_t *s1, const char32_t *s2)
{
for (;;)
{
@@ -65,7 +65,7 @@ main(int argc, char **argv)
{
for (int form = 0; form < 4; form++)
{
- pg_wchar *result;
+ char32_t *result;
result = unicode_normalize(form, test->input);
diff --git a/src/common/unicode_case.c b/src/common/unicode_case.c
index 073faf6a0d5..e5e494db43c 100644
--- a/src/common/unicode_case.c
+++ b/src/common/unicode_case.c
@@ -30,7 +30,7 @@ enum CaseMapResult
/*
* Map for each case kind.
*/
-static const pg_wchar *const casekind_map[NCaseKind] =
+static const char32_t *const casekind_map[NCaseKind] =
{
[CaseLower] = case_map_lower,
[CaseTitle] = case_map_title,
@@ -38,42 +38,42 @@ static const pg_wchar *const casekind_map[NCaseKind] =
[CaseFold] = case_map_fold,
};
-static pg_wchar find_case_map(pg_wchar ucs, const pg_wchar *map);
+static char32_t find_case_map(char32_t ucs, const char32_t *map);
static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
void *wbstate);
-static enum CaseMapResult casemap(pg_wchar u1, CaseKind casekind, bool full,
+static enum CaseMapResult casemap(char32_t u1, CaseKind casekind, bool full,
const char *src, size_t srclen, size_t srcoff,
- pg_wchar *simple, const pg_wchar **special);
+ char32_t *simple, const char32_t **special);
-pg_wchar
-unicode_lowercase_simple(pg_wchar code)
+char32_t
+unicode_lowercase_simple(char32_t code)
{
- pg_wchar cp = find_case_map(code, case_map_lower);
+ char32_t cp = find_case_map(code, case_map_lower);
return cp != 0 ? cp : code;
}
-pg_wchar
-unicode_titlecase_simple(pg_wchar code)
+char32_t
+unicode_titlecase_simple(char32_t code)
{
- pg_wchar cp = find_case_map(code, case_map_title);
+ char32_t cp = find_case_map(code, case_map_title);
return cp != 0 ? cp : code;
}
-pg_wchar
-unicode_uppercase_simple(pg_wchar code)
+char32_t
+unicode_uppercase_simple(char32_t code)
{
- pg_wchar cp = find_case_map(code, case_map_upper);
+ char32_t cp = find_case_map(code, case_map_upper);
return cp != 0 ? cp : code;
}
-pg_wchar
-unicode_casefold_simple(pg_wchar code)
+char32_t
+unicode_casefold_simple(char32_t code)
{
- pg_wchar cp = find_case_map(code, case_map_fold);
+ char32_t cp = find_case_map(code, case_map_fold);
return cp != 0 ? cp : code;
}
@@ -231,10 +231,10 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
{
- pg_wchar u1 = utf8_to_unicode((unsigned char *) src + srcoff);
+ char32_t u1 = utf8_to_unicode((unsigned char *) src + srcoff);
int u1len = unicode_utf8len(u1);
- pg_wchar simple = 0;
- const pg_wchar *special = NULL;
+ char32_t simple = 0;
+ const char32_t *special = NULL;
enum CaseMapResult casemap_result;
if (str_casekind == CaseTitle)
@@ -265,8 +265,8 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
case CASEMAP_SIMPLE:
{
/* replace with single character */
- pg_wchar u2 = simple;
- pg_wchar u2len = unicode_utf8len(u2);
+ char32_t u2 = simple;
+ char32_t u2len = unicode_utf8len(u2);
Assert(special == NULL);
if (result_len + u2len <= dstsize)
@@ -280,7 +280,7 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
Assert(simple == 0);
for (int i = 0; i < MAX_CASE_EXPANSION && special[i]; i++)
{
- pg_wchar u2 = special[i];
+ char32_t u2 = special[i];
size_t u2len = unicode_utf8len(u2);
if (result_len + u2len <= dstsize)
@@ -320,7 +320,7 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
{
if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
{
- pg_wchar curr = utf8_to_unicode(str + i);
+ char32_t curr = utf8_to_unicode(str + i);
if (pg_u_prop_case_ignorable(curr))
continue;
@@ -344,7 +344,7 @@ check_final_sigma(const unsigned char *str, size_t len, size_t offset)
{
if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
{
- pg_wchar curr = utf8_to_unicode(str + i);
+ char32_t curr = utf8_to_unicode(str + i);
if (pg_u_prop_case_ignorable(curr))
continue;
@@ -394,9 +394,9 @@ check_special_conditions(int conditions, const char *str, size_t len,
* character without modification.
*/
static enum CaseMapResult
-casemap(pg_wchar u1, CaseKind casekind, bool full,
+casemap(char32_t u1, CaseKind casekind, bool full,
const char *src, size_t srclen, size_t srcoff,
- pg_wchar *simple, const pg_wchar **special)
+ char32_t *simple, const char32_t **special)
{
uint16 idx;
@@ -434,8 +434,8 @@ casemap(pg_wchar u1, CaseKind casekind, bool full,
* Find entry in simple case map.
* If the entry does not exist, 0 will be returned.
*/
-static pg_wchar
-find_case_map(pg_wchar ucs, const pg_wchar *map)
+static char32_t
+find_case_map(char32_t ucs, const char32_t *map)
{
/* Fast path for codepoints < 0x80 */
if (ucs < 0x80)
diff --git a/src/common/unicode_category.c b/src/common/unicode_category.c
index 4136c4d4f92..aab667a7bb4 100644
--- a/src/common/unicode_category.c
+++ b/src/common/unicode_category.c
@@ -1,7 +1,7 @@
/*-------------------------------------------------------------------------
* unicode_category.c
* Determine general category and character properties of Unicode
- * characters. Encoding must be UTF8, where we assume that the pg_wchar
+ * characters. Encoding must be UTF8, where we assume that the char32_t
* representation is a code point.
*
* Portions Copyright (c) 2017-2025, PostgreSQL Global Development Group
@@ -76,13 +76,13 @@
#define PG_U_CHARACTER_TAB 0x09
static bool range_search(const pg_unicode_range *tbl, size_t size,
- pg_wchar code);
+ char32_t code);
/*
* Unicode general category for the given codepoint.
*/
pg_unicode_category
-unicode_category(pg_wchar code)
+unicode_category(char32_t code)
{
int min = 0;
int mid;
@@ -108,7 +108,7 @@ unicode_category(pg_wchar code)
}
bool
-pg_u_prop_alphabetic(pg_wchar code)
+pg_u_prop_alphabetic(char32_t code)
{
if (code < 0x80)
return unicode_opt_ascii[code].properties & PG_U_PROP_ALPHABETIC;
@@ -119,7 +119,7 @@ pg_u_prop_alphabetic(pg_wchar code)
}
bool
-pg_u_prop_lowercase(pg_wchar code)
+pg_u_prop_lowercase(char32_t code)
{
if (code < 0x80)
return unicode_opt_ascii[code].properties & PG_U_PROP_LOWERCASE;
@@ -130,7 +130,7 @@ pg_u_prop_lowercase(pg_wchar code)
}
bool
-pg_u_prop_uppercase(pg_wchar code)
+pg_u_prop_uppercase(char32_t code)
{
if (code < 0x80)
return unicode_opt_ascii[code].properties & PG_U_PROP_UPPERCASE;
@@ -141,7 +141,7 @@ pg_u_prop_uppercase(pg_wchar code)
}
bool
-pg_u_prop_cased(pg_wchar code)
+pg_u_prop_cased(char32_t code)
{
uint32 category_mask;
@@ -156,7 +156,7 @@ pg_u_prop_cased(pg_wchar code)
}
bool
-pg_u_prop_case_ignorable(pg_wchar code)
+pg_u_prop_case_ignorable(char32_t code)
{
if (code < 0x80)
return unicode_opt_ascii[code].properties & PG_U_PROP_CASE_IGNORABLE;
@@ -167,7 +167,7 @@ pg_u_prop_case_ignorable(pg_wchar code)
}
bool
-pg_u_prop_white_space(pg_wchar code)
+pg_u_prop_white_space(char32_t code)
{
if (code < 0x80)
return unicode_opt_ascii[code].properties & PG_U_PROP_WHITE_SPACE;
@@ -178,7 +178,7 @@ pg_u_prop_white_space(pg_wchar code)
}
bool
-pg_u_prop_hex_digit(pg_wchar code)
+pg_u_prop_hex_digit(char32_t code)
{
if (code < 0x80)
return unicode_opt_ascii[code].properties & PG_U_PROP_HEX_DIGIT;
@@ -189,7 +189,7 @@ pg_u_prop_hex_digit(pg_wchar code)
}
bool
-pg_u_prop_join_control(pg_wchar code)
+pg_u_prop_join_control(char32_t code)
{
if (code < 0x80)
return unicode_opt_ascii[code].properties & PG_U_PROP_JOIN_CONTROL;
@@ -208,7 +208,7 @@ pg_u_prop_join_control(pg_wchar code)
*/
bool
-pg_u_isdigit(pg_wchar code, bool posix)
+pg_u_isdigit(char32_t code, bool posix)
{
if (posix)
return ('0' <= code && code <= '9');
@@ -217,19 +217,19 @@ pg_u_isdigit(pg_wchar code, bool posix)
}
bool
-pg_u_isalpha(pg_wchar code)
+pg_u_isalpha(char32_t code)
{
return pg_u_prop_alphabetic(code);
}
bool
-pg_u_isalnum(pg_wchar code, bool posix)
+pg_u_isalnum(char32_t code, bool posix)
{
return pg_u_isalpha(code) || pg_u_isdigit(code, posix);
}
bool
-pg_u_isword(pg_wchar code)
+pg_u_isword(char32_t code)
{
uint32 category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
@@ -240,32 +240,32 @@ pg_u_isword(pg_wchar code)
}
bool
-pg_u_isupper(pg_wchar code)
+pg_u_isupper(char32_t code)
{
return pg_u_prop_uppercase(code);
}
bool
-pg_u_islower(pg_wchar code)
+pg_u_islower(char32_t code)
{
return pg_u_prop_lowercase(code);
}
bool
-pg_u_isblank(pg_wchar code)
+pg_u_isblank(char32_t code)
{
return code == PG_U_CHARACTER_TAB ||
unicode_category(code) == PG_U_SPACE_SEPARATOR;
}
bool
-pg_u_iscntrl(pg_wchar code)
+pg_u_iscntrl(char32_t code)
{
return unicode_category(code) == PG_U_CONTROL;
}
bool
-pg_u_isgraph(pg_wchar code)
+pg_u_isgraph(char32_t code)
{
uint32 category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
@@ -276,7 +276,7 @@ pg_u_isgraph(pg_wchar code)
}
bool
-pg_u_isprint(pg_wchar code)
+pg_u_isprint(char32_t code)
{
pg_unicode_category category = unicode_category(code);
@@ -287,7 +287,7 @@ pg_u_isprint(pg_wchar code)
}
bool
-pg_u_ispunct(pg_wchar code, bool posix)
+pg_u_ispunct(char32_t code, bool posix)
{
uint32 category_mask;
@@ -308,13 +308,13 @@ pg_u_ispunct(pg_wchar code, bool posix)
}
bool
-pg_u_isspace(pg_wchar code)
+pg_u_isspace(char32_t code)
{
return pg_u_prop_white_space(code);
}
bool
-pg_u_isxdigit(pg_wchar code, bool posix)
+pg_u_isxdigit(char32_t code, bool posix)
{
if (posix)
return (('0' <= code && code <= '9') ||
@@ -478,7 +478,7 @@ unicode_category_abbrev(pg_unicode_category category)
* given table.
*/
static bool
-range_search(const pg_unicode_range *tbl, size_t size, pg_wchar code)
+range_search(const pg_unicode_range *tbl, size_t size, char32_t code)
{
int min = 0;
int mid;
diff --git a/src/common/unicode_norm.c b/src/common/unicode_norm.c
index 6654b4cbc49..489d99cd5ab 100644
--- a/src/common/unicode_norm.c
+++ b/src/common/unicode_norm.c
@@ -69,7 +69,7 @@ conv_compare(const void *p1, const void *p2)
* lookup, while the frontend version uses a binary search.
*/
static const pg_unicode_decomposition *
-get_code_entry(pg_wchar code)
+get_code_entry(char32_t code)
{
#ifndef FRONTEND
int h;
@@ -109,7 +109,7 @@ get_code_entry(pg_wchar code)
* Get the combining class of the given codepoint.
*/
static uint8
-get_canonical_class(pg_wchar code)
+get_canonical_class(char32_t code)
{
const pg_unicode_decomposition *entry = get_code_entry(code);
@@ -130,15 +130,15 @@ get_canonical_class(pg_wchar code)
* Note: the returned pointer can point to statically allocated buffer, and
* is only valid until next call to this function!
*/
-static const pg_wchar *
+static const char32_t *
get_code_decomposition(const pg_unicode_decomposition *entry, int *dec_size)
{
- static pg_wchar x;
+ static char32_t x;
if (DECOMPOSITION_IS_INLINE(entry))
{
Assert(DECOMPOSITION_SIZE(entry) == 1);
- x = (pg_wchar) entry->dec_index;
+ x = (char32_t) entry->dec_index;
*dec_size = 1;
return &x;
}
@@ -156,7 +156,7 @@ get_code_decomposition(const pg_unicode_decomposition *entry, int *dec_size)
* are, in turn, decomposable.
*/
static int
-get_decomposed_size(pg_wchar code, bool compat)
+get_decomposed_size(char32_t code, bool compat)
{
const pg_unicode_decomposition *entry;
int size = 0;
@@ -318,7 +318,7 @@ recompose_code(uint32 start, uint32 code, uint32 *result)
* in the array result.
*/
static void
-decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
+decompose_code(char32_t code, bool compat, char32_t **result, int *current)
{
const pg_unicode_decomposition *entry;
int i;
@@ -337,7 +337,7 @@ decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
v,
tindex,
sindex;
- pg_wchar *res = *result;
+ char32_t *res = *result;
sindex = code - SBASE;
l = LBASE + sindex / (VCOUNT * TCOUNT);
@@ -369,7 +369,7 @@ decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
(!compat && DECOMPOSITION_IS_COMPAT(entry)))
{
- pg_wchar *res = *result;
+ char32_t *res = *result;
res[*current] = code;
(*current)++;
@@ -382,7 +382,7 @@ decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
decomp = get_code_decomposition(entry, &dec_size);
for (i = 0; i < dec_size; i++)
{
- pg_wchar lcode = (pg_wchar) decomp[i];
+ char32_t lcode = (char32_t) decomp[i];
/* Leave if no more decompositions */
decompose_code(lcode, compat, result, current);
@@ -398,17 +398,17 @@ decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
* malloc. Or NULL if we run out of memory. In backend, the returned
* string is palloc'd instead, and OOM is reported with ereport().
*/
-pg_wchar *
-unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
+char32_t *
+unicode_normalize(UnicodeNormalizationForm form, const char32_t *input)
{
bool compat = (form == UNICODE_NFKC || form == UNICODE_NFKD);
bool recompose = (form == UNICODE_NFC || form == UNICODE_NFKC);
- pg_wchar *decomp_chars;
- pg_wchar *recomp_chars;
+ char32_t *decomp_chars;
+ char32_t *recomp_chars;
int decomp_size,
current_size;
int count;
- const pg_wchar *p;
+ const char32_t *p;
/* variables for recomposition */
int last_class;
@@ -425,7 +425,7 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
for (p = input; *p; p++)
decomp_size += get_decomposed_size(*p, compat);
- decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
+ decomp_chars = (char32_t *) ALLOC((decomp_size + 1) * sizeof(char32_t));
if (decomp_chars == NULL)
return NULL;
@@ -448,9 +448,9 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
*/
for (count = 1; count < decomp_size; count++)
{
- pg_wchar prev = decomp_chars[count - 1];
- pg_wchar next = decomp_chars[count];
- pg_wchar tmp;
+ char32_t prev = decomp_chars[count - 1];
+ char32_t next = decomp_chars[count];
+ char32_t tmp;
const uint8 prevClass = get_canonical_class(prev);
const uint8 nextClass = get_canonical_class(next);
@@ -487,7 +487,7 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
* longer than the decomposed one, so make the allocation of the output
* string based on that assumption.
*/
- recomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
+ recomp_chars = (char32_t *) ALLOC((decomp_size + 1) * sizeof(char32_t));
if (!recomp_chars)
{
FREE(decomp_chars);
@@ -501,9 +501,9 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
for (count = 1; count < decomp_size; count++)
{
- pg_wchar ch = decomp_chars[count];
+ char32_t ch = decomp_chars[count];
int ch_class = get_canonical_class(ch);
- pg_wchar composite;
+ char32_t composite;
if (last_class < ch_class &&
recompose_code(starter_ch, ch, &composite))
@@ -524,7 +524,7 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
recomp_chars[target_pos++] = ch;
}
}
- recomp_chars[target_pos] = (pg_wchar) '\0';
+ recomp_chars[target_pos] = (char32_t) '\0';
FREE(decomp_chars);
@@ -540,7 +540,7 @@ unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
#ifndef FRONTEND
static const pg_unicode_normprops *
-qc_hash_lookup(pg_wchar ch, const pg_unicode_norminfo *norminfo)
+qc_hash_lookup(char32_t ch, const pg_unicode_norminfo *norminfo)
{
int h;
uint32 hashkey;
@@ -571,7 +571,7 @@ qc_hash_lookup(pg_wchar ch, const pg_unicode_norminfo *norminfo)
* Look up the normalization quick check character property
*/
static UnicodeNormalizationQC
-qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
+qc_is_allowed(UnicodeNormalizationForm form, char32_t ch)
{
const pg_unicode_normprops *found = NULL;
@@ -595,7 +595,7 @@ qc_is_allowed(UnicodeNormalizationForm form, pg_wchar ch)
}
UnicodeNormalizationQC
-unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *input)
+unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const char32_t *input)
{
uint8 lastCanonicalClass = 0;
UnicodeNormalizationQC result = UNICODE_NORM_QC_YES;
@@ -610,9 +610,9 @@ unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *
if (form == UNICODE_NFD || form == UNICODE_NFKD)
return UNICODE_NORM_QC_MAYBE;
- for (const pg_wchar *p = input; *p; p++)
+ for (const char32_t *p = input; *p; p++)
{
- pg_wchar ch = *p;
+ char32_t ch = *p;
uint8 canonicalClass;
UnicodeNormalizationQC check;
diff --git a/src/fe_utils/mbprint.c b/src/fe_utils/mbprint.c
index eb3eeee9925..abffdbe18a2 100644
--- a/src/fe_utils/mbprint.c
+++ b/src/fe_utils/mbprint.c
@@ -49,20 +49,20 @@ pg_get_utf8_id(void)
*
* No error checks here, c must point to a long-enough string.
*/
-static pg_wchar
+static char32_t
utf8_to_unicode(const unsigned char *c)
{
if ((*c & 0x80) == 0)
- return (pg_wchar) c[0];
+ return (char32_t) c[0];
else if ((*c & 0xe0) == 0xc0)
- return (pg_wchar) (((c[0] & 0x1f) << 6) |
+ return (char32_t) (((c[0] & 0x1f) << 6) |
(c[1] & 0x3f));
else if ((*c & 0xf0) == 0xe0)
- return (pg_wchar) (((c[0] & 0x0f) << 12) |
+ return (char32_t) (((c[0] & 0x0f) << 12) |
((c[1] & 0x3f) << 6) |
(c[2] & 0x3f));
else if ((*c & 0xf8) == 0xf0)
- return (pg_wchar) (((c[0] & 0x07) << 18) |
+ return (char32_t) (((c[0] & 0x07) << 18) |
((c[1] & 0x3f) << 12) |
((c[2] & 0x3f) << 6) |
(c[3] & 0x3f));
diff --git a/src/include/c.h b/src/include/c.h
index f4ec33e9b07..757dfff4782 100644
--- a/src/include/c.h
+++ b/src/include/c.h
@@ -1376,6 +1376,29 @@ typedef intptr_t sigjmp_buf[5];
/* /port compatibility functions */
#include "port.h"
+/*
+ * char16_t and char32_t
+ * Unicode code points.
+ *
+ * uchar.h should always be available in C11, but it's not available on
+ * Mac. However, these types are keywords in C++11, so when using C++, we
+ * can't redefine the types.
+ *
+ * XXX: when uchar.h is available everywhere, we can remove this check and
+ * just include uchar.h unconditionally.
+ *
+ * XXX: this section is out of place because uchar.h needs to be included
+ * after port.h, due to an interaction with win32_port.h in some cases.
+ */
+#ifdef HAVE_UCHAR_H
+#include <uchar.h>
+#else
+#ifndef __cplusplus
+typedef uint16_t char16_t;
+typedef uint32_t char32_t;
+#endif
+#endif
+
/* IWYU pragma: end_exports */
#endif /* C_H */
diff --git a/src/include/common/unicode_case.h b/src/include/common/unicode_case.h
index 41e2c1f4b33..6bcffd349c2 100644
--- a/src/include/common/unicode_case.h
+++ b/src/include/common/unicode_case.h
@@ -14,14 +14,12 @@
#ifndef UNICODE_CASE_H
#define UNICODE_CASE_H
-#include "mb/pg_wchar.h"
-
typedef size_t (*WordBoundaryNext) (void *wbstate);
-pg_wchar unicode_lowercase_simple(pg_wchar code);
-pg_wchar unicode_titlecase_simple(pg_wchar code);
-pg_wchar unicode_uppercase_simple(pg_wchar code);
-pg_wchar unicode_casefold_simple(pg_wchar code);
+char32_t unicode_lowercase_simple(char32_t code);
+char32_t unicode_titlecase_simple(char32_t code);
+char32_t unicode_uppercase_simple(char32_t code);
+char32_t unicode_casefold_simple(char32_t code);
size_t unicode_strlower(char *dst, size_t dstsize, const char *src,
ssize_t srclen, bool full);
size_t unicode_strtitle(char *dst, size_t dstsize, const char *src,
diff --git a/src/include/common/unicode_case_table.h b/src/include/common/unicode_case_table.h
index d5311786582..0a14fb2d97b 100644
--- a/src/include/common/unicode_case_table.h
+++ b/src/include/common/unicode_case_table.h
@@ -18,7 +18,6 @@
*/
#include "common/unicode_case.h"
-#include "mb/pg_wchar.h"
/*
* The maximum number of codepoints that can result from case mapping
@@ -45,7 +44,7 @@ typedef enum
typedef struct
{
int16 conditions;
- pg_wchar map[NCaseKind][MAX_CASE_EXPANSION];
+ char32_t map[NCaseKind][MAX_CASE_EXPANSION];
} pg_special_case;
/*
@@ -166,7 +165,7 @@ static const pg_special_case special_case[106] =
* The entry case_map_lower[case_index(codepoint)] is the mapping for the
* given codepoint.
*/
-static const pg_wchar case_map_lower[1704] =
+static const char32_t case_map_lower[1704] =
{
0x000000, /* reserved */
0x000000, /* U+000000 */
@@ -1879,7 +1878,7 @@ static const pg_wchar case_map_lower[1704] =
* The entry case_map_title[case_index(codepoint)] is the mapping for the
* given codepoint.
*/
-static const pg_wchar case_map_title[1704] =
+static const char32_t case_map_title[1704] =
{
0x000000, /* reserved */
0x000000, /* U+000000 */
@@ -3592,7 +3591,7 @@ static const pg_wchar case_map_title[1704] =
* The entry case_map_upper[case_index(codepoint)] is the mapping for the
* given codepoint.
*/
-static const pg_wchar case_map_upper[1704] =
+static const char32_t case_map_upper[1704] =
{
0x000000, /* reserved */
0x000000, /* U+000000 */
@@ -5305,7 +5304,7 @@ static const pg_wchar case_map_upper[1704] =
* The entry case_map_fold[case_index(codepoint)] is the mapping for the
* given codepoint.
*/
-static const pg_wchar case_map_fold[1704] =
+static const char32_t case_map_fold[1704] =
{
0x000000, /* reserved */
0x000000, /* U+000000 */
@@ -13522,7 +13521,7 @@ static const uint16 case_map[4778] =
* the offset into the mapping tables.
*/
static inline uint16
-case_index(pg_wchar cp)
+case_index(char32_t cp)
{
/* Fast path for codepoints < 0x0588 */
if (cp < 0x0588)
diff --git a/src/include/common/unicode_category.h b/src/include/common/unicode_category.h
index 8fd8b67a416..684143d3c8a 100644
--- a/src/include/common/unicode_category.h
+++ b/src/include/common/unicode_category.h
@@ -14,8 +14,6 @@
#ifndef UNICODE_CATEGORY_H
#define UNICODE_CATEGORY_H
-#include "mb/pg_wchar.h"
-
/*
* Unicode General Category Values
*
@@ -61,31 +59,31 @@ typedef enum pg_unicode_category
PG_U_FINAL_PUNCTUATION = 29 /* Pf */
} pg_unicode_category;
-extern pg_unicode_category unicode_category(pg_wchar code);
+extern pg_unicode_category unicode_category(char32_t code);
extern const char *unicode_category_string(pg_unicode_category category);
extern const char *unicode_category_abbrev(pg_unicode_category category);
-extern bool pg_u_prop_alphabetic(pg_wchar code);
-extern bool pg_u_prop_lowercase(pg_wchar code);
-extern bool pg_u_prop_uppercase(pg_wchar code);
-extern bool pg_u_prop_cased(pg_wchar code);
-extern bool pg_u_prop_case_ignorable(pg_wchar code);
-extern bool pg_u_prop_white_space(pg_wchar code);
-extern bool pg_u_prop_hex_digit(pg_wchar code);
-extern bool pg_u_prop_join_control(pg_wchar code);
+extern bool pg_u_prop_alphabetic(char32_t code);
+extern bool pg_u_prop_lowercase(char32_t code);
+extern bool pg_u_prop_uppercase(char32_t code);
+extern bool pg_u_prop_cased(char32_t code);
+extern bool pg_u_prop_case_ignorable(char32_t code);
+extern bool pg_u_prop_white_space(char32_t code);
+extern bool pg_u_prop_hex_digit(char32_t code);
+extern bool pg_u_prop_join_control(char32_t code);
-extern bool pg_u_isdigit(pg_wchar code, bool posix);
-extern bool pg_u_isalpha(pg_wchar code);
-extern bool pg_u_isalnum(pg_wchar code, bool posix);
-extern bool pg_u_isword(pg_wchar code);
-extern bool pg_u_isupper(pg_wchar code);
-extern bool pg_u_islower(pg_wchar code);
-extern bool pg_u_isblank(pg_wchar code);
-extern bool pg_u_iscntrl(pg_wchar code);
-extern bool pg_u_isgraph(pg_wchar code);
-extern bool pg_u_isprint(pg_wchar code);
-extern bool pg_u_ispunct(pg_wchar code, bool posix);
-extern bool pg_u_isspace(pg_wchar code);
-extern bool pg_u_isxdigit(pg_wchar code, bool posix);
+extern bool pg_u_isdigit(char32_t code, bool posix);
+extern bool pg_u_isalpha(char32_t code);
+extern bool pg_u_isalnum(char32_t code, bool posix);
+extern bool pg_u_isword(char32_t code);
+extern bool pg_u_isupper(char32_t code);
+extern bool pg_u_islower(char32_t code);
+extern bool pg_u_isblank(char32_t code);
+extern bool pg_u_iscntrl(char32_t code);
+extern bool pg_u_isgraph(char32_t code);
+extern bool pg_u_isprint(char32_t code);
+extern bool pg_u_ispunct(char32_t code, bool posix);
+extern bool pg_u_isspace(char32_t code);
+extern bool pg_u_isxdigit(char32_t code, bool posix);
#endif /* UNICODE_CATEGORY_H */
diff --git a/src/include/common/unicode_category_table.h b/src/include/common/unicode_category_table.h
index 95a1c65da7e..466a41b72b0 100644
--- a/src/include/common/unicode_category_table.h
+++ b/src/include/common/unicode_category_table.h
@@ -20,15 +20,15 @@
*/
typedef struct
{
- uint32 first; /* Unicode codepoint */
- uint32 last; /* Unicode codepoint */
+ char32_t first; /* Unicode codepoint */
+ char32_t last; /* Unicode codepoint */
uint8 category; /* General Category */
} pg_category_range;
typedef struct
{
- uint32 first; /* Unicode codepoint */
- uint32 last; /* Unicode codepoint */
+ char32_t first; /* Unicode codepoint */
+ char32_t last; /* Unicode codepoint */
} pg_unicode_range;
typedef struct
diff --git a/src/include/common/unicode_norm.h b/src/include/common/unicode_norm.h
index 5bc3b79e78e..516c192cc4c 100644
--- a/src/include/common/unicode_norm.h
+++ b/src/include/common/unicode_norm.h
@@ -14,8 +14,6 @@
#ifndef UNICODE_NORM_H
#define UNICODE_NORM_H
-#include "mb/pg_wchar.h"
-
typedef enum
{
UNICODE_NFC = 0,
@@ -32,8 +30,8 @@ typedef enum
UNICODE_NORM_QC_MAYBE = -1,
} UnicodeNormalizationQC;
-extern pg_wchar *unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input);
+extern char32_t *unicode_normalize(UnicodeNormalizationForm form, const char32_t *input);
-extern UnicodeNormalizationQC unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const pg_wchar *input);
+extern UnicodeNormalizationQC unicode_is_normalized_quickcheck(UnicodeNormalizationForm form, const char32_t *input);
#endif /* UNICODE_NORM_H */
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index 4b4a9974b75..4d84bdc81e4 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -532,25 +532,25 @@ typedef uint32 (*utf_local_conversion_func) (uint32 code);
* Some handy functions for Unicode-specific tests.
*/
static inline bool
-is_valid_unicode_codepoint(pg_wchar c)
+is_valid_unicode_codepoint(char32_t c)
{
return (c > 0 && c <= 0x10FFFF);
}
static inline bool
-is_utf16_surrogate_first(pg_wchar c)
+is_utf16_surrogate_first(char32_t c)
{
return (c >= 0xD800 && c <= 0xDBFF);
}
static inline bool
-is_utf16_surrogate_second(pg_wchar c)
+is_utf16_surrogate_second(char32_t c)
{
return (c >= 0xDC00 && c <= 0xDFFF);
}
-static inline pg_wchar
-surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
+static inline char32_t
+surrogate_pair_to_codepoint(char16_t first, char16_t second)
{
return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF);
}
@@ -561,20 +561,20 @@ surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
*
* No error checks here, c must point to a long-enough string.
*/
-static inline pg_wchar
+static inline char32_t
utf8_to_unicode(const unsigned char *c)
{
if ((*c & 0x80) == 0)
- return (pg_wchar) c[0];
+ return (char32_t) c[0];
else if ((*c & 0xe0) == 0xc0)
- return (pg_wchar) (((c[0] & 0x1f) << 6) |
+ return (char32_t) (((c[0] & 0x1f) << 6) |
(c[1] & 0x3f));
else if ((*c & 0xf0) == 0xe0)
- return (pg_wchar) (((c[0] & 0x0f) << 12) |
+ return (char32_t) (((c[0] & 0x0f) << 12) |
((c[1] & 0x3f) << 6) |
(c[2] & 0x3f));
else if ((*c & 0xf8) == 0xf0)
- return (pg_wchar) (((c[0] & 0x07) << 18) |
+ return (char32_t) (((c[0] & 0x07) << 18) |
((c[1] & 0x3f) << 12) |
((c[2] & 0x3f) << 6) |
(c[3] & 0x3f));
@@ -588,7 +588,7 @@ utf8_to_unicode(const unsigned char *c)
* unicode_utf8len(c) bytes available.
*/
static inline unsigned char *
-unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
+unicode_to_utf8(char32_t c, unsigned char *utf8string)
{
if (c <= 0x7F)
{
@@ -620,7 +620,7 @@ unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
* Number of bytes needed to represent the given char in UTF8.
*/
static inline int
-unicode_utf8len(pg_wchar c)
+unicode_utf8len(char32_t c)
{
if (c <= 0x7F)
return 1;
@@ -676,8 +676,8 @@ extern int pg_valid_server_encoding(const char *name);
extern bool is_encoding_supported_by_icu(int encoding);
extern const char *get_encoding_name_for_icu(int encoding);
-extern unsigned char *unicode_to_utf8(pg_wchar c, unsigned char *utf8string);
-extern pg_wchar utf8_to_unicode(const unsigned char *c);
+extern unsigned char *unicode_to_utf8(char32_t c, unsigned char *utf8string);
+extern char32_t utf8_to_unicode(const unsigned char *c);
extern bool pg_utf8_islegal(const unsigned char *source, int length);
extern int pg_utf_mblen(const unsigned char *s);
extern int pg_mule_mblen(const unsigned char *s);
@@ -739,8 +739,8 @@ extern char *pg_server_to_client(const char *s, int len);
extern char *pg_any_to_server(const char *s, int len, int encoding);
extern char *pg_server_to_any(const char *s, int len, int encoding);
-extern void pg_unicode_to_server(pg_wchar c, unsigned char *s);
-extern bool pg_unicode_to_server_noerror(pg_wchar c, unsigned char *s);
+extern void pg_unicode_to_server(char32_t c, unsigned char *s);
+extern bool pg_unicode_to_server_noerror(char32_t c, unsigned char *s);
extern unsigned short BIG5toCNS(unsigned short big5, unsigned char *lc);
extern unsigned short CNStoBIG5(unsigned short cns, unsigned char lc);
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 08d7bfbee10..f52f14cc566 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -463,6 +463,9 @@
/* Define to 1 if you have the <termios.h> header file. */
#undef HAVE_TERMIOS_H
+/* Define to 1 if you have the <uchar.h> header file. */
+#undef HAVE_UCHAR_H
+
/* Define to 1 if curl_global_init() is guaranteed to be thread-safe. */
#undef HAVE_THREADSAFE_CURL_GLOBAL_INIT
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index ac2da4c98cf..df88c78fe3a 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -3505,6 +3505,8 @@ cb_cleanup_dir
cb_options
cb_tablespace
cb_tablespace_mapping
+char16_t
+char32_t
check_agg_arguments_context
check_function_callback
check_network_data
--
2.43.0
On Thu, 2025-10-30 at 04:25 +1300, Thomas Munro wrote:
Here are some sketch-quality patches to try out some of these ideas,
for discussion. I gave them .txt endings so as not to hijack your
thread's CI.
I like the direction this is going. I will commit the char32_t work
anyway, so afterward feel free to hijack the thread (there's a lot of
good information here so continuing here might be more productive than
starting a new thread).
Regarding 0002, IIUC, for PG_WCHAR_UTF32, surrogates are forbidden, but
the comment about UTF-16 is a bit vague. I think we should add some
asserts to make it clear.
The basic communication mechanism between the modules is the database
encoding: it determines PgWcharEncodingScheme in both wchar.c and
pg_locale_libc.c. That seems reasonable to me, and doesn't interfere
with the other providers.
I'm still not quite sure how this fits with ICU in a single-byte
encoding, but doesn't seem worse than what we do currently.
Also, tangentially, I'm a bit anxious to do a permanent
setlocale(LC_CTYPE, "C"), and we are very close. If these two threads
are successful, I believe we can do it:
/messages/by-id/90f176c5b85b9da26a3265b2630ece3552068566.camel@j-davis.com
/messages/by-id/d9657a6e51aa20702447bb2386b32fea6218670f.camel@j-davis.com
That would be a big simplification because it would isolate libc ctype
behavior to pg_locale_libc.c. That would make me feel generally more
comfortable with additional work in this area.
Regards,
Jeff Davis