diff --git a/src/backend/utils/mb/wchar.c b/src/backend/utils/mb/wchar.c index e164866..1a2a24d 100644 --- a/src/backend/utils/mb/wchar.c +++ b/src/backend/utils/mb/wchar.c @@ -453,6 +453,34 @@ unicode_to_utf8(pg_wchar c, unsigned char *utf8string) return utf8string; } +/* + * Convert a UTF-8 character to a Unicode code point. + * This is a one-character version of pg_utf2wchar_with_len. + * + * c must point to a large enough string for the character's length. + */ +pg_wchar +utf8_to_unicode(const unsigned char *c) +{ + if ((*c & 0x80) == 0) + return (pg_wchar) c[0]; + else if ((*c & 0xe0) == 0xc0) + return (pg_wchar) (((c[0] & 0x1f) << 6) | + (c[1] & 0x3f)); + else if ((*c & 0xf0) == 0xe0) + return (pg_wchar) (((c[0] & 0x0f) << 12) | + ((c[1] & 0x3f) << 6) | + (c[2] & 0x3f)); + else if ((*c & 0xf8) == 0xf0) + return (pg_wchar) (((c[0] & 0x07) << 18) | + ((c[1] & 0x3f) << 12) | + ((c[2] & 0x3f) << 6) | + (c[3] & 0x3f)); + else + /* that is an invalid code on purpose */ + return 0xffffffff; +} + /* * Return the byte length of a UTF8 character pointed to by s @@ -462,8 +490,8 @@ unicode_to_utf8(pg_wchar c, unsigned char *utf8string) * We return "1" for any leading byte that is either flat-out illegal or * indicates a length larger than we support. * - * pg_utf2wchar_with_len(), utf2ucs(), pg_utf8_islegal(), and perhaps - * other places would need to be fixed to change this. + * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), + * and perhaps other places would need to be fixed to change this. */ int pg_utf_mblen(const unsigned char *s) @@ -632,36 +660,10 @@ ucs_wcwidth(pg_wchar ucs) (ucs >= 0x20000 && ucs <= 0x2ffff))); } -static pg_wchar -utf2ucs(const unsigned char *c) -{ - /* - * one char version of pg_utf2wchar_with_len. no control here, c must - * point to a large enough string - */ - if ((*c & 0x80) == 0) - return (pg_wchar) c[0]; - else if ((*c & 0xe0) == 0xc0) - return (pg_wchar) (((c[0] & 0x1f) << 6) | - (c[1] & 0x3f)); - else if ((*c & 0xf0) == 0xe0) - return (pg_wchar) (((c[0] & 0x0f) << 12) | - ((c[1] & 0x3f) << 6) | - (c[2] & 0x3f)); - else if ((*c & 0xf8) == 0xf0) - return (pg_wchar) (((c[0] & 0x07) << 18) | - ((c[1] & 0x3f) << 12) | - ((c[2] & 0x3f) << 6) | - (c[3] & 0x3f)); - else - /* that is an invalid code on purpose */ - return 0xffffffff; -} - static int pg_utf_dsplen(const unsigned char *s) { - return ucs_wcwidth(utf2ucs(s)); + return ucs_wcwidth(utf8_to_unicode(s)); } /* diff --git a/src/bin/psql/mbprint.c b/src/bin/psql/mbprint.c index b1263ea..4c8b1d8 100644 --- a/src/bin/psql/mbprint.c +++ b/src/bin/psql/mbprint.c @@ -63,7 +63,7 @@ utf2ucs(const unsigned char *c) ((c[1] & 0x3f) << 6) | (c[2] & 0x3f)); } - else if ((*c & 0xf0) == 0xf0) + else if ((*c & 0xf8) == 0xf0) { return (pg_wchar) (((c[0] & 0x07) << 18) | ((c[1] & 0x3f) << 12) | diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h index 817f9aa..ea753a6 100644 --- a/src/include/mb/pg_wchar.h +++ b/src/include/mb/pg_wchar.h @@ -412,6 +412,7 @@ extern int pg_valid_client_encoding(const char *name); extern int pg_valid_server_encoding(const char *name); extern unsigned char *unicode_to_utf8(pg_wchar c, unsigned char *utf8string); +extern pg_wchar utf8_to_unicode(const unsigned char *utf8char); extern int pg_utf_mblen(const unsigned char *); extern unsigned char *pg_do_encoding_conversion(unsigned char *src, int len, int src_encoding,