Patch for bug #12845 (GB18030 encoding)

Started by Arjen Nienhuisover 10 years ago14 messages

Arjen Nienhuis

a.g.nienhuis@gmail.com

over 10 years ago

1 attachment(s)

Hi,

Can someone look at this patch. It should fix bug #12845.

The current tests for conversions are very minimal. I expanded them a
bit for this bug.

I think the binary search in the .map files should be removed but I
leave that for another patch.

Attachments:

0001-Have-GB18030-handle-more-than-2-byte-Unicode-code-po.patchapplication/octet-stream; name=0001-Have-GB18030-handle-more-than-2-byte-Unicode-code-po.patchDownload

From 9564aa018c7eae9d66589b2be4303f8aede94f77 Mon Sep 17 00:00:00 2001
From: Arjen Nienhuis <a.g.nienhuis@gmail.com>
Date: Sun, 3 May 2015 22:28:26 +0200
Subject: [PATCH] Have GB18030 handle more than 2-byte Unicode code points

BUG #12845: The GB18030 encoding doesn't support Unicode characters over 0xFFFF

SELECT convert_to(chr(128512), 'GB18030');

expected result:

 convert_to
------------
 \x9439fc36
(1 row)
---
 .../utf8_and_gb18030/utf8_and_gb18030.c            | 279 ++++++++++++++++++++-
 src/backend/utils/mb/wchar.c                       |   2 +-
 src/include/mb/pg_wchar.h                          |   1 +
 src/test/regress/expected/conversion.out           |  16 +-
 src/test/regress/sql/conversion.sql                |   4 +-
 5 files changed, 287 insertions(+), 15 deletions(-)

diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c b/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c
index 4427fea..c645831 100644
--- a/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c
+++ b/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c
@@ -25,6 +25,16 @@ PG_FUNCTION_INFO_V1(utf8_to_gb18030);
 extern Datum gb18030_to_utf8(PG_FUNCTION_ARGS);
 extern Datum utf8_to_gb18030(PG_FUNCTION_ARGS);
 
+static uint32 utf8_to_gb18030_hi(uint32 utf8);
+static uint32 gb18030_to_utf8_hi(uint32 gb);
+static int compare1(const void *p1, const void *p2);
+static int compare2(const void *p1, const void *p2);
+
+/* All Unicode codepoints over U+FFFF are mapped to one range in the GB18030 encoding */
+static const uint32 UTF32_FIRST = 0x10000;
+static const uint32 GB18030_FIRST = 0x90308130;
+static const uint32 GB18030_LAST = 0xe3329a35;
+
 /* ----------
  * conv_proc(
  *		INTEGER,	-- source encoding id
@@ -41,11 +51,84 @@ gb18030_to_utf8(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	unsigned int iiso;
+	unsigned int outf;
+	int			l;
+	pg_local_to_utf *p;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_GB18030, PG_UTF8);
 
-	LocalToUtf(src, dest, LUmapGB18030, NULL,
-		 sizeof(LUmapGB18030) / sizeof(pg_local_to_utf), 0, PG_GB18030, len);
+	for (; len > 0; len -= l)
+	{
+		/* "break" cases all represent errors */
+		if (*src == '\0')
+			break;
+
+		if (!IS_HIGHBIT_SET(*src))
+		{
+			/* ASCII case is easy */
+			*dest++ = *src++;
+			l = 1;
+			continue;
+		}
+
+		l = pg_gb18030_verifier(src, len);
+		if (l < 0)
+			break;
+
+		if (l == 2)
+		{
+			iiso = *src++ << 8;
+			iiso |= *src++;
+		}
+		else if (l == 4)
+		{
+			iiso = *src++ << 24;
+			iiso |= *src++ << 16;
+			iiso |= *src++ << 8;
+			iiso |= *src++;
+
+			if (iiso >= GB18030_FIRST && iiso <= GB18030_LAST)
+			{
+				outf = gb18030_to_utf8_hi(iiso);
+				*dest++ = outf >> 24;
+				*dest++ = (outf & 0x00ff0000) >> 16;
+				*dest++ = (outf & 0x0000ff00) >> 8;
+				*dest++ = outf & 0x000000ff;
+				continue;
+			}
+		}
+		else
+		{
+			elog(ERROR, "unsupported character length %d", l);
+			iiso = 0;			/* keep compiler quiet */
+		}
+
+		p = bsearch(&iiso, LUmapGB18030, sizeof(LUmapGB18030) / sizeof(pg_local_to_utf),
+					sizeof(pg_local_to_utf), compare2);
+
+		if (p == NULL)
+		{
+			report_untranslatable_char(PG_GB18030, PG_UTF8,
+									   (const char *) (src - l), len);
+		}
+		else
+		{
+			if (p->utf & 0xff000000)
+				*dest++ = p->utf >> 24;
+			if (p->utf & 0x00ff0000)
+				*dest++ = (p->utf & 0x00ff0000) >> 16;
+			if (p->utf & 0x0000ff00)
+				*dest++ = (p->utf & 0x0000ff00) >> 8;
+			if (p->utf & 0x000000ff)
+				*dest++ = p->utf & 0x000000ff;
+		}
+	}
+
+	if (len > 0)
+		report_invalid_encoding(PG_GB18030, (const char *) src, len);
+
+	*dest = '\0';
 
 	PG_RETURN_VOID();
 }
@@ -56,11 +139,199 @@ utf8_to_gb18030(PG_FUNCTION_ARGS)
 	unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
 	unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
 	int			len = PG_GETARG_INT32(4);
+	uint32		iutf;
+	uint32		code;
+	pg_utf_to_local *p;
+	int			l;
 
 	CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_GB18030);
 
-	UtfToLocal(src, dest, ULmapGB18030, NULL,
-		 sizeof(ULmapGB18030) / sizeof(pg_utf_to_local), 0, PG_GB18030, len);
+	for (; len > 0; len -= l)
+	{
+		/* "break" cases all represent errors */
+		if (*src == '\0')
+			break;
+
+		l = pg_utf_mblen(src);
+
+		if (len < l)
+			break;
+
+		if (!pg_utf8_islegal(src, l))
+			break;
+
+		if (l == 1)
+		{
+			/* ASCII case is easy */
+			*dest++ = *src++;
+			continue;
+		}
+		else if (l == 2)
+		{
+			iutf = *src++ << 8;
+			iutf |= *src++;
+		}
+		else if (l == 3)
+		{
+			iutf = *src++ << 16;
+			iutf |= *src++ << 8;
+			iutf |= *src++;
+		}
+		else if (l == 4)
+		{
+			iutf = *src++ << 24;
+			iutf |= *src++ << 16;
+			iutf |= *src++ << 8;
+			iutf |= *src++;
+			/* 4 byte codes all map to the linear range */
+			code = utf8_to_gb18030_hi(iutf);
+			*dest++ = code >> 24;
+			*dest++ = (code & 0x00ff0000) >> 16;
+			*dest++ = (code & 0x0000ff00) >> 8;
+			*dest++ = code & 0x000000ff;
+			continue;
+		}
+		else
+		{
+			elog(ERROR, "unsupported character length %d", l);
+			iutf = 0;			/* keep compiler quiet */
+		}
+
+		p = bsearch(&iutf, ULmapGB18030, sizeof(ULmapGB18030) / sizeof(pg_utf_to_local),
+					sizeof(pg_utf_to_local), compare1);
+		if (p == NULL)
+			report_untranslatable_char(PG_UTF8, PG_GB18030,
+									   (const char *) (src - l), len);
+		code = p->code;
+		/* GB18030 is always 1, 2 or 4 bytes. 1 byte is handled above  */
+		if (code & 0xffff0000)
+		{
+			*dest++ = code >> 24;
+			*dest++ = (code & 0x00ff0000) >> 16;
+		}
+		*dest++ = (code & 0x0000ff00) >> 8;
+		*dest++ = code & 0x000000ff;
+	}
+
+	if (len > 0)
+		report_invalid_encoding(PG_UTF8, (const char *) src, len);
+
+	*dest = '\0';
 
 	PG_RETURN_VOID();
 }
+
+/*
+ * comparison routine for bsearch()
+ * this routine is intended for UTF8 -> local code
+ */
+static int
+compare1(const void *p1, const void *p2)
+{
+	uint32		v1,
+				v2;
+
+	v1 = *(const uint32 *) p1;
+	v2 = ((const pg_utf_to_local *) p2)->utf;
+	return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
+}
+
+/*
+ * comparison routine for bsearch()
+ * this routine is intended for local code -> UTF8
+ */
+static int
+compare2(const void *p1, const void *p2)
+{
+	uint32		v1,
+				v2;
+
+	v1 = *(const uint32 *) p1;
+	v2 = ((const pg_local_to_utf *) p2)->code;
+	return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
+}
+
+/*
+ * Convert UTF32 to UTF-8
+ * Works only for >= U+10000
+ */
+static uint32
+utf32_to_utf8_hi(uint32 utf32) {
+	uint32 b1 = (utf32 >> 18) | 0xF0;
+	uint32 b2 = ((utf32 >> 12) & 0x3F) | 0x80;
+	uint32 b3 = ((utf32 >> 6) & 0x3F) | 0x80;
+	uint32 b4 = (utf32 & 0x3F) | 0x80;
+	return (b1 << 24) | (b2 << 16) | (b3 << 8) | (b4 << 0);
+}
+
+/*
+ * Convert UTF-8 to UTF32
+ * Works only for >= U+10000
+ */
+static uint32
+utf8_to_utf32_hi(uint32 utf8) {
+	/* assert(utf8 > 0xffffff); */
+	uint32 b1 = (utf8 & 0x07000000) >> 6;
+	uint32 b2 = (utf8 & 0x003f0000) >> 4;
+	uint32 b3 = (utf8 & 0x00003f00) >> 2;
+	uint32 b4 = (utf8 & 0x0000003f) >> 0;
+	return b1 | b2 | b3 | b4;
+}
+
+static uint32
+gb_linear(uint32 gb) {
+	uint32 b0 = (gb & 0xff000000) >> 24;
+	uint32 b1 = (gb & 0x00ff0000) >> 16;
+	uint32 b2 = (gb & 0x0000ff00) >> 8;
+	uint32 b3 = (gb & 0x000000ff) >> 0;
+	return b0 * 12600 + b1 * 1260 + b2 * 10 + b3;
+}
+
+static uint32
+gb_unlinear(uint32 lin) {
+	uint32 zlin = lin - gb_linear(0x81308130);
+	uint32 r3 = 0x30 + zlin % 10;
+	uint32 r2 = 0x81 + (zlin / 10) % 126;
+	uint32 r1 = 0x30 + (zlin / 1260) % 10;
+	uint32 r0 = 0x81 + zlin / 12600;
+	return (r0 << 24) | (r1 << 16) | (r2 << 8) | (r3 << 0);
+}
+
+/*
+ * Convert GB18030 to UTF32
+ * Works only for >= U+10000
+ */
+static uint32
+gb_to_utf32_hi(uint32 gb)
+{
+	return UTF32_FIRST + (gb_linear(gb) - gb_linear(GB18030_FIRST));
+}
+
+/*
+ * Convert UTF32 to GB18030
+ * Works only for >= U+10000
+ */
+static uint32
+utf32_to_gb18030_hi(uint32 utf32) {
+	return gb_unlinear(gb_linear(GB18030_FIRST) + utf32 - UTF32_FIRST);
+}
+
+/*
+ * Convert UTF-8 to GB18030
+ * Works only for >= U+10000
+ */
+static uint32
+utf8_to_gb18030_hi(uint32 utf8) {
+	uint32 utf32 = utf8_to_utf32_hi(utf8);
+	return utf32_to_gb18030_hi(utf32);
+}
+
+/*
+ * Convert UTF-8 to GB18030
+ * Works only for >= U+10000
+ */
+static uint32
+gb18030_to_utf8_hi(uint32 gb) {
+	uint32 utf32 = gb_to_utf32_hi(gb);
+	return utf32_to_utf8_hi(utf32);
+}
diff --git a/src/backend/utils/mb/wchar.c b/src/backend/utils/mb/wchar.c
index 0cc753e..f19a19c 100644
--- a/src/backend/utils/mb/wchar.c
+++ b/src/backend/utils/mb/wchar.c
@@ -1400,7 +1400,7 @@ pg_uhc_verifier(const unsigned char *s, int len)
 	return mbl;
 }
 
-static int
+int
 pg_gb18030_verifier(const unsigned char *s, int len)
 {
 	int			l,
diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h
index f7222fc..ce757b9 100644
--- a/src/include/mb/pg_wchar.h
+++ b/src/include/mb/pg_wchar.h
@@ -531,6 +531,7 @@ extern void mic2latin_with_table(const unsigned char *mic, unsigned char *p,
 					 int len, int lc, int encoding,
 					 const unsigned char *tab);
 
+extern int pg_gb18030_verifier(const unsigned char *s, int len);
 extern bool pg_utf8_islegal(const unsigned char *source, int length);
 
 #ifdef WIN32
diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
index 82eca26..13f1cf3 100644
--- a/src/test/regress/expected/conversion.out
+++ b/src/test/regress/expected/conversion.out
@@ -523,17 +523,17 @@ SELECT CONVERT('foo', 'UTF8', 'EUC_TW');
 (1 row)
 
 -- GB18030 --> UTF8
-SELECT CONVERT('foo', 'GB18030', 'UTF8');
- convert 
----------
- foo
+SELECT CONVERT('Postgres \247\343\247\335\247\340\247\337 \2249\3138 \317\363 \250\246le\2010\2747phant', 'GB18030', 'UTF8');
+                                             convert                                             
+-------------------------------------------------------------------------------------------------
+ Postgres \321\201\320\273\320\276\320\275 \360\237\220\230 \350\261\241 \303\251le\314\201phant
 (1 row)
 
 -- UTF8 --> GB18030
-SELECT CONVERT('foo', 'UTF8', 'GB18030');
- convert 
----------
- foo
+SELECT CONVERT('Postgres \321\201\320\273\320\276\320\275 \360\237\220\230 \350\261\241 \303\251le\314\201phant', 'UTF-8', 'GB18030');
+                                         convert                                         
+-----------------------------------------------------------------------------------------
+ Postgres \247\343\247\335\247\340\247\337 \2249\3138 \317\363 \250\246le\2010\2747phant
 (1 row)
 
 -- GBK --> UTF8
diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
index be194ee..e27f06f 100644
--- a/src/test/regress/sql/conversion.sql
+++ b/src/test/regress/sql/conversion.sql
@@ -171,9 +171,9 @@ SELECT CONVERT('foo', 'EUC_TW', 'UTF8');
 -- UTF8 --> EUC_TW
 SELECT CONVERT('foo', 'UTF8', 'EUC_TW');
 -- GB18030 --> UTF8
-SELECT CONVERT('foo', 'GB18030', 'UTF8');
+SELECT CONVERT('Postgres \247\343\247\335\247\340\247\337 \2249\3138 \317\363 \250\246le\2010\2747phant', 'GB18030', 'UTF8');
 -- UTF8 --> GB18030
-SELECT CONVERT('foo', 'UTF8', 'GB18030');
+SELECT CONVERT('Postgres \321\201\320\273\320\276\320\275 \360\237\220\230 \350\261\241 \303\251le\314\201phant', 'UTF-8', 'GB18030');
 -- GBK --> UTF8
 SELECT CONVERT('foo', 'GBK', 'UTF8');
 -- UTF8 --> GBK
-- 
2.1.0

Robert Haas

robertmhaas@gmail.com

over 10 years ago

In reply to: Arjen Nienhuis (#1)

Re: Patch for bug #12845 (GB18030 encoding)

On Tue, May 5, 2015 at 9:04 AM, Arjen Nienhuis <a.g.nienhuis@gmail.com> wrote:

Can someone look at this patch. It should fix bug #12845.

The current tests for conversions are very minimal. I expanded them a
bit for this bug.

I think the binary search in the .map files should be removed but I
leave that for another patch.

Please add this patch to
https://commitfest.postgresql.org/action/commitfest_view/open so we
don't forget about it.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Alvaro Herrera

alvherre@2ndquadrant.com

over 10 years ago

In reply to: Robert Haas (#2)

Re: Patch for bug #12845 (GB18030 encoding)

Robert Haas wrote:

On Tue, May 5, 2015 at 9:04 AM, Arjen Nienhuis <a.g.nienhuis@gmail.com> wrote:

Can someone look at this patch. It should fix bug #12845.

The current tests for conversions are very minimal. I expanded them a
bit for this bug.

I think the binary search in the .map files should be removed but I
leave that for another patch.

Please add this patch to
https://commitfest.postgresql.org/action/commitfest_view/open so we
don't forget about it.

If we think this is a bug fix, we should add it to the open items list,
https://wiki.postgresql.org/wiki/Open_Items

--
ï¿½lvaro Herrera http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Robert Haas

robertmhaas@gmail.com

over 10 years ago

In reply to: Alvaro Herrera (#3)

Re: Patch for bug #12845 (GB18030 encoding)

On Wed, May 6, 2015 at 10:55 AM, Alvaro Herrera
<alvherre@2ndquadrant.com> wrote:

Robert Haas wrote:

On Tue, May 5, 2015 at 9:04 AM, Arjen Nienhuis <a.g.nienhuis@gmail.com> wrote:

Can someone look at this patch. It should fix bug #12845.

The current tests for conversions are very minimal. I expanded them a
bit for this bug.

I think the binary search in the .map files should be removed but I
leave that for another patch.

Please add this patch to
https://commitfest.postgresql.org/action/commitfest_view/open so we
don't forget about it.

If we think this is a bug fix, we should add it to the open items list,
https://wiki.postgresql.org/wiki/Open_Items

It's a behavior change, so I don't think we would consider a back-patch.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Alvaro Herrera

alvherre@2ndquadrant.com

over 10 years ago

In reply to: Robert Haas (#4)

Re: Patch for bug #12845 (GB18030 encoding)

Robert Haas wrote:

On Wed, May 6, 2015 at 10:55 AM, Alvaro Herrera
<alvherre@2ndquadrant.com> wrote:

Robert Haas wrote:

On Tue, May 5, 2015 at 9:04 AM, Arjen Nienhuis <a.g.nienhuis@gmail.com> wrote:

Can someone look at this patch. It should fix bug #12845.

The current tests for conversions are very minimal. I expanded them a
bit for this bug.

I think the binary search in the .map files should be removed but I
leave that for another patch.

Please add this patch to
https://commitfest.postgresql.org/action/commitfest_view/open so we
don't forget about it.

If we think this is a bug fix, we should add it to the open items list,
https://wiki.postgresql.org/wiki/Open_Items

It's a behavior change, so I don't think we would consider a back-patch.

Maybe not, but at the very least we should consider getting it fixed in
9.5 rather than waiting a full development cycle. Same as in
/messages/by-id/20150428131549.GA25925@momjian.us
I'm not saying we MUST include it in 9.5, but we should at least
consider it. If we simply stash it in the open CF we guarantee that it
will linger there for a year.

--
ï¿½lvaro Herrera http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Robert Haas

robertmhaas@gmail.com

over 10 years ago

In reply to: Alvaro Herrera (#5)

Re: Patch for bug #12845 (GB18030 encoding)

On Wed, May 6, 2015 at 11:13 AM, Alvaro Herrera
<alvherre@2ndquadrant.com> wrote:

It's a behavior change, so I don't think we would consider a back-patch.

Maybe not, but at the very least we should consider getting it fixed in
9.5 rather than waiting a full development cycle. Same as in
/messages/by-id/20150428131549.GA25925@momjian.us
I'm not saying we MUST include it in 9.5, but we should at least
consider it. If we simply stash it in the open CF we guarantee that it
will linger there for a year.

Sure, if somebody has the time to put into it now, I'm fine with that.
I'm afraid it won't be me, though: even if I had the time, I don't
know enough about encodings.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Tom Lane

tgl@sss.pgh.pa.us

over 10 years ago

In reply to: Robert Haas (#6)

Re: Patch for bug #12845 (GB18030 encoding)

Robert Haas <robertmhaas@gmail.com> writes:

On Wed, May 6, 2015 at 11:13 AM, Alvaro Herrera
<alvherre@2ndquadrant.com> wrote:

Maybe not, but at the very least we should consider getting it fixed in
9.5 rather than waiting a full development cycle. Same as in
/messages/by-id/20150428131549.GA25925@momjian.us
I'm not saying we MUST include it in 9.5, but we should at least
consider it. If we simply stash it in the open CF we guarantee that it
will linger there for a year.

Sure, if somebody has the time to put into it now, I'm fine with that.
I'm afraid it won't be me, though: even if I had the time, I don't
know enough about encodings.

I concur that we should at least consider this patch for 9.5. I've
added it to
https://wiki.postgresql.org/wiki/PostgreSQL_9.5_Open_Items

I'm willing to look at it myself, whenever my non-copious spare time
permits; but that won't be in the immediate future.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Tom Lane

tgl@sss.pgh.pa.us

over 10 years ago

In reply to: Tom Lane (#7)

Re: Patch for bug #12845 (GB18030 encoding)

I wrote:

Robert Haas <robertmhaas@gmail.com> writes:

On Wed, May 6, 2015 at 11:13 AM, Alvaro Herrera
<alvherre@2ndquadrant.com> wrote:

Maybe not, but at the very least we should consider getting it fixed in
9.5 rather than waiting a full development cycle. Same as in
/messages/by-id/20150428131549.GA25925@momjian.us
I'm not saying we MUST include it in 9.5, but we should at least
consider it. If we simply stash it in the open CF we guarantee that it
will linger there for a year.

Sure, if somebody has the time to put into it now, I'm fine with that.
I'm afraid it won't be me, though: even if I had the time, I don't
know enough about encodings.

I concur that we should at least consider this patch for 9.5. I've
added it to
https://wiki.postgresql.org/wiki/PostgreSQL_9.5_Open_Items

I looked at this patch a bit, and read up on GB18030 (thank you
wikipedia). I concur we have a problem to fix. I do not like the way
this patch went about it though, ie copying-and-pasting LocalToUtf and
UtfToLocal and their supporting routines into utf8_and_gb18030.c.
Aside from being duplicative, this means the improved mapping capability
isn't available to use with anything except GB18030. (I do not know
whether there are any linear mapping ranges in other encodings, but
seeing that the Unicode crowd went to the trouble of defining a notation
for it in http://www.unicode.org/reports/tr22/, I'm betting there are.)

What I think would be a better solution, if slightly more invasive,
is to extend LocalToUtf and UtfToLocal to add a callback function
argument for a function of signature "uint32 translate(uint32)".
This function, if provided, would be called after failing to find a
mapping in the mapping table(s), and it could implement any translation
that would be better handled by code than as a boatload of mapping-table
entries. If it returns zero then it doesn't know a translation either,
so throw error as before.

An alternative definition that could be proposed would be to call the
function before consulting the mapping tables, not after, on the grounds
that the function can probably exit cheaply if the input's not in a range
that it cares about. However, consulting the mapping table first wins
if you have ranges that mostly work but contain a few exceptions: put
the exceptions in the mapping table and then the function need not worry
about handling them.

Another alternative approach would be to try to define linear mapping
ranges in a tabular fashion, for more consistency with what's there now.
But that probably wouldn't work terribly well because the bytewise
character representations used in this logic have to be converted into
code points before you can do any sort of linear mapping. We could
hard-wire that conversion for UTF8, but the conversion in the other code
space would be encoding-specific. So we might as well just treat the
whole linear mapping behavior as a black box function for each encoding.

I'm also discounting the possibility that someone would want an
algorithmic mapping for cases involving "combined" codes (ie pairs of
UTF8 characters). Of the encodings we support, only EUC_JIS_2004 and
SHIFT_JIS_2004 need such cases at all, and those have only a handful of
cases; so it doesn't seem popular enough to justify the extra complexity.

I also notice that pg_gb18030_verifier isn't even close to strict enough;
it basically relies on pg_gb18030_mblen which contains no checks
whatsoever on the third and fourth bytes. So that needs to be fixed.

The verification tightening would definitely not be something to
back-patch, and I'm inclined to think that the additional mapping
capability shouldn't be either, in view of the facts that (a) we've
had few if any field complaints yet, and (b) changing the signatures
of LocalToUtf/UtfToLocal might possibly break third-party code.
So I'm seeing this as a HEAD-only patch, but I do want to try to
squeeze it into 9.5 rather than wait another year.

Barring objections, I'll go make this happen.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Arjen Nienhuis

a.g.nienhuis@gmail.com

over 10 years ago

In reply to: Tom Lane (#8)

1 attachment(s)

Re: Patch for bug #12845 (GB18030 encoding)

On Thu, May 14, 2015 at 11:04 PM, Tom Lane <tgl@sss.pgh.pa.us> wrote:

I wrote:

Robert Haas <robertmhaas@gmail.com> writes:

On Wed, May 6, 2015 at 11:13 AM, Alvaro Herrera
<alvherre@2ndquadrant.com> wrote:

Maybe not, but at the very least we should consider getting it fixed in
9.5 rather than waiting a full development cycle. Same as in
/messages/by-id/20150428131549.GA25925@momjian.us
I'm not saying we MUST include it in 9.5, but we should at least
consider it. If we simply stash it in the open CF we guarantee that it
will linger there for a year.

Sure, if somebody has the time to put into it now, I'm fine with that.
I'm afraid it won't be me, though: even if I had the time, I don't
know enough about encodings.

I concur that we should at least consider this patch for 9.5. I've
added it to
https://wiki.postgresql.org/wiki/PostgreSQL_9.5_Open_Items

I looked at this patch a bit, and read up on GB18030 (thank you
wikipedia). I concur we have a problem to fix. I do not like the way
this patch went about it though, ie copying-and-pasting LocalToUtf and
UtfToLocal and their supporting routines into utf8_and_gb18030.c.
Aside from being duplicative, this means the improved mapping capability
isn't available to use with anything except GB18030. (I do not know
whether there are any linear mapping ranges in other encodings, but
seeing that the Unicode crowd went to the trouble of defining a notation
for it in http://www.unicode.org/reports/tr22/, I'm betting there are.)

What I think would be a better solution, if slightly more invasive,
is to extend LocalToUtf and UtfToLocal to add a callback function
argument for a function of signature "uint32 translate(uint32)".
This function, if provided, would be called after failing to find a
mapping in the mapping table(s), and it could implement any translation
that would be better handled by code than as a boatload of mapping-table
entries. If it returns zero then it doesn't know a translation either,
so throw error as before.

An alternative definition that could be proposed would be to call the
function before consulting the mapping tables, not after, on the grounds
that the function can probably exit cheaply if the input's not in a range
that it cares about. However, consulting the mapping table first wins
if you have ranges that mostly work but contain a few exceptions: put
the exceptions in the mapping table and then the function need not worry
about handling them.

Another alternative approach would be to try to define linear mapping
ranges in a tabular fashion, for more consistency with what's there now.
But that probably wouldn't work terribly well because the bytewise
character representations used in this logic have to be converted into
code points before you can do any sort of linear mapping. We could
hard-wire that conversion for UTF8, but the conversion in the other code
space would be encoding-specific. So we might as well just treat the
whole linear mapping behavior as a black box function for each encoding.

I'm also discounting the possibility that someone would want an
algorithmic mapping for cases involving "combined" codes (ie pairs of
UTF8 characters). Of the encodings we support, only EUC_JIS_2004 and
SHIFT_JIS_2004 need such cases at all, and those have only a handful of
cases; so it doesn't seem popular enough to justify the extra complexity.

I also notice that pg_gb18030_verifier isn't even close to strict enough;
it basically relies on pg_gb18030_mblen which contains no checks
whatsoever on the third and fourth bytes. So that needs to be fixed.

The verification tightening would definitely not be something to
back-patch, and I'm inclined to think that the additional mapping
capability shouldn't be either, in view of the facts that (a) we've
had few if any field complaints yet, and (b) changing the signatures
of LocalToUtf/UtfToLocal might possibly break third-party code.
So I'm seeing this as a HEAD-only patch, but I do want to try to
squeeze it into 9.5 rather than wait another year.

Barring objections, I'll go make this happen.

GB18030 is a special case, because it's a full mapping of all unicode
characters, and most of it is algorithmically defined. This makes
UtfToLocal a bad choice to implement it. UtfToLocal assumes a sparse
array with only the defined characters. It uses binary search to find
a character. The 2 tables it uses now are huge (the .so file is 1MB).
Adding the rest of the valid characters to this scheme is possible,
but would make the problem worse.

I think fixing UtfToLocal only for the new characters is not optimal.

I think the best solution is to get rid of UtfToLocal for GB18030. Use
a specialized algorithm:
- For characters > U+FFFF use the algorithm from my patch
- For charcaters <= U+FFFF use special mapping tables to map from/to
UTF32. Those tables would be smaller, and the code would be faster (I
assume).

For example (256 KB):

unsigned int utf32_to_gb18030[65536] = {
/* 0x0 */ 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
/* 0x8 */ 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf,
--
/* 0xdb08 */ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
--
/* 0xfff0 */ 0x8431a334, 0x8431a335, 0x8431a336, 0x8431a337,
0x8431a338, 0x8431a339, 0x8431a430, 0x8431a431,
/* 0xfff8 */ 0x8431a432, 0x8431a433, 0x8431a434, 0x8431a435,
0x8431a436, 0x8431a437, 0x8431a438, 0x8431a439
};

Instead of (500KB):

static pg_utf_to_local ULmapGB18030[ 63360 ] = {
{0xc280, 0x81308130},
{0xc281, 0x81308131},
--
{0xefbfbe, 0x8431a438},
{0xefbfbf, 0x8431a439}
};

See the attachment for a python script to generate those mappings.

Gr. Arjen

#10

Tom Lane

tgl@sss.pgh.pa.us

over 10 years ago

In reply to: Arjen Nienhuis (#9)

Re: Patch for bug #12845 (GB18030 encoding)

Arjen Nienhuis <a.g.nienhuis@gmail.com> writes:

GB18030 is a special case, because it's a full mapping of all unicode
characters, and most of it is algorithmically defined.

True.

This makes UtfToLocal a bad choice to implement it.

I disagree with that conclusion. There are still 30000+ characters
that need to be translated via lookup table, so we still need either
UtfToLocal or a clone of it; and as I said previously, I'm not on board
with cloning it.

I think the best solution is to get rid of UtfToLocal for GB18030. Use
a specialized algorithm:
- For characters > U+FFFF use the algorithm from my patch
- For charcaters <= U+FFFF use special mapping tables to map from/to
UTF32. Those tables would be smaller, and the code would be faster (I
assume).

I looked at what wikipeda claims is the authoritative conversion table:

http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/gb-18030-2000.xml

According to that, about half of the characters below U+FFFF can be
processed via linear conversions, so I think we ought to save table
space by doing that. However, the remaining stuff that has to be
processed by lookup still contains a pretty substantial number of
characters that map to 4-byte GB18030 characters, so I don't think
we can get any table size savings by adopting a bespoke table format.
We might as well use UtfToLocal. (Worth noting in this connection
is that we haven't seen fit to sweat about UtfToLocal's use of 4-byte
table entries for other encodings, even though most of the others
are not concerned with characters outside the BMP.)

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#11

Arjen Nienhuis

a.g.nienhuis@gmail.com

over 10 years ago

In reply to: Tom Lane (#10)

Re: Patch for bug #12845 (GB18030 encoding)

On Fri, May 15, 2015 at 4:10 PM, Tom Lane <tgl@sss.pgh.pa.us> wrote:

Arjen Nienhuis <a.g.nienhuis@gmail.com> writes:

GB18030 is a special case, because it's a full mapping of all unicode
characters, and most of it is algorithmically defined.

True.

This makes UtfToLocal a bad choice to implement it.

I disagree with that conclusion. There are still 30000+ characters
that need to be translated via lookup table, so we still need either
UtfToLocal or a clone of it; and as I said previously, I'm not on board
with cloning it.

I think the best solution is to get rid of UtfToLocal for GB18030. Use
a specialized algorithm:
- For characters > U+FFFF use the algorithm from my patch
- For charcaters <= U+FFFF use special mapping tables to map from/to
UTF32. Those tables would be smaller, and the code would be faster (I
assume).

I looked at what wikipeda claims is the authoritative conversion table:

http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/gb-18030-2000.xml

According to that, about half of the characters below U+FFFF can be
processed via linear conversions, so I think we ought to save table
space by doing that. However, the remaining stuff that has to be
processed by lookup still contains a pretty substantial number of
characters that map to 4-byte GB18030 characters, so I don't think
we can get any table size savings by adopting a bespoke table format.
We might as well use UtfToLocal. (Worth noting in this connection
is that we haven't seen fit to sweat about UtfToLocal's use of 4-byte
table entries for other encodings, even though most of the others
are not concerned with characters outside the BMP.)

It's not about 4 vs 2 bytes, it's about using 8 bytes vs 4. UtfToLocal
uses a sparse array:

map = {{0, x}, {1, y}, {2, z}, ...}

v.s.

map = {x, y, z, ...}

That's fine when not every code point is used, but it's different for
GB18030 where almost all code points are used. Using a plain array
saves space and saves a binary search.

Gr. Arjen

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#12

Tom Lane

tgl@sss.pgh.pa.us

over 10 years ago

In reply to: Arjen Nienhuis (#11)

Re: Patch for bug #12845 (GB18030 encoding)

Arjen Nienhuis <a.g.nienhuis@gmail.com> writes:

On Fri, May 15, 2015 at 4:10 PM, Tom Lane <tgl@sss.pgh.pa.us> wrote:

According to that, about half of the characters below U+FFFF can be
processed via linear conversions, so I think we ought to save table
space by doing that. However, the remaining stuff that has to be
processed by lookup still contains a pretty substantial number of
characters that map to 4-byte GB18030 characters, so I don't think
we can get any table size savings by adopting a bespoke table format.
We might as well use UtfToLocal. (Worth noting in this connection
is that we haven't seen fit to sweat about UtfToLocal's use of 4-byte
table entries for other encodings, even though most of the others
are not concerned with characters outside the BMP.)

It's not about 4 vs 2 bytes, it's about using 8 bytes vs 4. UtfToLocal
uses a sparse array:

map = {{0, x}, {1, y}, {2, z}, ...}

v.s.

map = {x, y, z, ...}

That's fine when not every code point is used, but it's different for
GB18030 where almost all code points are used. Using a plain array
saves space and saves a binary search.

Well, it doesn't save any space: if we get rid of the additional linear
ranges in the lookup table, what remains is 30733 entries requiring about
256K, same as (or a bit less than) what you suggest.

The point about possibly being able to do this with a simple lookup table
instead of binary search is valid, but I still say it's a mistake to
suppose that we should consider that only for GB18030. With the reduced
table size, the GB18030 conversion tables are not all that far out of line
with the other Far Eastern conversions:

$ size utf8*.so | sort -n
text data bss dec hex filename
1880 512 16 2408 968 utf8_and_ascii.so
2394 528 16 2938 b7a utf8_and_iso8859_1.so
6674 512 16 7202 1c22 utf8_and_cyrillic.so
24318 904 16 25238 6296 utf8_and_win.so
28750 968 16 29734 7426 utf8_and_iso8859.so
121110 512 16 121638 1db26 utf8_and_euc_cn.so
123458 512 16 123986 1e452 utf8_and_sjis.so
133606 512 16 134134 20bf6 utf8_and_euc_kr.so
185014 512 16 185542 2d4c6 utf8_and_sjis2004.so
185522 512 16 186050 2d6c2 utf8_and_euc2004.so
212950 512 16 213478 341e6 utf8_and_euc_jp.so
221394 512 16 221922 362e2 utf8_and_big5.so
274772 512 16 275300 43364 utf8_and_johab.so
277776 512 16 278304 43f20 utf8_and_uhc.so
332262 512 16 332790 513f6 utf8_and_euc_tw.so
350640 512 16 351168 55bc0 utf8_and_gbk.so
496680 512 16 497208 79638 utf8_and_gb18030.so

If we were to get excited about reducing the conversion time for GB18030,
it would clearly make sense to use similar infrastructure for GBK, and
perhaps the EUC encodings too.

However, I'm not that excited about changing it. We have not heard field
complaints about these converters being too slow. What's more, there
doesn't seem to be any practical way to apply the same idea to the other
conversion direction, which means if you do feel there's a speed problem
this would only halfway fix it.

So my feeling is that the most practical and maintainable answer is to
keep GB18030 using code that is mostly shared with the other encodings.
I've committed a fix that does it that way for 9.5. If you want to
pursue the idea of a faster conversion using direct lookup tables,
I think that would be 9.6 material at this point.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#13

Robert Haas

robertmhaas@gmail.com

over 10 years ago

In reply to: Tom Lane (#12)

Re: Patch for bug #12845 (GB18030 encoding)

On Fri, May 15, 2015 at 3:18 PM, Tom Lane <tgl@sss.pgh.pa.us> wrote:

However, I'm not that excited about changing it. We have not heard field
complaints about these converters being too slow. What's more, there
doesn't seem to be any practical way to apply the same idea to the other
conversion direction, which means if you do feel there's a speed problem
this would only halfway fix it.

Half a loaf is better than none.

--
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#14

Arjen Nienhuis

a.g.nienhuis@gmail.com

over 10 years ago

In reply to: Tom Lane (#12)

Re: Patch for bug #12845 (GB18030 encoding)

That's fine when not every code point is used, but it's different for
GB18030 where almost all code points are used. Using a plain array
saves space and saves a binary search.

Well, it doesn't save any space: if we get rid of the additional linear
ranges in the lookup table, what remains is 30733 entries requiring about
256K, same as (or a bit less than) what you suggest.

We could do both. What about something like this:

static unsigned int utf32_to_gb18030_from_0x0001[1105] = {
/* 0x0 */ 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8,
...
static unsigned int utf32_to_gb18030_from_0x2010[1587] = {
/* 0x0 */ 0xa95c, 0x8136a532, 0x8136a533, 0xa843, 0xa1aa, 0xa844,
0xa1ac, 0x8136a534,
...
static unsigned int utf32_to_gb18030_from_0x2E81[28965] = {
/* 0x0 */ 0xfe50, 0x8138fd39, 0x8138fe30, 0xfe54, 0x8138fe31,
0x8138fe32, 0x8138fe33, 0xfe57,
...
static unsigned int utf32_to_gb18030_from_0xE000[2149] = {
/* 0x0 */ 0xaaa1, 0xaaa2, 0xaaa3, 0xaaa4, 0xaaa5, 0xaaa6, 0xaaa7, 0xaaa8,
...
static unsigned int utf32_to_gb18030_from_0xF92C[254] = {
/* 0x0 */ 0xfd9c, 0x84308535, 0x84308536, 0x84308537, 0x84308538,
0x84308539, 0x84308630, 0x84308631,
...
static unsigned int utf32_to_gb18030_from_0xFE30[464] = {
/* 0x0 */ 0xa955, 0xa6f2, 0x84318538, 0xa6f4, 0xa6f5, 0xa6e0, 0xa6e1, 0xa6f0,
...

static uint32
conv_utf8_to_18030(uint32 code)
{
uint32 ucs = utf8word_to_unicode(code);

#define conv_lin(minunicode, maxunicode, mincode) \
if (ucs >= minunicode && ucs <= maxunicode) \
return gb_unlinear(ucs - minunicode + gb_linear(mincode))

#define conv_array(minunicode, maxunicode) \
if (ucs >= minunicode && ucs <= maxunicode) \
return utf32_to_gb18030_from_##minunicode[ucs - minunicode];

conv_array(0x0001, 0x0452);
conv_lin(0x0452, 0x200F, 0x8130D330);
conv_array(0x2010, 0x2643);
conv_lin(0x2643, 0x2E80, 0x8137A839);
conv_array(0x2E81, 0x9FA6);
conv_lin(0x9FA6, 0xD7FF, 0x82358F33);
conv_array(0xE000, 0xE865);
conv_lin(0xE865, 0xF92B, 0x8336D030);
conv_array(0xF92C, 0xFA2A);
conv_lin(0xFA2A, 0xFE2F, 0x84309C38);
conv_array(0xFE30, 0x10000);
conv_lin(0x10000, 0x10FFFF, 0x90308130);
/* No mapping exists */
return 0;
}

The point about possibly being able to do this with a simple lookup table
instead of binary search is valid, but I still say it's a mistake to
suppose that we should consider that only for GB18030. With the reduced
table size, the GB18030 conversion tables are not all that far out of line
with the other Far Eastern conversions:

$ size utf8*.so | sort -n
text data bss dec hex filename
1880 512 16 2408 968 utf8_and_ascii.so
2394 528 16 2938 b7a utf8_and_iso8859_1.so
6674 512 16 7202 1c22 utf8_and_cyrillic.so
24318 904 16 25238 6296 utf8_and_win.so
28750 968 16 29734 7426 utf8_and_iso8859.so
121110 512 16 121638 1db26 utf8_and_euc_cn.so
123458 512 16 123986 1e452 utf8_and_sjis.so
133606 512 16 134134 20bf6 utf8_and_euc_kr.so
185014 512 16 185542 2d4c6 utf8_and_sjis2004.so
185522 512 16 186050 2d6c2 utf8_and_euc2004.so
212950 512 16 213478 341e6 utf8_and_euc_jp.so
221394 512 16 221922 362e2 utf8_and_big5.so
274772 512 16 275300 43364 utf8_and_johab.so
277776 512 16 278304 43f20 utf8_and_uhc.so
332262 512 16 332790 513f6 utf8_and_euc_tw.so
350640 512 16 351168 55bc0 utf8_and_gbk.so
496680 512 16 497208 79638 utf8_and_gb18030.so

If we were to get excited about reducing the conversion time for GB18030,
it would clearly make sense to use similar infrastructure for GBK, and
perhaps the EUC encodings too.

I'll check them as well. If they have linear ranges it should work.

However, I'm not that excited about changing it. We have not heard field
complaints about these converters being too slow. What's more, there
doesn't seem to be any practical way to apply the same idea to the other
conversion direction, which means if you do feel there's a speed problem
this would only halfway fix it.

It does work if you linearlize it first. That's why we need to convert
to utf32 first as well. That's a form of linearization.

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers