EUC_JP and SJIS conversion improvement
The character-code conversion from EUC_JP to SJIS is executed by
converting two stages. The first stage is conversion from EUC_JP to MIC.
The next stage is conversion from MIC to SJIS. (Conversion from SJIS to
EUC_JP is also similar.)
It is not so efficient, because it is necessary to allocate the
buffer for MIC, and to execute the calculation for conversion twice.
In the attached patch, it enables the direct conversion of EUC_JP and
SJIS. Additionally, there is an improvement that reduce the call of
pg_mic_mblen.
The effect of the patch that I measured is as follows:
o The Data for test was created by 'pgbench -i'.
o Test SQL:
set client_encoding to 'SJIS';
select * from accounts;
o Test results: Linux(CPU: Pentium III, Compiler option: -O2)
- original: 2.920s
- patched : 2.278s
regards,
---
Atsushi Ogawa
Attachments:
euc_jp_and_sjis.patchapplication/octet-stream; name=euc_jp_and_sjis.patchDownload
*** ./src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c.orig Mon Apr 11 14:52:35 2005
--- ./src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c Mon Apr 11 15:03:45 2005
***************
*** 58,63 ****
--- 58,65 ----
static void mic2sjis(unsigned char *mic, unsigned char *p, int len);
static void euc_jp2mic(unsigned char *euc, unsigned char *p, int len);
static void mic2euc_jp(unsigned char *mic, unsigned char *p, int len);
+ static void euc_jp2sjis(unsigned char *mic, unsigned char *p, int len);
+ static void sjis2euc_jp(unsigned char *mic, unsigned char *p, int len);
Datum
euc_jp_to_sjis(PG_FUNCTION_ARGS)
***************
*** 65,80 ****
unsigned char *src = PG_GETARG_CSTRING(2);
unsigned char *dest = PG_GETARG_CSTRING(3);
int len = PG_GETARG_INT32(4);
- unsigned char *buf;
Assert(PG_GETARG_INT32(0) == PG_EUC_JP);
Assert(PG_GETARG_INT32(1) == PG_SJIS);
Assert(len >= 0);
! buf = palloc(len * ENCODING_GROWTH_RATE);
! euc_jp2mic(src, buf, len);
! mic2sjis(buf, dest, strlen(buf));
! pfree(buf);
PG_RETURN_VOID();
}
--- 67,78 ----
unsigned char *src = PG_GETARG_CSTRING(2);
unsigned char *dest = PG_GETARG_CSTRING(3);
int len = PG_GETARG_INT32(4);
Assert(PG_GETARG_INT32(0) == PG_EUC_JP);
Assert(PG_GETARG_INT32(1) == PG_SJIS);
Assert(len >= 0);
! euc_jp2sjis(src, dest, len);
PG_RETURN_VOID();
}
***************
*** 85,100 ****
unsigned char *src = PG_GETARG_CSTRING(2);
unsigned char *dest = PG_GETARG_CSTRING(3);
int len = PG_GETARG_INT32(4);
- unsigned char *buf;
Assert(PG_GETARG_INT32(0) == PG_SJIS);
Assert(PG_GETARG_INT32(1) == PG_EUC_JP);
Assert(len >= 0);
! buf = palloc(len * ENCODING_GROWTH_RATE);
! sjis2mic(src, buf, len);
! mic2euc_jp(buf, dest, strlen(buf));
! pfree(buf);
PG_RETURN_VOID();
}
--- 83,94 ----
unsigned char *src = PG_GETARG_CSTRING(2);
unsigned char *dest = PG_GETARG_CSTRING(3);
int len = PG_GETARG_INT32(4);
Assert(PG_GETARG_INT32(0) == PG_SJIS);
Assert(PG_GETARG_INT32(1) == PG_EUC_JP);
Assert(len >= 0);
! sjis2euc_jp(src, dest, len);
PG_RETURN_VOID();
}
***************
*** 454,456 ****
--- 448,646 ----
}
*p = '\0';
}
+
+ /*
+ * EUC_JP -> SJIS
+ */
+ static void
+ euc_jp2sjis(unsigned char *euc, unsigned char *p, int len)
+ {
+ int c1,
+ c2,
+ k;
+ unsigned char *euc_end = euc + len;
+
+ while (euc_end >= euc && (c1 = *euc++))
+ {
+ if(c1 < 0x80)
+ {
+ /* should be ASCII */
+ *p++ = c1;
+ }
+ else if (c1 == SS2)
+ {
+ /* hankaku kana? */
+ *p++ = *euc++;
+ }
+ else if (c1 == SS3)
+ {
+ /* JIS X0212 kanji? */
+ c1 = *euc++;
+ c2 = *euc++;
+ k = c1 << 8 | c2;
+ if (k >= 0xf5a1)
+ {
+ /* UDC2 */
+ c1 -= 0x54;
+ *p++ = ((c1 - 0xa1) >> 1) + ((c1 < 0xdf) ? 0x81 : 0xc1) + 0x74;
+ *p++ = c2 - ((c1 & 1) ? ((c2 < 0xe0) ? 0x61 : 0x60) : 2);
+ }
+ else
+ {
+ int i, k2;
+
+ /* IBM kanji */
+ for (i = 0;; i++)
+ {
+ k2 = ibmkanji[i].euc & 0xffff;
+ if (k2 == 0xffff)
+ {
+ *p++ = PGSJISALTCODE >> 8;
+ *p++ = PGSJISALTCODE & 0xff;
+ break;
+ }
+ if (k2 == k)
+ {
+ k = ibmkanji[i].sjis;
+ *p++ = k >> 8;
+ *p++ = k & 0xff;
+ break;
+ }
+ }
+ }
+ }
+ else
+ {
+ /* JIS X0208 kanji? */
+ c2 = *euc++;
+ k = (c1 << 8) | (c2 & 0xff);
+ if (k >= 0xf5a1)
+ {
+ /* UDC1 */
+ c1 -= 0x54;
+ *p++ = ((c1 - 0xa1) >> 1) + ((c1 < 0xdf) ? 0x81 : 0xc1) + 0x6f;
+ }
+ else
+ *p++ = ((c1 - 0xa1) >> 1) + ((c1 < 0xdf) ? 0x81 : 0xc1);
+ *p++ = c2 - ((c1 & 1) ? ((c2 < 0xe0) ? 0x61 : 0x60) : 2);
+ }
+ }
+ *p = '\0';
+ }
+
+ /*
+ * SJIS ---> EUC_JP
+ */
+ static void
+ sjis2euc_jp(unsigned char *sjis, unsigned char *p, int len)
+ {
+ int c1,
+ c2,
+ i,
+ k,
+ k2;
+ unsigned char *sjis_end = sjis + len;
+
+ while (sjis_end >= sjis && (c1 = *sjis++))
+ {
+ if(c1 < 0x80)
+ {
+ /* should be ASCII */
+ *p++ = c1;
+ }
+ else if (c1 >= 0xa1 && c1 <= 0xdf)
+ {
+ /* JIS X0201 (1 byte kana) */
+ *p++ = SS2;
+ *p++ = c1;
+ }
+ else
+ {
+ /*
+ * JIS X0208, X0212, user defined extended characters
+ */
+ c2 = *sjis++;
+ k = (c1 << 8) + c2;
+ if (k >= 0xed40 && k < 0xf040)
+ {
+ /* NEC selection IBM kanji */
+ for (i = 0;; i++)
+ {
+ k2 = ibmkanji[i].nec;
+ if (k2 == 0xffff)
+ break;
+ if (k2 == k)
+ {
+ k = ibmkanji[i].sjis;
+ c1 = (k >> 8) & 0xff;
+ c2 = k & 0xff;
+ }
+ }
+ }
+
+ if (k < 0xeb3f)
+ {
+ /* JIS X0208 */
+ *p++ = ((c1 & 0x3f) << 1) + 0x9f + (c2 > 0x9e);
+ *p++ = c2 + ((c2 > 0x9e) ? 2 : 0x60) + (c2 < 0x80);
+ }
+ else if ((k >= 0xeb40 && k < 0xf040) || (k >= 0xfc4c && k <= 0xfcfc))
+ {
+ /* NEC selection IBM kanji - Other undecided justice */
+ *p++ = PGEUCALTCODE >> 8;
+ *p++ = PGEUCALTCODE & 0xff;
+ }
+ else if (k >= 0xf040 && k < 0xf540)
+ {
+ /*
+ * UDC1 mapping to X0208 85 ku - 94 ku JIS code 0x7521 -
+ * 0x7e7e EUC 0xf5a1 - 0xfefe
+ */
+ c1 -= 0x6f;
+ *p++ = ((c1 & 0x3f) << 1) + 0xf3 + (c2 > 0x9e);
+ *p++ = c2 + ((c2 > 0x9e) ? 2 : 0x60) + (c2 < 0x80);
+ }
+ else if (k >= 0xf540 && k < 0xfa40)
+ {
+ /*
+ * UDC2 mapping to X0212 85 ku - 94 ku JIS code 0x7521 -
+ * 0x7e7e EUC 0x8ff5a1 - 0x8ffefe
+ */
+ *p++ = SS3;
+ c1 -= 0x74;
+ *p++ = ((c1 & 0x3f) << 1) + 0xf3 + (c2 > 0x9e);
+ *p++ = c2 + ((c2 > 0x9e) ? 2 : 0x60) + (c2 < 0x80);
+ }
+ else if (k >= 0xfa40)
+ {
+ /*
+ * mapping IBM kanji to X0208 and X0212
+ *
+ */
+ for (i = 0;; i++)
+ {
+ k2 = ibmkanji[i].sjis;
+ if (k2 == 0xffff)
+ break;
+ if (k2 == k)
+ {
+ k = ibmkanji[i].euc;
+ if (k >= 0x8f0000)
+ {
+ *p++ = SS3;
+ *p++ = 0x80 | ((k & 0xff00) >> 8);
+ *p++ = 0x80 | (k & 0xff);
+ }
+ else
+ {
+ *p++ = 0x80 | (k >> 8);
+ *p++ = 0x80 | (k & 0xff);
+ }
+ }
+ }
+ }
+ }
+ }
+ *p = '\0';
+ }
+
The character-code conversion from EUC_JP to SJIS is executed by
converting two stages. The first stage is conversion from EUC_JP to MIC.
The next stage is conversion from MIC to SJIS. (Conversion from SJIS to
EUC_JP is also similar.)It is not so efficient, because it is necessary to allocate the
buffer for MIC, and to execute the calculation for conversion twice.In the attached patch, it enables the direct conversion of EUC_JP and
SJIS. Additionally, there is an improvement that reduce the call of
pg_mic_mblen.The effect of the patch that I measured is as follows:
o The Data for test was created by 'pgbench -i'.
o Test SQL:
set client_encoding to 'SJIS';
select * from accounts;o Test results: Linux(CPU: Pentium III, Compiler option: -O2)
- original: 2.920s
- patched : 2.278sregards,
---
Atsushi Ogawa
I have tested Atsushi's patches with PostgreSQL 8.0.3 on my Note PC
running Linux 2.4 and got following results (database encoding is
EUC_JP):
1) without patches
$ time psql -c 'set client_encoding to 'SJIS';select * from accounts;' test >/dev/null
real 0m4.926s
user 0m1.680s
sys 0m0.090s
2) with patches
$ time psql -c 'set client_encoding to 'SJIS';select * from accounts;' test >/dev/null
real 0m3.816s
user 0m1.560s
sys 0m0.070s
3) no encoding conversions
$ time psql -c 'set client_encoding to 'EUC_JP';select * from accounts;' test >/dev/null
real 0m3.220s
user 0m1.760s
sys 0m0.070s
I got the 52% overhead decreases to 18% with the patches. This is a
huge improvement! I will commit to current if there's no objection.
--
Tatsuo Ishii