Index: src/backend/utils/mb/conv.c =================================================================== RCS file: /projects/cvsroot/pgsql/src/backend/utils/mb/conv.c,v retrieving revision 1.52 diff -c -r1.52 conv.c *** src/backend/utils/mb/conv.c 7 Mar 2005 04:30:52 -0000 1.52 --- src/backend/utils/mb/conv.c 5 Jun 2005 04:40:53 -0000 *************** *** 361,372 **** iutf = *utf++ << 8; iutf |= *utf++; } ! else { iutf = *utf++ << 16; iutf |= *utf++ << 8; iutf |= *utf++; } p = bsearch(&iutf, map, size, sizeof(pg_utf_to_local), compare1); if (p == NULL) --- 361,379 ---- iutf = *utf++ << 8; iutf |= *utf++; } ! else if (l == 3) { iutf = *utf++ << 16; iutf |= *utf++ << 8; iutf |= *utf++; } + else if (l == 4) + { + iutf = *utf++ << 24; + iutf |= *utf++ << 16; + iutf |= *utf++ << 8; + iutf |= *utf++; + } p = bsearch(&iutf, map, size, sizeof(pg_utf_to_local), compare1); if (p == NULL) Index: src/backend/utils/mb/wchar.c =================================================================== RCS file: /projects/cvsroot/pgsql/src/backend/utils/mb/wchar.c,v retrieving revision 1.43 diff -c -r1.43 wchar.c *** src/backend/utils/mb/wchar.c 14 Mar 2005 18:31:20 -0000 1.43 --- src/backend/utils/mb/wchar.c 5 Jun 2005 04:40:54 -0000 *************** *** 406,413 **** len = 1; else if ((*s & 0xe0) == 0xc0) len = 2; ! else if ((*s & 0xe0) == 0xe0) ! len = 3; return (len); } --- 406,419 ---- len = 1; else if ((*s & 0xe0) == 0xc0) len = 2; ! else if ((*s & 0xf0) == 0xe0) ! len = 3; ! else if ((*s & 0xf8) == 0xf0) ! len = 4; ! else if ((*s & 0xfc) == 0xf8) ! len = 5; ! else if ((*s & 0xfe) == 0xfc) ! len = 6; return (len); } *************** *** 721,727 **** {pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, 3}, /* 3; PG_EUC_KR */ {pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, 3}, /* 4; PG_EUC_TW */ {pg_johab2wchar_with_len, pg_johab_mblen, pg_johab_dsplen, 3}, /* 5; PG_JOHAB */ ! {pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, 3}, /* 6; PG_UTF8 */ {pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, 3}, /* 7; PG_MULE_INTERNAL */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 8; PG_LATIN1 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 9; PG_LATIN2 */ --- 727,733 ---- {pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, 3}, /* 3; PG_EUC_KR */ {pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, 3}, /* 4; PG_EUC_TW */ {pg_johab2wchar_with_len, pg_johab_mblen, pg_johab_dsplen, 3}, /* 5; PG_JOHAB */ ! {pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, 4}, /* 6; PG_UTF8 */ {pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, 3}, /* 7; PG_MULE_INTERNAL */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 8; PG_LATIN1 */ {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1}, /* 9; PG_LATIN2 */ *************** *** 800,805 **** --- 806,836 ---- #ifndef FRONTEND + bool pg_utf8_islegal(const unsigned char *source, int length) { + unsigned char a; + const unsigned char *srcptr = source+length; + switch (length) { + default: return false; + /* Everything else falls through when "true"... */ + case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; + case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; + case 2: if ((a = (*--srcptr)) > 0xBF) return false; + switch (*source) { + /* no fall-through in this inner switch */ + case 0xE0: if (a < 0xA0) return false; break; + case 0xED: if (a > 0x9F) return false; break; + case 0xF0: if (a < 0x90) return false; break; + case 0xF4: if (a > 0x8F) return false; break; + default: if (a < 0x80) return false; + } + + case 1: if (*source >= 0x80 && *source < 0xC2) return false; + } + if (*source > 0xF4) return false; + return true; + } + + /* * Verify mbstr to make sure that it has a valid character sequence. * mbstr is not necessarily NULL terminated; length of mbstr is *************** *** 823,873 **** while (len > 0 && *mbstr) { - /* special UTF8 check */ - if (encoding == PG_UTF8 && (*mbstr & 0xf8) == 0xf0) - { - if (noError) - return false; - ereport(ERROR, - (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), - errmsg("Unicode characters greater than or equal to 0x10000 are not supported"))); - } - l = pg_mblen(mbstr); ! ! for (i = 1; i < l; i++) ! { ! /* ! * we expect that every multibyte char consists of bytes ! * having the 8th bit set ! */ ! if (i >= len || (mbstr[i] & 0x80) == 0) { ! char buf[8 * 2 + 1]; ! char *p = buf; ! int j, jlimit; ! if (noError) ! return false; ! jlimit = Min(l, len); ! jlimit = Min(jlimit, 8); /* prevent buffer overrun */ ! for (j = 0; j < jlimit; j++) ! p += sprintf(p, "%02x", mbstr[j]); ! ereport(ERROR, ! (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), ! errmsg("invalid byte sequence for encoding \"%s\": 0x%s", ! GetDatabaseEncodingName(), buf))); } } - len -= l; mbstr += l; } - return true; } --- 854,900 ---- while (len > 0 && *mbstr) { l = pg_mblen(mbstr); ! ! /* special UTF-8 check */ ! if (encoding == PG_UTF8) { ! if(!pg_utf8_islegal(mbstr,l)) { ! if (noError) return false; ! ereport(ERROR,(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),errmsg("Invalid UNICODE byte sequence detected near byte %c",*mbstr))); ! } ! } else { ! for (i = 1; i < l; i++) { ! /* ! * we expect that every multibyte char consists of bytes ! * having the 8th bit set ! */ ! if (i >= len || (mbstr[i] & 0x80) == 0) ! { ! char buf[8 * 2 + 1]; ! char *p = buf; ! int j, jlimit; ! if (noError) ! return false; ! jlimit = Min(l, len); ! jlimit = Min(jlimit, 8); /* prevent buffer overrun */ ! for (j = 0; j < jlimit; j++) ! p += sprintf(p, "%02x", mbstr[j]); ! ereport(ERROR, ! (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), ! errmsg("invalid byte sequence for encoding \"%s\": 0x%s", ! GetDatabaseEncodingName(), buf))); ! } } } len -= l; mbstr += l; } return true; } Index: src/include/mb/pg_wchar.h =================================================================== RCS file: /projects/cvsroot/pgsql/src/include/mb/pg_wchar.h,v retrieving revision 1.58 diff -c -r1.58 pg_wchar.h *** src/include/mb/pg_wchar.h 14 Mar 2005 18:31:24 -0000 1.58 --- src/include/mb/pg_wchar.h 5 Jun 2005 04:41:08 -0000 *************** *** 340,343 **** --- 340,345 ---- extern void latin2mic_with_table(unsigned char *l, unsigned char *p, int len, int lc, unsigned char *tab); extern void mic2latin_with_table(unsigned char *mic, unsigned char *p, int len, int lc, unsigned char *tab); + extern bool pg_utf8_islegal(const unsigned char *source, int length); + #endif /* PG_WCHAR_H */