diff -ruN postgresql-7.5-old/src/backend/utils/mb/wchar.c postgresql-7.5/src/backend/utils/mb/wchar.c --- postgresql-7.5-old/src/backend/utils/mb/wchar.c 2004-08-06 22:44:26.000000000 +1000 +++ postgresql-7.5/src/backend/utils/mb/wchar.c 2004-08-07 01:19:39.000000000 +1000 @@ -801,6 +801,53 @@ #ifndef FRONTEND +/* --------------------------------------------------------------------- */ + +/* + * Index into the table below with the first byte of a UTF-8 sequence to + * get the number of trailing bytes that are supposed to follow it. + */ + +static const char trailingBytesForUTF8[256] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 +}; + +/* --------------------------------------------------------------------- */ + +/* + * Utility routine to tell whether a sequence of bytes is legal UTF-8. + */ + +static unsigned char isLegalUTF8(const unsigned char *source) { + int length = trailingBytesForUTF8[*source]+1; + unsigned char a; + const unsigned char *srcptr = source+length; + switch (length) { + default: return false; + /* Everything else falls through when "true"... */ + case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; + case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; + case 2: if ((a = (*--srcptr)) > 0xBF) return false; + switch (*source) { + /* no fall-through in this inner switch */ + case 0xE0: if (a < 0xA0) return false; break; + case 0xF0: if (a < 0x90) return false; break; + case 0xF4: if (a > 0x8F) return false; break; + default: if (a < 0x80) return false; + } + case 1: if (*source >= 0x80 && *source < 0xC2) return false; + if (*source > 0xF4) return false; + } + return true; +} + /* * Verify mbstr to make sure that it has a valid character sequence. * mbstr is not necessarily NULL terminated; length of mbstr is @@ -825,14 +872,16 @@ while (len > 0 && *mbstr) { /* special UTF-8 check */ - if (encoding == PG_UTF8 && (*mbstr & 0xf8) == 0xf0) + if (encoding == PG_UTF8 && !isLegalUTF8(mbstr)) { if (noError) return false; ereport(ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), - errmsg("Unicode characters greater than or equal to 0x10000 are not supported"))); + errmsg("Invalid UNICODE byte sequence detected"))); } + if (encoding == PG_UTF8) + return true; l = pg_mblen(mbstr);