// sanity test for utf specific character incrementer. // // -v displays status for invalid source code. // -m displays status for the result that the product of new // incrementer is match to the one of the generic incrementer. // show all status lines when both -v and -m are specified. // // `utftest | grep FAILED' shows remaining glitches using new // incrementer. (4 lines) // // CAUTION: this program yields so much lines. // // `utftest' yields 17375 lines. These lines are the saved by new func // and remaining glitches. // `utftest -m' yields 1112064 lines. // // // Sample of status lines: // src char src utf8 dst utf8 dest char result status // 000d7bf => ed9ebf => ed9f80 (000d7c0) successed - Don't match to generic inc(000d7bf) // 000d7ff => ed9fbf => ed9fbf (000d7ff) FAILED - Match to generic inc // 000d800 => eda080 Source not valid utf8 // // successed/FAILED in result status shows the return value of // character increment function. Following description says that the // result of the new incrementer was/was'nt identical to the generic // incrementer. #include #include typedef int bool; static int true = 1; static int false = 0; static bool pg_utf8_increment(unsigned char *mbstr, int length); static bool pg_generic_charinc(unsigned char *charptr, int len); void uni2utf8(unsigned int unicode, unsigned char *utf8buf); unsigned int utf8tounicode(unsigned char *utf8buf); int scatf(char* buf, char* format, ...); int main(int argc, char** argv) { unsigned char buf[4], buf2[4]; char outbuf[1024]; unsigned int i; int dispinvalid = 0; int dispmatch = 0; for (i = 1 ; i < argc ; i++) { if (strcmp(argv[i], "-v") == 0) dispinvalid = 1; if (strcmp(argv[i], "-m") == 0) dispmatch = 1; } for(i = 0 ; i < 0x1010000 ; i++) { bool prechk, successed, gensuccess, match; uni2utf8(i, buf); uni2utf8(i, buf2); *outbuf = 0; scatf(outbuf, "%07x => ", i); int len = pg_utf_mblen(buf); int j = 0; while (j < len) scatf(outbuf, "%02x", buf[j++]); while (j < 4) { scatf(outbuf, " "); j++; } prechk = pg_utf8_islegal(buf, len); if (! prechk) { scatf(outbuf, "Source not valid utf8"); if (dispinvalid) puts(outbuf); continue; } successed = pg_utf8_increment(buf, len); scatf(outbuf, " => "); j = 0; while (j < len) scatf(outbuf, "%02x", buf[j++]); while (j < 4) { scatf(outbuf, " "); j++; } gensuccess = pg_generic_charinc(buf2, len); match = (memcmp(buf, buf2, len) == 0); if (!gensuccess || !match || dispmatch) { scatf(outbuf, "(%07x) %s - %s", utf8tounicode(buf), (successed ? "successed" : "FAILED"), (match ? "Match to generic inc" : "Don't match to generic inc")); if (!match) { scatf(outbuf, "(%07x)", utf8tounicode(buf2)); } puts(outbuf); } } } bool pg_utf8_islegal(const unsigned char *source, int length) { unsigned char a; switch (length) { default: /* reject lengths 5 and 6 for now */ return false; case 4: a = source[3]; if (a < 0x80 || a > 0xBF) return false; /* FALL THRU */ case 3: a = source[2]; if (a < 0x80 || a > 0xBF) return false; /* FALL THRU */ case 2: a = source[1]; switch (*source) { case 0xE0: if (a < 0xA0 || a > 0xBF) return false; break; case 0xED: if (a < 0x80 || a > 0x9F) return false; break; case 0xF0: if (a < 0x90 || a > 0xBF) return false; break; case 0xF4: if (a < 0x80 || a > 0x8F) return false; break; default: if (a < 0x80 || a > 0xBF) return false; break; } /* FALL THRU */ case 1: a = *source; if (a >= 0x80 && a < 0xC2) return false; if (a > 0xF4) return false; break; } return true; } int pg_utf_mblen(const unsigned char *s) { int len; if ((*s & 0x80) == 0) len = 1; else if ((*s & 0xe0) == 0xc0) len = 2; else if ((*s & 0xf0) == 0xe0) len = 3; else if ((*s & 0xf8) == 0xf0) len = 4; #ifdef NOT_USED else if ((*s & 0xfc) == 0xf8) len = 5; else if ((*s & 0xfe) == 0xfc) len = 6; #endif else len = 1; return len; } static bool pg_utf8_increment(unsigned char *charptr, int length) { unsigned char a; unsigned char bak[4]; bool success; memcpy(bak, charptr, length); switch (length) { default: /* reject lengths 5 and 6 for now */ return false; case 4: a = charptr[3]; if (a < 0xBF) { charptr[3]++; break; } charptr[3] = 0x80; /* FALL THRU */ case 3: a = charptr[2]; if (a < 0xBF) { charptr[2]++; break; } charptr[2] = 0x80; /* FALL THRU */ case 2: a = charptr[1]; if ((*charptr == 0xed && a < 0x9F) || a < 0xBF) { charptr[1]++; break; } charptr[1] = 0x80; /* FALL THRU */ case 1: a = *charptr; if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF7) { memcpy(charptr, bak, length); return false; } charptr[0]++; break; } /* Check the result with pg_utf8_islegal as the last resort. */ success = pg_utf8_islegal(charptr, length); if (!success) memcpy(charptr, bak, length); return success; } void uni2utf8(unsigned int unicode, unsigned char *utf8buf) { int i, len; if (unicode < 0x80) { len = 1; *utf8buf = 0; } else if (unicode < 0x800) { len = 2; *utf8buf = 0xc0; } else if (unicode < 0x10000) { len = 3; *utf8buf = 0xe0; } else if (unicode < 0x110000) { len = 4; *utf8buf = 0xf0; } else { printf("Uunicode of of range: %x\n", unicode); exit(1); } for(i = len - 1 ; i > 0 ; i--) { utf8buf[i] = (0x80 | (unicode & 0x3f)); unicode >>= 6; } *utf8buf |= unicode; } unsigned int utf8tounicode(unsigned char *utf8buf) { unsigned int a = *utf8buf; if (a < 0x80) return a; if (a < 0xc0) return 0xfffffff; if (a < 0xe0) return ((utf8buf[0] - 0xc0) << 6) + (utf8buf[1] - 0x80); if (a < 0xf0) return ((utf8buf[0] - 0xe0) << 12) + ((utf8buf[1] - 0x80) << 6) + utf8buf[2] - 0x80; if (a < 0xf8) return ((utf8buf[0] - 0xf0) << 18) + ((utf8buf[1] - 0x80) << 12) + ((utf8buf[2] - 0x80) << 6) + utf8buf[3] - 0x80; return 0xfffffff; } static bool pg_generic_charinc(unsigned char *charptr, int len) { unsigned char *lastchar = (unsigned char *) (charptr + len - 1); unsigned char savelastchar = *lastchar; const char *const_charptr = (const char *)charptr; while (*lastchar < (unsigned char) 255) { (*lastchar)++; if (!pg_utf8_islegal(const_charptr, len)) // modified. continue; return true; } *lastchar = savelastchar; return false; } int scatf(char* buf, char* format, ...) { va_list args; int ret; va_start(args, format); ret = vsprintf(buf + strlen(buf), format, args); va_end(args); return ret; }