// sanity test for utf specific character incrementer.
//
// -v displays status for invalid source code.
// -m displays status for the result that the product of new
//    incrementer is match to the one of the generic incrementer.
// show all status lines when both -v and -m are specified.
//
// `utftest | grep FAILED' shows remaining glitches using new
//    incrementer. (4 lines)
//
// CAUTION: this program yields so much lines.
//
// `utftest' yields 17375 lines. These lines are the saved by new func
//           and remaining glitches.
// `utftest -m' yields 1112064 lines.
//
//
// Sample of status lines:
// src char  src utf8   dst utf8  dest char result status
// 000d7bf => ed9ebf   => ed9f80  (000d7c0) successed - Don't match to generic inc(000d7bf)
// 000d7ff => ed9fbf   => ed9fbf  (000d7ff) FAILED - Match to generic inc
// 000d800 => eda080  Source not valid utf8
//
// successed/FAILED in result status shows the return value of
// character increment function. Following description says that the
// result of the new incrementer was/was'nt identical to the generic
// incrementer.

#include <stdio.h>
#include <stdarg.h>

typedef int bool;
static int true = 1;
static int false = 0;

static bool pg_utf8_increment(unsigned char *mbstr, int length);
static bool pg_generic_charinc(unsigned char *charptr, int len);
void uni2utf8(unsigned int unicode, unsigned char *utf8buf);
unsigned int utf8tounicode(unsigned char *utf8buf);
int scatf(char* buf, char* format, ...);

int main(int argc, char** argv) {
  unsigned char buf[4], buf2[4];
  char outbuf[1024];
  unsigned int i;
  int dispinvalid = 0;
  int dispmatch = 0;

  for (i = 1 ; i < argc ; i++) {
	if (strcmp(argv[i], "-v") == 0) dispinvalid = 1;
	if (strcmp(argv[i], "-m") == 0) dispmatch = 1;
  }
  
  for(i = 0 ; i < 0x1010000 ; i++) {
	bool prechk, successed, gensuccess, match;

	uni2utf8(i, buf);
	uni2utf8(i, buf2);
	*outbuf = 0;

	scatf(outbuf, "%07x => ", i);

	int len = pg_utf_mblen(buf);

	int j = 0;

	while (j < len)
	  scatf(outbuf, "%02x", buf[j++]);

	while (j < 4) {
	  scatf(outbuf, "  ");
	  j++;
	}
	  
	prechk = pg_utf8_islegal(buf, len);
	if (! prechk) {
	  scatf(outbuf, "Source not valid utf8");

	  if (dispinvalid)
		puts(outbuf);
	  continue;
	}

	successed = pg_utf8_increment(buf, len);
	scatf(outbuf, " => ");
	j = 0;
	while (j < len)
	  scatf(outbuf, "%02x", buf[j++]);
	  
	while (j < 4) {
	  scatf(outbuf, "  ");
	  j++;
	}

	gensuccess = pg_generic_charinc(buf2, len);
	
	match = (memcmp(buf, buf2, len) == 0);

	if (!gensuccess || !match || dispmatch) {
	  scatf(outbuf, 
			"(%07x) %s - %s",
			utf8tounicode(buf),
			(successed ? "successed" : "FAILED"),
			(match ? "Match to generic inc" : "Don't match to generic inc"));
	  if (!match) {
		scatf(outbuf, "(%07x)", utf8tounicode(buf2));
	  }
	  puts(outbuf);
	}
  }
}

bool
pg_utf8_islegal(const unsigned char *source, int length)
{
	unsigned char a;

	switch (length)
	{
		default:
			/* reject lengths 5 and 6 for now */
			return false;
		case 4:
			a = source[3];
			if (a < 0x80 || a > 0xBF)
				return false;
			/* FALL THRU */
		case 3:
			a = source[2];
			if (a < 0x80 || a > 0xBF)
				return false;
			/* FALL THRU */
		case 2:
			a = source[1];
			switch (*source)
			{
				case 0xE0:
					if (a < 0xA0 || a > 0xBF)
						return false;
					break;
				case 0xED:
					if (a < 0x80 || a > 0x9F)
						return false;
					break;
				case 0xF0:
					if (a < 0x90 || a > 0xBF)
						return false;
					break;
				case 0xF4:
					if (a < 0x80 || a > 0x8F)
						return false;
					break;
				default:
					if (a < 0x80 || a > 0xBF)
						return false;
					break;
			}
			/* FALL THRU */
		case 1:
			a = *source;
			if (a >= 0x80 && a < 0xC2)
				return false;
			if (a > 0xF4)
				return false;
			break;
	}
	return true;
}

int
pg_utf_mblen(const unsigned char *s)
{
	int			len;

	if ((*s & 0x80) == 0)
		len = 1;
	else if ((*s & 0xe0) == 0xc0)
		len = 2;
	else if ((*s & 0xf0) == 0xe0)
		len = 3;
	else if ((*s & 0xf8) == 0xf0)
		len = 4;
#ifdef NOT_USED
	else if ((*s & 0xfc) == 0xf8)
		len = 5;
	else if ((*s & 0xfe) == 0xfc)
		len = 6;
#endif
	else
		len = 1;
	return len;
}


static bool pg_utf8_increment(unsigned char *charptr, int length)
{
 	unsigned char a;
 	unsigned char bak[4];
 	bool success;
 
 	memcpy(bak, charptr, length);
 	switch (length)
 	{
 		default:
 			/* reject lengths 5 and 6 for now */
 			return false;
 		case 4:
 			a = charptr[3];
 			if (a < 0xBF)
 			{
 				charptr[3]++;
 				break;
 			}
 			charptr[3] = 0x80;
 			/* FALL THRU */
 		case 3:
 			a = charptr[2];
 			if (a < 0xBF)
 			{
 				charptr[2]++;
 				break;
 			}
 			charptr[2] = 0x80;
 			/* FALL THRU */
 		case 2:
 			a = charptr[1];
 			if ((*charptr == 0xed && a < 0x9F) || a < 0xBF)
 			{
 				charptr[1]++;
 				break;
 			}
 			charptr[1] = 0x80;
 			/* FALL THRU */
 		case 1:
 			a = *charptr;
 			if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF7) {
				memcpy(charptr, bak, length);
 				return false;
 			}
 			charptr[0]++;
 			break;
 	}
 	
 	/* Check the result with pg_utf8_islegal as the last resort. */
 	success = pg_utf8_islegal(charptr, length);
 	if (!success)
 		memcpy(charptr, bak, length);
 
 	return success;
}

void uni2utf8(unsigned int unicode, unsigned char *utf8buf) {
  int i, len;
  
  if (unicode < 0x80) {
    len = 1;
    *utf8buf = 0;
  } else if (unicode < 0x800) {
    len = 2;
    *utf8buf = 0xc0;
  } else if (unicode < 0x10000) {
    len = 3;
    *utf8buf = 0xe0;
  } else if (unicode < 0x110000) {
    len = 4;
    *utf8buf = 0xf0;
  } else {
    printf("Uunicode of of range: %x\n", unicode);
	exit(1);
  }

  for(i = len - 1 ; i > 0 ; i--) {
    utf8buf[i] = (0x80 | (unicode & 0x3f));
    unicode >>= 6;
  }
  *utf8buf |= unicode;
}

unsigned int utf8tounicode(unsigned char *utf8buf) {
  unsigned int a = *utf8buf;
  if (a < 0x80) return a;
  if (a < 0xc0) return 0xfffffff;
  if (a < 0xe0)
	return 
	  ((utf8buf[0] - 0xc0) << 6) +
	  (utf8buf[1] - 0x80);
  if (a < 0xf0)
	return
	  ((utf8buf[0] - 0xe0) << 12) +
	  ((utf8buf[1] - 0x80) << 6) +
	  utf8buf[2] - 0x80;
  if (a < 0xf8)
	return
	  ((utf8buf[0] - 0xf0) << 18) +
	  ((utf8buf[1] - 0x80) << 12) +
	  ((utf8buf[2] - 0x80) << 6) +
	  utf8buf[3] - 0x80;
  return 0xfffffff;
}


static bool pg_generic_charinc(unsigned char *charptr, int len)
{
       unsigned char *lastchar = (unsigned char *) (charptr + len - 1);
       unsigned char savelastchar = *lastchar;
       const char *const_charptr = (const char *)charptr;
 
       while (*lastchar < (unsigned char) 255)
       {
               (*lastchar)++;
               if (!pg_utf8_islegal(const_charptr, len)) // modified.
                       continue;
               return true;
       }
 
       *lastchar = savelastchar;
       return false;
}

int scatf(char* buf, char* format, ...) {
  va_list args;
  int ret;

  va_start(args, format);
  ret = vsprintf(buf + strlen(buf), format, args);
  va_end(args);
  return ret;
}