
#include <ctype.h>
#include <langinfo.h>
#include <locale.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <wctype.h>

locale_t c_utf8_locale = NULL;

void
unicode_to_utf8(unsigned long int c, unsigned char *utf8string)
{
    if (c <= 0x7F)
    {
        utf8string[0] = c;
    }
    else if (c <= 0x7FF)
    {
        utf8string[0] = 0xC0 | ((c >> 6) & 0x1F);
        utf8string[1] = 0x80 | (c & 0x3F);
    }
    else if (c <= 0xFFFF)
    {
        utf8string[0] = 0xE0 | ((c >> 12) & 0x0F);
        utf8string[1] = 0x80 | ((c >> 6) & 0x3F);
        utf8string[2] = 0x80 | (c & 0x3F);
    }
    else
    {
        utf8string[0] = 0xF0 | ((c >> 18) & 0x07);
        utf8string[1] = 0x80 | ((c >> 12) & 0x3F);
        utf8string[2] = 0x80 | ((c >> 6) & 0x3F);
        utf8string[3] = 0x80 | (c & 0x3F);
    }
}

size_t
mbstowcs_l(wchar_t *dest, const char *src, size_t n, locale_t loc)
{
  locale_t save_locale = uselocale(loc);
  size_t result;

  result = mbstowcs(dest, src, n);
  uselocale(save_locale);
  return result;
}

wchar_t
unicode_to_wchar_l(int codepoint, locale_t locale)
{
  unsigned char utf8_string[5] = {0};
  wchar_t wc[3] = {0};
  size_t n;

  if (codepoint == 0x000000)
    return 0x00;

  if (!c_utf8_locale)
    {
      c_utf8_locale = newlocale(LC_CTYPE_MASK, "C.UTF-8", NULL);
      if (!c_utf8_locale)
	{
	  printf("locale \"C.UTF-8\" not available\n");
	  exit(1);
	}
    }

  memset(utf8_string, 0, 5);
  unicode_to_utf8(codepoint, utf8_string);

  n = mbstowcs_l(NULL, utf8_string, 0, c_utf8_locale);

  if (n == -1)
    {
      printf("could not convert 0x%06x to wchar_t: invalid byte sequence\n",
	     codepoint);
      printf("utf8_string: 0x%02x 0x%02x 0x%02x 0x%02x 0x%02x\n",
	     utf8_string[0], utf8_string[1], utf8_string[2], utf8_string[3], utf8_string[4]);
      exit(1);
    }

  mbstowcs_l(wc, utf8_string, 2, c_utf8_locale);

  return wc[0];
}

int main(int argc, char *argv[])
{
  char *base_locale_string, *test_locale_string;
  locale_t base_locale, test_locale;

  if (argc < 2)
    {
      printf("Usage: ctype_test BASE_LOCALE TEST_LOCALE\n");
      exit(1);
    }

  base_locale_string = argv[1];
  test_locale_string = argv[2];

  base_locale = newlocale(LC_CTYPE_MASK, base_locale_string, NULL);
  if (!base_locale)
    {
      printf("base locale \"%s\" not available\n", base_locale_string);
      exit(1);
    }

  test_locale = newlocale(LC_CTYPE_MASK, test_locale_string, NULL);
  if (!test_locale)
    {
      printf("test locale \"%s\" not available\n", test_locale_string);
      exit(1);
    }

  /* test plain char variants */
  for (int i = -1; i <= 0x7f; i++)
    {
      bool base_isalpha = isalpha_l(i, base_locale);
      bool base_islower = islower_l(i, base_locale);
      bool base_isupper = isupper_l(i, base_locale);
      bool base_isdigit = isdigit_l(i, base_locale);
      bool base_isxdigit = isxdigit_l(i, base_locale);
      bool base_isalnum = isalnum_l(i, base_locale);
      bool base_isspace = isspace_l(i, base_locale);
      bool base_ispunct = ispunct_l(i, base_locale);
      bool base_isblank = isblank_l(i, base_locale);
      bool base_isprint = isprint_l(i, base_locale);
      bool base_isgraph = isgraph_l(i, base_locale);
      bool base_iscntrl = iscntrl_l(i, base_locale);

      bool test_isalpha = isalpha_l(i, test_locale);
      bool test_islower = islower_l(i, test_locale);
      bool test_isupper = isupper_l(i, test_locale);
      bool test_isdigit = isdigit_l(i, test_locale);
      bool test_isxdigit = isxdigit_l(i, test_locale);
      bool test_isalnum = isalnum_l(i, test_locale);
      bool test_isspace = isspace_l(i, test_locale);
      bool test_ispunct = ispunct_l(i, test_locale);
      bool test_isblank = isblank_l(i, test_locale);
      bool test_isprint = isprint_l(i, test_locale);
      bool test_isgraph = isgraph_l(i, test_locale);
      bool test_iscntrl = iscntrl_l(i, test_locale);

      if (base_isalpha != test_isalpha ||
	  base_islower != test_islower ||
	  base_isupper != test_isupper ||
	  base_isdigit != test_isdigit ||
	  base_isxdigit != test_isxdigit ||
	  base_isalnum != test_isalnum ||
	  base_isspace != test_isspace ||
	  base_ispunct != test_ispunct ||
	  base_isblank != test_isblank ||
	  base_isprint != test_isprint ||
	  base_isgraph != test_isgraph ||
	  base_iscntrl != test_iscntrl)
	{
	  printf("FAILURE (%s,%s) for plain char 0x%02x\n",
		 base_locale_string, test_locale_string, i);
	  exit(1);
	}
    }
  
  /* test wide variants */
  for (wchar_t unicode = 0; unicode < 0x10ffff; unicode++)
    {
      /* skip surrogates */
      if (unicode >= 0xd800 && unicode < 0xE000)
	continue;

      wchar_t wc1 = unicode_to_wchar_l(unicode, base_locale);
      wchar_t wc2 = unicode_to_wchar_l(unicode, test_locale);

      bool base_isalpha = iswalpha_l(wc1, base_locale);
      bool base_islower = iswlower_l(wc1, base_locale);
      bool base_isupper = iswupper_l(wc1, base_locale);
      bool base_isdigit = iswdigit_l(wc1, base_locale);
      bool base_isxdigit = iswxdigit_l(wc1, base_locale);
      bool base_isalnum = iswalnum_l(wc1, base_locale);
      bool base_isspace = iswspace_l(wc1, base_locale);
      bool base_ispunct = iswpunct_l(wc1, base_locale);
      bool base_isblank = iswblank_l(wc1, base_locale);
      bool base_isprint = iswprint_l(wc1, base_locale);
      bool base_isgraph = iswgraph_l(wc1, base_locale);
      bool base_iscntrl = iswcntrl_l(wc1, base_locale);

      bool test_isalpha = iswalpha_l(wc2, test_locale);
      bool test_islower = iswlower_l(wc2, test_locale);
      bool test_isupper = iswupper_l(wc2, test_locale);
      bool test_isdigit = iswdigit_l(wc2, test_locale);
      bool test_isxdigit = iswxdigit_l(wc2, test_locale);
      bool test_isalnum = iswalnum_l(wc2, test_locale);
      bool test_isspace = iswspace_l(wc2, test_locale);
      bool test_ispunct = iswpunct_l(wc2, test_locale);
      bool test_isblank = iswblank_l(wc2, test_locale);
      bool test_isprint = iswprint_l(wc2, test_locale);
      bool test_isgraph = iswgraph_l(wc2, test_locale);
      bool test_iscntrl = iswcntrl_l(wc2, test_locale);

      /* just a sanity check that works on some platforms, but not guaranteed */
      if (unicode != wc1 || unicode != wc2)
	printf("codepoint 0x%06x: 0x%06x 0x%06x\n", unicode, wc1, wc2);

      if (base_isalpha != test_isalpha ||
	  base_islower != test_islower ||
	  base_isupper != test_isupper ||
	  base_isdigit != test_isdigit ||
	  base_isxdigit != test_isxdigit ||
	  base_isalnum != test_isalnum ||
	  base_isspace != test_isspace ||
	  base_ispunct != test_ispunct ||
	  base_isblank != test_isblank ||
	  base_isprint != test_isprint ||
	  base_isgraph != test_isgraph ||
	  base_iscntrl != test_iscntrl)
	{
	  printf("FAILURE (%s,%s) for unicode char 0x%06x\n",
		 base_locale_string, test_locale_string, unicode);
	  exit(1);
	}
    }
  
  freelocale(base_locale);
  freelocale(test_locale);
}
