Home | History | Annotate | Download | only in tests
      1 #undef G_DISABLE_ASSERT
      2 #undef G_LOG_DOMAIN
      3 
      4 #include <stdarg.h>
      5 #include <stdio.h>
      6 #include <stdlib.h>
      7 #include <string.h>
      8 #include <glib.h>
      9 
     10 static gint exit_status = 0;
     11 
     12 static void
     13 croak (char *format, ...)
     14 {
     15   va_list va;
     16 
     17   va_start (va, format);
     18   vfprintf (stderr, format, va);
     19   va_end (va);
     20 
     21   exit (1);
     22 }
     23 
     24 static void
     25 fail (char *format, ...)
     26 {
     27   va_list va;
     28 
     29   va_start (va, format);
     30   vfprintf (stderr, format, va);
     31   va_end (va);
     32 
     33   exit_status |= 1;
     34 }
     35 
     36 typedef enum
     37 {
     38   VALID,
     39   INCOMPLETE,
     40   NOTUNICODE,
     41   OVERLONG,
     42   MALFORMED
     43 } Status;
     44 
     45 static gboolean
     46 ucs4_equal (gunichar *a, gunichar *b)
     47 {
     48   while (*a && *b && (*a == *b))
     49     {
     50       a++;
     51       b++;
     52     }
     53 
     54   return (*a == *b);
     55 }
     56 
     57 static gboolean
     58 utf16_equal (gunichar2 *a, gunichar2 *b)
     59 {
     60   while (*a && *b && (*a == *b))
     61     {
     62       a++;
     63       b++;
     64     }
     65 
     66   return (*a == *b);
     67 }
     68 
     69 static gint
     70 utf16_count (gunichar2 *a)
     71 {
     72   gint result = 0;
     73 
     74   while (a[result])
     75     result++;
     76 
     77   return result;
     78 }
     79 
     80 static void
     81 process (gint      line,
     82 	 gchar    *utf8,
     83 	 Status    status,
     84 	 gunichar *ucs4,
     85 	 gint      ucs4_len)
     86 {
     87   const gchar *end;
     88   gboolean is_valid = g_utf8_validate (utf8, -1, &end);
     89   GError *error = NULL;
     90   glong items_read, items_written;
     91 
     92   switch (status)
     93     {
     94     case VALID:
     95       if (!is_valid)
     96 	{
     97 	  fail ("line %d: valid but g_utf8_validate returned FALSE\n", line);
     98 	  return;
     99 	}
    100       break;
    101     case NOTUNICODE:
    102     case INCOMPLETE:
    103     case OVERLONG:
    104     case MALFORMED:
    105       if (is_valid)
    106 	{
    107 	  fail ("line %d: invalid but g_utf8_validate returned TRUE\n", line);
    108 	  return;
    109 	}
    110       break;
    111     }
    112 
    113   if (status == INCOMPLETE)
    114     {
    115       gunichar *ucs4_result;
    116 
    117       ucs4_result = g_utf8_to_ucs4 (utf8, -1, NULL, NULL, &error);
    118 
    119       if (!error || !g_error_matches (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT))
    120 	{
    121 	  fail ("line %d: incomplete input not properly detected\n", line);
    122 	  return;
    123 	}
    124       g_clear_error (&error);
    125 
    126       ucs4_result = g_utf8_to_ucs4 (utf8, -1, &items_read, NULL, &error);
    127 
    128       if (!ucs4_result || items_read == strlen (utf8))
    129 	{
    130 	  fail ("line %d: incomplete input not properly detected\n", line);
    131 	  return;
    132 	}
    133 
    134       g_free (ucs4_result);
    135     }
    136 
    137   if (status == VALID || status == NOTUNICODE)
    138     {
    139       gunichar *ucs4_result;
    140       gchar *utf8_result;
    141 
    142       ucs4_result = g_utf8_to_ucs4 (utf8, -1, &items_read, &items_written, &error);
    143       if (!ucs4_result)
    144 	{
    145 	  fail ("line %d: conversion to ucs4 failed: %s\n", line, error->message);
    146 	  return;
    147 	}
    148 
    149       if (!ucs4_equal (ucs4_result, ucs4) ||
    150 	  items_read != strlen (utf8) ||
    151 	  items_written != ucs4_len)
    152 	{
    153 	  fail ("line %d: results of conversion to ucs4 do not match expected.\n", line);
    154 	  return;
    155 	}
    156 
    157       g_free (ucs4_result);
    158 
    159       ucs4_result = g_utf8_to_ucs4_fast (utf8, -1, &items_written);
    160 
    161       if (!ucs4_equal (ucs4_result, ucs4) ||
    162 	  items_written != ucs4_len)
    163 	{
    164 	  fail ("line %d: results of conversion to ucs4 do not match expected.\n", line);
    165 	  return;
    166 	}
    167 
    168       utf8_result = g_ucs4_to_utf8 (ucs4_result, -1, &items_read, &items_written, &error);
    169       if (!utf8_result)
    170 	{
    171 	  fail ("line %d: conversion back to utf8 failed: %s", line, error->message);
    172 	  return;
    173 	}
    174 
    175       if (strcmp (utf8_result, utf8) != 0 ||
    176 	  items_read != ucs4_len ||
    177 	  items_written != strlen (utf8))
    178 	{
    179 	  fail ("line %d: conversion back to utf8 did not match original\n", line);
    180 	  return;
    181 	}
    182 
    183       g_free (utf8_result);
    184       g_free (ucs4_result);
    185     }
    186 
    187   if (status == VALID)
    188     {
    189       gunichar2 *utf16_expected_tmp;
    190       gunichar2 *utf16_expected;
    191       gunichar2 *utf16_from_utf8;
    192       gunichar2 *utf16_from_ucs4;
    193       gunichar *ucs4_result;
    194       gsize bytes_written;
    195       gint n_chars;
    196       gchar *utf8_result;
    197 
    198 #if G_BYTE_ORDER == G_LITTLE_ENDIAN
    199 #define TARGET "UTF-16LE"
    200 #else
    201 #define TARGET "UTF-16"
    202 #endif
    203 
    204       if (!(utf16_expected_tmp = (gunichar2 *)g_convert (utf8, -1, TARGET, "UTF-8",
    205 							 NULL, &bytes_written, NULL)))
    206 	{
    207 	  fail ("line %d: could not convert to UTF-16 via g_convert\n", line);
    208 	  return;
    209 	}
    210 
    211       /* zero-terminate and remove BOM
    212        */
    213       n_chars = bytes_written / 2;
    214       if (utf16_expected_tmp[0] == 0xfeff) /* BOM */
    215 	{
    216 	  n_chars--;
    217 	  utf16_expected = g_new (gunichar2, n_chars + 1);
    218 	  memcpy (utf16_expected, utf16_expected_tmp + 1, sizeof(gunichar2) * n_chars);
    219 	}
    220       else if (utf16_expected_tmp[0] == 0xfffe) /* ANTI-BOM */
    221 	{
    222 	  fail ("line %d: conversion via iconv to \"UTF-16\" is not native-endian\n", line);
    223 	  return;
    224 	}
    225       else
    226 	{
    227 	  utf16_expected = g_new (gunichar2, n_chars + 1);
    228 	  memcpy (utf16_expected, utf16_expected_tmp, sizeof(gunichar2) * n_chars);
    229 	}
    230 
    231       utf16_expected[n_chars] = '\0';
    232 
    233       if (!(utf16_from_utf8 = g_utf8_to_utf16 (utf8, -1, &items_read, &items_written, &error)))
    234 	{
    235 	  fail ("line %d: conversion to ucs16 failed: %s\n", line, error->message);
    236 	  return;
    237 	}
    238 
    239       if (items_read != strlen (utf8) ||
    240 	  utf16_count (utf16_from_utf8) != items_written)
    241 	{
    242 	  fail ("line %d: length error in conversion to ucs16\n", line);
    243 	  return;
    244 	}
    245 
    246       if (!(utf16_from_ucs4 = g_ucs4_to_utf16 (ucs4, -1, &items_read, &items_written, &error)))
    247 	{
    248 	  fail ("line %d: conversion to ucs16 failed: %s\n", line, error->message);
    249 	  return;
    250 	}
    251 
    252       if (items_read != ucs4_len ||
    253 	  utf16_count (utf16_from_ucs4) != items_written)
    254 	{
    255 	  fail ("line %d: length error in conversion to ucs16\n", line);
    256 	  return;
    257 	}
    258 
    259       if (!utf16_equal (utf16_from_utf8, utf16_expected) ||
    260 	  !utf16_equal (utf16_from_ucs4, utf16_expected))
    261 	{
    262 	  fail ("line %d: results of conversion to ucs16 do not match\n", line);
    263 	  return;
    264 	}
    265 
    266       if (!(utf8_result = g_utf16_to_utf8 (utf16_from_utf8, -1, &items_read, &items_written, &error)))
    267 	{
    268 	  fail ("line %d: conversion back to utf8 failed: %s\n", line, error->message);
    269 	  return;
    270 	}
    271 
    272       if (items_read != utf16_count (utf16_from_utf8) ||
    273 	  items_written != strlen (utf8))
    274 	{
    275 	  fail ("line %d: length error in conversion from ucs16 to utf8\n", line);
    276 	  return;
    277 	}
    278 
    279       if (!(ucs4_result = g_utf16_to_ucs4 (utf16_from_ucs4, -1, &items_read, &items_written, &error)))
    280 	{
    281 	  fail ("line %d: conversion back to utf8/ucs4 failed\n", line);
    282 	  return;
    283 	}
    284 
    285       if (items_read != utf16_count (utf16_from_utf8) ||
    286 	  items_written != ucs4_len)
    287 	{
    288 	  fail ("line %d: length error in conversion from ucs16 to ucs4\n", line);
    289 	  return;
    290 	}
    291 
    292       if (strcmp (utf8, utf8_result) != 0 ||
    293 	  !ucs4_equal (ucs4, ucs4_result))
    294 	{
    295 	  fail ("line %d: conversion back to utf8/ucs4 did not match original\n", line);
    296 	  return;
    297 	}
    298 
    299       g_free (utf16_expected_tmp);
    300       g_free (utf16_expected);
    301       g_free (utf16_from_utf8);
    302       g_free (utf16_from_ucs4);
    303       g_free (utf8_result);
    304       g_free (ucs4_result);
    305     }
    306 }
    307 
    308 int
    309 main (int argc, char **argv)
    310 {
    311   gchar *srcdir = getenv ("srcdir");
    312   gchar *testfile;
    313   gchar *contents;
    314   GError *error = NULL;
    315   gchar *p, *end;
    316   char *tmp;
    317   gint state = 0;
    318   gint line = 1;
    319   gint start_line = 0;		/* Quiet GCC */
    320   gchar *utf8 = NULL;		/* Quiet GCC */
    321   GArray *ucs4;
    322   Status status = VALID;	/* Quiet GCC */
    323 
    324   if (!srcdir)
    325     srcdir = ".";
    326 
    327   testfile = g_strconcat (srcdir, G_DIR_SEPARATOR_S "utf8.txt", NULL);
    328 
    329   g_file_get_contents (testfile, &contents, NULL, &error);
    330   if (error)
    331     croak ("Cannot open utf8.txt: %s", error->message);
    332 
    333   ucs4 = g_array_new (TRUE, FALSE, sizeof(gunichar));
    334 
    335   p = contents;
    336 
    337   /* Loop over lines */
    338   while (*p)
    339     {
    340       while (*p && (*p == ' ' || *p == '\t'))
    341 	p++;
    342 
    343       end = p;
    344       while (*end && (*end != '\r' && *end != '\n'))
    345 	end++;
    346 
    347       if (!*p || *p == '#' || *p == '\r' || *p == '\n')
    348 	goto next_line;
    349 
    350       tmp = g_strstrip (g_strndup (p, end - p));
    351 
    352       switch (state)
    353 	{
    354 	case 0:
    355 	  /* UTF-8 string */
    356 	  start_line = line;
    357 	  utf8 = tmp;
    358 	  tmp = NULL;
    359 	  break;
    360 
    361 	case 1:
    362 	  /* Status */
    363 	  if (!strcmp (tmp, "VALID"))
    364 	    status = VALID;
    365 	  else if (!strcmp (tmp, "INCOMPLETE"))
    366 	    status = INCOMPLETE;
    367 	  else if (!strcmp (tmp, "NOTUNICODE"))
    368 	    status = NOTUNICODE;
    369 	  else if (!strcmp (tmp, "OVERLONG"))
    370 	    status = OVERLONG;
    371 	  else if (!strcmp (tmp, "MALFORMED"))
    372 	    status = MALFORMED;
    373 	  else
    374 	    croak ("Invalid status on line %d\n", line);
    375 
    376 	  if (status != VALID && status != NOTUNICODE)
    377 	    state++;		/* No UCS-4 data */
    378 
    379 	  break;
    380 
    381 	case 2:
    382 	  /* UCS-4 version */
    383 
    384 	  p = strtok (tmp, " \t");
    385 	  while (p)
    386 	    {
    387 	      gchar *endptr;
    388 
    389 	      gunichar ch = strtoul (p, &endptr, 16);
    390 	      if (*endptr != '\0')
    391 		croak ("Invalid UCS-4 character on line %d\n", line);
    392 
    393 	      g_array_append_val (ucs4, ch);
    394 
    395 	      p = strtok (NULL, " \t");
    396 	    }
    397 
    398 	  break;
    399 	}
    400 
    401       g_free (tmp);
    402       state = (state + 1) % 3;
    403 
    404       if (state == 0)
    405 	{
    406 	  process (start_line, utf8, status, (gunichar *)ucs4->data, ucs4->len);
    407 	  g_array_set_size (ucs4, 0);
    408 	  g_free (utf8);
    409 	}
    410 
    411     next_line:
    412       p = end;
    413       if (*p && *p == '\r')
    414 	p++;
    415       if (*p && *p == '\n')
    416 	p++;
    417 
    418       line++;
    419     }
    420 
    421   return exit_status;
    422 }
    423