Home | History | Annotate | Download | only in tests
      1 /* GLIB - Library of useful routines for C programming
      2  * Copyright (C) 2001 Matthias Clasen <matthiasc (at) poet.de>
      3  *
      4  * This library is free software; you can redistribute it and/or
      5  * modify it under the terms of the GNU Lesser General Public
      6  * License as published by the Free Software Foundation; either
      7  * version 2 of the License, or (at your option) any later version.
      8  *
      9  * This library is distributed in the hope that it will be useful,
     10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
     11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     12  * Lesser General Public License for more details.
     13  *
     14  * You should have received a copy of the GNU Lesser General Public
     15  * License along with this library; if not, write to the
     16  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
     17  * Boston, MA 02111-1307, USA.
     18  */
     19 
     20 #include "glib.h"
     21 
     22 #define UNICODE_VALID(Char)                   \
     23     ((Char) < 0x110000 &&                     \
     24      (((Char) & 0xFFFFF800) != 0xD800) &&     \
     25      ((Char) < 0xFDD0 || (Char) > 0xFDEF) &&  \
     26      ((Char) & 0xFFFE) != 0xFFFE)
     27 
     28 
     29 
     30 static gboolean any_failed = FALSE;
     31 
     32 struct {
     33   const gchar *text;
     34   gint max_len;
     35   gint offset;
     36   gboolean valid;
     37 } test[] = {
     38   /* some tests to check max_len handling */
     39   /* length 1 */
     40   { "abcde", -1, 5, TRUE },
     41   { "abcde", 3, 3, TRUE },
     42   { "abcde", 5, 5, TRUE },
     43   { "abcde", 7, 5, FALSE },
     44   /* length 2 */
     45   { "\xc2\xa9\xc2\xa9\xc2\xa9", -1, 6, TRUE },
     46   { "\xc2\xa9\xc2\xa9\xc2\xa9",  1, 0, FALSE },
     47   { "\xc2\xa9\xc2\xa9\xc2\xa9",  2, 2, TRUE },
     48   { "\xc2\xa9\xc2\xa9\xc2\xa9",  3, 2, FALSE },
     49   { "\xc2\xa9\xc2\xa9\xc2\xa9",  4, 4, TRUE },
     50   { "\xc2\xa9\xc2\xa9\xc2\xa9",  5, 4, FALSE },
     51   { "\xc2\xa9\xc2\xa9\xc2\xa9",  6, 6, TRUE },
     52   { "\xc2\xa9\xc2\xa9\xc2\xa9",  7, 6, FALSE },
     53   /* length 3 */
     54   { "\xe2\x89\xa0\xe2\x89\xa0", -1, 6, TRUE },
     55   { "\xe2\x89\xa0\xe2\x89\xa0",  1, 0, FALSE },
     56   { "\xe2\x89\xa0\xe2\x89\xa0",  2, 0, FALSE },
     57   { "\xe2\x89\xa0\xe2\x89\xa0",  3, 3, TRUE },
     58   { "\xe2\x89\xa0\xe2\x89\xa0",  4, 3, FALSE },
     59   { "\xe2\x89\xa0\xe2\x89\xa0",  5, 3, FALSE },
     60   { "\xe2\x89\xa0\xe2\x89\xa0",  6, 6, TRUE },
     61   { "\xe2\x89\xa0\xe2\x89\xa0",  7, 6, FALSE },
     62 
     63   /* examples from http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt */
     64   /* greek 'kosme' */
     65   { "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5", -1, 11, TRUE },
     66   /* first sequence of each length */
     67   { "\x00", -1, 0, TRUE },
     68   { "\xc2\x80", -1, 2, TRUE },
     69   { "\xe0\xa0\x80", -1, 3, TRUE },
     70   { "\xf0\x90\x80\x80", -1, 4, TRUE },
     71   { "\xf8\x88\x80\x80\x80", -1, 0, FALSE },
     72   { "\xfc\x84\x80\x80\x80\x80", -1, 0, FALSE },
     73   /* last sequence of each length */
     74   { "\x7f", -1, 1, TRUE },
     75   { "\xdf\xbf", -1, 2, TRUE },
     76   { "\xef\xbf\xbf", -1, 0, FALSE },
     77   { "\xf7\xbf\xbf\xbf", -1, 0, FALSE },
     78   { "\xfb\xbf\xbf\xbf\xbf", -1, 0, FALSE },
     79   { "\xfd\xbf\xbf\xbf\xbf\xbf", -1, 0, FALSE },
     80   /* other boundary conditions */
     81   { "\xed\x9f\xbf", -1, 3, TRUE },
     82   { "\xee\x80\x80", -1, 3, TRUE },
     83   { "\xef\xbf\xbd", -1, 3, TRUE },
     84   { "\xf4\x8f\xbf\xbf", -1, 0, FALSE },
     85   { "\xf4\x90\x80\x80", -1, 0, FALSE },
     86   /* malformed sequences */
     87   /* continuation bytes */
     88   { "\x80", -1, 0, FALSE },
     89   { "\xbf", -1, 0, FALSE },
     90   { "\x80\xbf", -1, 0, FALSE },
     91   { "\x80\xbf\x80", -1, 0, FALSE },
     92   { "\x80\xbf\x80\xbf", -1, 0, FALSE },
     93   { "\x80\xbf\x80\xbf\x80", -1, 0, FALSE },
     94   { "\x80\xbf\x80\xbf\x80\xbf", -1, 0, FALSE },
     95   { "\x80\xbf\x80\xbf\x80\xbf\x80", -1, 0, FALSE },
     96 
     97   /* all possible continuation byte */
     98   { "\x80", -1, 0, FALSE },
     99   { "\x81", -1, 0, FALSE },
    100   { "\x82", -1, 0, FALSE },
    101   { "\x83", -1, 0, FALSE },
    102   { "\x84", -1, 0, FALSE },
    103   { "\x85", -1, 0, FALSE },
    104   { "\x86", -1, 0, FALSE },
    105   { "\x87", -1, 0, FALSE },
    106   { "\x88", -1, 0, FALSE },
    107   { "\x89", -1, 0, FALSE },
    108   { "\x8a", -1, 0, FALSE },
    109   { "\x8b", -1, 0, FALSE },
    110   { "\x8c", -1, 0, FALSE },
    111   { "\x8d", -1, 0, FALSE },
    112   { "\x8e", -1, 0, FALSE },
    113   { "\x8f", -1, 0, FALSE },
    114   { "\x90", -1, 0, FALSE },
    115   { "\x91", -1, 0, FALSE },
    116   { "\x92", -1, 0, FALSE },
    117   { "\x93", -1, 0, FALSE },
    118   { "\x94", -1, 0, FALSE },
    119   { "\x95", -1, 0, FALSE },
    120   { "\x96", -1, 0, FALSE },
    121   { "\x97", -1, 0, FALSE },
    122   { "\x98", -1, 0, FALSE },
    123   { "\x99", -1, 0, FALSE },
    124   { "\x9a", -1, 0, FALSE },
    125   { "\x9b", -1, 0, FALSE },
    126   { "\x9c", -1, 0, FALSE },
    127   { "\x9d", -1, 0, FALSE },
    128   { "\x9e", -1, 0, FALSE },
    129   { "\x9f", -1, 0, FALSE },
    130   { "\xa0", -1, 0, FALSE },
    131   { "\xa1", -1, 0, FALSE },
    132   { "\xa2", -1, 0, FALSE },
    133   { "\xa3", -1, 0, FALSE },
    134   { "\xa4", -1, 0, FALSE },
    135   { "\xa5", -1, 0, FALSE },
    136   { "\xa6", -1, 0, FALSE },
    137   { "\xa7", -1, 0, FALSE },
    138   { "\xa8", -1, 0, FALSE },
    139   { "\xa9", -1, 0, FALSE },
    140   { "\xaa", -1, 0, FALSE },
    141   { "\xab", -1, 0, FALSE },
    142   { "\xac", -1, 0, FALSE },
    143   { "\xad", -1, 0, FALSE },
    144   { "\xae", -1, 0, FALSE },
    145   { "\xaf", -1, 0, FALSE },
    146   { "\xb0", -1, 0, FALSE },
    147   { "\xb1", -1, 0, FALSE },
    148   { "\xb2", -1, 0, FALSE },
    149   { "\xb3", -1, 0, FALSE },
    150   { "\xb4", -1, 0, FALSE },
    151   { "\xb5", -1, 0, FALSE },
    152   { "\xb6", -1, 0, FALSE },
    153   { "\xb7", -1, 0, FALSE },
    154   { "\xb8", -1, 0, FALSE },
    155   { "\xb9", -1, 0, FALSE },
    156   { "\xba", -1, 0, FALSE },
    157   { "\xbb", -1, 0, FALSE },
    158   { "\xbc", -1, 0, FALSE },
    159   { "\xbd", -1, 0, FALSE },
    160   { "\xbe", -1, 0, FALSE },
    161   { "\xbf", -1, 0, FALSE },
    162   /* lone start characters */
    163   { "\xc0\x20", -1, 0, FALSE },
    164   { "\xc1\x20", -1, 0, FALSE },
    165   { "\xc2\x20", -1, 0, FALSE },
    166   { "\xc3\x20", -1, 0, FALSE },
    167   { "\xc4\x20", -1, 0, FALSE },
    168   { "\xc5\x20", -1, 0, FALSE },
    169   { "\xc6\x20", -1, 0, FALSE },
    170   { "\xc7\x20", -1, 0, FALSE },
    171   { "\xc8\x20", -1, 0, FALSE },
    172   { "\xc9\x20", -1, 0, FALSE },
    173   { "\xca\x20", -1, 0, FALSE },
    174   { "\xcb\x20", -1, 0, FALSE },
    175   { "\xcc\x20", -1, 0, FALSE },
    176   { "\xcd\x20", -1, 0, FALSE },
    177   { "\xce\x20", -1, 0, FALSE },
    178   { "\xcf\x20", -1, 0, FALSE },
    179   { "\xd0\x20", -1, 0, FALSE },
    180   { "\xd1\x20", -1, 0, FALSE },
    181   { "\xd2\x20", -1, 0, FALSE },
    182   { "\xd3\x20", -1, 0, FALSE },
    183   { "\xd4\x20", -1, 0, FALSE },
    184   { "\xd5\x20", -1, 0, FALSE },
    185   { "\xd6\x20", -1, 0, FALSE },
    186   { "\xd7\x20", -1, 0, FALSE },
    187   { "\xd8\x20", -1, 0, FALSE },
    188   { "\xd9\x20", -1, 0, FALSE },
    189   { "\xda\x20", -1, 0, FALSE },
    190   { "\xdb\x20", -1, 0, FALSE },
    191   { "\xdc\x20", -1, 0, FALSE },
    192   { "\xdd\x20", -1, 0, FALSE },
    193   { "\xde\x20", -1, 0, FALSE },
    194   { "\xdf\x20", -1, 0, FALSE },
    195   { "\xe0\x20", -1, 0, FALSE },
    196   { "\xe1\x20", -1, 0, FALSE },
    197   { "\xe2\x20", -1, 0, FALSE },
    198   { "\xe3\x20", -1, 0, FALSE },
    199   { "\xe4\x20", -1, 0, FALSE },
    200   { "\xe5\x20", -1, 0, FALSE },
    201   { "\xe6\x20", -1, 0, FALSE },
    202   { "\xe7\x20", -1, 0, FALSE },
    203   { "\xe8\x20", -1, 0, FALSE },
    204   { "\xe9\x20", -1, 0, FALSE },
    205   { "\xea\x20", -1, 0, FALSE },
    206   { "\xeb\x20", -1, 0, FALSE },
    207   { "\xec\x20", -1, 0, FALSE },
    208   { "\xed\x20", -1, 0, FALSE },
    209   { "\xee\x20", -1, 0, FALSE },
    210   { "\xef\x20", -1, 0, FALSE },
    211   { "\xf0\x20", -1, 0, FALSE },
    212   { "\xf1\x20", -1, 0, FALSE },
    213   { "\xf2\x20", -1, 0, FALSE },
    214   { "\xf3\x20", -1, 0, FALSE },
    215   { "\xf4\x20", -1, 0, FALSE },
    216   { "\xf5\x20", -1, 0, FALSE },
    217   { "\xf6\x20", -1, 0, FALSE },
    218   { "\xf7\x20", -1, 0, FALSE },
    219   { "\xf8\x20", -1, 0, FALSE },
    220   { "\xf9\x20", -1, 0, FALSE },
    221   { "\xfa\x20", -1, 0, FALSE },
    222   { "\xfb\x20", -1, 0, FALSE },
    223   { "\xfc\x20", -1, 0, FALSE },
    224   { "\xfd\x20", -1, 0, FALSE },
    225   /* missing continuation bytes */
    226   { "\x20\xc0", -1, 1, FALSE },
    227   { "\x20\xe0\x80", -1, 1, FALSE },
    228   { "\x20\xf0\x80\x80", -1, 1, FALSE },
    229   { "\x20\xf8\x80\x80\x80", -1, 1, FALSE },
    230   { "\x20\xfc\x80\x80\x80\x80", -1, 1, FALSE },
    231   { "\x20\xdf", -1, 1, FALSE },
    232   { "\x20\xef\xbf", -1, 1, FALSE },
    233   { "\x20\xf7\xbf\xbf", -1, 1, FALSE },
    234   { "\x20\xfb\xbf\xbf\xbf", -1, 1, FALSE },
    235   { "\x20\xfd\xbf\xbf\xbf\xbf", -1, 1, FALSE },
    236   /* impossible bytes */
    237   { "\x20\xfe\x20", -1, 1, FALSE },
    238   { "\x20\xff\x20", -1, 1, FALSE },
    239   /* overlong sequences */
    240   { "\x20\xc0\xaf\x20", -1, 1, FALSE },
    241   { "\x20\xe0\x80\xaf\x20", -1, 1, FALSE },
    242   { "\x20\xf0\x80\x80\xaf\x20", -1, 1, FALSE },
    243   { "\x20\xf8\x80\x80\x80\xaf\x20", -1, 1, FALSE },
    244   { "\x20\xfc\x80\x80\x80\x80\xaf\x20", -1, 1, FALSE },
    245   { "\x20\xc1\xbf\x20", -1, 1, FALSE },
    246   { "\x20\xe0\x9f\xbf\x20", -1, 1, FALSE },
    247   { "\x20\xf0\x8f\xbf\xbf\x20", -1, 1, FALSE },
    248   { "\x20\xf8\x87\xbf\xbf\xbf\x20", -1, 1, FALSE },
    249   { "\x20\xfc\x83\xbf\xbf\xbf\xbf\x20", -1, 1, FALSE },
    250   { "\x20\xc0\x80\x20", -1, 1, FALSE },
    251   { "\x20\xe0\x80\x80\x20", -1, 1, FALSE },
    252   { "\x20\xf0\x80\x80\x80\x20", -1, 1, FALSE },
    253   { "\x20\xf8\x80\x80\x80\x80\x20", -1, 1, FALSE },
    254   { "\x20\xfc\x80\x80\x80\x80\x80\x20", -1, 1, FALSE },
    255   /* illegal code positions */
    256   { "\x20\xed\xa0\x80\x20", -1, 1, FALSE },
    257   { "\x20\xed\xad\xbf\x20", -1, 1, FALSE },
    258   { "\x20\xed\xae\x80\x20", -1, 1, FALSE },
    259   { "\x20\xed\xaf\xbf\x20", -1, 1, FALSE },
    260   { "\x20\xed\xb0\x80\x20", -1, 1, FALSE },
    261   { "\x20\xed\xbe\x80\x20", -1, 1, FALSE },
    262   { "\x20\xed\xbf\xbf\x20", -1, 1, FALSE },
    263   { "\x20\xed\xa0\x80\xed\xb0\x80\x20", -1, 1, FALSE },
    264   { "\x20\xed\xa0\x80\xed\xbf\xbf\x20", -1, 1, FALSE },
    265   { "\x20\xed\xad\xbf\xed\xb0\x80\x20", -1, 1, FALSE },
    266   { "\x20\xed\xad\xbf\xed\xbf\xbf\x20", -1, 1, FALSE },
    267   { "\x20\xed\xae\x80\xed\xb0\x80\x20", -1, 1, FALSE },
    268   { "\x20\xed\xae\x80\xed\xbf\xbf\x20", -1, 1, FALSE },
    269   { "\x20\xed\xaf\xbf\xed\xb0\x80\x20", -1, 1, FALSE },
    270   { "\x20\xed\xaf\xbf\xed\xbf\xbf\x20", -1, 1, FALSE },
    271   { "\x20\xef\xbf\xbe\x20", -1, 1, FALSE },
    272   { "\x20\xef\xbf\xbf\x20", -1, 1, FALSE },
    273 
    274   { NULL, }
    275 };
    276 
    277 static void
    278 do_test (gint         index,
    279 	 const gchar *text,
    280 	 gint         max_len,
    281 	 gint         offset,
    282 	 gboolean     valid)
    283 {
    284   const gchar *end;
    285   gboolean result;
    286 
    287   result = g_utf8_validate (text, max_len, &end);
    288 
    289   if (result != valid || end - text != offset)
    290     {
    291       GString *str;
    292       const gchar *p;
    293 
    294       any_failed = TRUE;
    295 
    296       str = g_string_new (0);
    297       for (p = text; *p; p++)
    298 	g_string_append_printf (str, "\\x%02hhx", *p);
    299       g_print ("%d: g_utf8_validate (\"%s\", %d) failed, "
    300 	       "expected %s %d, got %s %d\n",
    301 	       index,
    302 	       str->str, max_len,
    303 	       valid ? "TRUE" : "FALSE", offset,
    304 	       result ? "TRUE" : "FALSE", (gint) (end - text));
    305       g_string_free (str, FALSE);
    306     }
    307 }
    308 
    309 int
    310 main (int argc, char *argv[])
    311 {
    312   gint i;
    313 
    314   for (i = 0; test[i].text; i++)
    315     do_test (i, test[i].text, test[i].max_len,
    316 	     test[i].offset, test[i].valid);
    317 
    318   return any_failed ? 1 : 0;
    319 }
    320