1 /* GLIB - Library of useful routines for C programming 2 * Copyright (C) 2001 Matthias Clasen <matthiasc (at) poet.de> 3 * 4 * This library is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU Lesser General Public 6 * License as published by the Free Software Foundation; either 7 * version 2 of the License, or (at your option) any later version. 8 * 9 * This library is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 * Lesser General Public License for more details. 13 * 14 * You should have received a copy of the GNU Lesser General Public 15 * License along with this library; if not, write to the 16 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 17 * Boston, MA 02111-1307, USA. 18 */ 19 20 #include "glib.h" 21 22 #define UNICODE_VALID(Char) \ 23 ((Char) < 0x110000 && \ 24 (((Char) & 0xFFFFF800) != 0xD800) && \ 25 ((Char) < 0xFDD0 || (Char) > 0xFDEF) && \ 26 ((Char) & 0xFFFE) != 0xFFFE) 27 28 29 30 static gboolean any_failed = FALSE; 31 32 struct { 33 const gchar *text; 34 gint max_len; 35 gint offset; 36 gboolean valid; 37 } test[] = { 38 /* some tests to check max_len handling */ 39 /* length 1 */ 40 { "abcde", -1, 5, TRUE }, 41 { "abcde", 3, 3, TRUE }, 42 { "abcde", 5, 5, TRUE }, 43 { "abcde", 7, 5, FALSE }, 44 /* length 2 */ 45 { "\xc2\xa9\xc2\xa9\xc2\xa9", -1, 6, TRUE }, 46 { "\xc2\xa9\xc2\xa9\xc2\xa9", 1, 0, FALSE }, 47 { "\xc2\xa9\xc2\xa9\xc2\xa9", 2, 2, TRUE }, 48 { "\xc2\xa9\xc2\xa9\xc2\xa9", 3, 2, FALSE }, 49 { "\xc2\xa9\xc2\xa9\xc2\xa9", 4, 4, TRUE }, 50 { "\xc2\xa9\xc2\xa9\xc2\xa9", 5, 4, FALSE }, 51 { "\xc2\xa9\xc2\xa9\xc2\xa9", 6, 6, TRUE }, 52 { "\xc2\xa9\xc2\xa9\xc2\xa9", 7, 6, FALSE }, 53 /* length 3 */ 54 { "\xe2\x89\xa0\xe2\x89\xa0", -1, 6, TRUE }, 55 { "\xe2\x89\xa0\xe2\x89\xa0", 1, 0, FALSE }, 56 { "\xe2\x89\xa0\xe2\x89\xa0", 2, 0, FALSE }, 57 { "\xe2\x89\xa0\xe2\x89\xa0", 3, 3, TRUE }, 58 { "\xe2\x89\xa0\xe2\x89\xa0", 4, 3, FALSE }, 59 { "\xe2\x89\xa0\xe2\x89\xa0", 5, 3, FALSE }, 60 { "\xe2\x89\xa0\xe2\x89\xa0", 6, 6, TRUE }, 61 { "\xe2\x89\xa0\xe2\x89\xa0", 7, 6, FALSE }, 62 63 /* examples from http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt */ 64 /* greek 'kosme' */ 65 { "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5", -1, 11, TRUE }, 66 /* first sequence of each length */ 67 { "\x00", -1, 0, TRUE }, 68 { "\xc2\x80", -1, 2, TRUE }, 69 { "\xe0\xa0\x80", -1, 3, TRUE }, 70 { "\xf0\x90\x80\x80", -1, 4, TRUE }, 71 { "\xf8\x88\x80\x80\x80", -1, 0, FALSE }, 72 { "\xfc\x84\x80\x80\x80\x80", -1, 0, FALSE }, 73 /* last sequence of each length */ 74 { "\x7f", -1, 1, TRUE }, 75 { "\xdf\xbf", -1, 2, TRUE }, 76 { "\xef\xbf\xbf", -1, 0, FALSE }, 77 { "\xf7\xbf\xbf\xbf", -1, 0, FALSE }, 78 { "\xfb\xbf\xbf\xbf\xbf", -1, 0, FALSE }, 79 { "\xfd\xbf\xbf\xbf\xbf\xbf", -1, 0, FALSE }, 80 /* other boundary conditions */ 81 { "\xed\x9f\xbf", -1, 3, TRUE }, 82 { "\xee\x80\x80", -1, 3, TRUE }, 83 { "\xef\xbf\xbd", -1, 3, TRUE }, 84 { "\xf4\x8f\xbf\xbf", -1, 0, FALSE }, 85 { "\xf4\x90\x80\x80", -1, 0, FALSE }, 86 /* malformed sequences */ 87 /* continuation bytes */ 88 { "\x80", -1, 0, FALSE }, 89 { "\xbf", -1, 0, FALSE }, 90 { "\x80\xbf", -1, 0, FALSE }, 91 { "\x80\xbf\x80", -1, 0, FALSE }, 92 { "\x80\xbf\x80\xbf", -1, 0, FALSE }, 93 { "\x80\xbf\x80\xbf\x80", -1, 0, FALSE }, 94 { "\x80\xbf\x80\xbf\x80\xbf", -1, 0, FALSE }, 95 { "\x80\xbf\x80\xbf\x80\xbf\x80", -1, 0, FALSE }, 96 97 /* all possible continuation byte */ 98 { "\x80", -1, 0, FALSE }, 99 { "\x81", -1, 0, FALSE }, 100 { "\x82", -1, 0, FALSE }, 101 { "\x83", -1, 0, FALSE }, 102 { "\x84", -1, 0, FALSE }, 103 { "\x85", -1, 0, FALSE }, 104 { "\x86", -1, 0, FALSE }, 105 { "\x87", -1, 0, FALSE }, 106 { "\x88", -1, 0, FALSE }, 107 { "\x89", -1, 0, FALSE }, 108 { "\x8a", -1, 0, FALSE }, 109 { "\x8b", -1, 0, FALSE }, 110 { "\x8c", -1, 0, FALSE }, 111 { "\x8d", -1, 0, FALSE }, 112 { "\x8e", -1, 0, FALSE }, 113 { "\x8f", -1, 0, FALSE }, 114 { "\x90", -1, 0, FALSE }, 115 { "\x91", -1, 0, FALSE }, 116 { "\x92", -1, 0, FALSE }, 117 { "\x93", -1, 0, FALSE }, 118 { "\x94", -1, 0, FALSE }, 119 { "\x95", -1, 0, FALSE }, 120 { "\x96", -1, 0, FALSE }, 121 { "\x97", -1, 0, FALSE }, 122 { "\x98", -1, 0, FALSE }, 123 { "\x99", -1, 0, FALSE }, 124 { "\x9a", -1, 0, FALSE }, 125 { "\x9b", -1, 0, FALSE }, 126 { "\x9c", -1, 0, FALSE }, 127 { "\x9d", -1, 0, FALSE }, 128 { "\x9e", -1, 0, FALSE }, 129 { "\x9f", -1, 0, FALSE }, 130 { "\xa0", -1, 0, FALSE }, 131 { "\xa1", -1, 0, FALSE }, 132 { "\xa2", -1, 0, FALSE }, 133 { "\xa3", -1, 0, FALSE }, 134 { "\xa4", -1, 0, FALSE }, 135 { "\xa5", -1, 0, FALSE }, 136 { "\xa6", -1, 0, FALSE }, 137 { "\xa7", -1, 0, FALSE }, 138 { "\xa8", -1, 0, FALSE }, 139 { "\xa9", -1, 0, FALSE }, 140 { "\xaa", -1, 0, FALSE }, 141 { "\xab", -1, 0, FALSE }, 142 { "\xac", -1, 0, FALSE }, 143 { "\xad", -1, 0, FALSE }, 144 { "\xae", -1, 0, FALSE }, 145 { "\xaf", -1, 0, FALSE }, 146 { "\xb0", -1, 0, FALSE }, 147 { "\xb1", -1, 0, FALSE }, 148 { "\xb2", -1, 0, FALSE }, 149 { "\xb3", -1, 0, FALSE }, 150 { "\xb4", -1, 0, FALSE }, 151 { "\xb5", -1, 0, FALSE }, 152 { "\xb6", -1, 0, FALSE }, 153 { "\xb7", -1, 0, FALSE }, 154 { "\xb8", -1, 0, FALSE }, 155 { "\xb9", -1, 0, FALSE }, 156 { "\xba", -1, 0, FALSE }, 157 { "\xbb", -1, 0, FALSE }, 158 { "\xbc", -1, 0, FALSE }, 159 { "\xbd", -1, 0, FALSE }, 160 { "\xbe", -1, 0, FALSE }, 161 { "\xbf", -1, 0, FALSE }, 162 /* lone start characters */ 163 { "\xc0\x20", -1, 0, FALSE }, 164 { "\xc1\x20", -1, 0, FALSE }, 165 { "\xc2\x20", -1, 0, FALSE }, 166 { "\xc3\x20", -1, 0, FALSE }, 167 { "\xc4\x20", -1, 0, FALSE }, 168 { "\xc5\x20", -1, 0, FALSE }, 169 { "\xc6\x20", -1, 0, FALSE }, 170 { "\xc7\x20", -1, 0, FALSE }, 171 { "\xc8\x20", -1, 0, FALSE }, 172 { "\xc9\x20", -1, 0, FALSE }, 173 { "\xca\x20", -1, 0, FALSE }, 174 { "\xcb\x20", -1, 0, FALSE }, 175 { "\xcc\x20", -1, 0, FALSE }, 176 { "\xcd\x20", -1, 0, FALSE }, 177 { "\xce\x20", -1, 0, FALSE }, 178 { "\xcf\x20", -1, 0, FALSE }, 179 { "\xd0\x20", -1, 0, FALSE }, 180 { "\xd1\x20", -1, 0, FALSE }, 181 { "\xd2\x20", -1, 0, FALSE }, 182 { "\xd3\x20", -1, 0, FALSE }, 183 { "\xd4\x20", -1, 0, FALSE }, 184 { "\xd5\x20", -1, 0, FALSE }, 185 { "\xd6\x20", -1, 0, FALSE }, 186 { "\xd7\x20", -1, 0, FALSE }, 187 { "\xd8\x20", -1, 0, FALSE }, 188 { "\xd9\x20", -1, 0, FALSE }, 189 { "\xda\x20", -1, 0, FALSE }, 190 { "\xdb\x20", -1, 0, FALSE }, 191 { "\xdc\x20", -1, 0, FALSE }, 192 { "\xdd\x20", -1, 0, FALSE }, 193 { "\xde\x20", -1, 0, FALSE }, 194 { "\xdf\x20", -1, 0, FALSE }, 195 { "\xe0\x20", -1, 0, FALSE }, 196 { "\xe1\x20", -1, 0, FALSE }, 197 { "\xe2\x20", -1, 0, FALSE }, 198 { "\xe3\x20", -1, 0, FALSE }, 199 { "\xe4\x20", -1, 0, FALSE }, 200 { "\xe5\x20", -1, 0, FALSE }, 201 { "\xe6\x20", -1, 0, FALSE }, 202 { "\xe7\x20", -1, 0, FALSE }, 203 { "\xe8\x20", -1, 0, FALSE }, 204 { "\xe9\x20", -1, 0, FALSE }, 205 { "\xea\x20", -1, 0, FALSE }, 206 { "\xeb\x20", -1, 0, FALSE }, 207 { "\xec\x20", -1, 0, FALSE }, 208 { "\xed\x20", -1, 0, FALSE }, 209 { "\xee\x20", -1, 0, FALSE }, 210 { "\xef\x20", -1, 0, FALSE }, 211 { "\xf0\x20", -1, 0, FALSE }, 212 { "\xf1\x20", -1, 0, FALSE }, 213 { "\xf2\x20", -1, 0, FALSE }, 214 { "\xf3\x20", -1, 0, FALSE }, 215 { "\xf4\x20", -1, 0, FALSE }, 216 { "\xf5\x20", -1, 0, FALSE }, 217 { "\xf6\x20", -1, 0, FALSE }, 218 { "\xf7\x20", -1, 0, FALSE }, 219 { "\xf8\x20", -1, 0, FALSE }, 220 { "\xf9\x20", -1, 0, FALSE }, 221 { "\xfa\x20", -1, 0, FALSE }, 222 { "\xfb\x20", -1, 0, FALSE }, 223 { "\xfc\x20", -1, 0, FALSE }, 224 { "\xfd\x20", -1, 0, FALSE }, 225 /* missing continuation bytes */ 226 { "\x20\xc0", -1, 1, FALSE }, 227 { "\x20\xe0\x80", -1, 1, FALSE }, 228 { "\x20\xf0\x80\x80", -1, 1, FALSE }, 229 { "\x20\xf8\x80\x80\x80", -1, 1, FALSE }, 230 { "\x20\xfc\x80\x80\x80\x80", -1, 1, FALSE }, 231 { "\x20\xdf", -1, 1, FALSE }, 232 { "\x20\xef\xbf", -1, 1, FALSE }, 233 { "\x20\xf7\xbf\xbf", -1, 1, FALSE }, 234 { "\x20\xfb\xbf\xbf\xbf", -1, 1, FALSE }, 235 { "\x20\xfd\xbf\xbf\xbf\xbf", -1, 1, FALSE }, 236 /* impossible bytes */ 237 { "\x20\xfe\x20", -1, 1, FALSE }, 238 { "\x20\xff\x20", -1, 1, FALSE }, 239 /* overlong sequences */ 240 { "\x20\xc0\xaf\x20", -1, 1, FALSE }, 241 { "\x20\xe0\x80\xaf\x20", -1, 1, FALSE }, 242 { "\x20\xf0\x80\x80\xaf\x20", -1, 1, FALSE }, 243 { "\x20\xf8\x80\x80\x80\xaf\x20", -1, 1, FALSE }, 244 { "\x20\xfc\x80\x80\x80\x80\xaf\x20", -1, 1, FALSE }, 245 { "\x20\xc1\xbf\x20", -1, 1, FALSE }, 246 { "\x20\xe0\x9f\xbf\x20", -1, 1, FALSE }, 247 { "\x20\xf0\x8f\xbf\xbf\x20", -1, 1, FALSE }, 248 { "\x20\xf8\x87\xbf\xbf\xbf\x20", -1, 1, FALSE }, 249 { "\x20\xfc\x83\xbf\xbf\xbf\xbf\x20", -1, 1, FALSE }, 250 { "\x20\xc0\x80\x20", -1, 1, FALSE }, 251 { "\x20\xe0\x80\x80\x20", -1, 1, FALSE }, 252 { "\x20\xf0\x80\x80\x80\x20", -1, 1, FALSE }, 253 { "\x20\xf8\x80\x80\x80\x80\x20", -1, 1, FALSE }, 254 { "\x20\xfc\x80\x80\x80\x80\x80\x20", -1, 1, FALSE }, 255 /* illegal code positions */ 256 { "\x20\xed\xa0\x80\x20", -1, 1, FALSE }, 257 { "\x20\xed\xad\xbf\x20", -1, 1, FALSE }, 258 { "\x20\xed\xae\x80\x20", -1, 1, FALSE }, 259 { "\x20\xed\xaf\xbf\x20", -1, 1, FALSE }, 260 { "\x20\xed\xb0\x80\x20", -1, 1, FALSE }, 261 { "\x20\xed\xbe\x80\x20", -1, 1, FALSE }, 262 { "\x20\xed\xbf\xbf\x20", -1, 1, FALSE }, 263 { "\x20\xed\xa0\x80\xed\xb0\x80\x20", -1, 1, FALSE }, 264 { "\x20\xed\xa0\x80\xed\xbf\xbf\x20", -1, 1, FALSE }, 265 { "\x20\xed\xad\xbf\xed\xb0\x80\x20", -1, 1, FALSE }, 266 { "\x20\xed\xad\xbf\xed\xbf\xbf\x20", -1, 1, FALSE }, 267 { "\x20\xed\xae\x80\xed\xb0\x80\x20", -1, 1, FALSE }, 268 { "\x20\xed\xae\x80\xed\xbf\xbf\x20", -1, 1, FALSE }, 269 { "\x20\xed\xaf\xbf\xed\xb0\x80\x20", -1, 1, FALSE }, 270 { "\x20\xed\xaf\xbf\xed\xbf\xbf\x20", -1, 1, FALSE }, 271 { "\x20\xef\xbf\xbe\x20", -1, 1, FALSE }, 272 { "\x20\xef\xbf\xbf\x20", -1, 1, FALSE }, 273 274 { NULL, } 275 }; 276 277 static void 278 do_test (gint index, 279 const gchar *text, 280 gint max_len, 281 gint offset, 282 gboolean valid) 283 { 284 const gchar *end; 285 gboolean result; 286 287 result = g_utf8_validate (text, max_len, &end); 288 289 if (result != valid || end - text != offset) 290 { 291 GString *str; 292 const gchar *p; 293 294 any_failed = TRUE; 295 296 str = g_string_new (0); 297 for (p = text; *p; p++) 298 g_string_append_printf (str, "\\x%02hhx", *p); 299 g_print ("%d: g_utf8_validate (\"%s\", %d) failed, " 300 "expected %s %d, got %s %d\n", 301 index, 302 str->str, max_len, 303 valid ? "TRUE" : "FALSE", offset, 304 result ? "TRUE" : "FALSE", (gint) (end - text)); 305 g_string_free (str, FALSE); 306 } 307 } 308 309 int 310 main (int argc, char *argv[]) 311 { 312 gint i; 313 314 for (i = 0; test[i].text; i++) 315 do_test (i, test[i].text, test[i].max_len, 316 test[i].offset, test[i].valid); 317 318 return any_failed ? 1 : 0; 319 } 320