1 /* 2 **************************************************************************** 3 * Copyright (c) 2005-2009, International Business Machines Corporation and * 4 * others. All Rights Reserved. * 5 **************************************************************************** 6 */ 7 8 #include "unicode/utypes.h" 9 10 #include "unicode/ucsdet.h" 11 #include "unicode/ucnv.h" 12 #include "unicode/ustring.h" 13 14 #include "cintltst.h" 15 16 #include <stdlib.h> 17 #include <string.h> 18 19 #define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0])) 20 21 #define NEW_ARRAY(type,count) (type *) malloc((count) * sizeof(type)) 22 #define DELETE_ARRAY(array) free(array) 23 24 static void TestConstruction(void); 25 static void TestUTF8(void); 26 static void TestUTF16(void); 27 static void TestC1Bytes(void); 28 static void TestInputFilter(void); 29 static void TestChaining(void); 30 static void TestBufferOverflow(void); 31 static void TestIBM424(void); 32 static void TestIBM420(void); 33 34 void addUCsdetTest(TestNode** root); 35 36 void addUCsdetTest(TestNode** root) 37 { 38 addTest(root, &TestConstruction, "ucsdetst/TestConstruction"); 39 addTest(root, &TestUTF8, "ucsdetst/TestUTF8"); 40 addTest(root, &TestUTF16, "ucsdetst/TestUTF16"); 41 addTest(root, &TestC1Bytes, "ucsdetst/TestC1Bytes"); 42 addTest(root, &TestInputFilter, "ucsdetst/TestInputFilter"); 43 addTest(root, &TestChaining, "ucsdetst/TestErrorChaining"); 44 addTest(root, &TestBufferOverflow, "ucsdetst/TestBufferOverflow"); 45 #if !UCONFIG_NO_LEGACY_CONVERSION 46 addTest(root, &TestIBM424, "ucsdetst/TestIBM424"); 47 addTest(root, &TestIBM420, "ucsdetst/TestIBM420"); 48 #endif 49 } 50 51 static int32_t preflight(const UChar *src, int32_t length, UConverter *cnv) 52 { 53 UErrorCode status; 54 char buffer[1024]; 55 char *dest, *destLimit = buffer + sizeof(buffer); 56 const UChar *srcLimit = src + length; 57 int32_t result = 0; 58 59 do { 60 dest = buffer; 61 status = U_ZERO_ERROR; 62 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status); 63 result += (int32_t) (dest - buffer); 64 } while (status == U_BUFFER_OVERFLOW_ERROR); 65 66 return result; 67 } 68 69 static char *extractBytes(const UChar *src, int32_t length, const char *codepage, int32_t *byteLength) 70 { 71 UErrorCode status = U_ZERO_ERROR; 72 UConverter *cnv = ucnv_open(codepage, &status); 73 int32_t byteCount = preflight(src, length, cnv); 74 const UChar *srcLimit = src + length; 75 char *bytes = NEW_ARRAY(char, byteCount + 1); 76 char *dest = bytes, *destLimit = bytes + byteCount + 1; 77 78 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status); 79 ucnv_close(cnv); 80 81 *byteLength = byteCount; 82 return bytes; 83 } 84 85 static void freeBytes(char *bytes) 86 { 87 DELETE_ARRAY(bytes); 88 } 89 90 static void TestConstruction(void) 91 { 92 UErrorCode status = U_ZERO_ERROR; 93 UCharsetDetector *csd = ucsdet_open(&status); 94 UEnumeration *e = ucsdet_getAllDetectableCharsets(csd, &status); 95 const char *name; 96 int32_t count = uenum_count(e, &status); 97 int32_t i, length; 98 99 for(i = 0; i < count; i += 1) { 100 name = uenum_next(e, &length, &status); 101 102 if(name == NULL || length <= 0) { 103 log_err("ucsdet_getAllDetectableCharsets() returned a null or empty name!\n"); 104 } 105 } 106 /* one past the list of all names must return NULL */ 107 name = uenum_next(e, &length, &status); 108 if(name != NULL || length != 0 || U_FAILURE(status)) { 109 log_err("ucsdet_getAllDetectableCharsets(past the list) returned a non-null name!\n"); 110 } 111 112 uenum_close(e); 113 ucsdet_close(csd); 114 } 115 116 static void TestUTF8(void) 117 { 118 UErrorCode status = U_ZERO_ERROR; 119 static const char ss[] = "This is a string with some non-ascii characters that will " 120 "be converted to UTF-8, then shoved through the detection process. " 121 "\\u0391\\u0392\\u0393\\u0394\\u0395" 122 "Sure would be nice if our source could contain Unicode directly!"; 123 int32_t byteLength = 0, sLength = 0, dLength = 0; 124 UChar s[sizeof(ss)]; 125 char *bytes; 126 UCharsetDetector *csd = ucsdet_open(&status); 127 const UCharsetMatch *match; 128 UChar detected[sizeof(ss)]; 129 130 sLength = u_unescape(ss, s, sizeof(ss)); 131 bytes = extractBytes(s, sLength, "UTF-8", &byteLength); 132 133 ucsdet_setText(csd, bytes, byteLength, &status); 134 if (U_FAILURE(status)) { 135 log_err("status is %s\n", u_errorName(status)); 136 goto bail; 137 } 138 139 match = ucsdet_detect(csd, &status); 140 141 if (match == NULL) { 142 log_err("Detection failure for UTF-8: got no matches.\n"); 143 goto bail; 144 } 145 146 dLength = ucsdet_getUChars(match, detected, sLength, &status); 147 148 if (u_strCompare(detected, dLength, s, sLength, FALSE) != 0) { 149 log_err("Round-trip test failed!\n"); 150 } 151 152 ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */ 153 154 bail: 155 freeBytes(bytes); 156 ucsdet_close(csd); 157 } 158 159 static void TestUTF16(void) 160 { 161 UErrorCode status = U_ZERO_ERROR; 162 /* Notice the BOM on the start of this string */ 163 static const UChar chars[] = { 164 0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C, 165 0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a, 166 0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628, 167 0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646, 168 0x064a, 0x062a, 0x0000}; 169 int32_t beLength = 0, leLength = 0, cLength = ARRAY_SIZE(chars); 170 char *beBytes = extractBytes(chars, cLength, "UTF-16BE", &beLength); 171 char *leBytes = extractBytes(chars, cLength, "UTF-16LE", &leLength); 172 UCharsetDetector *csd = ucsdet_open(&status); 173 const UCharsetMatch *match; 174 const char *name; 175 int32_t conf; 176 177 ucsdet_setText(csd, beBytes, beLength, &status); 178 match = ucsdet_detect(csd, &status); 179 180 if (match == NULL) { 181 log_err("Encoding detection failure for UTF-16BE: got no matches.\n"); 182 goto try_le; 183 } 184 185 name = ucsdet_getName(match, &status); 186 conf = ucsdet_getConfidence(match, &status); 187 188 if (strcmp(name, "UTF-16BE") != 0) { 189 log_err("Encoding detection failure for UTF-16BE: got %s\n", name); 190 } 191 192 if (conf != 100) { 193 log_err("Did not get 100%% confidence for UTF-16BE: got %d\n", conf); 194 } 195 196 try_le: 197 ucsdet_setText(csd, leBytes, leLength, &status); 198 match = ucsdet_detect(csd, &status); 199 200 if (match == NULL) { 201 log_err("Encoding detection failure for UTF-16LE: got no matches.\n"); 202 goto bail; 203 } 204 205 name = ucsdet_getName(match, &status); 206 conf = ucsdet_getConfidence(match, &status); 207 208 209 if (strcmp(name, "UTF-16LE") != 0) { 210 log_err("Enconding detection failure for UTF-16LE: got %s\n", name); 211 } 212 213 if (conf != 100) { 214 log_err("Did not get 100%% confidence for UTF-16LE: got %d\n", conf); 215 } 216 217 bail: 218 freeBytes(leBytes); 219 freeBytes(beBytes); 220 ucsdet_close(csd); 221 } 222 223 static void TestC1Bytes(void) 224 { 225 #if !UCONFIG_NO_LEGACY_CONVERSION 226 UErrorCode status = U_ZERO_ERROR; 227 static const char ssISO[] = "This is a small sample of some English text. Just enough to be sure that it detects correctly."; 228 static const char ssWindows[] = "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes."; 229 int32_t sISOLength = 0, sWindowsLength = 0; 230 UChar sISO[sizeof(ssISO)]; 231 UChar sWindows[sizeof(ssWindows)]; 232 int32_t lISO = 0, lWindows = 0; 233 char *bISO; 234 char *bWindows; 235 UCharsetDetector *csd = ucsdet_open(&status); 236 const UCharsetMatch *match; 237 const char *name; 238 239 sISOLength = u_unescape(ssISO, sISO, sizeof(ssISO)); 240 sWindowsLength = u_unescape(ssWindows, sWindows, sizeof(ssWindows)); 241 bISO = extractBytes(sISO, sISOLength, "ISO-8859-1", &lISO); 242 bWindows = extractBytes(sWindows, sWindowsLength, "windows-1252", &lWindows); 243 244 ucsdet_setText(csd, bWindows, lWindows, &status); 245 match = ucsdet_detect(csd, &status); 246 247 if (match == NULL) { 248 log_err("English test with C1 bytes got no matches.\n"); 249 goto bail; 250 } 251 252 name = ucsdet_getName(match, &status); 253 254 if (strcmp(name, "windows-1252") != 0) { 255 log_data_err("English text with C1 bytes does not detect as windows-1252, but as %s. (Are you missing data?)\n", name); 256 } 257 258 ucsdet_setText(csd, bISO, lISO, &status); 259 match = ucsdet_detect(csd, &status); 260 261 if (match == NULL) { 262 log_err("English text without C1 bytes got no matches.\n"); 263 goto bail; 264 } 265 266 name = ucsdet_getName(match, &status); 267 268 if (strcmp(name, "ISO-8859-1") != 0) { 269 log_err("English text without C1 bytes does not detect as ISO-8859-1, but as %s\n", name); 270 } 271 272 bail: 273 freeBytes(bWindows); 274 freeBytes(bISO); 275 276 ucsdet_close(csd); 277 #endif 278 } 279 280 static void TestInputFilter(void) 281 { 282 UErrorCode status = U_ZERO_ERROR; 283 static const char ss[] = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>"; 284 int32_t sLength = 0; 285 UChar s[sizeof(ss)]; 286 int32_t byteLength = 0; 287 char *bytes; 288 UCharsetDetector *csd = ucsdet_open(&status); 289 const UCharsetMatch *match; 290 const char *lang, *name; 291 292 sLength = u_unescape(ss, s, sizeof(ss)); 293 bytes = extractBytes(s, sLength, "ISO-8859-1", &byteLength); 294 295 ucsdet_enableInputFilter(csd, TRUE); 296 297 if (!ucsdet_isInputFilterEnabled(csd)) { 298 log_err("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!\n"); 299 } 300 301 302 ucsdet_setText(csd, bytes, byteLength, &status); 303 match = ucsdet_detect(csd, &status); 304 305 if (match == NULL) { 306 log_err("Turning on the input filter resulted in no matches.\n"); 307 goto turn_off; 308 } 309 310 name = ucsdet_getName(match, &status); 311 312 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) { 313 log_err("Turning on the input filter resulted in %s rather than ISO-8859-1\n", name); 314 } else { 315 lang = ucsdet_getLanguage(match, &status); 316 317 if (lang == NULL || strcmp(lang, "fr") != 0) { 318 log_err("Input filter did not strip markup!\n"); 319 } 320 } 321 322 turn_off: 323 ucsdet_enableInputFilter(csd, FALSE); 324 ucsdet_setText(csd, bytes, byteLength, &status); 325 match = ucsdet_detect(csd, &status); 326 327 if (match == NULL) { 328 log_err("Turning off the input filter resulted in no matches.\n"); 329 goto bail; 330 } 331 332 name = ucsdet_getName(match, &status); 333 334 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) { 335 log_err("Turning off the input filter resulted in %s rather than ISO-8859-1\n", name); 336 } else { 337 lang = ucsdet_getLanguage(match, &status); 338 339 if (lang == NULL || strcmp(lang, "en") != 0) { 340 log_err("Unfiltered input did not detect as English!\n"); 341 } 342 } 343 344 bail: 345 freeBytes(bytes); 346 ucsdet_close(csd); 347 } 348 349 static void TestChaining(void) { 350 UErrorCode status = U_USELESS_COLLATOR_ERROR; 351 352 ucsdet_open(&status); 353 ucsdet_setText(NULL, NULL, 0, &status); 354 ucsdet_getName(NULL, &status); 355 ucsdet_getConfidence(NULL, &status); 356 ucsdet_getLanguage(NULL, &status); 357 ucsdet_detect(NULL, &status); 358 ucsdet_setDeclaredEncoding(NULL, NULL, 0, &status); 359 ucsdet_detectAll(NULL, NULL, &status); 360 ucsdet_getUChars(NULL, NULL, 0, &status); 361 ucsdet_getUChars(NULL, NULL, 0, &status); 362 ucsdet_close(NULL); 363 364 /* All of this code should have done nothing. */ 365 if (status != U_USELESS_COLLATOR_ERROR) { 366 log_err("Status got changed to %s\n", u_errorName(status)); 367 } 368 } 369 370 static void TestBufferOverflow(void) { 371 UErrorCode status = U_ZERO_ERROR; 372 static const char *testStrings[] = { 373 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b", /* A partial ISO-2022 shift state at the end */ 374 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24", /* A partial ISO-2022 shift state at the end */ 375 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28", /* A partial ISO-2022 shift state at the end */ 376 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end with a bad one at the start */ 377 "\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end */ 378 "\xa1", /* Could be a single byte shift-jis at the end */ 379 "\x74\x68\xa1", /* Could be a single byte shift-jis at the end */ 380 "\x74\x68\x65\xa1" /* Could be a single byte shift-jis at the end, but now we have English creeping in. */ 381 }; 382 static const char *testResults[] = { 383 "windows-1252", 384 "windows-1252", 385 "windows-1252", 386 "windows-1252", 387 "ISO-2022-JP", 388 NULL, 389 NULL, 390 "ISO-8859-1" 391 }; 392 int32_t idx = 0; 393 UCharsetDetector *csd = ucsdet_open(&status); 394 const UCharsetMatch *match; 395 396 ucsdet_setDeclaredEncoding(csd, "ISO-2022-JP", -1, &status); 397 398 if (U_FAILURE(status)) { 399 log_err("Couldn't open detector. %s\n", u_errorName(status)); 400 goto bail; 401 } 402 403 for (idx = 0; idx < ARRAY_SIZE(testStrings); idx++) { 404 ucsdet_setText(csd, testStrings[idx], -1, &status); 405 match = ucsdet_detect(csd, &status); 406 407 if (match == NULL) { 408 if (testResults[idx] != NULL) { 409 log_err("Unexpectedly got no results at index %d.\n", idx); 410 } 411 else { 412 log_verbose("Got no result as expected at index %d.\n", idx); 413 } 414 continue; 415 } 416 417 if (testResults[idx] == NULL || strcmp(ucsdet_getName(match, &status), testResults[idx]) != 0) { 418 log_err("Unexpectedly got %s instead of %s at index %d with confidence %d.\n", 419 ucsdet_getName(match, &status), testResults[idx], idx, ucsdet_getConfidence(match, &status)); 420 goto bail; 421 } 422 } 423 424 bail: 425 ucsdet_close(csd); 426 } 427 428 static void TestIBM424(void) 429 { 430 UErrorCode status = U_ZERO_ERROR; 431 432 static const UChar chars[] = { 433 0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8, 434 0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9, 435 0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8, 436 0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA, 437 0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5, 438 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE, 439 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 440 0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC, 441 0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3, 442 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020, 443 0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC, 444 0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 445 0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 446 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 447 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC, 448 0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1, 449 0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000 450 }; 451 452 static const UChar chars_reverse[] = { 453 0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA, 454 0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8, 455 0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 456 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 457 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9, 458 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4, 459 0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9, 460 0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5, 461 0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3, 462 0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020, 463 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 464 0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9, 465 0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020, 466 0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4, 467 0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7, 468 0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0, 469 0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4, 470 0x0000 471 }; 472 473 int32_t bLength = 0, brLength = 0, cLength = ARRAY_SIZE(chars), crLength = ARRAY_SIZE(chars_reverse); 474 475 char *bytes = extractBytes(chars, cLength, "IBM424", &bLength); 476 char *bytes_r = extractBytes(chars_reverse, crLength, "IBM424", &brLength); 477 478 UCharsetDetector *csd = ucsdet_open(&status); 479 const UCharsetMatch *match; 480 const char *name; 481 482 ucsdet_setText(csd, bytes, bLength, &status); 483 match = ucsdet_detect(csd, &status); 484 485 if (match == NULL) { 486 log_err("Encoding detection failure for IBM424_rtl: got no matches.\n"); 487 goto bail; 488 } 489 490 name = ucsdet_getName(match, &status); 491 if (strcmp(name, "IBM424_rtl") != 0) { 492 log_data_err("Encoding detection failure for IBM424_rtl: got %s. (Are you missing data?)\n", name); 493 } 494 495 ucsdet_setText(csd, bytes_r, brLength, &status); 496 match = ucsdet_detect(csd, &status); 497 498 if (match == NULL) { 499 log_err("Encoding detection failure for IBM424_ltr: got no matches.\n"); 500 goto bail; 501 } 502 503 name = ucsdet_getName(match, &status); 504 if (strcmp(name, "IBM424_ltr") != 0) { 505 log_data_err("Encoding detection failure for IBM424_ltr: got %s. (Are you missing data?)\n", name); 506 } 507 508 bail: 509 freeBytes(bytes); 510 freeBytes(bytes_r); 511 ucsdet_close(csd); 512 } 513 514 static void TestIBM420(void) 515 { 516 UErrorCode status = U_ZERO_ERROR; 517 518 static const UChar chars[] = { 519 0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627, 520 0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641, 521 0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 522 0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645, 523 0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A, 524 0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644, 525 0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020, 526 0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 527 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634, 528 0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F, 529 0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647, 530 0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627, 531 0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E, 532 0x0000 533 }; 534 static const UChar chars_reverse[] = { 535 0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F, 536 0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020, 537 0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648, 538 0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628, 539 0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 540 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A, 541 0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644, 542 0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A, 543 0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A, 544 0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627, 545 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A, 546 0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645, 547 0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648, 548 0x0000, 549 }; 550 551 int32_t bLength = 0, brLength = 0, cLength = ARRAY_SIZE(chars), crLength = ARRAY_SIZE(chars_reverse); 552 553 char *bytes = extractBytes(chars, cLength, "IBM420", &bLength); 554 char *bytes_r = extractBytes(chars_reverse, crLength, "IBM420", &brLength); 555 556 UCharsetDetector *csd = ucsdet_open(&status); 557 const UCharsetMatch *match; 558 const char *name; 559 560 ucsdet_setText(csd, bytes, bLength, &status); 561 match = ucsdet_detect(csd, &status); 562 563 if (match == NULL) { 564 log_err("Encoding detection failure for IBM420_rtl: got no matches.\n"); 565 goto bail; 566 } 567 568 name = ucsdet_getName(match, &status); 569 if (strcmp(name, "IBM420_rtl") != 0) { 570 log_data_err("Encoding detection failure for IBM420_rtl: got %s. (Are you missing data?)\n", name); 571 } 572 573 ucsdet_setText(csd, bytes_r, brLength, &status); 574 match = ucsdet_detect(csd, &status); 575 576 if (match == NULL) { 577 log_err("Encoding detection failure for IBM420_ltr: got no matches.\n"); 578 goto bail; 579 } 580 581 name = ucsdet_getName(match, &status); 582 if (strcmp(name, "IBM420_ltr") != 0) { 583 log_data_err("Encoding detection failure for IBM420_ltr: got %s. (Are you missing data?)\n", name); 584 } 585 586 bail: 587 freeBytes(bytes); 588 freeBytes(bytes_r); 589 ucsdet_close(csd); 590 } 591