1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 **************************************************************************** 5 * Copyright (c) 2005-2016, International Business Machines Corporation and * 6 * others. All Rights Reserved. * 7 **************************************************************************** 8 */ 9 10 #include "unicode/utypes.h" 11 12 #include "unicode/ucsdet.h" 13 #include "unicode/ucnv.h" 14 #include "unicode/ustring.h" 15 16 #include "cintltst.h" 17 #include "cmemory.h" 18 19 #include <stdlib.h> 20 #include <string.h> 21 22 #define NEW_ARRAY(type,count) (type *) malloc((count) * sizeof(type)) 23 #define DELETE_ARRAY(array) free(array) 24 25 static void TestConstruction(void); 26 static void TestUTF8(void); 27 static void TestUTF16(void); 28 static void TestC1Bytes(void); 29 static void TestInputFilter(void); 30 static void TestChaining(void); 31 static void TestBufferOverflow(void); 32 static void TestIBM424(void); 33 static void TestIBM420(void); 34 35 void addUCsdetTest(TestNode** root); 36 37 void addUCsdetTest(TestNode** root) 38 { 39 addTest(root, &TestConstruction, "ucsdetst/TestConstruction"); 40 addTest(root, &TestUTF8, "ucsdetst/TestUTF8"); 41 addTest(root, &TestUTF16, "ucsdetst/TestUTF16"); 42 addTest(root, &TestC1Bytes, "ucsdetst/TestC1Bytes"); 43 addTest(root, &TestInputFilter, "ucsdetst/TestInputFilter"); 44 addTest(root, &TestChaining, "ucsdetst/TestErrorChaining"); 45 addTest(root, &TestBufferOverflow, "ucsdetst/TestBufferOverflow"); 46 #if !UCONFIG_NO_LEGACY_CONVERSION 47 addTest(root, &TestIBM424, "ucsdetst/TestIBM424"); 48 addTest(root, &TestIBM420, "ucsdetst/TestIBM420"); 49 #endif 50 } 51 52 static int32_t preflight(const UChar *src, int32_t length, UConverter *cnv) 53 { 54 UErrorCode status; 55 char buffer[1024]; 56 char *dest, *destLimit = buffer + sizeof(buffer); 57 const UChar *srcLimit = src + length; 58 int32_t result = 0; 59 60 do { 61 dest = buffer; 62 status = U_ZERO_ERROR; 63 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status); 64 result += (int32_t) (dest - buffer); 65 } while (status == U_BUFFER_OVERFLOW_ERROR); 66 67 return result; 68 } 69 70 static char *extractBytes(const UChar *src, int32_t length, const char *codepage, int32_t *byteLength) 71 { 72 UErrorCode status = U_ZERO_ERROR; 73 UConverter *cnv = ucnv_open(codepage, &status); 74 int32_t byteCount = preflight(src, length, cnv); 75 const UChar *srcLimit = src + length; 76 char *bytes = NEW_ARRAY(char, byteCount + 1); 77 char *dest = bytes, *destLimit = bytes + byteCount + 1; 78 79 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status); 80 ucnv_close(cnv); 81 82 *byteLength = byteCount; 83 return bytes; 84 } 85 86 static void freeBytes(char *bytes) 87 { 88 DELETE_ARRAY(bytes); 89 } 90 91 static void TestConstruction(void) 92 { 93 UErrorCode status = U_ZERO_ERROR; 94 UCharsetDetector *csd = ucsdet_open(&status); 95 UEnumeration *e = ucsdet_getAllDetectableCharsets(csd, &status); 96 const char *name; 97 int32_t count = uenum_count(e, &status); 98 int32_t i, length; 99 100 for(i = 0; i < count; i += 1) { 101 name = uenum_next(e, &length, &status); 102 103 if(name == NULL || length <= 0) { 104 log_err("ucsdet_getAllDetectableCharsets() returned a null or empty name!\n"); 105 } 106 } 107 /* one past the list of all names must return NULL */ 108 name = uenum_next(e, &length, &status); 109 if(name != NULL || length != 0 || U_FAILURE(status)) { 110 log_err("ucsdet_getAllDetectableCharsets(past the list) returned a non-null name!\n"); 111 } 112 113 uenum_close(e); 114 ucsdet_close(csd); 115 } 116 117 static void TestUTF8(void) 118 { 119 UErrorCode status = U_ZERO_ERROR; 120 static const char ss[] = "This is a string with some non-ascii characters that will " 121 "be converted to UTF-8, then shoved through the detection process. " 122 "\\u0391\\u0392\\u0393\\u0394\\u0395" 123 "Sure would be nice if our source could contain Unicode directly!"; 124 int32_t byteLength = 0, sLength = 0, dLength = 0; 125 UChar s[sizeof(ss)]; 126 char *bytes; 127 UCharsetDetector *csd = ucsdet_open(&status); 128 const UCharsetMatch *match; 129 UChar detected[sizeof(ss)]; 130 131 sLength = u_unescape(ss, s, sizeof(ss)); 132 bytes = extractBytes(s, sLength, "UTF-8", &byteLength); 133 134 ucsdet_setText(csd, bytes, byteLength, &status); 135 if (U_FAILURE(status)) { 136 log_err("status is %s\n", u_errorName(status)); 137 goto bail; 138 } 139 140 match = ucsdet_detect(csd, &status); 141 142 if (match == NULL) { 143 log_err("Detection failure for UTF-8: got no matches.\n"); 144 goto bail; 145 } 146 147 dLength = ucsdet_getUChars(match, detected, sLength, &status); 148 149 if (u_strCompare(detected, dLength, s, sLength, FALSE) != 0) { 150 log_err("Round-trip test failed!\n"); 151 } 152 153 ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */ 154 155 bail: 156 freeBytes(bytes); 157 ucsdet_close(csd); 158 } 159 160 static void TestUTF16(void) 161 { 162 UErrorCode status = U_ZERO_ERROR; 163 /* Notice the BOM on the start of this string */ 164 static const UChar chars[] = { 165 0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C, 166 0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a, 167 0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628, 168 0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646, 169 0x064a, 0x062a, 0x0000}; 170 int32_t beLength = 0, leLength = 0, cLength = UPRV_LENGTHOF(chars); 171 char *beBytes = extractBytes(chars, cLength, "UTF-16BE", &beLength); 172 char *leBytes = extractBytes(chars, cLength, "UTF-16LE", &leLength); 173 UCharsetDetector *csd = ucsdet_open(&status); 174 const UCharsetMatch *match; 175 const char *name; 176 int32_t conf; 177 178 ucsdet_setText(csd, beBytes, beLength, &status); 179 match = ucsdet_detect(csd, &status); 180 181 if (match == NULL) { 182 log_err("Encoding detection failure for UTF-16BE: got no matches.\n"); 183 goto try_le; 184 } 185 186 name = ucsdet_getName(match, &status); 187 conf = ucsdet_getConfidence(match, &status); 188 189 if (strcmp(name, "UTF-16BE") != 0) { 190 log_err("Encoding detection failure for UTF-16BE: got %s\n", name); 191 } 192 193 if (conf != 100) { 194 log_err("Did not get 100%% confidence for UTF-16BE: got %d\n", conf); 195 } 196 197 try_le: 198 ucsdet_setText(csd, leBytes, leLength, &status); 199 match = ucsdet_detect(csd, &status); 200 201 if (match == NULL) { 202 log_err("Encoding detection failure for UTF-16LE: got no matches.\n"); 203 goto bail; 204 } 205 206 name = ucsdet_getName(match, &status); 207 conf = ucsdet_getConfidence(match, &status); 208 209 210 if (strcmp(name, "UTF-16LE") != 0) { 211 log_err("Enconding detection failure for UTF-16LE: got %s\n", name); 212 } 213 214 if (conf != 100) { 215 log_err("Did not get 100%% confidence for UTF-16LE: got %d\n", conf); 216 } 217 218 bail: 219 freeBytes(leBytes); 220 freeBytes(beBytes); 221 ucsdet_close(csd); 222 } 223 224 static void TestC1Bytes(void) 225 { 226 #if !UCONFIG_NO_LEGACY_CONVERSION 227 UErrorCode status = U_ZERO_ERROR; 228 static const char ssISO[] = "This is a small sample of some English text. Just enough to be sure that it detects correctly."; 229 static const char ssWindows[] = "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes."; 230 int32_t sISOLength = 0, sWindowsLength = 0; 231 UChar sISO[sizeof(ssISO)]; 232 UChar sWindows[sizeof(ssWindows)]; 233 int32_t lISO = 0, lWindows = 0; 234 char *bISO; 235 char *bWindows; 236 UCharsetDetector *csd = ucsdet_open(&status); 237 const UCharsetMatch *match; 238 const char *name; 239 240 sISOLength = u_unescape(ssISO, sISO, sizeof(ssISO)); 241 sWindowsLength = u_unescape(ssWindows, sWindows, sizeof(ssWindows)); 242 bISO = extractBytes(sISO, sISOLength, "ISO-8859-1", &lISO); 243 bWindows = extractBytes(sWindows, sWindowsLength, "windows-1252", &lWindows); 244 245 ucsdet_setText(csd, bWindows, lWindows, &status); 246 match = ucsdet_detect(csd, &status); 247 248 if (match == NULL) { 249 log_err("English test with C1 bytes got no matches.\n"); 250 goto bail; 251 } 252 253 name = ucsdet_getName(match, &status); 254 255 if (strcmp(name, "windows-1252") != 0) { 256 log_data_err("English text with C1 bytes does not detect as windows-1252, but as %s. (Are you missing data?)\n", name); 257 } 258 259 ucsdet_setText(csd, bISO, lISO, &status); 260 match = ucsdet_detect(csd, &status); 261 262 if (match == NULL) { 263 log_err("English text without C1 bytes got no matches.\n"); 264 goto bail; 265 } 266 267 name = ucsdet_getName(match, &status); 268 269 if (strcmp(name, "ISO-8859-1") != 0) { 270 log_err("English text without C1 bytes does not detect as ISO-8859-1, but as %s\n", name); 271 } 272 273 bail: 274 freeBytes(bWindows); 275 freeBytes(bISO); 276 277 ucsdet_close(csd); 278 #endif 279 } 280 281 static void TestInputFilter(void) 282 { 283 UErrorCode status = U_ZERO_ERROR; 284 static const char ss[] = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>"; 285 int32_t sLength = 0; 286 UChar s[sizeof(ss)]; 287 int32_t byteLength = 0; 288 char *bytes; 289 UCharsetDetector *csd = ucsdet_open(&status); 290 const UCharsetMatch *match; 291 const char *lang, *name; 292 293 sLength = u_unescape(ss, s, sizeof(ss)); 294 bytes = extractBytes(s, sLength, "ISO-8859-1", &byteLength); 295 296 ucsdet_enableInputFilter(csd, TRUE); 297 298 if (!ucsdet_isInputFilterEnabled(csd)) { 299 log_err("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!\n"); 300 } 301 302 303 ucsdet_setText(csd, bytes, byteLength, &status); 304 match = ucsdet_detect(csd, &status); 305 306 if (match == NULL) { 307 log_err("Turning on the input filter resulted in no matches.\n"); 308 goto turn_off; 309 } 310 311 name = ucsdet_getName(match, &status); 312 313 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) { 314 log_err("Turning on the input filter resulted in %s rather than ISO-8859-1\n", name); 315 } else { 316 lang = ucsdet_getLanguage(match, &status); 317 318 if (lang == NULL || strcmp(lang, "fr") != 0) { 319 log_err("Input filter did not strip markup!\n"); 320 } 321 } 322 323 turn_off: 324 ucsdet_enableInputFilter(csd, FALSE); 325 ucsdet_setText(csd, bytes, byteLength, &status); 326 match = ucsdet_detect(csd, &status); 327 328 if (match == NULL) { 329 log_err("Turning off the input filter resulted in no matches.\n"); 330 goto bail; 331 } 332 333 name = ucsdet_getName(match, &status); 334 335 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) { 336 log_err("Turning off the input filter resulted in %s rather than ISO-8859-1\n", name); 337 } else { 338 lang = ucsdet_getLanguage(match, &status); 339 340 if (lang == NULL || strcmp(lang, "en") != 0) { 341 log_err("Unfiltered input did not detect as English!\n"); 342 } 343 } 344 345 bail: 346 freeBytes(bytes); 347 ucsdet_close(csd); 348 } 349 350 static void TestChaining(void) { 351 UErrorCode status = U_USELESS_COLLATOR_ERROR; 352 353 ucsdet_open(&status); 354 ucsdet_setText(NULL, NULL, 0, &status); 355 ucsdet_getName(NULL, &status); 356 ucsdet_getConfidence(NULL, &status); 357 ucsdet_getLanguage(NULL, &status); 358 ucsdet_detect(NULL, &status); 359 ucsdet_setDeclaredEncoding(NULL, NULL, 0, &status); 360 ucsdet_detectAll(NULL, NULL, &status); 361 ucsdet_getUChars(NULL, NULL, 0, &status); 362 ucsdet_getUChars(NULL, NULL, 0, &status); 363 ucsdet_close(NULL); 364 365 /* All of this code should have done nothing. */ 366 if (status != U_USELESS_COLLATOR_ERROR) { 367 log_err("Status got changed to %s\n", u_errorName(status)); 368 } 369 } 370 371 static void TestBufferOverflow(void) { 372 UErrorCode status = U_ZERO_ERROR; 373 static const char *testStrings[] = { 374 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b", /* A partial ISO-2022 shift state at the end */ 375 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24", /* A partial ISO-2022 shift state at the end */ 376 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28", /* A partial ISO-2022 shift state at the end */ 377 "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end with a bad one at the start */ 378 "\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end */ 379 "\xa1", /* Could be a single byte shift-jis at the end */ 380 "\x74\x68\xa1", /* Could be a single byte shift-jis at the end */ 381 "\x74\x68\x65\xa1" /* Could be a single byte shift-jis at the end, but now we have English creeping in. */ 382 }; 383 static const char *testResults[] = { 384 "windows-1252", 385 "windows-1252", 386 "windows-1252", 387 "windows-1252", 388 "ISO-2022-JP", 389 NULL, 390 NULL, 391 "ISO-8859-1" 392 }; 393 int32_t idx = 0; 394 UCharsetDetector *csd = ucsdet_open(&status); 395 const UCharsetMatch *match; 396 397 ucsdet_setDeclaredEncoding(csd, "ISO-2022-JP", -1, &status); 398 399 if (U_FAILURE(status)) { 400 log_err("Couldn't open detector. %s\n", u_errorName(status)); 401 goto bail; 402 } 403 404 for (idx = 0; idx < UPRV_LENGTHOF(testStrings); idx++) { 405 ucsdet_setText(csd, testStrings[idx], -1, &status); 406 match = ucsdet_detect(csd, &status); 407 408 if (match == NULL) { 409 if (testResults[idx] != NULL) { 410 log_err("Unexpectedly got no results at index %d.\n", idx); 411 } 412 else { 413 log_verbose("Got no result as expected at index %d.\n", idx); 414 } 415 continue; 416 } 417 418 if (testResults[idx] == NULL || strcmp(ucsdet_getName(match, &status), testResults[idx]) != 0) { 419 log_err("Unexpectedly got %s instead of %s at index %d with confidence %d.\n", 420 ucsdet_getName(match, &status), testResults[idx], idx, ucsdet_getConfidence(match, &status)); 421 goto bail; 422 } 423 } 424 425 bail: 426 ucsdet_close(csd); 427 } 428 429 static void TestIBM424(void) 430 { 431 UErrorCode status = U_ZERO_ERROR; 432 433 static const UChar chars[] = { 434 0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8, 435 0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9, 436 0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8, 437 0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA, 438 0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5, 439 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE, 440 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 441 0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC, 442 0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3, 443 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020, 444 0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC, 445 0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 446 0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 447 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 448 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC, 449 0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1, 450 0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000 451 }; 452 453 static const UChar chars_reverse[] = { 454 0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA, 455 0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8, 456 0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 457 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 458 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9, 459 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4, 460 0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9, 461 0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5, 462 0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3, 463 0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020, 464 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 465 0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9, 466 0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020, 467 0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4, 468 0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7, 469 0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0, 470 0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4, 471 0x0000 472 }; 473 474 int32_t bLength = 0, brLength = 0, cLength = UPRV_LENGTHOF(chars), crLength = UPRV_LENGTHOF(chars_reverse); 475 476 char *bytes = extractBytes(chars, cLength, "IBM424", &bLength); 477 char *bytes_r = extractBytes(chars_reverse, crLength, "IBM424", &brLength); 478 479 UCharsetDetector *csd = ucsdet_open(&status); 480 const UCharsetMatch *match; 481 const char *name; 482 483 ucsdet_setText(csd, bytes, bLength, &status); 484 match = ucsdet_detect(csd, &status); 485 486 if (match == NULL) { 487 log_err("Encoding detection failure for IBM424_rtl: got no matches.\n"); 488 goto bail; 489 } 490 491 name = ucsdet_getName(match, &status); 492 if (strcmp(name, "IBM424_rtl") != 0) { 493 log_data_err("Encoding detection failure for IBM424_rtl: got %s. (Are you missing data?)\n", name); 494 } 495 496 ucsdet_setText(csd, bytes_r, brLength, &status); 497 match = ucsdet_detect(csd, &status); 498 499 if (match == NULL) { 500 log_err("Encoding detection failure for IBM424_ltr: got no matches.\n"); 501 goto bail; 502 } 503 504 name = ucsdet_getName(match, &status); 505 if (strcmp(name, "IBM424_ltr") != 0) { 506 log_data_err("Encoding detection failure for IBM424_ltr: got %s. (Are you missing data?)\n", name); 507 } 508 509 bail: 510 freeBytes(bytes); 511 freeBytes(bytes_r); 512 ucsdet_close(csd); 513 } 514 515 static void TestIBM420(void) 516 { 517 UErrorCode status = U_ZERO_ERROR; 518 519 static const UChar chars[] = { 520 0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627, 521 0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641, 522 0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 523 0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645, 524 0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A, 525 0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644, 526 0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020, 527 0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 528 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634, 529 0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F, 530 0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647, 531 0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627, 532 0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E, 533 0x0000 534 }; 535 static const UChar chars_reverse[] = { 536 0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F, 537 0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020, 538 0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648, 539 0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628, 540 0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 541 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A, 542 0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644, 543 0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A, 544 0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A, 545 0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627, 546 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A, 547 0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645, 548 0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648, 549 0x0000, 550 }; 551 552 int32_t bLength = 0, brLength = 0, cLength = UPRV_LENGTHOF(chars), crLength = UPRV_LENGTHOF(chars_reverse); 553 554 char *bytes = extractBytes(chars, cLength, "IBM420", &bLength); 555 char *bytes_r = extractBytes(chars_reverse, crLength, "IBM420", &brLength); 556 557 UCharsetDetector *csd = ucsdet_open(&status); 558 const UCharsetMatch *match; 559 const char *name; 560 561 ucsdet_setText(csd, bytes, bLength, &status); 562 match = ucsdet_detect(csd, &status); 563 564 if (match == NULL) { 565 log_err("Encoding detection failure for IBM420_rtl: got no matches.\n"); 566 goto bail; 567 } 568 569 name = ucsdet_getName(match, &status); 570 if (strcmp(name, "IBM420_rtl") != 0) { 571 log_data_err("Encoding detection failure for IBM420_rtl: got %s. (Are you missing data?)\n", name); 572 } 573 574 ucsdet_setText(csd, bytes_r, brLength, &status); 575 match = ucsdet_detect(csd, &status); 576 577 if (match == NULL) { 578 log_err("Encoding detection failure for IBM420_ltr: got no matches.\n"); 579 goto bail; 580 } 581 582 name = ucsdet_getName(match, &status); 583 if (strcmp(name, "IBM420_ltr") != 0) { 584 log_data_err("Encoding detection failure for IBM420_ltr: got %s. (Are you missing data?)\n", name); 585 } 586 587 bail: 588 freeBytes(bytes); 589 freeBytes(bytes_r); 590 ucsdet_close(csd); 591 } 592