1 /* 2 ********************************************************************** 3 * Copyright (C) 2005-2009, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 */ 7 8 9 #include "unicode/utypes.h" 10 #include "unicode/ucsdet.h" 11 #include "unicode/ucnv.h" 12 #include "unicode/unistr.h" 13 #include "unicode/putil.h" 14 #include "unicode/uniset.h" 15 16 #include "intltest.h" 17 #include "csdetest.h" 18 19 #include "xmlparser.h" 20 21 #include <stdlib.h> 22 #include <string.h> 23 24 #ifdef DEBUG_DETECT 25 #include <stdio.h> 26 #endif 27 28 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) 29 30 #define NEW_ARRAY(type,count) (type *) /*uprv_*/malloc((count) * sizeof(type)) 31 #define DELETE_ARRAY(array) /*uprv_*/free((void *) (array)) 32 33 #define CH_SPACE 0x0020 34 #define CH_SLASH 0x002F 35 36 //--------------------------------------------------------------------------- 37 // 38 // Test class boilerplate 39 // 40 //--------------------------------------------------------------------------- 41 CharsetDetectionTest::CharsetDetectionTest() 42 { 43 } 44 45 46 CharsetDetectionTest::~CharsetDetectionTest() 47 { 48 } 49 50 51 52 void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ ) 53 { 54 if (exec) logln("TestSuite CharsetDetectionTest: "); 55 switch (index) { 56 case 0: name = "ConstructionTest"; 57 if (exec) ConstructionTest(); 58 break; 59 60 case 1: name = "UTF8Test"; 61 if (exec) UTF8Test(); 62 break; 63 64 case 2: name = "UTF16Test"; 65 if (exec) UTF16Test(); 66 break; 67 68 case 3: name = "C1BytesTest"; 69 if (exec) C1BytesTest(); 70 break; 71 72 case 4: name = "InputFilterTest"; 73 if (exec) InputFilterTest(); 74 break; 75 76 case 5: name = "DetectionTest"; 77 if (exec) DetectionTest(); 78 break; 79 80 case 6: name = "IBM424Test"; 81 if (exec) IBM424Test(); 82 break; 83 84 case 7: name = "IBM420Test"; 85 if (exec) IBM420Test(); 86 break; 87 88 case 8: name = "Ticket6394Test"; 89 if (exec) Ticket6394Test(); 90 break; 91 92 default: name = ""; 93 break; //needed to end loop 94 } 95 } 96 97 static UnicodeString *split(const UnicodeString &src, UChar ch, int32_t &splits) 98 { 99 int32_t offset = -1; 100 101 splits = 1; 102 while((offset = src.indexOf(ch, offset + 1)) >= 0) { 103 splits += 1; 104 } 105 106 UnicodeString *result = new UnicodeString[splits]; 107 108 int32_t start = 0; 109 int32_t split = 0; 110 int32_t end; 111 112 while((end = src.indexOf(ch, start)) >= 0) { 113 src.extractBetween(start, end, result[split++]); 114 start = end + 1; 115 } 116 117 src.extractBetween(start, src.length(), result[split]); 118 119 return result; 120 } 121 122 static char *extractBytes(const UnicodeString &source, const char *codepage, int32_t &length) 123 { 124 int32_t sLength = source.length(); 125 char *bytes = NULL; 126 127 length = source.extract(0, sLength, NULL, codepage); 128 129 if (length > 0) { 130 bytes = NEW_ARRAY(char, length + 1); 131 source.extract(0, sLength, bytes, codepage); 132 } 133 134 return bytes; 135 } 136 137 static void freeBytes(char *bytes) 138 { 139 DELETE_ARRAY(bytes); 140 } 141 142 void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const UnicodeString &encoding, const UnicodeString &id) 143 { 144 int32_t splits = 0; 145 int32_t testLength = testString.length(); 146 UnicodeString *eSplit = split(encoding, CH_SLASH, splits); 147 UErrorCode status = U_ZERO_ERROR; 148 int32_t cpLength = eSplit[0].length(); 149 char codepage[64]; 150 151 u_UCharsToChars(eSplit[0].getBuffer(), codepage, cpLength); 152 codepage[cpLength] = '\0'; 153 154 UCharsetDetector *csd = ucsdet_open(&status); 155 156 int32_t byteLength = 0; 157 char *bytes = extractBytes(testString, codepage, byteLength); 158 159 if (bytes == NULL) { 160 #if !UCONFIG_NO_LEGACY_CONVERSION 161 errln("Can't open a " + encoding + " converter for " + id); 162 #endif 163 return; 164 } 165 166 ucsdet_setText(csd, bytes, byteLength, &status); 167 168 int32_t matchCount = 0; 169 const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status); 170 171 172 UnicodeString name(ucsdet_getName(matches[0], &status)); 173 UnicodeString lang(ucsdet_getLanguage(matches[0], &status)); 174 UChar *decoded = NULL; 175 int32_t dLength = 0; 176 177 if (matchCount == 0) { 178 errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got no matches"); 179 goto bail; 180 } 181 182 if (name.compare(eSplit[0]) != 0) { 183 errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name); 184 185 #ifdef DEBUG_DETECT 186 for (int32_t m = 0; m < matchCount; m += 1) { 187 const char *name = ucsdet_getName(matches[m], &status); 188 const char *lang = ucsdet_getLanguage(matches[m], &status); 189 int32_t confidence = ucsdet_getConfidence(matches[m], &status); 190 191 printf("%s (%s) %d\n", name, lang, confidence); 192 } 193 #endif 194 goto bail; 195 } 196 197 if (splits > 1 && lang.compare(eSplit[1]) != 0) { 198 errln("Language detection failure for " + id + ", " + eSplit[0] + ": expected " + eSplit[1] + ", got " + lang); 199 goto bail; 200 } 201 202 decoded = NEW_ARRAY(UChar, testLength); 203 dLength = ucsdet_getUChars(matches[0], decoded, testLength, &status); 204 205 if (testString.compare(decoded, dLength) != 0) { 206 errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() didn't yeild the original string."); 207 208 #ifdef DEBUG_DETECT 209 for(int32_t i = 0; i < testLength; i += 1) { 210 if(testString[i] != decoded[i]) { 211 printf("Strings differ at byte %d\n", i); 212 break; 213 } 214 } 215 #endif 216 217 } 218 219 DELETE_ARRAY(decoded); 220 221 bail: 222 freeBytes(bytes); 223 ucsdet_close(csd); 224 delete[] eSplit; 225 } 226 227 const char *CharsetDetectionTest::getPath(char buffer[2048], const char *filename) { 228 UErrorCode status = U_ZERO_ERROR; 229 const char *testDataDirectory = IntlTest::getSourceTestData(status); 230 231 if (U_FAILURE(status)) { 232 errln("ERROR: getPath() failed - %s", u_errorName(status)); 233 return NULL; 234 } 235 236 strcpy(buffer, testDataDirectory); 237 strcat(buffer, filename); 238 return buffer; 239 } 240 241 void CharsetDetectionTest::ConstructionTest() 242 { 243 UErrorCode status = U_ZERO_ERROR; 244 UCharsetDetector *csd = ucsdet_open(&status); 245 UEnumeration *e = ucsdet_getAllDetectableCharsets(csd, &status); 246 int32_t count = uenum_count(e, &status); 247 248 #ifdef DEBUG_DETECT 249 printf("There are %d recognizers.\n", count); 250 #endif 251 252 for(int32_t i = 0; i < count; i += 1) { 253 int32_t length; 254 const char *name = uenum_next(e, &length, &status); 255 256 if(name == NULL || length <= 0) { 257 errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!"); 258 } 259 260 #ifdef DEBUG_DETECT 261 printf("%s\n", name); 262 #endif 263 } 264 265 uenum_close(e); 266 ucsdet_close(csd); 267 } 268 269 void CharsetDetectionTest::UTF8Test() 270 { 271 UErrorCode status = U_ZERO_ERROR; 272 UnicodeString ss = "This is a string with some non-ascii characters that will " 273 "be converted to UTF-8, then shoved through the detection process. " 274 "\\u0391\\u0392\\u0393\\u0394\\u0395" 275 "Sure would be nice if our source could contain Unicode directly!"; 276 UnicodeString s = ss.unescape(); 277 int32_t byteLength = 0, sLength = s.length(); 278 char *bytes = extractBytes(s, "UTF-8", byteLength); 279 UCharsetDetector *csd = ucsdet_open(&status); 280 const UCharsetMatch *match; 281 UChar *detected = NEW_ARRAY(UChar, sLength); 282 283 ucsdet_setText(csd, bytes, byteLength, &status); 284 match = ucsdet_detect(csd, &status); 285 286 if (match == NULL) { 287 errln("Detection failure for UTF-8: got no matches."); 288 goto bail; 289 } 290 291 ucsdet_getUChars(match, detected, sLength, &status); 292 293 if (s.compare(detected, sLength) != 0) { 294 errln("Round-trip test failed!"); 295 } 296 297 ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */ 298 299 bail: 300 DELETE_ARRAY(detected); 301 freeBytes(bytes); 302 ucsdet_close(csd); 303 } 304 305 void CharsetDetectionTest::UTF16Test() 306 { 307 UErrorCode status = U_ZERO_ERROR; 308 /* Notice the BOM on the start of this string */ 309 UChar chars[] = { 310 0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C, 311 0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a, 312 0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628, 313 0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646, 314 0x064a, 0x062a, 0x0000}; 315 UnicodeString s(chars); 316 int32_t beLength = 0, leLength = 0; 317 char *beBytes = extractBytes(s, "UTF-16BE", beLength); 318 char *leBytes = extractBytes(s, "UTF-16LE", leLength); 319 UCharsetDetector *csd = ucsdet_open(&status); 320 const UCharsetMatch *match; 321 const char *name; 322 int32_t conf; 323 324 ucsdet_setText(csd, beBytes, beLength, &status); 325 match = ucsdet_detect(csd, &status); 326 327 if (match == NULL) { 328 errln("Encoding detection failure for UTF-16BE: got no matches."); 329 goto try_le; 330 } 331 332 name = ucsdet_getName(match, &status); 333 conf = ucsdet_getConfidence(match, &status); 334 335 if (strcmp(name, "UTF-16BE") != 0) { 336 errln("Encoding detection failure for UTF-16BE: got %s", name); 337 goto try_le; // no point in looking at confidence if we got the wrong character set. 338 } 339 340 if (conf != 100) { 341 errln("Did not get 100%% confidence for UTF-16BE: got %d", conf); 342 } 343 344 try_le: 345 ucsdet_setText(csd, leBytes, leLength, &status); 346 match = ucsdet_detect(csd, &status); 347 348 if (match == NULL) { 349 errln("Encoding detection failure for UTF-16LE: got no matches."); 350 goto bail; 351 } 352 353 name = ucsdet_getName(match, &status); 354 conf = ucsdet_getConfidence(match, &status); 355 356 357 if (strcmp(name, "UTF-16LE") != 0) { 358 errln("Enconding detection failure for UTF-16LE: got %s", name); 359 goto bail; // no point in looking at confidence if we got the wrong character set. 360 } 361 362 if (conf != 100) { 363 errln("Did not get 100%% confidence for UTF-16LE: got %d", conf); 364 } 365 366 bail: 367 freeBytes(leBytes); 368 freeBytes(beBytes); 369 ucsdet_close(csd); 370 } 371 372 void CharsetDetectionTest::InputFilterTest() 373 { 374 UErrorCode status = U_ZERO_ERROR; 375 UnicodeString ss = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>"; 376 UnicodeString s = ss.unescape(); 377 int32_t byteLength = 0; 378 char *bytes = extractBytes(s, "ISO-8859-1", byteLength); 379 UCharsetDetector *csd = ucsdet_open(&status); 380 const UCharsetMatch *match; 381 const char *lang, *name; 382 383 ucsdet_enableInputFilter(csd, TRUE); 384 385 if (!ucsdet_isInputFilterEnabled(csd)) { 386 errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!"); 387 } 388 389 390 ucsdet_setText(csd, bytes, byteLength, &status); 391 match = ucsdet_detect(csd, &status); 392 393 if (match == NULL) { 394 errln("Turning on the input filter resulted in no matches."); 395 goto turn_off; 396 } 397 398 name = ucsdet_getName(match, &status); 399 400 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) { 401 errln("Turning on the input filter resulted in %s rather than ISO-8859-1.", name); 402 } else { 403 lang = ucsdet_getLanguage(match, &status); 404 405 if (lang == NULL || strcmp(lang, "fr") != 0) { 406 errln("Input filter did not strip markup!"); 407 } 408 } 409 410 turn_off: 411 ucsdet_enableInputFilter(csd, FALSE); 412 ucsdet_setText(csd, bytes, byteLength, &status); 413 match = ucsdet_detect(csd, &status); 414 415 if (match == NULL) { 416 errln("Turning off the input filter resulted in no matches."); 417 goto bail; 418 } 419 420 name = ucsdet_getName(match, &status); 421 422 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) { 423 errln("Turning off the input filter resulted in %s rather than ISO-8859-1.", name); 424 } else { 425 lang = ucsdet_getLanguage(match, &status); 426 427 if (lang == NULL || strcmp(lang, "en") != 0) { 428 errln("Unfiltered input did not detect as English!"); 429 } 430 } 431 432 bail: 433 freeBytes(bytes); 434 ucsdet_close(csd); 435 } 436 437 void CharsetDetectionTest::C1BytesTest() 438 { 439 #if !UCONFIG_NO_LEGACY_CONVERSION 440 UErrorCode status = U_ZERO_ERROR; 441 UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly."; 442 UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.", -1, US_INV); 443 UnicodeString sWindows = ssWindows.unescape(); 444 int32_t lISO = 0, lWindows = 0; 445 char *bISO = extractBytes(sISO, "ISO-8859-1", lISO); 446 char *bWindows = extractBytes(sWindows, "windows-1252", lWindows); 447 UCharsetDetector *csd = ucsdet_open(&status); 448 const UCharsetMatch *match; 449 const char *name; 450 451 ucsdet_setText(csd, bWindows, lWindows, &status); 452 match = ucsdet_detect(csd, &status); 453 454 if (match == NULL) { 455 errcheckln(status, "English test with C1 bytes got no matches. - %s", u_errorName(status)); 456 goto bail; 457 } 458 459 name = ucsdet_getName(match, &status); 460 461 if (strcmp(name, "windows-1252") != 0) { 462 errln("English text with C1 bytes does not detect as windows-1252, but as %s", name); 463 } 464 465 ucsdet_setText(csd, bISO, lISO, &status); 466 match = ucsdet_detect(csd, &status); 467 468 if (match == NULL) { 469 errln("English text without C1 bytes got no matches."); 470 goto bail; 471 } 472 473 name = ucsdet_getName(match, &status); 474 475 if (strcmp(name, "ISO-8859-1") != 0) { 476 errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name); 477 } 478 479 bail: 480 freeBytes(bWindows); 481 freeBytes(bISO); 482 483 ucsdet_close(csd); 484 #endif 485 } 486 487 void CharsetDetectionTest::DetectionTest() 488 { 489 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 490 UErrorCode status = U_ZERO_ERROR; 491 char path[2048]; 492 const char *testFilePath = getPath(path, "csdetest.xml"); 493 494 if (testFilePath == NULL) { 495 return; /* Couldn't get path: error message already output. */ 496 } 497 498 UXMLParser *parser = UXMLParser::createParser(status); 499 if (U_FAILURE(status)) { 500 dataerrln("FAIL: UXMLParser::createParser (%s)", u_errorName(status)); 501 return; 502 } 503 504 UXMLElement *root = parser->parseFile(testFilePath, status); 505 if (!assertSuccess( "parseFile",status)) return; 506 507 UnicodeString test_case = UNICODE_STRING_SIMPLE("test-case"); 508 UnicodeString id_attr = UNICODE_STRING_SIMPLE("id"); 509 UnicodeString enc_attr = UNICODE_STRING_SIMPLE("encodings"); 510 511 const UXMLElement *testCase; 512 int32_t tc = 0; 513 514 while((testCase = root->nextChildElement(tc)) != NULL) { 515 if (testCase->getTagName().compare(test_case) == 0) { 516 const UnicodeString *id = testCase->getAttribute(id_attr); 517 const UnicodeString *encodings = testCase->getAttribute(enc_attr); 518 const UnicodeString text = testCase->getText(TRUE); 519 int32_t encodingCount; 520 UnicodeString *encodingList = split(*encodings, CH_SPACE, encodingCount); 521 522 for(int32_t e = 0; e < encodingCount; e += 1) { 523 checkEncoding(text, encodingList[e], *id); 524 } 525 526 delete[] encodingList; 527 } 528 } 529 530 delete root; 531 delete parser; 532 #endif 533 } 534 535 void CharsetDetectionTest::IBM424Test() 536 { 537 UErrorCode status = U_ZERO_ERROR; 538 539 static const UChar chars[] = { 540 0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8, 541 0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9, 542 0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8, 543 0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA, 544 0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5, 545 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE, 546 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 547 0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC, 548 0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3, 549 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020, 550 0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC, 551 0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 552 0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 553 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 554 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC, 555 0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1, 556 0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000 557 }; 558 559 static const UChar chars_reverse[] = { 560 0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA, 561 0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8, 562 0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 563 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 564 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9, 565 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4, 566 0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9, 567 0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5, 568 0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3, 569 0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020, 570 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 571 0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9, 572 0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020, 573 0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4, 574 0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7, 575 0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0, 576 0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4, 577 0x0000 578 }; 579 580 int32_t bLength = 0, brLength = 0; 581 582 UnicodeString s1(chars); 583 UnicodeString s2(chars_reverse); 584 585 char *bytes = extractBytes(s1, "IBM424", bLength); 586 char *bytes_r = extractBytes(s2, "IBM424", brLength); 587 588 UCharsetDetector *csd = ucsdet_open(&status); 589 if (U_FAILURE(status)) { 590 errln("Error opening charset detector. - %s", u_errorName(status)); 591 } 592 const UCharsetMatch *match; 593 const char *name; 594 595 ucsdet_setText(csd, bytes, bLength, &status); 596 match = ucsdet_detect(csd, &status); 597 598 if (match == NULL) { 599 errcheckln(status, "Encoding detection failure for IBM424_rtl: got no matches. - %s", u_errorName(status)); 600 goto bail; 601 } 602 603 name = ucsdet_getName(match, &status); 604 if (strcmp(name, "IBM424_rtl") != 0) { 605 errln("Encoding detection failure for IBM424_rtl: got %s", name); 606 } 607 608 ucsdet_setText(csd, bytes_r, brLength, &status); 609 match = ucsdet_detect(csd, &status); 610 611 if (match == NULL) { 612 errln("Encoding detection failure for IBM424_ltr: got no matches."); 613 goto bail; 614 } 615 616 name = ucsdet_getName(match, &status); 617 if (strcmp(name, "IBM424_ltr") != 0) { 618 errln("Encoding detection failure for IBM424_ltr: got %s", name); 619 } 620 621 bail: 622 freeBytes(bytes); 623 freeBytes(bytes_r); 624 ucsdet_close(csd); 625 } 626 627 void CharsetDetectionTest::IBM420Test() 628 { 629 UErrorCode status = U_ZERO_ERROR; 630 631 static const UChar chars[] = { 632 0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627, 633 0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641, 634 0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 635 0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645, 636 0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A, 637 0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644, 638 0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020, 639 0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 640 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634, 641 0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F, 642 0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647, 643 0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627, 644 0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E, 645 0x0000 646 }; 647 static const UChar chars_reverse[] = { 648 0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F, 649 0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020, 650 0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648, 651 0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628, 652 0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 653 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A, 654 0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644, 655 0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A, 656 0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A, 657 0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627, 658 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A, 659 0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645, 660 0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648, 661 0x0000, 662 }; 663 664 int32_t bLength = 0, brLength = 0; 665 666 UnicodeString s1(chars); 667 UnicodeString s2(chars_reverse); 668 669 char *bytes = extractBytes(s1, "IBM420", bLength); 670 char *bytes_r = extractBytes(s2, "IBM420", brLength); 671 672 UCharsetDetector *csd = ucsdet_open(&status); 673 if (U_FAILURE(status)) { 674 errln("Error opening charset detector. - %s", u_errorName(status)); 675 } 676 const UCharsetMatch *match; 677 const char *name; 678 679 ucsdet_setText(csd, bytes, bLength, &status); 680 match = ucsdet_detect(csd, &status); 681 682 if (match == NULL) { 683 errcheckln(status, "Encoding detection failure for IBM420_rtl: got no matches. - %s", u_errorName(status)); 684 goto bail; 685 } 686 687 name = ucsdet_getName(match, &status); 688 if (strcmp(name, "IBM420_rtl") != 0) { 689 errln("Encoding detection failure for IBM420_rtl: got %s\n", name); 690 } 691 692 ucsdet_setText(csd, bytes_r, brLength, &status); 693 match = ucsdet_detect(csd, &status); 694 695 if (match == NULL) { 696 errln("Encoding detection failure for IBM420_ltr: got no matches.\n"); 697 goto bail; 698 } 699 700 name = ucsdet_getName(match, &status); 701 if (strcmp(name, "IBM420_ltr") != 0) { 702 errln("Encoding detection failure for IBM420_ltr: got %s\n", name); 703 } 704 705 bail: 706 freeBytes(bytes); 707 freeBytes(bytes_r); 708 ucsdet_close(csd); 709 } 710 711 712 void CharsetDetectionTest::Ticket6394Test() { 713 #if !UCONFIG_NO_CONVERSION 714 const char charText[] = "Here is some random English text that should be detected as ISO-8859-1." 715 "Ticket 6394 claims that ISO-8859-1 will appear in the array of detected " 716 "encodings more than once. The hop through UnicodeString is for platforms " 717 "where this char * string is be EBCDIC and needs conversion to Latin1."; 718 char latin1Text[sizeof(charText)]; 719 UnicodeString(charText).extract(0, sizeof(charText)-2, latin1Text, sizeof(latin1Text), "ISO-8859-1"); 720 721 UErrorCode status = U_ZERO_ERROR; 722 UCharsetDetector *csd = ucsdet_open(&status); 723 ucsdet_setText(csd, latin1Text, -1, &status); 724 if (U_FAILURE(status)) { 725 errln("Fail at file %s, line %d. status = %s", __FILE__, __LINE__, u_errorName(status)); 726 return; 727 } 728 729 int32_t matchCount = 0; 730 const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status); 731 if (U_FAILURE(status)) { 732 errln("Fail at file %s, line %d. status = %s", __FILE__, __LINE__, u_errorName(status)); 733 return; 734 } 735 736 UnicodeSet setOfCharsetNames; // UnicodSets can hold strings. 737 int32_t i; 738 for (i=0; i<matchCount; i++) { 739 UnicodeString charSetName(ucsdet_getName(matches[i], &status)); 740 if (U_FAILURE(status)) { 741 errln("Fail at file %s, line %d. status = %s; i=%d", __FILE__, __LINE__, u_errorName(status), i); 742 status = U_ZERO_ERROR; 743 } 744 if (setOfCharsetNames.contains(charSetName)) { 745 errln("Fail at file %s, line %d ", __FILE__, __LINE__); 746 errln(UnicodeString(" Duplicate charset name = ") + charSetName); 747 } 748 setOfCharsetNames.add(charSetName); 749 } 750 ucsdet_close(csd); 751 #endif 752 } 753 754