1 /* 2 ********************************************************************** 3 * Copyright (C) 2005-2009, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 */ 7 8 9 #include "unicode/utypes.h" 10 #include "unicode/ucsdet.h" 11 #include "unicode/ucnv.h" 12 #include "unicode/unistr.h" 13 #include "unicode/putil.h" 14 #include "unicode/uniset.h" 15 16 #include "intltest.h" 17 #include "csdetest.h" 18 19 #include "xmlparser.h" 20 21 #include <stdlib.h> 22 #include <string.h> 23 24 #ifdef DEBUG_DETECT 25 #include <stdio.h> 26 #endif 27 28 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) 29 30 #define NEW_ARRAY(type,count) (type *) /*uprv_*/malloc((count) * sizeof(type)) 31 #define DELETE_ARRAY(array) /*uprv_*/free((void *) (array)) 32 33 #define CH_SPACE 0x0020 34 #define CH_SLASH 0x002F 35 36 //--------------------------------------------------------------------------- 37 // 38 // Test class boilerplate 39 // 40 //--------------------------------------------------------------------------- 41 CharsetDetectionTest::CharsetDetectionTest() 42 { 43 } 44 45 46 CharsetDetectionTest::~CharsetDetectionTest() 47 { 48 } 49 50 51 52 void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ ) 53 { 54 if (exec) logln("TestSuite CharsetDetectionTest: "); 55 switch (index) { 56 case 0: name = "ConstructionTest"; 57 if (exec) ConstructionTest(); 58 break; 59 60 case 1: name = "UTF8Test"; 61 if (exec) UTF8Test(); 62 break; 63 64 case 2: name = "UTF16Test"; 65 if (exec) UTF16Test(); 66 break; 67 68 case 3: name = "C1BytesTest"; 69 if (exec) C1BytesTest(); 70 break; 71 72 case 4: name = "InputFilterTest"; 73 if (exec) InputFilterTest(); 74 break; 75 76 case 5: name = "DetectionTest"; 77 if (exec) DetectionTest(); 78 break; 79 #if !UCONFIG_NO_LEGACY_CONVERSION 80 case 6: name = "IBM424Test"; 81 if (exec) IBM424Test(); 82 break; 83 84 case 7: name = "IBM420Test"; 85 if (exec) IBM420Test(); 86 break; 87 #else 88 case 6: 89 case 7: name = "skip"; break; 90 #endif 91 case 8: name = "Ticket6394Test"; 92 if (exec) Ticket6394Test(); 93 break; 94 95 default: name = ""; 96 break; //needed to end loop 97 } 98 } 99 100 static UnicodeString *split(const UnicodeString &src, UChar ch, int32_t &splits) 101 { 102 int32_t offset = -1; 103 104 splits = 1; 105 while((offset = src.indexOf(ch, offset + 1)) >= 0) { 106 splits += 1; 107 } 108 109 UnicodeString *result = new UnicodeString[splits]; 110 111 int32_t start = 0; 112 int32_t split = 0; 113 int32_t end; 114 115 while((end = src.indexOf(ch, start)) >= 0) { 116 src.extractBetween(start, end, result[split++]); 117 start = end + 1; 118 } 119 120 src.extractBetween(start, src.length(), result[split]); 121 122 return result; 123 } 124 125 static char *extractBytes(const UnicodeString &source, const char *codepage, int32_t &length) 126 { 127 int32_t sLength = source.length(); 128 char *bytes = NULL; 129 130 length = source.extract(0, sLength, NULL, codepage); 131 132 if (length > 0) { 133 bytes = NEW_ARRAY(char, length + 1); 134 source.extract(0, sLength, bytes, codepage); 135 } 136 137 return bytes; 138 } 139 140 static void freeBytes(char *bytes) 141 { 142 DELETE_ARRAY(bytes); 143 } 144 145 void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const UnicodeString &encoding, const UnicodeString &id) 146 { 147 int32_t splits = 0; 148 int32_t testLength = testString.length(); 149 UnicodeString *eSplit = split(encoding, CH_SLASH, splits); 150 UErrorCode status = U_ZERO_ERROR; 151 int32_t cpLength = eSplit[0].length(); 152 char codepage[64]; 153 154 u_UCharsToChars(eSplit[0].getBuffer(), codepage, cpLength); 155 codepage[cpLength] = '\0'; 156 157 LocalUCharsetDetectorPointer csd(ucsdet_open(&status)); 158 159 int32_t byteLength = 0; 160 char *bytes = extractBytes(testString, codepage, byteLength); 161 162 if (bytes == NULL) { 163 #if !UCONFIG_NO_LEGACY_CONVERSION 164 errln("Can't open a " + encoding + " converter for " + id); 165 #endif 166 return; 167 } 168 169 ucsdet_setText(csd.getAlias(), bytes, byteLength, &status); 170 171 int32_t matchCount = 0; 172 const UCharsetMatch **matches = ucsdet_detectAll(csd.getAlias(), &matchCount, &status); 173 174 175 UnicodeString name(ucsdet_getName(matches[0], &status)); 176 UnicodeString lang(ucsdet_getLanguage(matches[0], &status)); 177 UChar *decoded = NULL; 178 int32_t dLength = 0; 179 180 if (matchCount == 0) { 181 errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got no matches"); 182 goto bail; 183 } 184 185 if (name.compare(eSplit[0]) != 0) { 186 errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name); 187 188 #ifdef DEBUG_DETECT 189 for (int32_t m = 0; m < matchCount; m += 1) { 190 const char *name = ucsdet_getName(matches[m], &status); 191 const char *lang = ucsdet_getLanguage(matches[m], &status); 192 int32_t confidence = ucsdet_getConfidence(matches[m], &status); 193 194 printf("%s (%s) %d\n", name, lang, confidence); 195 } 196 #endif 197 goto bail; 198 } 199 200 if (splits > 1 && lang.compare(eSplit[1]) != 0) { 201 errln("Language detection failure for " + id + ", " + eSplit[0] + ": expected " + eSplit[1] + ", got " + lang); 202 goto bail; 203 } 204 205 decoded = NEW_ARRAY(UChar, testLength); 206 dLength = ucsdet_getUChars(matches[0], decoded, testLength, &status); 207 208 if (testString.compare(decoded, dLength) != 0) { 209 errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() didn't yeild the original string."); 210 211 #ifdef DEBUG_DETECT 212 for(int32_t i = 0; i < testLength; i += 1) { 213 if(testString[i] != decoded[i]) { 214 printf("Strings differ at byte %d\n", i); 215 break; 216 } 217 } 218 #endif 219 220 } 221 222 DELETE_ARRAY(decoded); 223 224 bail: 225 freeBytes(bytes); 226 delete[] eSplit; 227 } 228 229 const char *CharsetDetectionTest::getPath(char buffer[2048], const char *filename) { 230 UErrorCode status = U_ZERO_ERROR; 231 const char *testDataDirectory = IntlTest::getSourceTestData(status); 232 233 if (U_FAILURE(status)) { 234 errln("ERROR: getPath() failed - %s", u_errorName(status)); 235 return NULL; 236 } 237 238 strcpy(buffer, testDataDirectory); 239 strcat(buffer, filename); 240 return buffer; 241 } 242 243 void CharsetDetectionTest::ConstructionTest() 244 { 245 IcuTestErrorCode status(*this, "ConstructionTest"); 246 LocalUCharsetDetectorPointer csd(ucsdet_open(status)); 247 LocalUEnumerationPointer e(ucsdet_getAllDetectableCharsets(csd.getAlias(), status)); 248 int32_t count = uenum_count(e.getAlias(), status); 249 250 #ifdef DEBUG_DETECT 251 printf("There are %d recognizers.\n", count); 252 #endif 253 254 for(int32_t i = 0; i < count; i += 1) { 255 int32_t length; 256 const char *name = uenum_next(e.getAlias(), &length, status); 257 258 if(name == NULL || length <= 0) { 259 errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!"); 260 } 261 262 #ifdef DEBUG_DETECT 263 printf("%s\n", name); 264 #endif 265 } 266 } 267 268 void CharsetDetectionTest::UTF8Test() 269 { 270 UErrorCode status = U_ZERO_ERROR; 271 UnicodeString ss = "This is a string with some non-ascii characters that will " 272 "be converted to UTF-8, then shoved through the detection process. " 273 "\\u0391\\u0392\\u0393\\u0394\\u0395" 274 "Sure would be nice if our source could contain Unicode directly!"; 275 UnicodeString s = ss.unescape(); 276 int32_t byteLength = 0, sLength = s.length(); 277 char *bytes = extractBytes(s, "UTF-8", byteLength); 278 UCharsetDetector *csd = ucsdet_open(&status); 279 const UCharsetMatch *match; 280 UChar *detected = NEW_ARRAY(UChar, sLength); 281 282 ucsdet_setText(csd, bytes, byteLength, &status); 283 match = ucsdet_detect(csd, &status); 284 285 if (match == NULL) { 286 errln("Detection failure for UTF-8: got no matches."); 287 goto bail; 288 } 289 290 ucsdet_getUChars(match, detected, sLength, &status); 291 292 if (s.compare(detected, sLength) != 0) { 293 errln("Round-trip test failed!"); 294 } 295 296 ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */ 297 298 bail: 299 DELETE_ARRAY(detected); 300 freeBytes(bytes); 301 ucsdet_close(csd); 302 } 303 304 void CharsetDetectionTest::UTF16Test() 305 { 306 UErrorCode status = U_ZERO_ERROR; 307 /* Notice the BOM on the start of this string */ 308 UChar chars[] = { 309 0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C, 310 0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a, 311 0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628, 312 0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646, 313 0x064a, 0x062a, 0x0000}; 314 UnicodeString s(chars); 315 int32_t beLength = 0, leLength = 0; 316 char *beBytes = extractBytes(s, "UTF-16BE", beLength); 317 char *leBytes = extractBytes(s, "UTF-16LE", leLength); 318 UCharsetDetector *csd = ucsdet_open(&status); 319 const UCharsetMatch *match; 320 const char *name; 321 int32_t conf; 322 323 ucsdet_setText(csd, beBytes, beLength, &status); 324 match = ucsdet_detect(csd, &status); 325 326 if (match == NULL) { 327 errln("Encoding detection failure for UTF-16BE: got no matches."); 328 goto try_le; 329 } 330 331 name = ucsdet_getName(match, &status); 332 conf = ucsdet_getConfidence(match, &status); 333 334 if (strcmp(name, "UTF-16BE") != 0) { 335 errln("Encoding detection failure for UTF-16BE: got %s", name); 336 goto try_le; // no point in looking at confidence if we got the wrong character set. 337 } 338 339 if (conf != 100) { 340 errln("Did not get 100%% confidence for UTF-16BE: got %d", conf); 341 } 342 343 try_le: 344 ucsdet_setText(csd, leBytes, leLength, &status); 345 match = ucsdet_detect(csd, &status); 346 347 if (match == NULL) { 348 errln("Encoding detection failure for UTF-16LE: got no matches."); 349 goto bail; 350 } 351 352 name = ucsdet_getName(match, &status); 353 conf = ucsdet_getConfidence(match, &status); 354 355 356 if (strcmp(name, "UTF-16LE") != 0) { 357 errln("Enconding detection failure for UTF-16LE: got %s", name); 358 goto bail; // no point in looking at confidence if we got the wrong character set. 359 } 360 361 if (conf != 100) { 362 errln("Did not get 100%% confidence for UTF-16LE: got %d", conf); 363 } 364 365 bail: 366 freeBytes(leBytes); 367 freeBytes(beBytes); 368 ucsdet_close(csd); 369 } 370 371 void CharsetDetectionTest::InputFilterTest() 372 { 373 UErrorCode status = U_ZERO_ERROR; 374 UnicodeString ss = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>"; 375 UnicodeString s = ss.unescape(); 376 int32_t byteLength = 0; 377 char *bytes = extractBytes(s, "ISO-8859-1", byteLength); 378 UCharsetDetector *csd = ucsdet_open(&status); 379 const UCharsetMatch *match; 380 const char *lang, *name; 381 382 ucsdet_enableInputFilter(csd, TRUE); 383 384 if (!ucsdet_isInputFilterEnabled(csd)) { 385 errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!"); 386 } 387 388 389 ucsdet_setText(csd, bytes, byteLength, &status); 390 match = ucsdet_detect(csd, &status); 391 392 if (match == NULL) { 393 errln("Turning on the input filter resulted in no matches."); 394 goto turn_off; 395 } 396 397 name = ucsdet_getName(match, &status); 398 399 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) { 400 errln("Turning on the input filter resulted in %s rather than ISO-8859-1.", name); 401 } else { 402 lang = ucsdet_getLanguage(match, &status); 403 404 if (lang == NULL || strcmp(lang, "fr") != 0) { 405 errln("Input filter did not strip markup!"); 406 } 407 } 408 409 turn_off: 410 ucsdet_enableInputFilter(csd, FALSE); 411 ucsdet_setText(csd, bytes, byteLength, &status); 412 match = ucsdet_detect(csd, &status); 413 414 if (match == NULL) { 415 errln("Turning off the input filter resulted in no matches."); 416 goto bail; 417 } 418 419 name = ucsdet_getName(match, &status); 420 421 if (name == NULL || strcmp(name, "ISO-8859-1") != 0) { 422 errln("Turning off the input filter resulted in %s rather than ISO-8859-1.", name); 423 } else { 424 lang = ucsdet_getLanguage(match, &status); 425 426 if (lang == NULL || strcmp(lang, "en") != 0) { 427 errln("Unfiltered input did not detect as English!"); 428 } 429 } 430 431 bail: 432 freeBytes(bytes); 433 ucsdet_close(csd); 434 } 435 436 void CharsetDetectionTest::C1BytesTest() 437 { 438 #if !UCONFIG_NO_LEGACY_CONVERSION 439 UErrorCode status = U_ZERO_ERROR; 440 UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly."; 441 UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.", -1, US_INV); 442 UnicodeString sWindows = ssWindows.unescape(); 443 int32_t lISO = 0, lWindows = 0; 444 char *bISO = extractBytes(sISO, "ISO-8859-1", lISO); 445 char *bWindows = extractBytes(sWindows, "windows-1252", lWindows); 446 UCharsetDetector *csd = ucsdet_open(&status); 447 const UCharsetMatch *match; 448 const char *name; 449 450 ucsdet_setText(csd, bWindows, lWindows, &status); 451 match = ucsdet_detect(csd, &status); 452 453 if (match == NULL) { 454 errcheckln(status, "English test with C1 bytes got no matches. - %s", u_errorName(status)); 455 goto bail; 456 } 457 458 name = ucsdet_getName(match, &status); 459 460 if (strcmp(name, "windows-1252") != 0) { 461 errln("English text with C1 bytes does not detect as windows-1252, but as %s", name); 462 } 463 464 ucsdet_setText(csd, bISO, lISO, &status); 465 match = ucsdet_detect(csd, &status); 466 467 if (match == NULL) { 468 errln("English text without C1 bytes got no matches."); 469 goto bail; 470 } 471 472 name = ucsdet_getName(match, &status); 473 474 if (strcmp(name, "ISO-8859-1") != 0) { 475 errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name); 476 } 477 478 bail: 479 freeBytes(bWindows); 480 freeBytes(bISO); 481 482 ucsdet_close(csd); 483 #endif 484 } 485 486 void CharsetDetectionTest::DetectionTest() 487 { 488 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 489 UErrorCode status = U_ZERO_ERROR; 490 char path[2048]; 491 const char *testFilePath = getPath(path, "csdetest.xml"); 492 493 if (testFilePath == NULL) { 494 return; /* Couldn't get path: error message already output. */ 495 } 496 497 UXMLParser *parser = UXMLParser::createParser(status); 498 if (U_FAILURE(status)) { 499 dataerrln("FAIL: UXMLParser::createParser (%s)", u_errorName(status)); 500 return; 501 } 502 503 UXMLElement *root = parser->parseFile(testFilePath, status); 504 if (!assertSuccess( "parseFile",status)) return; 505 506 UnicodeString test_case = UNICODE_STRING_SIMPLE("test-case"); 507 UnicodeString id_attr = UNICODE_STRING_SIMPLE("id"); 508 UnicodeString enc_attr = UNICODE_STRING_SIMPLE("encodings"); 509 510 const UXMLElement *testCase; 511 int32_t tc = 0; 512 513 while((testCase = root->nextChildElement(tc)) != NULL) { 514 if (testCase->getTagName().compare(test_case) == 0) { 515 const UnicodeString *id = testCase->getAttribute(id_attr); 516 const UnicodeString *encodings = testCase->getAttribute(enc_attr); 517 const UnicodeString text = testCase->getText(TRUE); 518 int32_t encodingCount; 519 UnicodeString *encodingList = split(*encodings, CH_SPACE, encodingCount); 520 521 for(int32_t e = 0; e < encodingCount; e += 1) { 522 checkEncoding(text, encodingList[e], *id); 523 } 524 525 delete[] encodingList; 526 } 527 } 528 529 delete root; 530 delete parser; 531 #endif 532 } 533 534 void CharsetDetectionTest::IBM424Test() 535 { 536 UErrorCode status = U_ZERO_ERROR; 537 538 static const UChar chars[] = { 539 0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8, 540 0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9, 541 0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8, 542 0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA, 543 0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5, 544 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE, 545 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 546 0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC, 547 0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3, 548 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020, 549 0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC, 550 0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 551 0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 552 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 553 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC, 554 0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1, 555 0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000 556 }; 557 558 static const UChar chars_reverse[] = { 559 0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA, 560 0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8, 561 0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 562 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 563 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9, 564 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4, 565 0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9, 566 0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5, 567 0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3, 568 0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020, 569 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 570 0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9, 571 0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020, 572 0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4, 573 0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7, 574 0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0, 575 0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4, 576 0x0000 577 }; 578 579 int32_t bLength = 0, brLength = 0; 580 581 UnicodeString s1(chars); 582 UnicodeString s2(chars_reverse); 583 584 char *bytes = extractBytes(s1, "IBM424", bLength); 585 char *bytes_r = extractBytes(s2, "IBM424", brLength); 586 587 UCharsetDetector *csd = ucsdet_open(&status); 588 if (U_FAILURE(status)) { 589 errln("Error opening charset detector. - %s", u_errorName(status)); 590 } 591 const UCharsetMatch *match; 592 const char *name; 593 594 ucsdet_setText(csd, bytes, bLength, &status); 595 match = ucsdet_detect(csd, &status); 596 597 if (match == NULL) { 598 errcheckln(status, "Encoding detection failure for IBM424_rtl: got no matches. - %s", u_errorName(status)); 599 goto bail; 600 } 601 602 name = ucsdet_getName(match, &status); 603 if (strcmp(name, "IBM424_rtl") != 0) { 604 errln("Encoding detection failure for IBM424_rtl: got %s", name); 605 } 606 607 ucsdet_setText(csd, bytes_r, brLength, &status); 608 match = ucsdet_detect(csd, &status); 609 610 if (match == NULL) { 611 errln("Encoding detection failure for IBM424_ltr: got no matches."); 612 goto bail; 613 } 614 615 name = ucsdet_getName(match, &status); 616 if (strcmp(name, "IBM424_ltr") != 0) { 617 errln("Encoding detection failure for IBM424_ltr: got %s", name); 618 } 619 620 bail: 621 freeBytes(bytes); 622 freeBytes(bytes_r); 623 ucsdet_close(csd); 624 } 625 626 void CharsetDetectionTest::IBM420Test() 627 { 628 UErrorCode status = U_ZERO_ERROR; 629 630 static const UChar chars[] = { 631 0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627, 632 0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641, 633 0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 634 0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645, 635 0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A, 636 0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644, 637 0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020, 638 0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 639 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634, 640 0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F, 641 0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647, 642 0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627, 643 0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E, 644 0x0000 645 }; 646 static const UChar chars_reverse[] = { 647 0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F, 648 0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020, 649 0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648, 650 0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628, 651 0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 652 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A, 653 0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644, 654 0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A, 655 0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A, 656 0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627, 657 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A, 658 0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645, 659 0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648, 660 0x0000, 661 }; 662 663 int32_t bLength = 0, brLength = 0; 664 665 UnicodeString s1(chars); 666 UnicodeString s2(chars_reverse); 667 668 char *bytes = extractBytes(s1, "IBM420", bLength); 669 char *bytes_r = extractBytes(s2, "IBM420", brLength); 670 671 UCharsetDetector *csd = ucsdet_open(&status); 672 if (U_FAILURE(status)) { 673 errln("Error opening charset detector. - %s", u_errorName(status)); 674 } 675 const UCharsetMatch *match; 676 const char *name; 677 678 ucsdet_setText(csd, bytes, bLength, &status); 679 match = ucsdet_detect(csd, &status); 680 681 if (match == NULL) { 682 errcheckln(status, "Encoding detection failure for IBM420_rtl: got no matches. - %s", u_errorName(status)); 683 goto bail; 684 } 685 686 name = ucsdet_getName(match, &status); 687 if (strcmp(name, "IBM420_rtl") != 0) { 688 errln("Encoding detection failure for IBM420_rtl: got %s\n", name); 689 } 690 691 ucsdet_setText(csd, bytes_r, brLength, &status); 692 match = ucsdet_detect(csd, &status); 693 694 if (match == NULL) { 695 errln("Encoding detection failure for IBM420_ltr: got no matches.\n"); 696 goto bail; 697 } 698 699 name = ucsdet_getName(match, &status); 700 if (strcmp(name, "IBM420_ltr") != 0) { 701 errln("Encoding detection failure for IBM420_ltr: got %s\n", name); 702 } 703 704 bail: 705 freeBytes(bytes); 706 freeBytes(bytes_r); 707 ucsdet_close(csd); 708 } 709 710 711 void CharsetDetectionTest::Ticket6394Test() { 712 #if !UCONFIG_NO_CONVERSION 713 const char charText[] = "Here is some random English text that should be detected as ISO-8859-1." 714 "Ticket 6394 claims that ISO-8859-1 will appear in the array of detected " 715 "encodings more than once. The hop through UnicodeString is for platforms " 716 "where this char * string is be EBCDIC and needs conversion to Latin1."; 717 char latin1Text[sizeof(charText)]; 718 UnicodeString(charText).extract(0, sizeof(charText)-2, latin1Text, sizeof(latin1Text), "ISO-8859-1"); 719 720 UErrorCode status = U_ZERO_ERROR; 721 UCharsetDetector *csd = ucsdet_open(&status); 722 ucsdet_setText(csd, latin1Text, -1, &status); 723 if (U_FAILURE(status)) { 724 errln("Fail at file %s, line %d. status = %s", __FILE__, __LINE__, u_errorName(status)); 725 return; 726 } 727 728 int32_t matchCount = 0; 729 const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status); 730 if (U_FAILURE(status)) { 731 errln("Fail at file %s, line %d. status = %s", __FILE__, __LINE__, u_errorName(status)); 732 return; 733 } 734 735 UnicodeSet setOfCharsetNames; // UnicodSets can hold strings. 736 int32_t i; 737 for (i=0; i<matchCount; i++) { 738 UnicodeString charSetName(ucsdet_getName(matches[i], &status)); 739 if (U_FAILURE(status)) { 740 errln("Fail at file %s, line %d. status = %s; i=%d", __FILE__, __LINE__, u_errorName(status), i); 741 status = U_ZERO_ERROR; 742 } 743 if (setOfCharsetNames.contains(charSetName)) { 744 errln("Fail at file %s, line %d ", __FILE__, __LINE__); 745 errln(UnicodeString(" Duplicate charset name = ") + charSetName); 746 } 747 setOfCharsetNames.add(charSetName); 748 } 749 ucsdet_close(csd); 750 #endif 751 } 752 753