Home | History | Annotate | Download | only in intltest
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4  **********************************************************************
      5  *   Copyright (C) 2005-2016, International Business Machines
      6  *   Corporation and others.  All Rights Reserved.
      7  **********************************************************************
      8  */
      9 
     10 
     11 #include "unicode/utypes.h"
     12 #include "unicode/ucsdet.h"
     13 #include "unicode/ucnv.h"
     14 #include "unicode/unistr.h"
     15 #include "unicode/putil.h"
     16 #include "unicode/uniset.h"
     17 
     18 #include "intltest.h"
     19 #include "csdetest.h"
     20 
     21 #include "xmlparser.h"
     22 
     23 #include <stdlib.h>
     24 #include <string.h>
     25 
     26 #ifdef DEBUG_DETECT
     27 #include <stdio.h>
     28 #endif
     29 
     30 #define NEW_ARRAY(type,count) (type *) /*uprv_*/malloc((count) * sizeof(type))
     31 #define DELETE_ARRAY(array) /*uprv_*/free((void *) (array))
     32 
     33 #define CH_SPACE 0x0020
     34 #define CH_SLASH 0x002F
     35 
     36 #define TEST_ASSERT(x) {if (!(x)) { \
     37     errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
     38 
     39 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
     40     errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));\
     41     return;}}
     42 
     43 
     44 //---------------------------------------------------------------------------
     45 //
     46 //  Test class boilerplate
     47 //
     48 //---------------------------------------------------------------------------
     49 CharsetDetectionTest::CharsetDetectionTest()
     50 {
     51 }
     52 
     53 
     54 CharsetDetectionTest::~CharsetDetectionTest()
     55 {
     56 }
     57 
     58 
     59 
     60 void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
     61 {
     62     if (exec) logln("TestSuite CharsetDetectionTest: ");
     63     switch (index) {
     64        case 0: name = "ConstructionTest";
     65             if (exec) ConstructionTest();
     66             break;
     67 
     68        case 1: name = "UTF8Test";
     69             if (exec) UTF8Test();
     70             break;
     71 
     72        case 2: name = "UTF16Test";
     73             if (exec) UTF16Test();
     74             break;
     75 
     76        case 3: name = "C1BytesTest";
     77             if (exec) C1BytesTest();
     78             break;
     79 
     80        case 4: name = "InputFilterTest";
     81             if (exec) InputFilterTest();
     82             break;
     83 
     84        case 5: name = "DetectionTest";
     85             if (exec) DetectionTest();
     86             break;
     87 #if !UCONFIG_NO_LEGACY_CONVERSION
     88        case 6: name = "IBM424Test";
     89             if (exec) IBM424Test();
     90             break;
     91 
     92        case 7: name = "IBM420Test";
     93             if (exec) IBM420Test();
     94             break;
     95 #else
     96        case 6:
     97        case 7: name = "skip"; break;
     98 #endif
     99        case 8: name = "Ticket6394Test";
    100             if (exec) Ticket6394Test();
    101             break;
    102 
    103        case 9: name = "Ticket6954Test";
    104             if (exec) Ticket6954Test();
    105             break;
    106 
    107         default: name = "";
    108             break; //needed to end loop
    109     }
    110 }
    111 
    112 static UnicodeString *split(const UnicodeString &src, UChar ch, int32_t &splits)
    113 {
    114     int32_t offset = -1;
    115 
    116     splits = 1;
    117     while((offset = src.indexOf(ch, offset + 1)) >= 0) {
    118         splits += 1;
    119     }
    120 
    121     UnicodeString *result = new UnicodeString[splits];
    122 
    123     int32_t start = 0;
    124     int32_t split = 0;
    125     int32_t end;
    126 
    127     while((end = src.indexOf(ch, start)) >= 0) {
    128         src.extractBetween(start, end, result[split++]);
    129         start = end + 1;
    130     }
    131 
    132     src.extractBetween(start, src.length(), result[split]);
    133 
    134     return result;
    135 }
    136 
    137 static char *extractBytes(const UnicodeString &source, const char *codepage, int32_t &length)
    138 {
    139     int32_t sLength = source.length();
    140     char *bytes = NULL;
    141 
    142     length = source.extract(0, sLength, NULL, codepage);
    143 
    144     if (length > 0) {
    145         bytes = NEW_ARRAY(char, length + 1);
    146         source.extract(0, sLength, bytes, codepage);
    147     }
    148 
    149     return bytes;
    150 }
    151 
    152 static void freeBytes(char *bytes)
    153 {
    154     DELETE_ARRAY(bytes);
    155 }
    156 
    157 void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const UnicodeString &encoding, const UnicodeString &id)
    158 {
    159     int32_t splits = 0;
    160     int32_t testLength = testString.length();
    161     UnicodeString *eSplit = split(encoding, CH_SLASH, splits);
    162     UErrorCode status = U_ZERO_ERROR;
    163     int32_t cpLength = eSplit[0].length();
    164     char codepage[64];
    165 
    166     u_UCharsToChars(eSplit[0].getBuffer(), codepage, cpLength);
    167     codepage[cpLength] = '\0';
    168 
    169     LocalUCharsetDetectorPointer csd(ucsdet_open(&status));
    170 
    171     int32_t byteLength = 0;
    172     char *bytes = extractBytes(testString, codepage, byteLength);
    173 
    174     if (bytes == NULL) {
    175 #if !UCONFIG_NO_LEGACY_CONVERSION
    176         dataerrln("Can't open a " + encoding + " converter for " + id);
    177 #endif
    178         return;
    179     }
    180 
    181     ucsdet_setText(csd.getAlias(), bytes, byteLength, &status);
    182 
    183     int32_t matchCount = 0;
    184     const UCharsetMatch **matches = ucsdet_detectAll(csd.getAlias(), &matchCount, &status);
    185 
    186 
    187     UnicodeString name(ucsdet_getName(matches[0], &status));
    188     UnicodeString lang(ucsdet_getLanguage(matches[0], &status));
    189     UChar *decoded = NULL;
    190     int32_t dLength = 0;
    191 
    192     if (matchCount == 0) {
    193         errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got no matches");
    194         goto bail;
    195     }
    196 
    197     if (name.compare(eSplit[0]) != 0) {
    198         errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name);
    199 
    200 #ifdef DEBUG_DETECT
    201         for (int32_t m = 0; m < matchCount; m += 1) {
    202             const char *name = ucsdet_getName(matches[m], &status);
    203             const char *lang = ucsdet_getLanguage(matches[m], &status);
    204             int32_t confidence = ucsdet_getConfidence(matches[m], &status);
    205 
    206             printf("%s (%s) %d\n", name, lang, confidence);
    207         }
    208 #endif
    209         goto bail;
    210     }
    211 
    212     if (splits > 1 && lang.compare(eSplit[1]) != 0) {
    213         errln("Language detection failure for " + id + ", " + eSplit[0] + ": expected " + eSplit[1] + ", got " + lang);
    214         goto bail;
    215     }
    216 
    217     decoded = NEW_ARRAY(UChar, testLength);
    218     dLength = ucsdet_getUChars(matches[0], decoded, testLength, &status);
    219 
    220     if (testString.compare(decoded, dLength) != 0) {
    221         errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() didn't yeild the original string.");
    222 
    223 #ifdef DEBUG_DETECT
    224         for(int32_t i = 0; i < testLength; i += 1) {
    225             if(testString[i] != decoded[i]) {
    226                 printf("Strings differ at byte %d\n", i);
    227                 break;
    228             }
    229         }
    230 #endif
    231 
    232     }
    233 
    234     DELETE_ARRAY(decoded);
    235 
    236 bail:
    237     freeBytes(bytes);
    238     delete[] eSplit;
    239 }
    240 
    241 const char *CharsetDetectionTest::getPath(char buffer[2048], const char *filename) {
    242     UErrorCode status = U_ZERO_ERROR;
    243     const char *testDataDirectory = IntlTest::getSourceTestData(status);
    244 
    245     if (U_FAILURE(status)) {
    246         errln("ERROR: getPath() failed - %s", u_errorName(status));
    247         return NULL;
    248     }
    249 
    250     strcpy(buffer, testDataDirectory);
    251     strcat(buffer, filename);
    252     return buffer;
    253 }
    254 
    255 void CharsetDetectionTest::ConstructionTest()
    256 {
    257     IcuTestErrorCode status(*this, "ConstructionTest");
    258     LocalUCharsetDetectorPointer csd(ucsdet_open(status));
    259     LocalUEnumerationPointer e(ucsdet_getAllDetectableCharsets(csd.getAlias(), status));
    260     int32_t count = uenum_count(e.getAlias(), status);
    261 
    262 #ifdef DEBUG_DETECT
    263     printf("There are %d recognizers.\n", count);
    264 #endif
    265 
    266     for(int32_t i = 0; i < count; i += 1) {
    267         int32_t length;
    268         const char *name = uenum_next(e.getAlias(), &length, status);
    269 
    270         if(name == NULL || length <= 0) {
    271             errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!");
    272         }
    273 
    274 #ifdef DEBUG_DETECT
    275         printf("%s\n", name);
    276 #endif
    277     }
    278 
    279     const char* defDisabled[] = {
    280         "IBM420_rtl", "IBM420_ltr",
    281         "IBM424_rtl", "IBM424_ltr",
    282         0
    283     };
    284 
    285     LocalUEnumerationPointer eActive(ucsdet_getDetectableCharsets(csd.getAlias(), status));
    286     const char *activeName = NULL;
    287 
    288     while ((activeName = uenum_next(eActive.getAlias(), NULL, status))) {
    289         // the charset must be included in all list
    290         UBool found = FALSE;
    291 
    292         const char *name = NULL;
    293         uenum_reset(e.getAlias(), status);
    294         while ((name = uenum_next(e.getAlias(), NULL, status))) {
    295             if (strcmp(activeName, name) == 0) {
    296                 found = TRUE;
    297                 break;
    298             }
    299         }
    300 
    301         if (!found) {
    302             errln(UnicodeString(activeName) + " is not included in the all charset list.");
    303         }
    304 
    305         // some charsets are disabled by default
    306         found = FALSE;
    307         for (int32_t i = 0; defDisabled[i] != 0; i++) {
    308             if (strcmp(activeName, defDisabled[i]) == 0) {
    309                 found = TRUE;
    310                 break;
    311             }
    312         }
    313         if (found) {
    314             errln(UnicodeString(activeName) + " should not be included in the default charset list.");
    315         }
    316     }
    317 }
    318 
    319 void CharsetDetectionTest::UTF8Test()
    320 {
    321     UErrorCode status = U_ZERO_ERROR;
    322     UnicodeString ss = "This is a string with some non-ascii characters that will "
    323                        "be converted to UTF-8, then shoved through the detection process.  "
    324                        "\\u0391\\u0392\\u0393\\u0394\\u0395"
    325                        "Sure would be nice if our source could contain Unicode directly!";
    326     UnicodeString s = ss.unescape();
    327     int32_t byteLength = 0, sLength = s.length();
    328     char *bytes = extractBytes(s, "UTF-8", byteLength);
    329     UCharsetDetector *csd = ucsdet_open(&status);
    330     const UCharsetMatch *match;
    331     UChar *detected = NEW_ARRAY(UChar, sLength);
    332 
    333     ucsdet_setText(csd, bytes, byteLength, &status);
    334     match = ucsdet_detect(csd, &status);
    335 
    336     if (match == NULL) {
    337         errln("Detection failure for UTF-8: got no matches.");
    338         goto bail;
    339     }
    340 
    341     ucsdet_getUChars(match, detected, sLength, &status);
    342 
    343     if (s.compare(detected, sLength) != 0) {
    344         errln("Round-trip test failed!");
    345     }
    346 
    347     ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
    348 
    349 bail:
    350     DELETE_ARRAY(detected);
    351     freeBytes(bytes);
    352     ucsdet_close(csd);
    353 }
    354 
    355 void CharsetDetectionTest::UTF16Test()
    356 {
    357     UErrorCode status = U_ZERO_ERROR;
    358     /* Notice the BOM on the start of this string */
    359     UChar chars[] = {
    360         0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
    361         0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
    362         0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
    363         0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
    364         0x064a, 0x062a, 0x0000};
    365     UnicodeString s(chars);
    366     int32_t beLength = 0, leLength = 0;
    367     char *beBytes = extractBytes(s, "UTF-16BE", beLength);
    368     char *leBytes = extractBytes(s, "UTF-16LE", leLength);
    369     UCharsetDetector *csd = ucsdet_open(&status);
    370     const UCharsetMatch *match;
    371     const char *name;
    372     int32_t conf;
    373 
    374     ucsdet_setText(csd, beBytes, beLength, &status);
    375     match = ucsdet_detect(csd, &status);
    376 
    377     if (match == NULL) {
    378         errln("Encoding detection failure for UTF-16BE: got no matches.");
    379         goto try_le;
    380     }
    381 
    382     name  = ucsdet_getName(match, &status);
    383     conf  = ucsdet_getConfidence(match, &status);
    384 
    385     if (strcmp(name, "UTF-16BE") != 0) {
    386         errln("Encoding detection failure for UTF-16BE: got %s", name);
    387         goto try_le; // no point in looking at confidence if we got the wrong character set.
    388     }
    389 
    390     if (conf != 100) {
    391         errln("Did not get 100%% confidence for UTF-16BE: got %d", conf);
    392     }
    393 
    394 try_le:
    395     ucsdet_setText(csd, leBytes, leLength, &status);
    396     match = ucsdet_detect(csd, &status);
    397 
    398     if (match == NULL) {
    399         errln("Encoding detection failure for UTF-16LE: got no matches.");
    400         goto bail;
    401     }
    402 
    403     name  = ucsdet_getName(match, &status);
    404     conf = ucsdet_getConfidence(match, &status);
    405 
    406 
    407     if (strcmp(name, "UTF-16LE") != 0) {
    408         errln("Enconding detection failure for UTF-16LE: got %s", name);
    409         goto bail; // no point in looking at confidence if we got the wrong character set.
    410     }
    411 
    412     if (conf != 100) {
    413         errln("Did not get 100%% confidence for UTF-16LE: got %d", conf);
    414     }
    415 
    416 bail:
    417     freeBytes(leBytes);
    418     freeBytes(beBytes);
    419     ucsdet_close(csd);
    420 }
    421 
    422 void CharsetDetectionTest::InputFilterTest()
    423 {
    424     UErrorCode status = U_ZERO_ERROR;
    425     UnicodeString ss = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
    426     UnicodeString s  = ss.unescape();
    427     int32_t byteLength = 0;
    428     char *bytes = extractBytes(s, "ISO-8859-1", byteLength);
    429     UCharsetDetector *csd = ucsdet_open(&status);
    430     const UCharsetMatch *match;
    431     const char *lang, *name;
    432 
    433     ucsdet_enableInputFilter(csd, TRUE);
    434 
    435     if (!ucsdet_isInputFilterEnabled(csd)) {
    436         errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!");
    437     }
    438 
    439 
    440     ucsdet_setText(csd, bytes, byteLength, &status);
    441     match = ucsdet_detect(csd, &status);
    442 
    443     if (match == NULL) {
    444         errln("Turning on the input filter resulted in no matches.");
    445         goto turn_off;
    446     }
    447 
    448     name = ucsdet_getName(match, &status);
    449 
    450     if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
    451         errln("Turning on the input filter resulted in %s rather than ISO-8859-1.", name);
    452     } else {
    453         lang = ucsdet_getLanguage(match, &status);
    454 
    455         if (lang == NULL || strcmp(lang, "fr") != 0) {
    456             errln("Input filter did not strip markup!");
    457         }
    458     }
    459 
    460 turn_off:
    461     ucsdet_enableInputFilter(csd, FALSE);
    462     ucsdet_setText(csd, bytes, byteLength, &status);
    463     match = ucsdet_detect(csd, &status);
    464 
    465     if (match == NULL) {
    466         errln("Turning off the input filter resulted in no matches.");
    467         goto bail;
    468     }
    469 
    470     name = ucsdet_getName(match, &status);
    471 
    472     if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
    473         errln("Turning off the input filter resulted in %s rather than ISO-8859-1.", name);
    474     } else {
    475         lang = ucsdet_getLanguage(match, &status);
    476 
    477         if (lang == NULL || strcmp(lang, "en") != 0) {
    478             errln("Unfiltered input did not detect as English!");
    479         }
    480     }
    481 
    482 bail:
    483     freeBytes(bytes);
    484     ucsdet_close(csd);
    485 }
    486 
    487 void CharsetDetectionTest::C1BytesTest()
    488 {
    489 #if !UCONFIG_NO_LEGACY_CONVERSION
    490     UErrorCode status = U_ZERO_ERROR;
    491     UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
    492     UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
    493     UnicodeString sWindows  = ssWindows.unescape();
    494     int32_t lISO = 0, lWindows = 0;
    495     char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
    496     char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
    497     UCharsetDetector *csd = ucsdet_open(&status);
    498     const UCharsetMatch *match;
    499     const char *name;
    500 
    501     ucsdet_setText(csd, bWindows, lWindows, &status);
    502     match = ucsdet_detect(csd, &status);
    503 
    504     if (match == NULL) {
    505         errcheckln(status, "English test with C1 bytes got no matches. - %s", u_errorName(status));
    506         goto bail;
    507     }
    508 
    509     name  = ucsdet_getName(match, &status);
    510 
    511     if (strcmp(name, "windows-1252") != 0) {
    512         errln("English text with C1 bytes does not detect as windows-1252, but as %s", name);
    513     }
    514 
    515     ucsdet_setText(csd, bISO, lISO, &status);
    516     match = ucsdet_detect(csd, &status);
    517 
    518     if (match == NULL) {
    519         errln("English text without C1 bytes got no matches.");
    520         goto bail;
    521     }
    522 
    523     name  = ucsdet_getName(match, &status);
    524 
    525     if (strcmp(name, "ISO-8859-1") != 0) {
    526         errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name);
    527     }
    528 
    529 bail:
    530     freeBytes(bWindows);
    531     freeBytes(bISO);
    532 
    533     ucsdet_close(csd);
    534 #endif
    535 }
    536 
    537 void CharsetDetectionTest::DetectionTest()
    538 {
    539 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
    540     UErrorCode status = U_ZERO_ERROR;
    541     char path[2048];
    542     const char *testFilePath = getPath(path, "csdetest.xml");
    543 
    544     if (testFilePath == NULL) {
    545         return; /* Couldn't get path: error message already output. */
    546     }
    547 
    548     UXMLParser  *parser = UXMLParser::createParser(status);
    549     if (U_FAILURE(status)) {
    550         dataerrln("FAIL: UXMLParser::createParser (%s)", u_errorName(status));
    551         return;
    552     }
    553 
    554     UXMLElement *root   = parser->parseFile(testFilePath, status);
    555     if (!assertSuccess( "parseFile",status)) return;
    556 
    557     UnicodeString test_case = UNICODE_STRING_SIMPLE("test-case");
    558     UnicodeString id_attr   = UNICODE_STRING_SIMPLE("id");
    559     UnicodeString enc_attr  = UNICODE_STRING_SIMPLE("encodings");
    560 
    561     const UXMLElement *testCase;
    562     int32_t tc = 0;
    563 
    564     while((testCase = root->nextChildElement(tc)) != NULL) {
    565         if (testCase->getTagName().compare(test_case) == 0) {
    566             const UnicodeString *id = testCase->getAttribute(id_attr);
    567             const UnicodeString *encodings = testCase->getAttribute(enc_attr);
    568             const UnicodeString  text = testCase->getText(TRUE);
    569             int32_t encodingCount;
    570             UnicodeString *encodingList = split(*encodings, CH_SPACE, encodingCount);
    571 
    572             for(int32_t e = 0; e < encodingCount; e += 1) {
    573                 checkEncoding(text, encodingList[e], *id);
    574             }
    575 
    576             delete[] encodingList;
    577         }
    578     }
    579 
    580     delete root;
    581     delete parser;
    582 #endif
    583 }
    584 
    585 void CharsetDetectionTest::IBM424Test()
    586 {
    587 #if !UCONFIG_ONLY_HTML_CONVERSION
    588     UErrorCode status = U_ZERO_ERROR;
    589 
    590     static const UChar chars[] = {
    591             0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
    592             0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
    593             0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
    594             0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
    595             0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
    596             0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
    597             0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
    598             0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
    599             0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
    600             0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
    601             0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
    602             0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
    603             0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
    604             0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
    605             0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
    606             0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
    607             0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
    608     };
    609 
    610     static const UChar chars_reverse[] = {
    611             0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
    612             0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
    613             0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
    614             0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
    615             0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
    616             0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
    617             0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
    618             0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
    619             0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
    620             0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
    621             0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
    622             0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
    623             0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
    624             0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
    625             0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
    626             0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
    627             0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
    628             0x0000
    629     };
    630 
    631     int32_t bLength = 0, brLength = 0;
    632 
    633     UnicodeString s1(chars);
    634     UnicodeString s2(chars_reverse);
    635 
    636     char *bytes = extractBytes(s1, "IBM424", bLength);
    637     char *bytes_r = extractBytes(s2, "IBM424", brLength);
    638 
    639     UCharsetDetector *csd = ucsdet_open(&status);
    640 	ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status);
    641 	ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status);
    642 	ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status);
    643 	ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status);
    644     if (U_FAILURE(status)) {
    645         errln("Error opening charset detector. - %s", u_errorName(status));
    646     }
    647     const UCharsetMatch *match;
    648     const char *name;
    649 
    650     ucsdet_setText(csd, bytes, bLength, &status);
    651     match = ucsdet_detect(csd, &status);
    652 
    653     if (match == NULL) {
    654         errcheckln(status, "Encoding detection failure for IBM424_rtl: got no matches. - %s", u_errorName(status));
    655         goto bail;
    656     }
    657 
    658     name  = ucsdet_getName(match, &status);
    659     if (strcmp(name, "IBM424_rtl") != 0) {
    660         errln("Encoding detection failure for IBM424_rtl: got %s", name);
    661     }
    662 
    663     ucsdet_setText(csd, bytes_r, brLength, &status);
    664     match = ucsdet_detect(csd, &status);
    665 
    666     if (match == NULL) {
    667         errln("Encoding detection failure for IBM424_ltr: got no matches.");
    668         goto bail;
    669     }
    670 
    671     name  = ucsdet_getName(match, &status);
    672     if (strcmp(name, "IBM424_ltr") != 0) {
    673         errln("Encoding detection failure for IBM424_ltr: got %s", name);
    674     }
    675 
    676 bail:
    677     freeBytes(bytes);
    678     freeBytes(bytes_r);
    679     ucsdet_close(csd);
    680 #endif
    681 }
    682 
    683 void CharsetDetectionTest::IBM420Test()
    684 {
    685 #if !UCONFIG_ONLY_HTML_CONVERSION
    686     UErrorCode status = U_ZERO_ERROR;
    687 
    688     static const UChar chars[] = {
    689         0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
    690         0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
    691         0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
    692         0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
    693         0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
    694         0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
    695         0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
    696         0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
    697         0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
    698         0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
    699         0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
    700         0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
    701         0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
    702         0x0000
    703     };
    704     static const UChar chars_reverse[] = {
    705         0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
    706         0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
    707         0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
    708         0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
    709         0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
    710         0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
    711         0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
    712         0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
    713         0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
    714         0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
    715         0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
    716         0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
    717         0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
    718         0x0000,
    719     };
    720 
    721     int32_t bLength = 0, brLength = 0;
    722 
    723     UnicodeString s1(chars);
    724     UnicodeString s2(chars_reverse);
    725 
    726     char *bytes = extractBytes(s1, "IBM420", bLength);
    727     char *bytes_r = extractBytes(s2, "IBM420", brLength);
    728 
    729     UCharsetDetector *csd = ucsdet_open(&status);
    730     if (U_FAILURE(status)) {
    731         errln("Error opening charset detector. - %s", u_errorName(status));
    732     }
    733 	ucsdet_setDetectableCharset(csd, "IBM424_rtl", TRUE, &status);
    734 	ucsdet_setDetectableCharset(csd, "IBM424_ltr", TRUE, &status);
    735 	ucsdet_setDetectableCharset(csd, "IBM420_rtl", TRUE, &status);
    736 	ucsdet_setDetectableCharset(csd, "IBM420_ltr", TRUE, &status);
    737     const UCharsetMatch *match;
    738     const char *name;
    739 
    740     ucsdet_setText(csd, bytes, bLength, &status);
    741     match = ucsdet_detect(csd, &status);
    742 
    743     if (match == NULL) {
    744         errcheckln(status, "Encoding detection failure for IBM420_rtl: got no matches. - %s", u_errorName(status));
    745         goto bail;
    746     }
    747 
    748     name  = ucsdet_getName(match, &status);
    749     if (strcmp(name, "IBM420_rtl") != 0) {
    750         errln("Encoding detection failure for IBM420_rtl: got %s\n", name);
    751     }
    752 
    753     ucsdet_setText(csd, bytes_r, brLength, &status);
    754     match = ucsdet_detect(csd, &status);
    755 
    756     if (match == NULL) {
    757         errln("Encoding detection failure for IBM420_ltr: got no matches.\n");
    758         goto bail;
    759     }
    760 
    761     name  = ucsdet_getName(match, &status);
    762     if (strcmp(name, "IBM420_ltr") != 0) {
    763         errln("Encoding detection failure for IBM420_ltr: got %s\n", name);
    764     }
    765 
    766 bail:
    767     freeBytes(bytes);
    768     freeBytes(bytes_r);
    769     ucsdet_close(csd);
    770 #endif
    771 }
    772 
    773 
    774 void CharsetDetectionTest::Ticket6394Test() {
    775 #if !UCONFIG_NO_CONVERSION
    776     const char charText[] =  "Here is some random English text that should be detected as ISO-8859-1."
    777                              "Ticket 6394 claims that ISO-8859-1 will appear in the array of detected "
    778                              "encodings more than once.  The hop through UnicodeString is for platforms "
    779                              "where this char * string is be EBCDIC and needs conversion to Latin1.";
    780     char latin1Text[sizeof(charText)];
    781     UnicodeString(charText).extract(0, sizeof(charText)-2, latin1Text, sizeof(latin1Text), "ISO-8859-1");
    782 
    783     UErrorCode status = U_ZERO_ERROR;
    784     UCharsetDetector *csd = ucsdet_open(&status);
    785     ucsdet_setText(csd, latin1Text, -1, &status);
    786     if (U_FAILURE(status)) {
    787         errln("Fail at file %s, line %d.  status = %s", __FILE__, __LINE__, u_errorName(status));
    788         return;
    789     }
    790 
    791     int32_t matchCount = 0;
    792     const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);
    793     if (U_FAILURE(status)) {
    794         errln("Fail at file %s, line %d.  status = %s", __FILE__, __LINE__, u_errorName(status));
    795         return;
    796     }
    797 
    798     UnicodeSet  setOfCharsetNames;    // UnicodSets can hold strings.
    799     int32_t i;
    800     for (i=0; i<matchCount; i++) {
    801         UnicodeString charSetName(ucsdet_getName(matches[i], &status));
    802         if (U_FAILURE(status)) {
    803             errln("Fail at file %s, line %d.  status = %s;  i=%d", __FILE__, __LINE__, u_errorName(status), i);
    804             status = U_ZERO_ERROR;
    805         }
    806         if (setOfCharsetNames.contains(charSetName)) {
    807             errln("Fail at file %s, line %d ", __FILE__, __LINE__);
    808             errln(UnicodeString("   Duplicate charset name = ") + charSetName);
    809         }
    810         setOfCharsetNames.add(charSetName);
    811     }
    812     ucsdet_close(csd);
    813 #endif
    814 }
    815 
    816 
    817 // Ticket 6954 - trouble with the haveC1Bytes flag that is used to distinguish between
    818 //               similar Windows and non-Windows SBCS encodings. State was kept in the shared
    819 //               Charset Recognizer objects, and could be overwritten.
    820 void CharsetDetectionTest::Ticket6954Test() {
    821 #if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION && !UCONFIG_NO_FORMATTING
    822     UErrorCode status = U_ZERO_ERROR;
    823     UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
    824     UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly."
    825                             "It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
    826     UnicodeString sWindows  = ssWindows.unescape();
    827     int32_t lISO = 0, lWindows = 0;
    828     char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
    829     char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
    830 
    831     // First do a plain vanilla detect of 1252 text
    832 
    833     UCharsetDetector *csd1 = ucsdet_open(&status);
    834     ucsdet_setText(csd1, bWindows, lWindows, &status);
    835     const UCharsetMatch *match1 = ucsdet_detect(csd1, &status);
    836     const char *name1 = ucsdet_getName(match1, &status);
    837     TEST_ASSERT_SUCCESS(status);
    838     TEST_ASSERT(strcmp(name1, "windows-1252")==0);
    839 
    840     // Next, using a completely separate detector, detect some 8859-1 text
    841 
    842     UCharsetDetector *csd2 = ucsdet_open(&status);
    843     ucsdet_setText(csd2, bISO, lISO, &status);
    844     const UCharsetMatch *match2 = ucsdet_detect(csd2, &status);
    845     const char *name2 = ucsdet_getName(match2, &status);
    846     TEST_ASSERT_SUCCESS(status);
    847     TEST_ASSERT(strcmp(name2, "ISO-8859-1")==0);
    848 
    849     // Recheck the 1252 results from the first detector, which should not have been
    850     //  altered by the use of a different detector.
    851 
    852     name1 = ucsdet_getName(match1, &status);
    853     TEST_ASSERT_SUCCESS(status);
    854     TEST_ASSERT(strcmp(name1, "windows-1252")==0);
    855 
    856     ucsdet_close(csd1);
    857     ucsdet_close(csd2);
    858     freeBytes(bISO);
    859     freeBytes(bWindows);
    860 #endif
    861 }
    862