Home | History | Annotate | Download | only in intltest
      1 /*
      2  **********************************************************************
      3  *   Copyright (C) 2005-2011, International Business Machines
      4  *   Corporation and others.  All Rights Reserved.
      5  **********************************************************************
      6  */
      7 
      8 
      9 #include "unicode/utypes.h"
     10 #include "unicode/ucsdet.h"
     11 #include "unicode/ucnv.h"
     12 #include "unicode/unistr.h"
     13 #include "unicode/putil.h"
     14 #include "unicode/uniset.h"
     15 
     16 #include "intltest.h"
     17 #include "csdetest.h"
     18 
     19 #include "xmlparser.h"
     20 
     21 #include <stdlib.h>
     22 #include <string.h>
     23 
     24 #ifdef DEBUG_DETECT
     25 #include <stdio.h>
     26 #endif
     27 
     28 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
     29 
     30 #define NEW_ARRAY(type,count) (type *) /*uprv_*/malloc((count) * sizeof(type))
     31 #define DELETE_ARRAY(array) /*uprv_*/free((void *) (array))
     32 
     33 #define CH_SPACE 0x0020
     34 #define CH_SLASH 0x002F
     35 
     36 //---------------------------------------------------------------------------
     37 //
     38 //  Test class boilerplate
     39 //
     40 //---------------------------------------------------------------------------
     41 CharsetDetectionTest::CharsetDetectionTest()
     42 {
     43 }
     44 
     45 
     46 CharsetDetectionTest::~CharsetDetectionTest()
     47 {
     48 }
     49 
     50 
     51 
     52 void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
     53 {
     54     if (exec) logln("TestSuite CharsetDetectionTest: ");
     55     switch (index) {
     56        case 0: name = "ConstructionTest";
     57             if (exec) ConstructionTest();
     58             break;
     59 
     60        case 1: name = "UTF8Test";
     61             if (exec) UTF8Test();
     62             break;
     63 
     64        case 2: name = "UTF16Test";
     65             if (exec) UTF16Test();
     66             break;
     67 
     68        case 3: name = "C1BytesTest";
     69             if (exec) C1BytesTest();
     70             break;
     71 
     72        case 4: name = "InputFilterTest";
     73             if (exec) InputFilterTest();
     74             break;
     75 
     76        case 5: name = "DetectionTest";
     77             if (exec) DetectionTest();
     78             break;
     79 #if !UCONFIG_NO_LEGACY_CONVERSION
     80        case 6: name = "IBM424Test";
     81             if (exec) IBM424Test();
     82             break;
     83 
     84        case 7: name = "IBM420Test";
     85             if (exec) IBM420Test();
     86             break;
     87 #else
     88        case 6:
     89        case 7: name = "skip"; break;
     90 #endif
     91        case 8: name = "Ticket6394Test";
     92             if (exec) Ticket6394Test();
     93             break;
     94 
     95         default: name = "";
     96             break; //needed to end loop
     97     }
     98 }
     99 
    100 static UnicodeString *split(const UnicodeString &src, UChar ch, int32_t &splits)
    101 {
    102     int32_t offset = -1;
    103 
    104     splits = 1;
    105     while((offset = src.indexOf(ch, offset + 1)) >= 0) {
    106         splits += 1;
    107     }
    108 
    109     UnicodeString *result = new UnicodeString[splits];
    110 
    111     int32_t start = 0;
    112     int32_t split = 0;
    113     int32_t end;
    114 
    115     while((end = src.indexOf(ch, start)) >= 0) {
    116         src.extractBetween(start, end, result[split++]);
    117         start = end + 1;
    118     }
    119 
    120     src.extractBetween(start, src.length(), result[split]);
    121 
    122     return result;
    123 }
    124 
    125 static char *extractBytes(const UnicodeString &source, const char *codepage, int32_t &length)
    126 {
    127     int32_t sLength = source.length();
    128     char *bytes = NULL;
    129 
    130     length = source.extract(0, sLength, NULL, codepage);
    131 
    132     if (length > 0) {
    133         bytes = NEW_ARRAY(char, length + 1);
    134         source.extract(0, sLength, bytes, codepage);
    135     }
    136 
    137     return bytes;
    138 }
    139 
    140 static void freeBytes(char *bytes)
    141 {
    142     DELETE_ARRAY(bytes);
    143 }
    144 
    145 void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const UnicodeString &encoding, const UnicodeString &id)
    146 {
    147     int32_t splits = 0;
    148     int32_t testLength = testString.length();
    149     UnicodeString *eSplit = split(encoding, CH_SLASH, splits);
    150     UErrorCode status = U_ZERO_ERROR;
    151     int32_t cpLength = eSplit[0].length();
    152     char codepage[64];
    153 
    154     u_UCharsToChars(eSplit[0].getBuffer(), codepage, cpLength);
    155     codepage[cpLength] = '\0';
    156 
    157     LocalUCharsetDetectorPointer csd(ucsdet_open(&status));
    158 
    159     int32_t byteLength = 0;
    160     char *bytes = extractBytes(testString, codepage, byteLength);
    161 
    162     if (bytes == NULL) {
    163 #if !UCONFIG_NO_LEGACY_CONVERSION
    164         dataerrln("Can't open a " + encoding + " converter for " + id);
    165 #endif
    166         return;
    167     }
    168 
    169     ucsdet_setText(csd.getAlias(), bytes, byteLength, &status);
    170 
    171     int32_t matchCount = 0;
    172     const UCharsetMatch **matches = ucsdet_detectAll(csd.getAlias(), &matchCount, &status);
    173 
    174 
    175     UnicodeString name(ucsdet_getName(matches[0], &status));
    176     UnicodeString lang(ucsdet_getLanguage(matches[0], &status));
    177     UChar *decoded = NULL;
    178     int32_t dLength = 0;
    179 
    180     if (matchCount == 0) {
    181         errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got no matches");
    182         goto bail;
    183     }
    184 
    185     if (name.compare(eSplit[0]) != 0) {
    186         errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name);
    187 
    188 #ifdef DEBUG_DETECT
    189         for (int32_t m = 0; m < matchCount; m += 1) {
    190             const char *name = ucsdet_getName(matches[m], &status);
    191             const char *lang = ucsdet_getLanguage(matches[m], &status);
    192             int32_t confidence = ucsdet_getConfidence(matches[m], &status);
    193 
    194             printf("%s (%s) %d\n", name, lang, confidence);
    195         }
    196 #endif
    197         goto bail;
    198     }
    199 
    200     if (splits > 1 && lang.compare(eSplit[1]) != 0) {
    201         errln("Language detection failure for " + id + ", " + eSplit[0] + ": expected " + eSplit[1] + ", got " + lang);
    202         goto bail;
    203     }
    204 
    205     decoded = NEW_ARRAY(UChar, testLength);
    206     dLength = ucsdet_getUChars(matches[0], decoded, testLength, &status);
    207 
    208     if (testString.compare(decoded, dLength) != 0) {
    209         errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() didn't yeild the original string.");
    210 
    211 #ifdef DEBUG_DETECT
    212         for(int32_t i = 0; i < testLength; i += 1) {
    213             if(testString[i] != decoded[i]) {
    214                 printf("Strings differ at byte %d\n", i);
    215                 break;
    216             }
    217         }
    218 #endif
    219 
    220     }
    221 
    222     DELETE_ARRAY(decoded);
    223 
    224 bail:
    225     freeBytes(bytes);
    226     delete[] eSplit;
    227 }
    228 
    229 const char *CharsetDetectionTest::getPath(char buffer[2048], const char *filename) {
    230     UErrorCode status = U_ZERO_ERROR;
    231     const char *testDataDirectory = IntlTest::getSourceTestData(status);
    232 
    233     if (U_FAILURE(status)) {
    234         errln("ERROR: getPath() failed - %s", u_errorName(status));
    235         return NULL;
    236     }
    237 
    238     strcpy(buffer, testDataDirectory);
    239     strcat(buffer, filename);
    240     return buffer;
    241 }
    242 
    243 void CharsetDetectionTest::ConstructionTest()
    244 {
    245     IcuTestErrorCode status(*this, "ConstructionTest");
    246     LocalUCharsetDetectorPointer csd(ucsdet_open(status));
    247     LocalUEnumerationPointer e(ucsdet_getAllDetectableCharsets(csd.getAlias(), status));
    248     int32_t count = uenum_count(e.getAlias(), status);
    249 
    250 #ifdef DEBUG_DETECT
    251     printf("There are %d recognizers.\n", count);
    252 #endif
    253 
    254     for(int32_t i = 0; i < count; i += 1) {
    255         int32_t length;
    256         const char *name = uenum_next(e.getAlias(), &length, status);
    257 
    258         if(name == NULL || length <= 0) {
    259             errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!");
    260         }
    261 
    262 #ifdef DEBUG_DETECT
    263         printf("%s\n", name);
    264 #endif
    265     }
    266 }
    267 
    268 void CharsetDetectionTest::UTF8Test()
    269 {
    270     UErrorCode status = U_ZERO_ERROR;
    271     UnicodeString ss = "This is a string with some non-ascii characters that will "
    272                        "be converted to UTF-8, then shoved through the detection process.  "
    273                        "\\u0391\\u0392\\u0393\\u0394\\u0395"
    274                        "Sure would be nice if our source could contain Unicode directly!";
    275     UnicodeString s = ss.unescape();
    276     int32_t byteLength = 0, sLength = s.length();
    277     char *bytes = extractBytes(s, "UTF-8", byteLength);
    278     UCharsetDetector *csd = ucsdet_open(&status);
    279     const UCharsetMatch *match;
    280     UChar *detected = NEW_ARRAY(UChar, sLength);
    281 
    282     ucsdet_setText(csd, bytes, byteLength, &status);
    283     match = ucsdet_detect(csd, &status);
    284 
    285     if (match == NULL) {
    286         errln("Detection failure for UTF-8: got no matches.");
    287         goto bail;
    288     }
    289 
    290     ucsdet_getUChars(match, detected, sLength, &status);
    291 
    292     if (s.compare(detected, sLength) != 0) {
    293         errln("Round-trip test failed!");
    294     }
    295 
    296     ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
    297 
    298 bail:
    299     DELETE_ARRAY(detected);
    300     freeBytes(bytes);
    301     ucsdet_close(csd);
    302 }
    303 
    304 void CharsetDetectionTest::UTF16Test()
    305 {
    306     UErrorCode status = U_ZERO_ERROR;
    307     /* Notice the BOM on the start of this string */
    308     UChar chars[] = {
    309         0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
    310         0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
    311         0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
    312         0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
    313         0x064a, 0x062a, 0x0000};
    314     UnicodeString s(chars);
    315     int32_t beLength = 0, leLength = 0;
    316     char *beBytes = extractBytes(s, "UTF-16BE", beLength);
    317     char *leBytes = extractBytes(s, "UTF-16LE", leLength);
    318     UCharsetDetector *csd = ucsdet_open(&status);
    319     const UCharsetMatch *match;
    320     const char *name;
    321     int32_t conf;
    322 
    323     ucsdet_setText(csd, beBytes, beLength, &status);
    324     match = ucsdet_detect(csd, &status);
    325 
    326     if (match == NULL) {
    327         errln("Encoding detection failure for UTF-16BE: got no matches.");
    328         goto try_le;
    329     }
    330 
    331     name  = ucsdet_getName(match, &status);
    332     conf  = ucsdet_getConfidence(match, &status);
    333 
    334     if (strcmp(name, "UTF-16BE") != 0) {
    335         errln("Encoding detection failure for UTF-16BE: got %s", name);
    336         goto try_le; // no point in looking at confidence if we got the wrong character set.
    337     }
    338 
    339     if (conf != 100) {
    340         errln("Did not get 100%% confidence for UTF-16BE: got %d", conf);
    341     }
    342 
    343 try_le:
    344     ucsdet_setText(csd, leBytes, leLength, &status);
    345     match = ucsdet_detect(csd, &status);
    346 
    347     if (match == NULL) {
    348         errln("Encoding detection failure for UTF-16LE: got no matches.");
    349         goto bail;
    350     }
    351 
    352     name  = ucsdet_getName(match, &status);
    353     conf = ucsdet_getConfidence(match, &status);
    354 
    355 
    356     if (strcmp(name, "UTF-16LE") != 0) {
    357         errln("Enconding detection failure for UTF-16LE: got %s", name);
    358         goto bail; // no point in looking at confidence if we got the wrong character set.
    359     }
    360 
    361     if (conf != 100) {
    362         errln("Did not get 100%% confidence for UTF-16LE: got %d", conf);
    363     }
    364 
    365 bail:
    366     freeBytes(leBytes);
    367     freeBytes(beBytes);
    368     ucsdet_close(csd);
    369 }
    370 
    371 void CharsetDetectionTest::InputFilterTest()
    372 {
    373     UErrorCode status = U_ZERO_ERROR;
    374     UnicodeString ss = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
    375     UnicodeString s  = ss.unescape();
    376     int32_t byteLength = 0;
    377     char *bytes = extractBytes(s, "ISO-8859-1", byteLength);
    378     UCharsetDetector *csd = ucsdet_open(&status);
    379     const UCharsetMatch *match;
    380     const char *lang, *name;
    381 
    382     ucsdet_enableInputFilter(csd, TRUE);
    383 
    384     if (!ucsdet_isInputFilterEnabled(csd)) {
    385         errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!");
    386     }
    387 
    388 
    389     ucsdet_setText(csd, bytes, byteLength, &status);
    390     match = ucsdet_detect(csd, &status);
    391 
    392     if (match == NULL) {
    393         errln("Turning on the input filter resulted in no matches.");
    394         goto turn_off;
    395     }
    396 
    397     name = ucsdet_getName(match, &status);
    398 
    399     if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
    400         errln("Turning on the input filter resulted in %s rather than ISO-8859-1.", name);
    401     } else {
    402         lang = ucsdet_getLanguage(match, &status);
    403 
    404         if (lang == NULL || strcmp(lang, "fr") != 0) {
    405             errln("Input filter did not strip markup!");
    406         }
    407     }
    408 
    409 turn_off:
    410     ucsdet_enableInputFilter(csd, FALSE);
    411     ucsdet_setText(csd, bytes, byteLength, &status);
    412     match = ucsdet_detect(csd, &status);
    413 
    414     if (match == NULL) {
    415         errln("Turning off the input filter resulted in no matches.");
    416         goto bail;
    417     }
    418 
    419     name = ucsdet_getName(match, &status);
    420 
    421     if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
    422         errln("Turning off the input filter resulted in %s rather than ISO-8859-1.", name);
    423     } else {
    424         lang = ucsdet_getLanguage(match, &status);
    425 
    426         if (lang == NULL || strcmp(lang, "en") != 0) {
    427             errln("Unfiltered input did not detect as English!");
    428         }
    429     }
    430 
    431 bail:
    432     freeBytes(bytes);
    433     ucsdet_close(csd);
    434 }
    435 
    436 void CharsetDetectionTest::C1BytesTest()
    437 {
    438 #if !UCONFIG_NO_LEGACY_CONVERSION
    439     UErrorCode status = U_ZERO_ERROR;
    440     UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
    441     UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
    442     UnicodeString sWindows  = ssWindows.unescape();
    443     int32_t lISO = 0, lWindows = 0;
    444     char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
    445     char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
    446     UCharsetDetector *csd = ucsdet_open(&status);
    447     const UCharsetMatch *match;
    448     const char *name;
    449 
    450     ucsdet_setText(csd, bWindows, lWindows, &status);
    451     match = ucsdet_detect(csd, &status);
    452 
    453     if (match == NULL) {
    454         errcheckln(status, "English test with C1 bytes got no matches. - %s", u_errorName(status));
    455         goto bail;
    456     }
    457 
    458     name  = ucsdet_getName(match, &status);
    459 
    460     if (strcmp(name, "windows-1252") != 0) {
    461         errln("English text with C1 bytes does not detect as windows-1252, but as %s", name);
    462     }
    463 
    464     ucsdet_setText(csd, bISO, lISO, &status);
    465     match = ucsdet_detect(csd, &status);
    466 
    467     if (match == NULL) {
    468         errln("English text without C1 bytes got no matches.");
    469         goto bail;
    470     }
    471 
    472     name  = ucsdet_getName(match, &status);
    473 
    474     if (strcmp(name, "ISO-8859-1") != 0) {
    475         errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name);
    476     }
    477 
    478 bail:
    479     freeBytes(bWindows);
    480     freeBytes(bISO);
    481 
    482     ucsdet_close(csd);
    483 #endif
    484 }
    485 
    486 void CharsetDetectionTest::DetectionTest()
    487 {
    488 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
    489     UErrorCode status = U_ZERO_ERROR;
    490     char path[2048];
    491     const char *testFilePath = getPath(path, "csdetest.xml");
    492 
    493     if (testFilePath == NULL) {
    494         return; /* Couldn't get path: error message already output. */
    495     }
    496 
    497     UXMLParser  *parser = UXMLParser::createParser(status);
    498     if (U_FAILURE(status)) {
    499         dataerrln("FAIL: UXMLParser::createParser (%s)", u_errorName(status));
    500         return;
    501     }
    502 
    503     UXMLElement *root   = parser->parseFile(testFilePath, status);
    504     if (!assertSuccess( "parseFile",status)) return;
    505 
    506     UnicodeString test_case = UNICODE_STRING_SIMPLE("test-case");
    507     UnicodeString id_attr   = UNICODE_STRING_SIMPLE("id");
    508     UnicodeString enc_attr  = UNICODE_STRING_SIMPLE("encodings");
    509 
    510     const UXMLElement *testCase;
    511     int32_t tc = 0;
    512 
    513     while((testCase = root->nextChildElement(tc)) != NULL) {
    514         if (testCase->getTagName().compare(test_case) == 0) {
    515             const UnicodeString *id = testCase->getAttribute(id_attr);
    516             const UnicodeString *encodings = testCase->getAttribute(enc_attr);
    517             const UnicodeString  text = testCase->getText(TRUE);
    518             int32_t encodingCount;
    519             UnicodeString *encodingList = split(*encodings, CH_SPACE, encodingCount);
    520 
    521             for(int32_t e = 0; e < encodingCount; e += 1) {
    522                 checkEncoding(text, encodingList[e], *id);
    523             }
    524 
    525             delete[] encodingList;
    526         }
    527     }
    528 
    529     delete root;
    530     delete parser;
    531 #endif
    532 }
    533 
    534 void CharsetDetectionTest::IBM424Test()
    535 {
    536     UErrorCode status = U_ZERO_ERROR;
    537 
    538     static const UChar chars[] = {
    539             0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
    540             0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
    541             0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
    542             0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
    543             0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
    544             0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
    545             0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
    546             0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
    547             0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
    548             0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
    549             0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
    550             0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
    551             0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
    552             0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
    553             0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
    554             0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
    555             0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
    556     };
    557 
    558     static const UChar chars_reverse[] = {
    559             0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
    560             0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
    561             0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
    562             0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
    563             0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
    564             0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
    565             0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
    566             0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
    567             0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
    568             0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
    569             0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
    570             0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
    571             0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
    572             0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
    573             0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
    574             0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
    575             0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
    576             0x0000
    577     };
    578 
    579     int32_t bLength = 0, brLength = 0;
    580 
    581     UnicodeString s1(chars);
    582     UnicodeString s2(chars_reverse);
    583 
    584     char *bytes = extractBytes(s1, "IBM424", bLength);
    585     char *bytes_r = extractBytes(s2, "IBM424", brLength);
    586 
    587     UCharsetDetector *csd = ucsdet_open(&status);
    588     if (U_FAILURE(status)) {
    589         errln("Error opening charset detector. - %s", u_errorName(status));
    590     }
    591     const UCharsetMatch *match;
    592     const char *name;
    593 
    594     ucsdet_setText(csd, bytes, bLength, &status);
    595     match = ucsdet_detect(csd, &status);
    596 
    597     if (match == NULL) {
    598         errcheckln(status, "Encoding detection failure for IBM424_rtl: got no matches. - %s", u_errorName(status));
    599         goto bail;
    600     }
    601 
    602     name  = ucsdet_getName(match, &status);
    603     if (strcmp(name, "IBM424_rtl") != 0) {
    604         errln("Encoding detection failure for IBM424_rtl: got %s", name);
    605     }
    606 
    607     ucsdet_setText(csd, bytes_r, brLength, &status);
    608     match = ucsdet_detect(csd, &status);
    609 
    610     if (match == NULL) {
    611         errln("Encoding detection failure for IBM424_ltr: got no matches.");
    612         goto bail;
    613     }
    614 
    615     name  = ucsdet_getName(match, &status);
    616     if (strcmp(name, "IBM424_ltr") != 0) {
    617         errln("Encoding detection failure for IBM424_ltr: got %s", name);
    618     }
    619 
    620 bail:
    621     freeBytes(bytes);
    622     freeBytes(bytes_r);
    623     ucsdet_close(csd);
    624 }
    625 
    626 void CharsetDetectionTest::IBM420Test()
    627 {
    628     UErrorCode status = U_ZERO_ERROR;
    629 
    630     static const UChar chars[] = {
    631         0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
    632         0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
    633         0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
    634         0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
    635         0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
    636         0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
    637         0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
    638         0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
    639         0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
    640         0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
    641         0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
    642         0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
    643         0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
    644         0x0000
    645     };
    646     static const UChar chars_reverse[] = {
    647         0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
    648         0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
    649         0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
    650         0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
    651         0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
    652         0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
    653         0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
    654         0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
    655         0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
    656         0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
    657         0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
    658         0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
    659         0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
    660         0x0000,
    661     };
    662 
    663     int32_t bLength = 0, brLength = 0;
    664 
    665     UnicodeString s1(chars);
    666     UnicodeString s2(chars_reverse);
    667 
    668     char *bytes = extractBytes(s1, "IBM420", bLength);
    669     char *bytes_r = extractBytes(s2, "IBM420", brLength);
    670 
    671     UCharsetDetector *csd = ucsdet_open(&status);
    672     if (U_FAILURE(status)) {
    673         errln("Error opening charset detector. - %s", u_errorName(status));
    674     }
    675     const UCharsetMatch *match;
    676     const char *name;
    677 
    678     ucsdet_setText(csd, bytes, bLength, &status);
    679     match = ucsdet_detect(csd, &status);
    680 
    681     if (match == NULL) {
    682         errcheckln(status, "Encoding detection failure for IBM420_rtl: got no matches. - %s", u_errorName(status));
    683         goto bail;
    684     }
    685 
    686     name  = ucsdet_getName(match, &status);
    687     if (strcmp(name, "IBM420_rtl") != 0) {
    688         errln("Encoding detection failure for IBM420_rtl: got %s\n", name);
    689     }
    690 
    691     ucsdet_setText(csd, bytes_r, brLength, &status);
    692     match = ucsdet_detect(csd, &status);
    693 
    694     if (match == NULL) {
    695         errln("Encoding detection failure for IBM420_ltr: got no matches.\n");
    696         goto bail;
    697     }
    698 
    699     name  = ucsdet_getName(match, &status);
    700     if (strcmp(name, "IBM420_ltr") != 0) {
    701         errln("Encoding detection failure for IBM420_ltr: got %s\n", name);
    702     }
    703 
    704 bail:
    705     freeBytes(bytes);
    706     freeBytes(bytes_r);
    707     ucsdet_close(csd);
    708 }
    709 
    710 
    711 void CharsetDetectionTest::Ticket6394Test() {
    712 #if !UCONFIG_NO_CONVERSION
    713     const char charText[] =  "Here is some random English text that should be detected as ISO-8859-1."
    714                              "Ticket 6394 claims that ISO-8859-1 will appear in the array of detected "
    715                              "encodings more than once.  The hop through UnicodeString is for platforms "
    716                              "where this char * string is be EBCDIC and needs conversion to Latin1.";
    717     char latin1Text[sizeof(charText)];
    718     UnicodeString(charText).extract(0, sizeof(charText)-2, latin1Text, sizeof(latin1Text), "ISO-8859-1");
    719 
    720     UErrorCode status = U_ZERO_ERROR;
    721     UCharsetDetector *csd = ucsdet_open(&status);
    722     ucsdet_setText(csd, latin1Text, -1, &status);
    723     if (U_FAILURE(status)) {
    724         errln("Fail at file %s, line %d.  status = %s", __FILE__, __LINE__, u_errorName(status));
    725         return;
    726     }
    727 
    728     int32_t matchCount = 0;
    729     const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);
    730     if (U_FAILURE(status)) {
    731         errln("Fail at file %s, line %d.  status = %s", __FILE__, __LINE__, u_errorName(status));
    732         return;
    733     }
    734 
    735     UnicodeSet  setOfCharsetNames;    // UnicodSets can hold strings.
    736     int32_t i;
    737     for (i=0; i<matchCount; i++) {
    738         UnicodeString charSetName(ucsdet_getName(matches[i], &status));
    739         if (U_FAILURE(status)) {
    740             errln("Fail at file %s, line %d.  status = %s;  i=%d", __FILE__, __LINE__, u_errorName(status), i);
    741             status = U_ZERO_ERROR;
    742         }
    743         if (setOfCharsetNames.contains(charSetName)) {
    744             errln("Fail at file %s, line %d ", __FILE__, __LINE__);
    745             errln(UnicodeString("   Duplicate charset name = ") + charSetName);
    746         }
    747         setOfCharsetNames.add(charSetName);
    748     }
    749     ucsdet_close(csd);
    750 #endif
    751 }
    752 
    753