Home | History | Annotate | Download | only in intltest
      1 /*
      2  **********************************************************************
      3  *   Copyright (C) 2005-2009, International Business Machines
      4  *   Corporation and others.  All Rights Reserved.
      5  **********************************************************************
      6  */
      7 
      8 
      9 #include "unicode/utypes.h"
     10 #include "unicode/ucsdet.h"
     11 #include "unicode/ucnv.h"
     12 #include "unicode/unistr.h"
     13 #include "unicode/putil.h"
     14 #include "unicode/uniset.h"
     15 
     16 #include "intltest.h"
     17 #include "csdetest.h"
     18 
     19 #include "xmlparser.h"
     20 
     21 #include <stdlib.h>
     22 #include <string.h>
     23 
     24 #ifdef DEBUG_DETECT
     25 #include <stdio.h>
     26 #endif
     27 
     28 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
     29 
     30 #define NEW_ARRAY(type,count) (type *) /*uprv_*/malloc((count) * sizeof(type))
     31 #define DELETE_ARRAY(array) /*uprv_*/free((void *) (array))
     32 
     33 #define CH_SPACE 0x0020
     34 #define CH_SLASH 0x002F
     35 
     36 //---------------------------------------------------------------------------
     37 //
     38 //  Test class boilerplate
     39 //
     40 //---------------------------------------------------------------------------
     41 CharsetDetectionTest::CharsetDetectionTest()
     42 {
     43 }
     44 
     45 
     46 CharsetDetectionTest::~CharsetDetectionTest()
     47 {
     48 }
     49 
     50 
     51 
     52 void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
     53 {
     54     if (exec) logln("TestSuite CharsetDetectionTest: ");
     55     switch (index) {
     56        case 0: name = "ConstructionTest";
     57             if (exec) ConstructionTest();
     58             break;
     59 
     60        case 1: name = "UTF8Test";
     61             if (exec) UTF8Test();
     62             break;
     63 
     64        case 2: name = "UTF16Test";
     65             if (exec) UTF16Test();
     66             break;
     67 
     68        case 3: name = "C1BytesTest";
     69             if (exec) C1BytesTest();
     70             break;
     71 
     72        case 4: name = "InputFilterTest";
     73             if (exec) InputFilterTest();
     74             break;
     75 
     76        case 5: name = "DetectionTest";
     77             if (exec) DetectionTest();
     78             break;
     79 
     80        case 6: name = "IBM424Test";
     81             if (exec) IBM424Test();
     82             break;
     83 
     84        case 7: name = "IBM420Test";
     85             if (exec) IBM420Test();
     86             break;
     87 
     88        case 8: name = "Ticket6394Test";
     89             if (exec) Ticket6394Test();
     90             break;
     91 
     92         default: name = "";
     93             break; //needed to end loop
     94     }
     95 }
     96 
     97 static UnicodeString *split(const UnicodeString &src, UChar ch, int32_t &splits)
     98 {
     99     int32_t offset = -1;
    100 
    101     splits = 1;
    102     while((offset = src.indexOf(ch, offset + 1)) >= 0) {
    103         splits += 1;
    104     }
    105 
    106     UnicodeString *result = new UnicodeString[splits];
    107 
    108     int32_t start = 0;
    109     int32_t split = 0;
    110     int32_t end;
    111 
    112     while((end = src.indexOf(ch, start)) >= 0) {
    113         src.extractBetween(start, end, result[split++]);
    114         start = end + 1;
    115     }
    116 
    117     src.extractBetween(start, src.length(), result[split]);
    118 
    119     return result;
    120 }
    121 
    122 static char *extractBytes(const UnicodeString &source, const char *codepage, int32_t &length)
    123 {
    124     int32_t sLength = source.length();
    125     char *bytes = NULL;
    126 
    127     length = source.extract(0, sLength, NULL, codepage);
    128 
    129     if (length > 0) {
    130         bytes = NEW_ARRAY(char, length + 1);
    131         source.extract(0, sLength, bytes, codepage);
    132     }
    133 
    134     return bytes;
    135 }
    136 
    137 static void freeBytes(char *bytes)
    138 {
    139     DELETE_ARRAY(bytes);
    140 }
    141 
    142 void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const UnicodeString &encoding, const UnicodeString &id)
    143 {
    144     int32_t splits = 0;
    145     int32_t testLength = testString.length();
    146     UnicodeString *eSplit = split(encoding, CH_SLASH, splits);
    147     UErrorCode status = U_ZERO_ERROR;
    148     int32_t cpLength = eSplit[0].length();
    149     char codepage[64];
    150 
    151     u_UCharsToChars(eSplit[0].getBuffer(), codepage, cpLength);
    152     codepage[cpLength] = '\0';
    153 
    154     UCharsetDetector *csd = ucsdet_open(&status);
    155 
    156     int32_t byteLength = 0;
    157     char *bytes = extractBytes(testString, codepage, byteLength);
    158 
    159     if (bytes == NULL) {
    160 #if !UCONFIG_NO_LEGACY_CONVERSION
    161         errln("Can't open a " + encoding + " converter for " + id);
    162 #endif
    163         return;
    164     }
    165 
    166     ucsdet_setText(csd, bytes, byteLength, &status);
    167 
    168     int32_t matchCount = 0;
    169     const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);
    170 
    171 
    172     UnicodeString name(ucsdet_getName(matches[0], &status));
    173     UnicodeString lang(ucsdet_getLanguage(matches[0], &status));
    174     UChar *decoded = NULL;
    175     int32_t dLength = 0;
    176 
    177     if (matchCount == 0) {
    178         errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got no matches");
    179         goto bail;
    180     }
    181 
    182     if (name.compare(eSplit[0]) != 0) {
    183         errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name);
    184 
    185 #ifdef DEBUG_DETECT
    186         for (int32_t m = 0; m < matchCount; m += 1) {
    187             const char *name = ucsdet_getName(matches[m], &status);
    188             const char *lang = ucsdet_getLanguage(matches[m], &status);
    189             int32_t confidence = ucsdet_getConfidence(matches[m], &status);
    190 
    191             printf("%s (%s) %d\n", name, lang, confidence);
    192         }
    193 #endif
    194         goto bail;
    195     }
    196 
    197     if (splits > 1 && lang.compare(eSplit[1]) != 0) {
    198         errln("Language detection failure for " + id + ", " + eSplit[0] + ": expected " + eSplit[1] + ", got " + lang);
    199         goto bail;
    200     }
    201 
    202     decoded = NEW_ARRAY(UChar, testLength);
    203     dLength = ucsdet_getUChars(matches[0], decoded, testLength, &status);
    204 
    205     if (testString.compare(decoded, dLength) != 0) {
    206         errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() didn't yeild the original string.");
    207 
    208 #ifdef DEBUG_DETECT
    209         for(int32_t i = 0; i < testLength; i += 1) {
    210             if(testString[i] != decoded[i]) {
    211                 printf("Strings differ at byte %d\n", i);
    212                 break;
    213             }
    214         }
    215 #endif
    216 
    217     }
    218 
    219     DELETE_ARRAY(decoded);
    220 
    221 bail:
    222     freeBytes(bytes);
    223     ucsdet_close(csd);
    224     delete[] eSplit;
    225 }
    226 
    227 const char *CharsetDetectionTest::getPath(char buffer[2048], const char *filename) {
    228     UErrorCode status = U_ZERO_ERROR;
    229     const char *testDataDirectory = IntlTest::getSourceTestData(status);
    230 
    231     if (U_FAILURE(status)) {
    232         errln("ERROR: getPath() failed - %s", u_errorName(status));
    233         return NULL;
    234     }
    235 
    236     strcpy(buffer, testDataDirectory);
    237     strcat(buffer, filename);
    238     return buffer;
    239 }
    240 
    241 void CharsetDetectionTest::ConstructionTest()
    242 {
    243     UErrorCode status = U_ZERO_ERROR;
    244     UCharsetDetector *csd = ucsdet_open(&status);
    245     UEnumeration *e = ucsdet_getAllDetectableCharsets(csd, &status);
    246     int32_t count = uenum_count(e, &status);
    247 
    248 #ifdef DEBUG_DETECT
    249     printf("There are %d recognizers.\n", count);
    250 #endif
    251 
    252     for(int32_t i = 0; i < count; i += 1) {
    253         int32_t length;
    254         const char *name = uenum_next(e, &length, &status);
    255 
    256         if(name == NULL || length <= 0) {
    257             errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!");
    258         }
    259 
    260 #ifdef DEBUG_DETECT
    261         printf("%s\n", name);
    262 #endif
    263     }
    264 
    265     uenum_close(e);
    266     ucsdet_close(csd);
    267 }
    268 
    269 void CharsetDetectionTest::UTF8Test()
    270 {
    271     UErrorCode status = U_ZERO_ERROR;
    272     UnicodeString ss = "This is a string with some non-ascii characters that will "
    273                        "be converted to UTF-8, then shoved through the detection process.  "
    274                        "\\u0391\\u0392\\u0393\\u0394\\u0395"
    275                        "Sure would be nice if our source could contain Unicode directly!";
    276     UnicodeString s = ss.unescape();
    277     int32_t byteLength = 0, sLength = s.length();
    278     char *bytes = extractBytes(s, "UTF-8", byteLength);
    279     UCharsetDetector *csd = ucsdet_open(&status);
    280     const UCharsetMatch *match;
    281     UChar *detected = NEW_ARRAY(UChar, sLength);
    282 
    283     ucsdet_setText(csd, bytes, byteLength, &status);
    284     match = ucsdet_detect(csd, &status);
    285 
    286     if (match == NULL) {
    287         errln("Detection failure for UTF-8: got no matches.");
    288         goto bail;
    289     }
    290 
    291     ucsdet_getUChars(match, detected, sLength, &status);
    292 
    293     if (s.compare(detected, sLength) != 0) {
    294         errln("Round-trip test failed!");
    295     }
    296 
    297     ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
    298 
    299 bail:
    300     DELETE_ARRAY(detected);
    301     freeBytes(bytes);
    302     ucsdet_close(csd);
    303 }
    304 
    305 void CharsetDetectionTest::UTF16Test()
    306 {
    307     UErrorCode status = U_ZERO_ERROR;
    308     /* Notice the BOM on the start of this string */
    309     UChar chars[] = {
    310         0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
    311         0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
    312         0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
    313         0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
    314         0x064a, 0x062a, 0x0000};
    315     UnicodeString s(chars);
    316     int32_t beLength = 0, leLength = 0;
    317     char *beBytes = extractBytes(s, "UTF-16BE", beLength);
    318     char *leBytes = extractBytes(s, "UTF-16LE", leLength);
    319     UCharsetDetector *csd = ucsdet_open(&status);
    320     const UCharsetMatch *match;
    321     const char *name;
    322     int32_t conf;
    323 
    324     ucsdet_setText(csd, beBytes, beLength, &status);
    325     match = ucsdet_detect(csd, &status);
    326 
    327     if (match == NULL) {
    328         errln("Encoding detection failure for UTF-16BE: got no matches.");
    329         goto try_le;
    330     }
    331 
    332     name  = ucsdet_getName(match, &status);
    333     conf  = ucsdet_getConfidence(match, &status);
    334 
    335     if (strcmp(name, "UTF-16BE") != 0) {
    336         errln("Encoding detection failure for UTF-16BE: got %s", name);
    337         goto try_le; // no point in looking at confidence if we got the wrong character set.
    338     }
    339 
    340     if (conf != 100) {
    341         errln("Did not get 100%% confidence for UTF-16BE: got %d", conf);
    342     }
    343 
    344 try_le:
    345     ucsdet_setText(csd, leBytes, leLength, &status);
    346     match = ucsdet_detect(csd, &status);
    347 
    348     if (match == NULL) {
    349         errln("Encoding detection failure for UTF-16LE: got no matches.");
    350         goto bail;
    351     }
    352 
    353     name  = ucsdet_getName(match, &status);
    354     conf = ucsdet_getConfidence(match, &status);
    355 
    356 
    357     if (strcmp(name, "UTF-16LE") != 0) {
    358         errln("Enconding detection failure for UTF-16LE: got %s", name);
    359         goto bail; // no point in looking at confidence if we got the wrong character set.
    360     }
    361 
    362     if (conf != 100) {
    363         errln("Did not get 100%% confidence for UTF-16LE: got %d", conf);
    364     }
    365 
    366 bail:
    367     freeBytes(leBytes);
    368     freeBytes(beBytes);
    369     ucsdet_close(csd);
    370 }
    371 
    372 void CharsetDetectionTest::InputFilterTest()
    373 {
    374     UErrorCode status = U_ZERO_ERROR;
    375     UnicodeString ss = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
    376     UnicodeString s  = ss.unescape();
    377     int32_t byteLength = 0;
    378     char *bytes = extractBytes(s, "ISO-8859-1", byteLength);
    379     UCharsetDetector *csd = ucsdet_open(&status);
    380     const UCharsetMatch *match;
    381     const char *lang, *name;
    382 
    383     ucsdet_enableInputFilter(csd, TRUE);
    384 
    385     if (!ucsdet_isInputFilterEnabled(csd)) {
    386         errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!");
    387     }
    388 
    389 
    390     ucsdet_setText(csd, bytes, byteLength, &status);
    391     match = ucsdet_detect(csd, &status);
    392 
    393     if (match == NULL) {
    394         errln("Turning on the input filter resulted in no matches.");
    395         goto turn_off;
    396     }
    397 
    398     name = ucsdet_getName(match, &status);
    399 
    400     if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
    401         errln("Turning on the input filter resulted in %s rather than ISO-8859-1.", name);
    402     } else {
    403         lang = ucsdet_getLanguage(match, &status);
    404 
    405         if (lang == NULL || strcmp(lang, "fr") != 0) {
    406             errln("Input filter did not strip markup!");
    407         }
    408     }
    409 
    410 turn_off:
    411     ucsdet_enableInputFilter(csd, FALSE);
    412     ucsdet_setText(csd, bytes, byteLength, &status);
    413     match = ucsdet_detect(csd, &status);
    414 
    415     if (match == NULL) {
    416         errln("Turning off the input filter resulted in no matches.");
    417         goto bail;
    418     }
    419 
    420     name = ucsdet_getName(match, &status);
    421 
    422     if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
    423         errln("Turning off the input filter resulted in %s rather than ISO-8859-1.", name);
    424     } else {
    425         lang = ucsdet_getLanguage(match, &status);
    426 
    427         if (lang == NULL || strcmp(lang, "en") != 0) {
    428             errln("Unfiltered input did not detect as English!");
    429         }
    430     }
    431 
    432 bail:
    433     freeBytes(bytes);
    434     ucsdet_close(csd);
    435 }
    436 
    437 void CharsetDetectionTest::C1BytesTest()
    438 {
    439 #if !UCONFIG_NO_LEGACY_CONVERSION
    440     UErrorCode status = U_ZERO_ERROR;
    441     UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
    442     UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
    443     UnicodeString sWindows  = ssWindows.unescape();
    444     int32_t lISO = 0, lWindows = 0;
    445     char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
    446     char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
    447     UCharsetDetector *csd = ucsdet_open(&status);
    448     const UCharsetMatch *match;
    449     const char *name;
    450 
    451     ucsdet_setText(csd, bWindows, lWindows, &status);
    452     match = ucsdet_detect(csd, &status);
    453 
    454     if (match == NULL) {
    455         errcheckln(status, "English test with C1 bytes got no matches. - %s", u_errorName(status));
    456         goto bail;
    457     }
    458 
    459     name  = ucsdet_getName(match, &status);
    460 
    461     if (strcmp(name, "windows-1252") != 0) {
    462         errln("English text with C1 bytes does not detect as windows-1252, but as %s", name);
    463     }
    464 
    465     ucsdet_setText(csd, bISO, lISO, &status);
    466     match = ucsdet_detect(csd, &status);
    467 
    468     if (match == NULL) {
    469         errln("English text without C1 bytes got no matches.");
    470         goto bail;
    471     }
    472 
    473     name  = ucsdet_getName(match, &status);
    474 
    475     if (strcmp(name, "ISO-8859-1") != 0) {
    476         errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name);
    477     }
    478 
    479 bail:
    480     freeBytes(bWindows);
    481     freeBytes(bISO);
    482 
    483     ucsdet_close(csd);
    484 #endif
    485 }
    486 
    487 void CharsetDetectionTest::DetectionTest()
    488 {
    489 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
    490     UErrorCode status = U_ZERO_ERROR;
    491     char path[2048];
    492     const char *testFilePath = getPath(path, "csdetest.xml");
    493 
    494     if (testFilePath == NULL) {
    495         return; /* Couldn't get path: error message already output. */
    496     }
    497 
    498     UXMLParser  *parser = UXMLParser::createParser(status);
    499     if (U_FAILURE(status)) {
    500         dataerrln("FAIL: UXMLParser::createParser (%s)", u_errorName(status));
    501         return;
    502     }
    503 
    504     UXMLElement *root   = parser->parseFile(testFilePath, status);
    505     if (!assertSuccess( "parseFile",status)) return;
    506 
    507     UnicodeString test_case = UNICODE_STRING_SIMPLE("test-case");
    508     UnicodeString id_attr   = UNICODE_STRING_SIMPLE("id");
    509     UnicodeString enc_attr  = UNICODE_STRING_SIMPLE("encodings");
    510 
    511     const UXMLElement *testCase;
    512     int32_t tc = 0;
    513 
    514     while((testCase = root->nextChildElement(tc)) != NULL) {
    515         if (testCase->getTagName().compare(test_case) == 0) {
    516             const UnicodeString *id = testCase->getAttribute(id_attr);
    517             const UnicodeString *encodings = testCase->getAttribute(enc_attr);
    518             const UnicodeString  text = testCase->getText(TRUE);
    519             int32_t encodingCount;
    520             UnicodeString *encodingList = split(*encodings, CH_SPACE, encodingCount);
    521 
    522             for(int32_t e = 0; e < encodingCount; e += 1) {
    523                 checkEncoding(text, encodingList[e], *id);
    524             }
    525 
    526             delete[] encodingList;
    527         }
    528     }
    529 
    530     delete root;
    531     delete parser;
    532 #endif
    533 }
    534 
    535 void CharsetDetectionTest::IBM424Test()
    536 {
    537     UErrorCode status = U_ZERO_ERROR;
    538 
    539     static const UChar chars[] = {
    540             0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
    541             0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
    542             0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
    543             0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
    544             0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
    545             0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
    546             0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
    547             0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
    548             0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
    549             0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
    550             0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
    551             0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
    552             0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
    553             0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
    554             0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
    555             0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
    556             0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
    557     };
    558 
    559     static const UChar chars_reverse[] = {
    560             0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
    561             0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
    562             0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
    563             0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
    564             0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
    565             0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
    566             0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
    567             0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
    568             0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
    569             0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
    570             0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
    571             0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
    572             0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
    573             0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
    574             0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
    575             0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
    576             0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
    577             0x0000
    578     };
    579 
    580     int32_t bLength = 0, brLength = 0;
    581 
    582     UnicodeString s1(chars);
    583     UnicodeString s2(chars_reverse);
    584 
    585     char *bytes = extractBytes(s1, "IBM424", bLength);
    586     char *bytes_r = extractBytes(s2, "IBM424", brLength);
    587 
    588     UCharsetDetector *csd = ucsdet_open(&status);
    589     if (U_FAILURE(status)) {
    590         errln("Error opening charset detector. - %s", u_errorName(status));
    591     }
    592     const UCharsetMatch *match;
    593     const char *name;
    594 
    595     ucsdet_setText(csd, bytes, bLength, &status);
    596     match = ucsdet_detect(csd, &status);
    597 
    598     if (match == NULL) {
    599         errcheckln(status, "Encoding detection failure for IBM424_rtl: got no matches. - %s", u_errorName(status));
    600         goto bail;
    601     }
    602 
    603     name  = ucsdet_getName(match, &status);
    604     if (strcmp(name, "IBM424_rtl") != 0) {
    605         errln("Encoding detection failure for IBM424_rtl: got %s", name);
    606     }
    607 
    608     ucsdet_setText(csd, bytes_r, brLength, &status);
    609     match = ucsdet_detect(csd, &status);
    610 
    611     if (match == NULL) {
    612         errln("Encoding detection failure for IBM424_ltr: got no matches.");
    613         goto bail;
    614     }
    615 
    616     name  = ucsdet_getName(match, &status);
    617     if (strcmp(name, "IBM424_ltr") != 0) {
    618         errln("Encoding detection failure for IBM424_ltr: got %s", name);
    619     }
    620 
    621 bail:
    622     freeBytes(bytes);
    623     freeBytes(bytes_r);
    624     ucsdet_close(csd);
    625 }
    626 
    627 void CharsetDetectionTest::IBM420Test()
    628 {
    629     UErrorCode status = U_ZERO_ERROR;
    630 
    631     static const UChar chars[] = {
    632         0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
    633         0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
    634         0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
    635         0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
    636         0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
    637         0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
    638         0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
    639         0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
    640         0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
    641         0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
    642         0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
    643         0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
    644         0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
    645         0x0000
    646     };
    647     static const UChar chars_reverse[] = {
    648         0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
    649         0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
    650         0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
    651         0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
    652         0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
    653         0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
    654         0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
    655         0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
    656         0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
    657         0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
    658         0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
    659         0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
    660         0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
    661         0x0000,
    662     };
    663 
    664     int32_t bLength = 0, brLength = 0;
    665 
    666     UnicodeString s1(chars);
    667     UnicodeString s2(chars_reverse);
    668 
    669     char *bytes = extractBytes(s1, "IBM420", bLength);
    670     char *bytes_r = extractBytes(s2, "IBM420", brLength);
    671 
    672     UCharsetDetector *csd = ucsdet_open(&status);
    673     if (U_FAILURE(status)) {
    674         errln("Error opening charset detector. - %s", u_errorName(status));
    675     }
    676     const UCharsetMatch *match;
    677     const char *name;
    678 
    679     ucsdet_setText(csd, bytes, bLength, &status);
    680     match = ucsdet_detect(csd, &status);
    681 
    682     if (match == NULL) {
    683         errcheckln(status, "Encoding detection failure for IBM420_rtl: got no matches. - %s", u_errorName(status));
    684         goto bail;
    685     }
    686 
    687     name  = ucsdet_getName(match, &status);
    688     if (strcmp(name, "IBM420_rtl") != 0) {
    689         errln("Encoding detection failure for IBM420_rtl: got %s\n", name);
    690     }
    691 
    692     ucsdet_setText(csd, bytes_r, brLength, &status);
    693     match = ucsdet_detect(csd, &status);
    694 
    695     if (match == NULL) {
    696         errln("Encoding detection failure for IBM420_ltr: got no matches.\n");
    697         goto bail;
    698     }
    699 
    700     name  = ucsdet_getName(match, &status);
    701     if (strcmp(name, "IBM420_ltr") != 0) {
    702         errln("Encoding detection failure for IBM420_ltr: got %s\n", name);
    703     }
    704 
    705 bail:
    706     freeBytes(bytes);
    707     freeBytes(bytes_r);
    708     ucsdet_close(csd);
    709 }
    710 
    711 
    712 void CharsetDetectionTest::Ticket6394Test() {
    713 #if !UCONFIG_NO_CONVERSION
    714     const char charText[] =  "Here is some random English text that should be detected as ISO-8859-1."
    715                              "Ticket 6394 claims that ISO-8859-1 will appear in the array of detected "
    716                              "encodings more than once.  The hop through UnicodeString is for platforms "
    717                              "where this char * string is be EBCDIC and needs conversion to Latin1.";
    718     char latin1Text[sizeof(charText)];
    719     UnicodeString(charText).extract(0, sizeof(charText)-2, latin1Text, sizeof(latin1Text), "ISO-8859-1");
    720 
    721     UErrorCode status = U_ZERO_ERROR;
    722     UCharsetDetector *csd = ucsdet_open(&status);
    723     ucsdet_setText(csd, latin1Text, -1, &status);
    724     if (U_FAILURE(status)) {
    725         errln("Fail at file %s, line %d.  status = %s", __FILE__, __LINE__, u_errorName(status));
    726         return;
    727     }
    728 
    729     int32_t matchCount = 0;
    730     const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);
    731     if (U_FAILURE(status)) {
    732         errln("Fail at file %s, line %d.  status = %s", __FILE__, __LINE__, u_errorName(status));
    733         return;
    734     }
    735 
    736     UnicodeSet  setOfCharsetNames;    // UnicodSets can hold strings.
    737     int32_t i;
    738     for (i=0; i<matchCount; i++) {
    739         UnicodeString charSetName(ucsdet_getName(matches[i], &status));
    740         if (U_FAILURE(status)) {
    741             errln("Fail at file %s, line %d.  status = %s;  i=%d", __FILE__, __LINE__, u_errorName(status), i);
    742             status = U_ZERO_ERROR;
    743         }
    744         if (setOfCharsetNames.contains(charSetName)) {
    745             errln("Fail at file %s, line %d ", __FILE__, __LINE__);
    746             errln(UnicodeString("   Duplicate charset name = ") + charSetName);
    747         }
    748         setOfCharsetNames.add(charSetName);
    749     }
    750     ucsdet_close(csd);
    751 #endif
    752 }
    753 
    754