Home | History | Annotate | Download | only in intltest
      1 /*
      2  **********************************************************************
      3  *   Copyright (C) 2005-2012, International Business Machines
      4  *   Corporation and others.  All Rights Reserved.
      5  **********************************************************************
      6  */
      7 
      8 
      9 #include "unicode/utypes.h"
     10 #include "unicode/ucsdet.h"
     11 #include "unicode/ucnv.h"
     12 #include "unicode/unistr.h"
     13 #include "unicode/putil.h"
     14 #include "unicode/uniset.h"
     15 
     16 #include "intltest.h"
     17 #include "csdetest.h"
     18 
     19 #include "xmlparser.h"
     20 
     21 #include <stdlib.h>
     22 #include <string.h>
     23 
     24 #ifdef DEBUG_DETECT
     25 #include <stdio.h>
     26 #endif
     27 
     28 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
     29 
     30 #define NEW_ARRAY(type,count) (type *) /*uprv_*/malloc((count) * sizeof(type))
     31 #define DELETE_ARRAY(array) /*uprv_*/free((void *) (array))
     32 
     33 #define CH_SPACE 0x0020
     34 #define CH_SLASH 0x002F
     35 
     36 #define TEST_ASSERT(x) {if (!(x)) { \
     37     errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
     38 
     39 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
     40     errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));\
     41     return;}}
     42 
     43 
     44 //---------------------------------------------------------------------------
     45 //
     46 //  Test class boilerplate
     47 //
     48 //---------------------------------------------------------------------------
     49 CharsetDetectionTest::CharsetDetectionTest()
     50 {
     51 }
     52 
     53 
     54 CharsetDetectionTest::~CharsetDetectionTest()
     55 {
     56 }
     57 
     58 
     59 
     60 void CharsetDetectionTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
     61 {
     62     if (exec) logln("TestSuite CharsetDetectionTest: ");
     63     switch (index) {
     64        case 0: name = "ConstructionTest";
     65             if (exec) ConstructionTest();
     66             break;
     67 
     68        case 1: name = "UTF8Test";
     69             if (exec) UTF8Test();
     70             break;
     71 
     72        case 2: name = "UTF16Test";
     73             if (exec) UTF16Test();
     74             break;
     75 
     76        case 3: name = "C1BytesTest";
     77             if (exec) C1BytesTest();
     78             break;
     79 
     80        case 4: name = "InputFilterTest";
     81             if (exec) InputFilterTest();
     82             break;
     83 
     84        case 5: name = "DetectionTest";
     85             if (exec) DetectionTest();
     86             break;
     87 #if !UCONFIG_NO_LEGACY_CONVERSION
     88        case 6: name = "IBM424Test";
     89             if (exec) IBM424Test();
     90             break;
     91 
     92        case 7: name = "IBM420Test";
     93             if (exec) IBM420Test();
     94             break;
     95 #else
     96        case 6:
     97        case 7: name = "skip"; break;
     98 #endif
     99        case 8: name = "Ticket6394Test";
    100             if (exec) Ticket6394Test();
    101             break;
    102 
    103        case 9: name = "Ticket6954Test";
    104             if (exec) Ticket6954Test();
    105             break;
    106 
    107         default: name = "";
    108             break; //needed to end loop
    109     }
    110 }
    111 
    112 static UnicodeString *split(const UnicodeString &src, UChar ch, int32_t &splits)
    113 {
    114     int32_t offset = -1;
    115 
    116     splits = 1;
    117     while((offset = src.indexOf(ch, offset + 1)) >= 0) {
    118         splits += 1;
    119     }
    120 
    121     UnicodeString *result = new UnicodeString[splits];
    122 
    123     int32_t start = 0;
    124     int32_t split = 0;
    125     int32_t end;
    126 
    127     while((end = src.indexOf(ch, start)) >= 0) {
    128         src.extractBetween(start, end, result[split++]);
    129         start = end + 1;
    130     }
    131 
    132     src.extractBetween(start, src.length(), result[split]);
    133 
    134     return result;
    135 }
    136 
    137 static char *extractBytes(const UnicodeString &source, const char *codepage, int32_t &length)
    138 {
    139     int32_t sLength = source.length();
    140     char *bytes = NULL;
    141 
    142     length = source.extract(0, sLength, NULL, codepage);
    143 
    144     if (length > 0) {
    145         bytes = NEW_ARRAY(char, length + 1);
    146         source.extract(0, sLength, bytes, codepage);
    147     }
    148 
    149     return bytes;
    150 }
    151 
    152 static void freeBytes(char *bytes)
    153 {
    154     DELETE_ARRAY(bytes);
    155 }
    156 
    157 void CharsetDetectionTest::checkEncoding(const UnicodeString &testString, const UnicodeString &encoding, const UnicodeString &id)
    158 {
    159     int32_t splits = 0;
    160     int32_t testLength = testString.length();
    161     UnicodeString *eSplit = split(encoding, CH_SLASH, splits);
    162     UErrorCode status = U_ZERO_ERROR;
    163     int32_t cpLength = eSplit[0].length();
    164     char codepage[64];
    165 
    166     u_UCharsToChars(eSplit[0].getBuffer(), codepage, cpLength);
    167     codepage[cpLength] = '\0';
    168 
    169     LocalUCharsetDetectorPointer csd(ucsdet_open(&status));
    170 
    171     int32_t byteLength = 0;
    172     char *bytes = extractBytes(testString, codepage, byteLength);
    173 
    174     if (bytes == NULL) {
    175 #if !UCONFIG_NO_LEGACY_CONVERSION
    176         dataerrln("Can't open a " + encoding + " converter for " + id);
    177 #endif
    178         return;
    179     }
    180 
    181     ucsdet_setText(csd.getAlias(), bytes, byteLength, &status);
    182 
    183     int32_t matchCount = 0;
    184     const UCharsetMatch **matches = ucsdet_detectAll(csd.getAlias(), &matchCount, &status);
    185 
    186 
    187     UnicodeString name(ucsdet_getName(matches[0], &status));
    188     UnicodeString lang(ucsdet_getLanguage(matches[0], &status));
    189     UChar *decoded = NULL;
    190     int32_t dLength = 0;
    191 
    192     if (matchCount == 0) {
    193         errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got no matches");
    194         goto bail;
    195     }
    196 
    197     if (name.compare(eSplit[0]) != 0) {
    198         errln("Encoding detection failure for " + id + ": expected " + eSplit[0] + ", got " + name);
    199 
    200 #ifdef DEBUG_DETECT
    201         for (int32_t m = 0; m < matchCount; m += 1) {
    202             const char *name = ucsdet_getName(matches[m], &status);
    203             const char *lang = ucsdet_getLanguage(matches[m], &status);
    204             int32_t confidence = ucsdet_getConfidence(matches[m], &status);
    205 
    206             printf("%s (%s) %d\n", name, lang, confidence);
    207         }
    208 #endif
    209         goto bail;
    210     }
    211 
    212     if (splits > 1 && lang.compare(eSplit[1]) != 0) {
    213         errln("Language detection failure for " + id + ", " + eSplit[0] + ": expected " + eSplit[1] + ", got " + lang);
    214         goto bail;
    215     }
    216 
    217     decoded = NEW_ARRAY(UChar, testLength);
    218     dLength = ucsdet_getUChars(matches[0], decoded, testLength, &status);
    219 
    220     if (testString.compare(decoded, dLength) != 0) {
    221         errln("Round-trip error for " + id + ", " + eSplit[0] + ": getUChars() didn't yeild the original string.");
    222 
    223 #ifdef DEBUG_DETECT
    224         for(int32_t i = 0; i < testLength; i += 1) {
    225             if(testString[i] != decoded[i]) {
    226                 printf("Strings differ at byte %d\n", i);
    227                 break;
    228             }
    229         }
    230 #endif
    231 
    232     }
    233 
    234     DELETE_ARRAY(decoded);
    235 
    236 bail:
    237     freeBytes(bytes);
    238     delete[] eSplit;
    239 }
    240 
    241 const char *CharsetDetectionTest::getPath(char buffer[2048], const char *filename) {
    242     UErrorCode status = U_ZERO_ERROR;
    243     const char *testDataDirectory = IntlTest::getSourceTestData(status);
    244 
    245     if (U_FAILURE(status)) {
    246         errln("ERROR: getPath() failed - %s", u_errorName(status));
    247         return NULL;
    248     }
    249 
    250     strcpy(buffer, testDataDirectory);
    251     strcat(buffer, filename);
    252     return buffer;
    253 }
    254 
    255 void CharsetDetectionTest::ConstructionTest()
    256 {
    257     IcuTestErrorCode status(*this, "ConstructionTest");
    258     LocalUCharsetDetectorPointer csd(ucsdet_open(status));
    259     LocalUEnumerationPointer e(ucsdet_getAllDetectableCharsets(csd.getAlias(), status));
    260     int32_t count = uenum_count(e.getAlias(), status);
    261 
    262 #ifdef DEBUG_DETECT
    263     printf("There are %d recognizers.\n", count);
    264 #endif
    265 
    266     for(int32_t i = 0; i < count; i += 1) {
    267         int32_t length;
    268         const char *name = uenum_next(e.getAlias(), &length, status);
    269 
    270         if(name == NULL || length <= 0) {
    271             errln("ucsdet_getAllDetectableCharsets() returned a null or empty name!");
    272         }
    273 
    274 #ifdef DEBUG_DETECT
    275         printf("%s\n", name);
    276 #endif
    277     }
    278 }
    279 
    280 void CharsetDetectionTest::UTF8Test()
    281 {
    282     UErrorCode status = U_ZERO_ERROR;
    283     UnicodeString ss = "This is a string with some non-ascii characters that will "
    284                        "be converted to UTF-8, then shoved through the detection process.  "
    285                        "\\u0391\\u0392\\u0393\\u0394\\u0395"
    286                        "Sure would be nice if our source could contain Unicode directly!";
    287     UnicodeString s = ss.unescape();
    288     int32_t byteLength = 0, sLength = s.length();
    289     char *bytes = extractBytes(s, "UTF-8", byteLength);
    290     UCharsetDetector *csd = ucsdet_open(&status);
    291     const UCharsetMatch *match;
    292     UChar *detected = NEW_ARRAY(UChar, sLength);
    293 
    294     ucsdet_setText(csd, bytes, byteLength, &status);
    295     match = ucsdet_detect(csd, &status);
    296 
    297     if (match == NULL) {
    298         errln("Detection failure for UTF-8: got no matches.");
    299         goto bail;
    300     }
    301 
    302     ucsdet_getUChars(match, detected, sLength, &status);
    303 
    304     if (s.compare(detected, sLength) != 0) {
    305         errln("Round-trip test failed!");
    306     }
    307 
    308     ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
    309 
    310 bail:
    311     DELETE_ARRAY(detected);
    312     freeBytes(bytes);
    313     ucsdet_close(csd);
    314 }
    315 
    316 void CharsetDetectionTest::UTF16Test()
    317 {
    318     UErrorCode status = U_ZERO_ERROR;
    319     /* Notice the BOM on the start of this string */
    320     UChar chars[] = {
    321         0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
    322         0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
    323         0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
    324         0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
    325         0x064a, 0x062a, 0x0000};
    326     UnicodeString s(chars);
    327     int32_t beLength = 0, leLength = 0;
    328     char *beBytes = extractBytes(s, "UTF-16BE", beLength);
    329     char *leBytes = extractBytes(s, "UTF-16LE", leLength);
    330     UCharsetDetector *csd = ucsdet_open(&status);
    331     const UCharsetMatch *match;
    332     const char *name;
    333     int32_t conf;
    334 
    335     ucsdet_setText(csd, beBytes, beLength, &status);
    336     match = ucsdet_detect(csd, &status);
    337 
    338     if (match == NULL) {
    339         errln("Encoding detection failure for UTF-16BE: got no matches.");
    340         goto try_le;
    341     }
    342 
    343     name  = ucsdet_getName(match, &status);
    344     conf  = ucsdet_getConfidence(match, &status);
    345 
    346     if (strcmp(name, "UTF-16BE") != 0) {
    347         errln("Encoding detection failure for UTF-16BE: got %s", name);
    348         goto try_le; // no point in looking at confidence if we got the wrong character set.
    349     }
    350 
    351     if (conf != 100) {
    352         errln("Did not get 100%% confidence for UTF-16BE: got %d", conf);
    353     }
    354 
    355 try_le:
    356     ucsdet_setText(csd, leBytes, leLength, &status);
    357     match = ucsdet_detect(csd, &status);
    358 
    359     if (match == NULL) {
    360         errln("Encoding detection failure for UTF-16LE: got no matches.");
    361         goto bail;
    362     }
    363 
    364     name  = ucsdet_getName(match, &status);
    365     conf = ucsdet_getConfidence(match, &status);
    366 
    367 
    368     if (strcmp(name, "UTF-16LE") != 0) {
    369         errln("Enconding detection failure for UTF-16LE: got %s", name);
    370         goto bail; // no point in looking at confidence if we got the wrong character set.
    371     }
    372 
    373     if (conf != 100) {
    374         errln("Did not get 100%% confidence for UTF-16LE: got %d", conf);
    375     }
    376 
    377 bail:
    378     freeBytes(leBytes);
    379     freeBytes(beBytes);
    380     ucsdet_close(csd);
    381 }
    382 
    383 void CharsetDetectionTest::InputFilterTest()
    384 {
    385     UErrorCode status = U_ZERO_ERROR;
    386     UnicodeString ss = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
    387     UnicodeString s  = ss.unescape();
    388     int32_t byteLength = 0;
    389     char *bytes = extractBytes(s, "ISO-8859-1", byteLength);
    390     UCharsetDetector *csd = ucsdet_open(&status);
    391     const UCharsetMatch *match;
    392     const char *lang, *name;
    393 
    394     ucsdet_enableInputFilter(csd, TRUE);
    395 
    396     if (!ucsdet_isInputFilterEnabled(csd)) {
    397         errln("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!");
    398     }
    399 
    400 
    401     ucsdet_setText(csd, bytes, byteLength, &status);
    402     match = ucsdet_detect(csd, &status);
    403 
    404     if (match == NULL) {
    405         errln("Turning on the input filter resulted in no matches.");
    406         goto turn_off;
    407     }
    408 
    409     name = ucsdet_getName(match, &status);
    410 
    411     if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
    412         errln("Turning on the input filter resulted in %s rather than ISO-8859-1.", name);
    413     } else {
    414         lang = ucsdet_getLanguage(match, &status);
    415 
    416         if (lang == NULL || strcmp(lang, "fr") != 0) {
    417             errln("Input filter did not strip markup!");
    418         }
    419     }
    420 
    421 turn_off:
    422     ucsdet_enableInputFilter(csd, FALSE);
    423     ucsdet_setText(csd, bytes, byteLength, &status);
    424     match = ucsdet_detect(csd, &status);
    425 
    426     if (match == NULL) {
    427         errln("Turning off the input filter resulted in no matches.");
    428         goto bail;
    429     }
    430 
    431     name = ucsdet_getName(match, &status);
    432 
    433     if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
    434         errln("Turning off the input filter resulted in %s rather than ISO-8859-1.", name);
    435     } else {
    436         lang = ucsdet_getLanguage(match, &status);
    437 
    438         if (lang == NULL || strcmp(lang, "en") != 0) {
    439             errln("Unfiltered input did not detect as English!");
    440         }
    441     }
    442 
    443 bail:
    444     freeBytes(bytes);
    445     ucsdet_close(csd);
    446 }
    447 
    448 void CharsetDetectionTest::C1BytesTest()
    449 {
    450 #if !UCONFIG_NO_LEGACY_CONVERSION
    451     UErrorCode status = U_ZERO_ERROR;
    452     UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
    453     UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
    454     UnicodeString sWindows  = ssWindows.unescape();
    455     int32_t lISO = 0, lWindows = 0;
    456     char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
    457     char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
    458     UCharsetDetector *csd = ucsdet_open(&status);
    459     const UCharsetMatch *match;
    460     const char *name;
    461 
    462     ucsdet_setText(csd, bWindows, lWindows, &status);
    463     match = ucsdet_detect(csd, &status);
    464 
    465     if (match == NULL) {
    466         errcheckln(status, "English test with C1 bytes got no matches. - %s", u_errorName(status));
    467         goto bail;
    468     }
    469 
    470     name  = ucsdet_getName(match, &status);
    471 
    472     if (strcmp(name, "windows-1252") != 0) {
    473         errln("English text with C1 bytes does not detect as windows-1252, but as %s", name);
    474     }
    475 
    476     ucsdet_setText(csd, bISO, lISO, &status);
    477     match = ucsdet_detect(csd, &status);
    478 
    479     if (match == NULL) {
    480         errln("English text without C1 bytes got no matches.");
    481         goto bail;
    482     }
    483 
    484     name  = ucsdet_getName(match, &status);
    485 
    486     if (strcmp(name, "ISO-8859-1") != 0) {
    487         errln("English text without C1 bytes does not detect as ISO-8859-1, but as %s", name);
    488     }
    489 
    490 bail:
    491     freeBytes(bWindows);
    492     freeBytes(bISO);
    493 
    494     ucsdet_close(csd);
    495 #endif
    496 }
    497 
    498 void CharsetDetectionTest::DetectionTest()
    499 {
    500 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
    501     UErrorCode status = U_ZERO_ERROR;
    502     char path[2048];
    503     const char *testFilePath = getPath(path, "csdetest.xml");
    504 
    505     if (testFilePath == NULL) {
    506         return; /* Couldn't get path: error message already output. */
    507     }
    508 
    509     UXMLParser  *parser = UXMLParser::createParser(status);
    510     if (U_FAILURE(status)) {
    511         dataerrln("FAIL: UXMLParser::createParser (%s)", u_errorName(status));
    512         return;
    513     }
    514 
    515     UXMLElement *root   = parser->parseFile(testFilePath, status);
    516     if (!assertSuccess( "parseFile",status)) return;
    517 
    518     UnicodeString test_case = UNICODE_STRING_SIMPLE("test-case");
    519     UnicodeString id_attr   = UNICODE_STRING_SIMPLE("id");
    520     UnicodeString enc_attr  = UNICODE_STRING_SIMPLE("encodings");
    521 
    522     const UXMLElement *testCase;
    523     int32_t tc = 0;
    524 
    525     while((testCase = root->nextChildElement(tc)) != NULL) {
    526         if (testCase->getTagName().compare(test_case) == 0) {
    527             const UnicodeString *id = testCase->getAttribute(id_attr);
    528             const UnicodeString *encodings = testCase->getAttribute(enc_attr);
    529             const UnicodeString  text = testCase->getText(TRUE);
    530             int32_t encodingCount;
    531             UnicodeString *encodingList = split(*encodings, CH_SPACE, encodingCount);
    532 
    533             for(int32_t e = 0; e < encodingCount; e += 1) {
    534                 checkEncoding(text, encodingList[e], *id);
    535             }
    536 
    537             delete[] encodingList;
    538         }
    539     }
    540 
    541     delete root;
    542     delete parser;
    543 #endif
    544 }
    545 
    546 void CharsetDetectionTest::IBM424Test()
    547 {
    548     UErrorCode status = U_ZERO_ERROR;
    549 
    550     static const UChar chars[] = {
    551             0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
    552             0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
    553             0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
    554             0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
    555             0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
    556             0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
    557             0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
    558             0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
    559             0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
    560             0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
    561             0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
    562             0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
    563             0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
    564             0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
    565             0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
    566             0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
    567             0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
    568     };
    569 
    570     static const UChar chars_reverse[] = {
    571             0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
    572             0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
    573             0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
    574             0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
    575             0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
    576             0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
    577             0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
    578             0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
    579             0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
    580             0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
    581             0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
    582             0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
    583             0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
    584             0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
    585             0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
    586             0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
    587             0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
    588             0x0000
    589     };
    590 
    591     int32_t bLength = 0, brLength = 0;
    592 
    593     UnicodeString s1(chars);
    594     UnicodeString s2(chars_reverse);
    595 
    596     char *bytes = extractBytes(s1, "IBM424", bLength);
    597     char *bytes_r = extractBytes(s2, "IBM424", brLength);
    598 
    599     UCharsetDetector *csd = ucsdet_open(&status);
    600     if (U_FAILURE(status)) {
    601         errln("Error opening charset detector. - %s", u_errorName(status));
    602     }
    603     const UCharsetMatch *match;
    604     const char *name;
    605 
    606     ucsdet_setText(csd, bytes, bLength, &status);
    607     match = ucsdet_detect(csd, &status);
    608 
    609     if (match == NULL) {
    610         errcheckln(status, "Encoding detection failure for IBM424_rtl: got no matches. - %s", u_errorName(status));
    611         goto bail;
    612     }
    613 
    614     name  = ucsdet_getName(match, &status);
    615     if (strcmp(name, "IBM424_rtl") != 0) {
    616         errln("Encoding detection failure for IBM424_rtl: got %s", name);
    617     }
    618 
    619     ucsdet_setText(csd, bytes_r, brLength, &status);
    620     match = ucsdet_detect(csd, &status);
    621 
    622     if (match == NULL) {
    623         errln("Encoding detection failure for IBM424_ltr: got no matches.");
    624         goto bail;
    625     }
    626 
    627     name  = ucsdet_getName(match, &status);
    628     if (strcmp(name, "IBM424_ltr") != 0) {
    629         errln("Encoding detection failure for IBM424_ltr: got %s", name);
    630     }
    631 
    632 bail:
    633     freeBytes(bytes);
    634     freeBytes(bytes_r);
    635     ucsdet_close(csd);
    636 }
    637 
    638 void CharsetDetectionTest::IBM420Test()
    639 {
    640     UErrorCode status = U_ZERO_ERROR;
    641 
    642     static const UChar chars[] = {
    643         0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
    644         0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
    645         0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
    646         0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
    647         0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
    648         0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
    649         0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
    650         0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
    651         0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
    652         0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
    653         0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
    654         0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
    655         0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
    656         0x0000
    657     };
    658     static const UChar chars_reverse[] = {
    659         0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
    660         0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
    661         0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
    662         0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
    663         0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
    664         0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
    665         0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
    666         0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
    667         0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
    668         0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
    669         0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
    670         0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
    671         0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
    672         0x0000,
    673     };
    674 
    675     int32_t bLength = 0, brLength = 0;
    676 
    677     UnicodeString s1(chars);
    678     UnicodeString s2(chars_reverse);
    679 
    680     char *bytes = extractBytes(s1, "IBM420", bLength);
    681     char *bytes_r = extractBytes(s2, "IBM420", brLength);
    682 
    683     UCharsetDetector *csd = ucsdet_open(&status);
    684     if (U_FAILURE(status)) {
    685         errln("Error opening charset detector. - %s", u_errorName(status));
    686     }
    687     const UCharsetMatch *match;
    688     const char *name;
    689 
    690     ucsdet_setText(csd, bytes, bLength, &status);
    691     match = ucsdet_detect(csd, &status);
    692 
    693     if (match == NULL) {
    694         errcheckln(status, "Encoding detection failure for IBM420_rtl: got no matches. - %s", u_errorName(status));
    695         goto bail;
    696     }
    697 
    698     name  = ucsdet_getName(match, &status);
    699     if (strcmp(name, "IBM420_rtl") != 0) {
    700         errln("Encoding detection failure for IBM420_rtl: got %s\n", name);
    701     }
    702 
    703     ucsdet_setText(csd, bytes_r, brLength, &status);
    704     match = ucsdet_detect(csd, &status);
    705 
    706     if (match == NULL) {
    707         errln("Encoding detection failure for IBM420_ltr: got no matches.\n");
    708         goto bail;
    709     }
    710 
    711     name  = ucsdet_getName(match, &status);
    712     if (strcmp(name, "IBM420_ltr") != 0) {
    713         errln("Encoding detection failure for IBM420_ltr: got %s\n", name);
    714     }
    715 
    716 bail:
    717     freeBytes(bytes);
    718     freeBytes(bytes_r);
    719     ucsdet_close(csd);
    720 }
    721 
    722 
    723 void CharsetDetectionTest::Ticket6394Test() {
    724 #if !UCONFIG_NO_CONVERSION
    725     const char charText[] =  "Here is some random English text that should be detected as ISO-8859-1."
    726                              "Ticket 6394 claims that ISO-8859-1 will appear in the array of detected "
    727                              "encodings more than once.  The hop through UnicodeString is for platforms "
    728                              "where this char * string is be EBCDIC and needs conversion to Latin1.";
    729     char latin1Text[sizeof(charText)];
    730     UnicodeString(charText).extract(0, sizeof(charText)-2, latin1Text, sizeof(latin1Text), "ISO-8859-1");
    731 
    732     UErrorCode status = U_ZERO_ERROR;
    733     UCharsetDetector *csd = ucsdet_open(&status);
    734     ucsdet_setText(csd, latin1Text, -1, &status);
    735     if (U_FAILURE(status)) {
    736         errln("Fail at file %s, line %d.  status = %s", __FILE__, __LINE__, u_errorName(status));
    737         return;
    738     }
    739 
    740     int32_t matchCount = 0;
    741     const UCharsetMatch **matches = ucsdet_detectAll(csd, &matchCount, &status);
    742     if (U_FAILURE(status)) {
    743         errln("Fail at file %s, line %d.  status = %s", __FILE__, __LINE__, u_errorName(status));
    744         return;
    745     }
    746 
    747     UnicodeSet  setOfCharsetNames;    // UnicodSets can hold strings.
    748     int32_t i;
    749     for (i=0; i<matchCount; i++) {
    750         UnicodeString charSetName(ucsdet_getName(matches[i], &status));
    751         if (U_FAILURE(status)) {
    752             errln("Fail at file %s, line %d.  status = %s;  i=%d", __FILE__, __LINE__, u_errorName(status), i);
    753             status = U_ZERO_ERROR;
    754         }
    755         if (setOfCharsetNames.contains(charSetName)) {
    756             errln("Fail at file %s, line %d ", __FILE__, __LINE__);
    757             errln(UnicodeString("   Duplicate charset name = ") + charSetName);
    758         }
    759         setOfCharsetNames.add(charSetName);
    760     }
    761     ucsdet_close(csd);
    762 #endif
    763 }
    764 
    765 
    766 // Ticket 6954 - trouble with the haveC1Bytes flag that is used to distinguish between
    767 //               similar Windows and non-Windows SBCS encodings. State was kept in the shared
    768 //               Charset Recognizer objects, and could be overwritten.
    769 void CharsetDetectionTest::Ticket6954Test() {
    770 #if !UCONFIG_NO_CONVERSION
    771     UErrorCode status = U_ZERO_ERROR;
    772     UnicodeString sISO = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
    773     UnicodeString ssWindows("This is another small sample of some English text. Just enough to be sure that it detects correctly."
    774                             "It also includes some \\u201CC1\\u201D bytes.", -1, US_INV);
    775     UnicodeString sWindows  = ssWindows.unescape();
    776     int32_t lISO = 0, lWindows = 0;
    777     char *bISO = extractBytes(sISO, "ISO-8859-1", lISO);
    778     char *bWindows = extractBytes(sWindows, "windows-1252", lWindows);
    779 
    780     // First do a plain vanilla detect of 1252 text
    781 
    782     UCharsetDetector *csd1 = ucsdet_open(&status);
    783     ucsdet_setText(csd1, bWindows, lWindows, &status);
    784     const UCharsetMatch *match1 = ucsdet_detect(csd1, &status);
    785     const char *name1 = ucsdet_getName(match1, &status);
    786     TEST_ASSERT_SUCCESS(status);
    787     TEST_ASSERT(strcmp(name1, "windows-1252")==0);
    788 
    789     // Next, using a completely separate detector, detect some 8859-1 text
    790 
    791     UCharsetDetector *csd2 = ucsdet_open(&status);
    792     ucsdet_setText(csd2, bISO, lISO, &status);
    793     const UCharsetMatch *match2 = ucsdet_detect(csd2, &status);
    794     const char *name2 = ucsdet_getName(match2, &status);
    795     TEST_ASSERT_SUCCESS(status);
    796     TEST_ASSERT(strcmp(name2, "ISO-8859-1")==0);
    797 
    798     // Recheck the 1252 results from the first detector, which should not have been
    799     //  altered by the use of a different detector.
    800 
    801     name1 = ucsdet_getName(match1, &status);
    802     TEST_ASSERT_SUCCESS(status);
    803     TEST_ASSERT(strcmp(name1, "windows-1252")==0);
    804 
    805     ucsdet_close(csd1);
    806     ucsdet_close(csd2);
    807     freeBytes(bISO);
    808     freeBytes(bWindows);
    809 #endif
    810 }
    811