Home | History | Annotate | Download | only in cintltst
      1 /*
      2  ****************************************************************************
      3  * Copyright (c) 2005-2009, International Business Machines Corporation and *
      4  * others. All Rights Reserved.                                             *
      5  ****************************************************************************
      6  */
      7 
      8 #include "unicode/utypes.h"
      9 
     10 #include "unicode/ucsdet.h"
     11 #include "unicode/ucnv.h"
     12 #include "unicode/ustring.h"
     13 
     14 #include "cintltst.h"
     15 
     16 #include <stdlib.h>
     17 #include <string.h>
     18 
     19 #define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
     20 
     21 #define NEW_ARRAY(type,count) (type *) malloc((count) * sizeof(type))
     22 #define DELETE_ARRAY(array) free(array)
     23 
     24 static void TestConstruction(void);
     25 static void TestUTF8(void);
     26 static void TestUTF16(void);
     27 static void TestC1Bytes(void);
     28 static void TestInputFilter(void);
     29 static void TestChaining(void);
     30 static void TestBufferOverflow(void);
     31 static void TestIBM424(void);
     32 static void TestIBM420(void);
     33 
     34 void addUCsdetTest(TestNode** root);
     35 
     36 void addUCsdetTest(TestNode** root)
     37 {
     38     addTest(root, &TestConstruction, "ucsdetst/TestConstruction");
     39     addTest(root, &TestUTF8, "ucsdetst/TestUTF8");
     40     addTest(root, &TestUTF16, "ucsdetst/TestUTF16");
     41     addTest(root, &TestC1Bytes, "ucsdetst/TestC1Bytes");
     42     addTest(root, &TestInputFilter, "ucsdetst/TestInputFilter");
     43     addTest(root, &TestChaining, "ucsdetst/TestErrorChaining");
     44     addTest(root, &TestBufferOverflow, "ucsdetst/TestBufferOverflow");
     45 #if !UCONFIG_NO_LEGACY_CONVERSION
     46     addTest(root, &TestIBM424, "ucsdetst/TestIBM424");
     47     addTest(root, &TestIBM420, "ucsdetst/TestIBM420");
     48 #endif
     49 }
     50 
     51 static int32_t preflight(const UChar *src, int32_t length, UConverter *cnv)
     52 {
     53     UErrorCode status;
     54     char buffer[1024];
     55     char *dest, *destLimit = buffer + sizeof(buffer);
     56     const UChar *srcLimit = src + length;
     57     int32_t result = 0;
     58 
     59     do {
     60         dest = buffer;
     61         status = U_ZERO_ERROR;
     62         ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status);
     63         result += (int32_t) (dest - buffer);
     64     } while (status == U_BUFFER_OVERFLOW_ERROR);
     65 
     66     return result;
     67 }
     68 
     69 static char *extractBytes(const UChar *src, int32_t length, const char *codepage, int32_t *byteLength)
     70 {
     71     UErrorCode status = U_ZERO_ERROR;
     72     UConverter *cnv = ucnv_open(codepage, &status);
     73     int32_t byteCount = preflight(src, length, cnv);
     74     const UChar *srcLimit = src + length;
     75     char *bytes = NEW_ARRAY(char, byteCount + 1);
     76     char *dest = bytes, *destLimit = bytes + byteCount + 1;
     77 
     78     ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status);
     79     ucnv_close(cnv);
     80 
     81     *byteLength = byteCount;
     82     return bytes;
     83 }
     84 
     85 static void freeBytes(char *bytes)
     86 {
     87     DELETE_ARRAY(bytes);
     88 }
     89 
     90 static void TestConstruction(void)
     91 {
     92     UErrorCode status = U_ZERO_ERROR;
     93     UCharsetDetector *csd = ucsdet_open(&status);
     94     UEnumeration *e = ucsdet_getAllDetectableCharsets(csd, &status);
     95     const char *name;
     96     int32_t count = uenum_count(e, &status);
     97     int32_t i, length;
     98 
     99     for(i = 0; i < count; i += 1) {
    100         name = uenum_next(e, &length, &status);
    101 
    102         if(name == NULL || length <= 0) {
    103             log_err("ucsdet_getAllDetectableCharsets() returned a null or empty name!\n");
    104         }
    105     }
    106     /* one past the list of all names must return NULL */
    107     name = uenum_next(e, &length, &status);
    108     if(name != NULL || length != 0 || U_FAILURE(status)) {
    109         log_err("ucsdet_getAllDetectableCharsets(past the list) returned a non-null name!\n");
    110     }
    111 
    112     uenum_close(e);
    113     ucsdet_close(csd);
    114 }
    115 
    116 static void TestUTF8(void)
    117 {
    118     UErrorCode status = U_ZERO_ERROR;
    119     static const char ss[] = "This is a string with some non-ascii characters that will "
    120                "be converted to UTF-8, then shoved through the detection process.  "
    121                "\\u0391\\u0392\\u0393\\u0394\\u0395"
    122                "Sure would be nice if our source could contain Unicode directly!";
    123     int32_t byteLength = 0, sLength = 0, dLength = 0;
    124     UChar s[sizeof(ss)];
    125     char *bytes;
    126     UCharsetDetector *csd = ucsdet_open(&status);
    127     const UCharsetMatch *match;
    128     UChar detected[sizeof(ss)];
    129 
    130     sLength = u_unescape(ss, s, sizeof(ss));
    131     bytes = extractBytes(s, sLength, "UTF-8", &byteLength);
    132 
    133     ucsdet_setText(csd, bytes, byteLength, &status);
    134     if (U_FAILURE(status)) {
    135         log_err("status is %s\n", u_errorName(status));
    136         goto bail;
    137     }
    138 
    139     match = ucsdet_detect(csd, &status);
    140 
    141     if (match == NULL) {
    142         log_err("Detection failure for UTF-8: got no matches.\n");
    143         goto bail;
    144     }
    145 
    146     dLength = ucsdet_getUChars(match, detected, sLength, &status);
    147 
    148     if (u_strCompare(detected, dLength, s, sLength, FALSE) != 0) {
    149         log_err("Round-trip test failed!\n");
    150     }
    151 
    152     ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
    153 
    154 bail:
    155     freeBytes(bytes);
    156     ucsdet_close(csd);
    157 }
    158 
    159 static void TestUTF16(void)
    160 {
    161     UErrorCode status = U_ZERO_ERROR;
    162     /* Notice the BOM on the start of this string */
    163     static const UChar chars[] = {
    164         0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
    165         0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
    166         0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
    167         0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
    168         0x064a, 0x062a, 0x0000};
    169     int32_t beLength = 0, leLength = 0, cLength = ARRAY_SIZE(chars);
    170     char *beBytes = extractBytes(chars, cLength, "UTF-16BE", &beLength);
    171     char *leBytes = extractBytes(chars, cLength, "UTF-16LE", &leLength);
    172     UCharsetDetector *csd = ucsdet_open(&status);
    173     const UCharsetMatch *match;
    174     const char *name;
    175     int32_t conf;
    176 
    177     ucsdet_setText(csd, beBytes, beLength, &status);
    178     match = ucsdet_detect(csd, &status);
    179 
    180     if (match == NULL) {
    181         log_err("Encoding detection failure for UTF-16BE: got no matches.\n");
    182         goto try_le;
    183     }
    184 
    185     name  = ucsdet_getName(match, &status);
    186     conf  = ucsdet_getConfidence(match, &status);
    187 
    188     if (strcmp(name, "UTF-16BE") != 0) {
    189         log_err("Encoding detection failure for UTF-16BE: got %s\n", name);
    190     }
    191 
    192     if (conf != 100) {
    193         log_err("Did not get 100%% confidence for UTF-16BE: got %d\n", conf);
    194     }
    195 
    196 try_le:
    197     ucsdet_setText(csd, leBytes, leLength, &status);
    198     match = ucsdet_detect(csd, &status);
    199 
    200     if (match == NULL) {
    201         log_err("Encoding detection failure for UTF-16LE: got no matches.\n");
    202         goto bail;
    203     }
    204 
    205     name  = ucsdet_getName(match, &status);
    206     conf = ucsdet_getConfidence(match, &status);
    207 
    208 
    209     if (strcmp(name, "UTF-16LE") != 0) {
    210         log_err("Enconding detection failure for UTF-16LE: got %s\n", name);
    211     }
    212 
    213     if (conf != 100) {
    214         log_err("Did not get 100%% confidence for UTF-16LE: got %d\n", conf);
    215     }
    216 
    217 bail:
    218     freeBytes(leBytes);
    219     freeBytes(beBytes);
    220     ucsdet_close(csd);
    221 }
    222 
    223 static void TestC1Bytes(void)
    224 {
    225 #if !UCONFIG_NO_LEGACY_CONVERSION
    226     UErrorCode status = U_ZERO_ERROR;
    227     static const char ssISO[] = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
    228     static const char ssWindows[] = "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.";
    229     int32_t sISOLength = 0, sWindowsLength = 0;
    230     UChar sISO[sizeof(ssISO)];
    231     UChar sWindows[sizeof(ssWindows)];
    232     int32_t lISO = 0, lWindows = 0;
    233     char *bISO;
    234     char *bWindows;
    235     UCharsetDetector *csd = ucsdet_open(&status);
    236     const UCharsetMatch *match;
    237     const char *name;
    238 
    239     sISOLength = u_unescape(ssISO, sISO, sizeof(ssISO));
    240     sWindowsLength = u_unescape(ssWindows, sWindows, sizeof(ssWindows));
    241     bISO = extractBytes(sISO, sISOLength, "ISO-8859-1", &lISO);
    242     bWindows = extractBytes(sWindows, sWindowsLength, "windows-1252", &lWindows);
    243 
    244     ucsdet_setText(csd, bWindows, lWindows, &status);
    245     match = ucsdet_detect(csd, &status);
    246 
    247     if (match == NULL) {
    248         log_err("English test with C1 bytes got no matches.\n");
    249         goto bail;
    250     }
    251 
    252     name  = ucsdet_getName(match, &status);
    253 
    254     if (strcmp(name, "windows-1252") != 0) {
    255         log_data_err("English text with C1 bytes does not detect as windows-1252, but as %s. (Are you missing data?)\n", name);
    256     }
    257 
    258     ucsdet_setText(csd, bISO, lISO, &status);
    259     match = ucsdet_detect(csd, &status);
    260 
    261     if (match == NULL) {
    262         log_err("English text without C1 bytes got no matches.\n");
    263         goto bail;
    264     }
    265 
    266     name  = ucsdet_getName(match, &status);
    267 
    268     if (strcmp(name, "ISO-8859-1") != 0) {
    269         log_err("English text without C1 bytes does not detect as ISO-8859-1, but as %s\n", name);
    270     }
    271 
    272 bail:
    273     freeBytes(bWindows);
    274     freeBytes(bISO);
    275 
    276     ucsdet_close(csd);
    277 #endif
    278 }
    279 
    280 static void TestInputFilter(void)
    281 {
    282     UErrorCode status = U_ZERO_ERROR;
    283     static const char ss[] = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
    284     int32_t sLength = 0;
    285     UChar s[sizeof(ss)];
    286     int32_t byteLength = 0;
    287     char *bytes;
    288     UCharsetDetector *csd = ucsdet_open(&status);
    289     const UCharsetMatch *match;
    290     const char *lang, *name;
    291 
    292     sLength = u_unescape(ss, s, sizeof(ss));
    293     bytes = extractBytes(s, sLength, "ISO-8859-1", &byteLength);
    294 
    295     ucsdet_enableInputFilter(csd, TRUE);
    296 
    297     if (!ucsdet_isInputFilterEnabled(csd)) {
    298         log_err("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!\n");
    299     }
    300 
    301 
    302     ucsdet_setText(csd, bytes, byteLength, &status);
    303     match = ucsdet_detect(csd, &status);
    304 
    305     if (match == NULL) {
    306         log_err("Turning on the input filter resulted in no matches.\n");
    307         goto turn_off;
    308     }
    309 
    310     name = ucsdet_getName(match, &status);
    311 
    312     if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
    313         log_err("Turning on the input filter resulted in %s rather than ISO-8859-1\n", name);
    314     } else {
    315         lang = ucsdet_getLanguage(match, &status);
    316 
    317         if (lang == NULL || strcmp(lang, "fr") != 0) {
    318             log_err("Input filter did not strip markup!\n");
    319         }
    320     }
    321 
    322 turn_off:
    323     ucsdet_enableInputFilter(csd, FALSE);
    324     ucsdet_setText(csd, bytes, byteLength, &status);
    325     match = ucsdet_detect(csd, &status);
    326 
    327     if (match == NULL) {
    328         log_err("Turning off the input filter resulted in no matches.\n");
    329         goto bail;
    330     }
    331 
    332     name = ucsdet_getName(match, &status);
    333 
    334     if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
    335         log_err("Turning off the input filter resulted in %s rather than ISO-8859-1\n", name);
    336     } else {
    337         lang = ucsdet_getLanguage(match, &status);
    338 
    339         if (lang == NULL || strcmp(lang, "en") != 0) {
    340             log_err("Unfiltered input did not detect as English!\n");
    341         }
    342     }
    343 
    344 bail:
    345     freeBytes(bytes);
    346     ucsdet_close(csd);
    347 }
    348 
    349 static void TestChaining(void) {
    350     UErrorCode status = U_USELESS_COLLATOR_ERROR;
    351 
    352     ucsdet_open(&status);
    353     ucsdet_setText(NULL, NULL, 0, &status);
    354     ucsdet_getName(NULL, &status);
    355     ucsdet_getConfidence(NULL, &status);
    356     ucsdet_getLanguage(NULL, &status);
    357     ucsdet_detect(NULL, &status);
    358     ucsdet_setDeclaredEncoding(NULL, NULL, 0, &status);
    359     ucsdet_detectAll(NULL, NULL, &status);
    360     ucsdet_getUChars(NULL, NULL, 0, &status);
    361     ucsdet_getUChars(NULL, NULL, 0, &status);
    362     ucsdet_close(NULL);
    363 
    364     /* All of this code should have done nothing. */
    365     if (status != U_USELESS_COLLATOR_ERROR) {
    366         log_err("Status got changed to %s\n", u_errorName(status));
    367     }
    368 }
    369 
    370 static void TestBufferOverflow(void) {
    371     UErrorCode status = U_ZERO_ERROR;
    372     static const char *testStrings[] = {
    373         "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b", /* A partial ISO-2022 shift state at the end */
    374         "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24", /* A partial ISO-2022 shift state at the end */
    375         "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28", /* A partial ISO-2022 shift state at the end */
    376         "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end with a bad one at the start */
    377         "\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end */
    378         "\xa1", /* Could be a single byte shift-jis at the end */
    379         "\x74\x68\xa1", /* Could be a single byte shift-jis at the end */
    380         "\x74\x68\x65\xa1" /* Could be a single byte shift-jis at the end, but now we have English creeping in. */
    381     };
    382     static const char *testResults[] = {
    383         "windows-1252",
    384         "windows-1252",
    385         "windows-1252",
    386         "windows-1252",
    387         "ISO-2022-JP",
    388         NULL,
    389         NULL,
    390         "ISO-8859-1"
    391     };
    392     int32_t idx = 0;
    393     UCharsetDetector *csd = ucsdet_open(&status);
    394     const UCharsetMatch *match;
    395 
    396     ucsdet_setDeclaredEncoding(csd, "ISO-2022-JP", -1, &status);
    397 
    398     if (U_FAILURE(status)) {
    399         log_err("Couldn't open detector. %s\n", u_errorName(status));
    400         goto bail;
    401     }
    402 
    403     for (idx = 0; idx < ARRAY_SIZE(testStrings); idx++) {
    404         ucsdet_setText(csd, testStrings[idx], -1, &status);
    405         match = ucsdet_detect(csd, &status);
    406 
    407         if (match == NULL) {
    408             if (testResults[idx] != NULL) {
    409                 log_err("Unexpectedly got no results at index %d.\n", idx);
    410             }
    411             else {
    412                 log_verbose("Got no result as expected at index %d.\n", idx);
    413             }
    414             continue;
    415         }
    416 
    417         if (testResults[idx] == NULL || strcmp(ucsdet_getName(match, &status), testResults[idx]) != 0) {
    418             log_err("Unexpectedly got %s instead of %s at index %d with confidence %d.\n",
    419                 ucsdet_getName(match, &status), testResults[idx], idx, ucsdet_getConfidence(match, &status));
    420             goto bail;
    421         }
    422     }
    423 
    424 bail:
    425     ucsdet_close(csd);
    426 }
    427 
    428 static void TestIBM424(void)
    429 {
    430     UErrorCode status = U_ZERO_ERROR;
    431 
    432     static const UChar chars[] = {
    433             0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
    434             0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
    435             0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
    436             0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
    437             0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
    438             0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
    439             0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
    440             0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
    441             0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
    442             0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
    443             0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
    444             0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
    445             0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
    446             0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
    447             0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
    448             0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
    449             0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
    450     };
    451 
    452     static const UChar chars_reverse[] = {
    453             0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
    454             0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
    455             0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
    456             0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
    457             0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
    458             0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
    459             0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
    460             0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
    461             0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
    462             0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
    463             0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
    464             0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
    465             0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
    466             0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
    467             0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
    468             0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
    469             0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
    470             0x0000
    471     };
    472 
    473     int32_t bLength = 0, brLength = 0, cLength = ARRAY_SIZE(chars), crLength = ARRAY_SIZE(chars_reverse);
    474 
    475     char *bytes = extractBytes(chars, cLength, "IBM424", &bLength);
    476     char *bytes_r = extractBytes(chars_reverse, crLength, "IBM424", &brLength);
    477 
    478     UCharsetDetector *csd = ucsdet_open(&status);
    479     const UCharsetMatch *match;
    480     const char *name;
    481 
    482     ucsdet_setText(csd, bytes, bLength, &status);
    483     match = ucsdet_detect(csd, &status);
    484 
    485     if (match == NULL) {
    486         log_err("Encoding detection failure for IBM424_rtl: got no matches.\n");
    487         goto bail;
    488     }
    489 
    490     name  = ucsdet_getName(match, &status);
    491     if (strcmp(name, "IBM424_rtl") != 0) {
    492         log_data_err("Encoding detection failure for IBM424_rtl: got %s. (Are you missing data?)\n", name);
    493     }
    494 
    495     ucsdet_setText(csd, bytes_r, brLength, &status);
    496     match = ucsdet_detect(csd, &status);
    497 
    498     if (match == NULL) {
    499         log_err("Encoding detection failure for IBM424_ltr: got no matches.\n");
    500         goto bail;
    501     }
    502 
    503     name  = ucsdet_getName(match, &status);
    504     if (strcmp(name, "IBM424_ltr") != 0) {
    505         log_data_err("Encoding detection failure for IBM424_ltr: got %s. (Are you missing data?)\n", name);
    506     }
    507 
    508 bail:
    509     freeBytes(bytes);
    510     freeBytes(bytes_r);
    511     ucsdet_close(csd);
    512 }
    513 
    514 static void TestIBM420(void)
    515 {
    516     UErrorCode status = U_ZERO_ERROR;
    517 
    518     static const UChar chars[] = {
    519         0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
    520         0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
    521         0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
    522         0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
    523         0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
    524         0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
    525         0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
    526         0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
    527         0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
    528         0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
    529         0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
    530         0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
    531         0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
    532         0x0000
    533     };
    534     static const UChar chars_reverse[] = {
    535         0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
    536         0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
    537         0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
    538         0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
    539         0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
    540         0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
    541         0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
    542         0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
    543         0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
    544         0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
    545         0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
    546         0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
    547         0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
    548         0x0000,
    549     };
    550 
    551     int32_t bLength = 0, brLength = 0, cLength = ARRAY_SIZE(chars), crLength = ARRAY_SIZE(chars_reverse);
    552 
    553     char *bytes = extractBytes(chars, cLength, "IBM420", &bLength);
    554     char *bytes_r = extractBytes(chars_reverse, crLength, "IBM420", &brLength);
    555 
    556     UCharsetDetector *csd = ucsdet_open(&status);
    557     const UCharsetMatch *match;
    558     const char *name;
    559 
    560     ucsdet_setText(csd, bytes, bLength, &status);
    561     match = ucsdet_detect(csd, &status);
    562 
    563     if (match == NULL) {
    564         log_err("Encoding detection failure for IBM420_rtl: got no matches.\n");
    565         goto bail;
    566     }
    567 
    568     name  = ucsdet_getName(match, &status);
    569     if (strcmp(name, "IBM420_rtl") != 0) {
    570         log_data_err("Encoding detection failure for IBM420_rtl: got %s. (Are you missing data?)\n", name);
    571     }
    572 
    573     ucsdet_setText(csd, bytes_r, brLength, &status);
    574     match = ucsdet_detect(csd, &status);
    575 
    576     if (match == NULL) {
    577         log_err("Encoding detection failure for IBM420_ltr: got no matches.\n");
    578         goto bail;
    579     }
    580 
    581     name  = ucsdet_getName(match, &status);
    582     if (strcmp(name, "IBM420_ltr") != 0) {
    583         log_data_err("Encoding detection failure for IBM420_ltr: got %s. (Are you missing data?)\n", name);
    584     }
    585 
    586 bail:
    587     freeBytes(bytes);
    588     freeBytes(bytes_r);
    589     ucsdet_close(csd);
    590 }
    591