Home | History | Annotate | Download | only in cintltst
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4  ****************************************************************************
      5  * Copyright (c) 2005-2016, International Business Machines Corporation and *
      6  * others. All Rights Reserved.                                             *
      7  ****************************************************************************
      8  */
      9 
     10 #include "unicode/utypes.h"
     11 
     12 #include "unicode/ucsdet.h"
     13 #include "unicode/ucnv.h"
     14 #include "unicode/ustring.h"
     15 
     16 #include "cintltst.h"
     17 #include "cmemory.h"
     18 
     19 #include <stdlib.h>
     20 #include <string.h>
     21 
     22 #define NEW_ARRAY(type,count) (type *) malloc((count) * sizeof(type))
     23 #define DELETE_ARRAY(array) free(array)
     24 
     25 static void TestConstruction(void);
     26 static void TestUTF8(void);
     27 static void TestUTF16(void);
     28 static void TestC1Bytes(void);
     29 static void TestInputFilter(void);
     30 static void TestChaining(void);
     31 static void TestBufferOverflow(void);
     32 static void TestIBM424(void);
     33 static void TestIBM420(void);
     34 
     35 void addUCsdetTest(TestNode** root);
     36 
     37 void addUCsdetTest(TestNode** root)
     38 {
     39     addTest(root, &TestConstruction, "ucsdetst/TestConstruction");
     40     addTest(root, &TestUTF8, "ucsdetst/TestUTF8");
     41     addTest(root, &TestUTF16, "ucsdetst/TestUTF16");
     42     addTest(root, &TestC1Bytes, "ucsdetst/TestC1Bytes");
     43     addTest(root, &TestInputFilter, "ucsdetst/TestInputFilter");
     44     addTest(root, &TestChaining, "ucsdetst/TestErrorChaining");
     45     addTest(root, &TestBufferOverflow, "ucsdetst/TestBufferOverflow");
     46 #if !UCONFIG_NO_LEGACY_CONVERSION
     47     addTest(root, &TestIBM424, "ucsdetst/TestIBM424");
     48     addTest(root, &TestIBM420, "ucsdetst/TestIBM420");
     49 #endif
     50 }
     51 
     52 static int32_t preflight(const UChar *src, int32_t length, UConverter *cnv)
     53 {
     54     UErrorCode status;
     55     char buffer[1024];
     56     char *dest, *destLimit = buffer + sizeof(buffer);
     57     const UChar *srcLimit = src + length;
     58     int32_t result = 0;
     59 
     60     do {
     61         dest = buffer;
     62         status = U_ZERO_ERROR;
     63         ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status);
     64         result += (int32_t) (dest - buffer);
     65     } while (status == U_BUFFER_OVERFLOW_ERROR);
     66 
     67     return result;
     68 }
     69 
     70 static char *extractBytes(const UChar *src, int32_t length, const char *codepage, int32_t *byteLength)
     71 {
     72     UErrorCode status = U_ZERO_ERROR;
     73     UConverter *cnv = ucnv_open(codepage, &status);
     74     int32_t byteCount = preflight(src, length, cnv);
     75     const UChar *srcLimit = src + length;
     76     char *bytes = NEW_ARRAY(char, byteCount + 1);
     77     char *dest = bytes, *destLimit = bytes + byteCount + 1;
     78 
     79     ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &status);
     80     ucnv_close(cnv);
     81 
     82     *byteLength = byteCount;
     83     return bytes;
     84 }
     85 
     86 static void freeBytes(char *bytes)
     87 {
     88     DELETE_ARRAY(bytes);
     89 }
     90 
     91 static void TestConstruction(void)
     92 {
     93     UErrorCode status = U_ZERO_ERROR;
     94     UCharsetDetector *csd = ucsdet_open(&status);
     95     UEnumeration *e = ucsdet_getAllDetectableCharsets(csd, &status);
     96     const char *name;
     97     int32_t count = uenum_count(e, &status);
     98     int32_t i, length;
     99 
    100     for(i = 0; i < count; i += 1) {
    101         name = uenum_next(e, &length, &status);
    102 
    103         if(name == NULL || length <= 0) {
    104             log_err("ucsdet_getAllDetectableCharsets() returned a null or empty name!\n");
    105         }
    106     }
    107     /* one past the list of all names must return NULL */
    108     name = uenum_next(e, &length, &status);
    109     if(name != NULL || length != 0 || U_FAILURE(status)) {
    110         log_err("ucsdet_getAllDetectableCharsets(past the list) returned a non-null name!\n");
    111     }
    112 
    113     uenum_close(e);
    114     ucsdet_close(csd);
    115 }
    116 
    117 static void TestUTF8(void)
    118 {
    119     UErrorCode status = U_ZERO_ERROR;
    120     static const char ss[] = "This is a string with some non-ascii characters that will "
    121                "be converted to UTF-8, then shoved through the detection process.  "
    122                "\\u0391\\u0392\\u0393\\u0394\\u0395"
    123                "Sure would be nice if our source could contain Unicode directly!";
    124     int32_t byteLength = 0, sLength = 0, dLength = 0;
    125     UChar s[sizeof(ss)];
    126     char *bytes;
    127     UCharsetDetector *csd = ucsdet_open(&status);
    128     const UCharsetMatch *match;
    129     UChar detected[sizeof(ss)];
    130 
    131     sLength = u_unescape(ss, s, sizeof(ss));
    132     bytes = extractBytes(s, sLength, "UTF-8", &byteLength);
    133 
    134     ucsdet_setText(csd, bytes, byteLength, &status);
    135     if (U_FAILURE(status)) {
    136         log_err("status is %s\n", u_errorName(status));
    137         goto bail;
    138     }
    139 
    140     match = ucsdet_detect(csd, &status);
    141 
    142     if (match == NULL) {
    143         log_err("Detection failure for UTF-8: got no matches.\n");
    144         goto bail;
    145     }
    146 
    147     dLength = ucsdet_getUChars(match, detected, sLength, &status);
    148 
    149     if (u_strCompare(detected, dLength, s, sLength, FALSE) != 0) {
    150         log_err("Round-trip test failed!\n");
    151     }
    152 
    153     ucsdet_setDeclaredEncoding(csd, "UTF-8", 5, &status); /* for coverage */
    154 
    155 bail:
    156     freeBytes(bytes);
    157     ucsdet_close(csd);
    158 }
    159 
    160 static void TestUTF16(void)
    161 {
    162     UErrorCode status = U_ZERO_ERROR;
    163     /* Notice the BOM on the start of this string */
    164     static const UChar chars[] = {
    165         0xFEFF, 0x0623, 0x0648, 0x0631, 0x0648, 0x0628, 0x0627, 0x002C,
    166         0x0020, 0x0628, 0x0631, 0x0645, 0x062c, 0x064a, 0x0627, 0x062a,
    167         0x0020, 0x0627, 0x0644, 0x062d, 0x0627, 0x0633, 0x0648, 0x0628,
    168         0x0020, 0x002b, 0x0020, 0x0627, 0x0646, 0x062a, 0x0631, 0x0646,
    169         0x064a, 0x062a, 0x0000};
    170     int32_t beLength = 0, leLength = 0, cLength = UPRV_LENGTHOF(chars);
    171     char *beBytes = extractBytes(chars, cLength, "UTF-16BE", &beLength);
    172     char *leBytes = extractBytes(chars, cLength, "UTF-16LE", &leLength);
    173     UCharsetDetector *csd = ucsdet_open(&status);
    174     const UCharsetMatch *match;
    175     const char *name;
    176     int32_t conf;
    177 
    178     ucsdet_setText(csd, beBytes, beLength, &status);
    179     match = ucsdet_detect(csd, &status);
    180 
    181     if (match == NULL) {
    182         log_err("Encoding detection failure for UTF-16BE: got no matches.\n");
    183         goto try_le;
    184     }
    185 
    186     name  = ucsdet_getName(match, &status);
    187     conf  = ucsdet_getConfidence(match, &status);
    188 
    189     if (strcmp(name, "UTF-16BE") != 0) {
    190         log_err("Encoding detection failure for UTF-16BE: got %s\n", name);
    191     }
    192 
    193     if (conf != 100) {
    194         log_err("Did not get 100%% confidence for UTF-16BE: got %d\n", conf);
    195     }
    196 
    197 try_le:
    198     ucsdet_setText(csd, leBytes, leLength, &status);
    199     match = ucsdet_detect(csd, &status);
    200 
    201     if (match == NULL) {
    202         log_err("Encoding detection failure for UTF-16LE: got no matches.\n");
    203         goto bail;
    204     }
    205 
    206     name  = ucsdet_getName(match, &status);
    207     conf = ucsdet_getConfidence(match, &status);
    208 
    209 
    210     if (strcmp(name, "UTF-16LE") != 0) {
    211         log_err("Enconding detection failure for UTF-16LE: got %s\n", name);
    212     }
    213 
    214     if (conf != 100) {
    215         log_err("Did not get 100%% confidence for UTF-16LE: got %d\n", conf);
    216     }
    217 
    218 bail:
    219     freeBytes(leBytes);
    220     freeBytes(beBytes);
    221     ucsdet_close(csd);
    222 }
    223 
    224 static void TestC1Bytes(void)
    225 {
    226 #if !UCONFIG_NO_LEGACY_CONVERSION
    227     UErrorCode status = U_ZERO_ERROR;
    228     static const char ssISO[] = "This is a small sample of some English text. Just enough to be sure that it detects correctly.";
    229     static const char ssWindows[] = "This is another small sample of some English text. Just enough to be sure that it detects correctly. It also includes some \\u201CC1\\u201D bytes.";
    230     int32_t sISOLength = 0, sWindowsLength = 0;
    231     UChar sISO[sizeof(ssISO)];
    232     UChar sWindows[sizeof(ssWindows)];
    233     int32_t lISO = 0, lWindows = 0;
    234     char *bISO;
    235     char *bWindows;
    236     UCharsetDetector *csd = ucsdet_open(&status);
    237     const UCharsetMatch *match;
    238     const char *name;
    239 
    240     sISOLength = u_unescape(ssISO, sISO, sizeof(ssISO));
    241     sWindowsLength = u_unescape(ssWindows, sWindows, sizeof(ssWindows));
    242     bISO = extractBytes(sISO, sISOLength, "ISO-8859-1", &lISO);
    243     bWindows = extractBytes(sWindows, sWindowsLength, "windows-1252", &lWindows);
    244 
    245     ucsdet_setText(csd, bWindows, lWindows, &status);
    246     match = ucsdet_detect(csd, &status);
    247 
    248     if (match == NULL) {
    249         log_err("English test with C1 bytes got no matches.\n");
    250         goto bail;
    251     }
    252 
    253     name  = ucsdet_getName(match, &status);
    254 
    255     if (strcmp(name, "windows-1252") != 0) {
    256         log_data_err("English text with C1 bytes does not detect as windows-1252, but as %s. (Are you missing data?)\n", name);
    257     }
    258 
    259     ucsdet_setText(csd, bISO, lISO, &status);
    260     match = ucsdet_detect(csd, &status);
    261 
    262     if (match == NULL) {
    263         log_err("English text without C1 bytes got no matches.\n");
    264         goto bail;
    265     }
    266 
    267     name  = ucsdet_getName(match, &status);
    268 
    269     if (strcmp(name, "ISO-8859-1") != 0) {
    270         log_err("English text without C1 bytes does not detect as ISO-8859-1, but as %s\n", name);
    271     }
    272 
    273 bail:
    274     freeBytes(bWindows);
    275     freeBytes(bISO);
    276 
    277     ucsdet_close(csd);
    278 #endif
    279 }
    280 
    281 static void TestInputFilter(void)
    282 {
    283     UErrorCode status = U_ZERO_ERROR;
    284     static const char ss[] = "<a> <lot> <of> <English> <inside> <the> <markup> Un tr\\u00E8s petit peu de Fran\\u00E7ais. <to> <confuse> <the> <detector>";
    285     int32_t sLength = 0;
    286     UChar s[sizeof(ss)];
    287     int32_t byteLength = 0;
    288     char *bytes;
    289     UCharsetDetector *csd = ucsdet_open(&status);
    290     const UCharsetMatch *match;
    291     const char *lang, *name;
    292 
    293     sLength = u_unescape(ss, s, sizeof(ss));
    294     bytes = extractBytes(s, sLength, "ISO-8859-1", &byteLength);
    295 
    296     ucsdet_enableInputFilter(csd, TRUE);
    297 
    298     if (!ucsdet_isInputFilterEnabled(csd)) {
    299         log_err("ucsdet_enableInputFilter(csd, TRUE) did not enable input filter!\n");
    300     }
    301 
    302 
    303     ucsdet_setText(csd, bytes, byteLength, &status);
    304     match = ucsdet_detect(csd, &status);
    305 
    306     if (match == NULL) {
    307         log_err("Turning on the input filter resulted in no matches.\n");
    308         goto turn_off;
    309     }
    310 
    311     name = ucsdet_getName(match, &status);
    312 
    313     if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
    314         log_err("Turning on the input filter resulted in %s rather than ISO-8859-1\n", name);
    315     } else {
    316         lang = ucsdet_getLanguage(match, &status);
    317 
    318         if (lang == NULL || strcmp(lang, "fr") != 0) {
    319             log_err("Input filter did not strip markup!\n");
    320         }
    321     }
    322 
    323 turn_off:
    324     ucsdet_enableInputFilter(csd, FALSE);
    325     ucsdet_setText(csd, bytes, byteLength, &status);
    326     match = ucsdet_detect(csd, &status);
    327 
    328     if (match == NULL) {
    329         log_err("Turning off the input filter resulted in no matches.\n");
    330         goto bail;
    331     }
    332 
    333     name = ucsdet_getName(match, &status);
    334 
    335     if (name == NULL || strcmp(name, "ISO-8859-1") != 0) {
    336         log_err("Turning off the input filter resulted in %s rather than ISO-8859-1\n", name);
    337     } else {
    338         lang = ucsdet_getLanguage(match, &status);
    339 
    340         if (lang == NULL || strcmp(lang, "en") != 0) {
    341             log_err("Unfiltered input did not detect as English!\n");
    342         }
    343     }
    344 
    345 bail:
    346     freeBytes(bytes);
    347     ucsdet_close(csd);
    348 }
    349 
    350 static void TestChaining(void) {
    351     UErrorCode status = U_USELESS_COLLATOR_ERROR;
    352 
    353     ucsdet_open(&status);
    354     ucsdet_setText(NULL, NULL, 0, &status);
    355     ucsdet_getName(NULL, &status);
    356     ucsdet_getConfidence(NULL, &status);
    357     ucsdet_getLanguage(NULL, &status);
    358     ucsdet_detect(NULL, &status);
    359     ucsdet_setDeclaredEncoding(NULL, NULL, 0, &status);
    360     ucsdet_detectAll(NULL, NULL, &status);
    361     ucsdet_getUChars(NULL, NULL, 0, &status);
    362     ucsdet_getUChars(NULL, NULL, 0, &status);
    363     ucsdet_close(NULL);
    364 
    365     /* All of this code should have done nothing. */
    366     if (status != U_USELESS_COLLATOR_ERROR) {
    367         log_err("Status got changed to %s\n", u_errorName(status));
    368     }
    369 }
    370 
    371 static void TestBufferOverflow(void) {
    372     UErrorCode status = U_ZERO_ERROR;
    373     static const char *testStrings[] = {
    374         "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b", /* A partial ISO-2022 shift state at the end */
    375         "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24", /* A partial ISO-2022 shift state at the end */
    376         "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28", /* A partial ISO-2022 shift state at the end */
    377         "\x80\x20\x54\x68\x69\x73\x20\x69\x73\x20\x45\x6E\x67\x6C\x69\x73\x68\x20\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end with a bad one at the start */
    378         "\x1b\x24\x28\x44", /* A complete ISO-2022 shift state at the end */
    379         "\xa1", /* Could be a single byte shift-jis at the end */
    380         "\x74\x68\xa1", /* Could be a single byte shift-jis at the end */
    381         "\x74\x68\x65\xa1" /* Could be a single byte shift-jis at the end, but now we have English creeping in. */
    382     };
    383     static const char *testResults[] = {
    384         "windows-1252",
    385         "windows-1252",
    386         "windows-1252",
    387         "windows-1252",
    388         "ISO-2022-JP",
    389         NULL,
    390         NULL,
    391         "ISO-8859-1"
    392     };
    393     int32_t idx = 0;
    394     UCharsetDetector *csd = ucsdet_open(&status);
    395     const UCharsetMatch *match;
    396 
    397     ucsdet_setDeclaredEncoding(csd, "ISO-2022-JP", -1, &status);
    398 
    399     if (U_FAILURE(status)) {
    400         log_err("Couldn't open detector. %s\n", u_errorName(status));
    401         goto bail;
    402     }
    403 
    404     for (idx = 0; idx < UPRV_LENGTHOF(testStrings); idx++) {
    405         ucsdet_setText(csd, testStrings[idx], -1, &status);
    406         match = ucsdet_detect(csd, &status);
    407 
    408         if (match == NULL) {
    409             if (testResults[idx] != NULL) {
    410                 log_err("Unexpectedly got no results at index %d.\n", idx);
    411             }
    412             else {
    413                 log_verbose("Got no result as expected at index %d.\n", idx);
    414             }
    415             continue;
    416         }
    417 
    418         if (testResults[idx] == NULL || strcmp(ucsdet_getName(match, &status), testResults[idx]) != 0) {
    419             log_err("Unexpectedly got %s instead of %s at index %d with confidence %d.\n",
    420                 ucsdet_getName(match, &status), testResults[idx], idx, ucsdet_getConfidence(match, &status));
    421             goto bail;
    422         }
    423     }
    424 
    425 bail:
    426     ucsdet_close(csd);
    427 }
    428 
    429 static void TestIBM424(void)
    430 {
    431     UErrorCode status = U_ZERO_ERROR;
    432 
    433     static const UChar chars[] = {
    434             0x05D4, 0x05E4, 0x05E8, 0x05E7, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05D4, 0x05E6, 0x05D1, 0x05D0, 0x05D9, 0x0020, 0x05D4, 0x05E8,
    435             0x05D0, 0x05E9, 0x05D9, 0x002C, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x05D0, 0x05DC, 0x05D5, 0x05E3, 0x0020, 0x05D0, 0x05D1, 0x05D9,
    436             0x05D7, 0x05D9, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC, 0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x002C, 0x0020, 0x05D4, 0x05D5, 0x05E8,
    437             0x05D4, 0x0020, 0x05E2, 0x05DC, 0x0020, 0x05E4, 0x05EA, 0x05D9, 0x05D7, 0x05EA, 0x0020, 0x05D7, 0x05E7, 0x05D9, 0x05E8, 0x05EA,
    438             0x0020, 0x05DE, 0x05E6, 0x0022, 0x05D7, 0x0020, 0x05D1, 0x05E2, 0x05E7, 0x05D1, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D3, 0x05D5,
    439             0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC, 0x05D9, 0x0020, 0x05E6, 0x05D4, 0x0022, 0x05DC, 0x0020, 0x05DE,
    440             0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5, 0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4,
    441             0x0020, 0x05D1, 0x002B, 0x0020, 0x05E8, 0x05E6, 0x05D5, 0x05E2, 0x05EA, 0x0020, 0x05E2, 0x05D6, 0x05D4, 0x002E, 0x0020, 0x05DC,
    442             0x05D3, 0x05D1, 0x05E8, 0x05D9, 0x0020, 0x05D4, 0x05E4, 0x05E6, 0x0022, 0x05E8, 0x002C, 0x0020, 0x05DE, 0x05D4, 0x05E2, 0x05D3,
    443             0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0020, 0x05E2, 0x05D5, 0x05DC, 0x05D4, 0x0020, 0x05EA, 0x05DE, 0x05D5, 0x05E0, 0x05D4, 0x0020,
    444             0x05E9, 0x05DC, 0x0020, 0x0022, 0x05D4, 0x05EA, 0x05E0, 0x05D4, 0x05D2, 0x05D5, 0x05EA, 0x0020, 0x05E4, 0x05E1, 0x05D5, 0x05DC,
    445             0x05D4, 0x0020, 0x05DC, 0x05DB, 0x05D0, 0x05D5, 0x05E8, 0x05D4, 0x0020, 0x05E9, 0x05DC, 0x0020, 0x05D7, 0x05D9, 0x05D9, 0x05DC,
    446             0x05D9, 0x05DD, 0x0020, 0x05D1, 0x05DE, 0x05D4, 0x05DC, 0x05DA, 0x0020, 0x05DE, 0x05D1, 0x05E6, 0x05E2, 0x0020, 0x05E2, 0x05D5,
    447             0x05E4, 0x05E8, 0x05EA, 0x0020, 0x05D9, 0x05E6, 0x05D5, 0x05E7, 0x05D4, 0x0022, 0x002E, 0x0020, 0x05DE, 0x05E0, 0x05D3, 0x05DC,
    448             0x05D1, 0x05DC, 0x05D9, 0x05D8, 0x0020, 0x05E7, 0x05D9, 0x05D1, 0x05DC, 0x0020, 0x05D0, 0x05EA, 0x0020, 0x05D4, 0x05D7, 0x05DC,
    449             0x05D8, 0x05EA, 0x05D5, 0x0020, 0x05DC, 0x05D0, 0x05D7, 0x05E8, 0x0020, 0x05E9, 0x05E2, 0x05D9, 0x05D9, 0x05DF, 0x0020, 0x05D1,
    450             0x05EA, 0x05DE, 0x05DC, 0x05D9, 0x05DC, 0x0020, 0x05D4, 0x05E2, 0x05D3, 0x05D5, 0x05D9, 0x05D5, 0x05EA, 0x0000
    451     };
    452 
    453     static const UChar chars_reverse[] = {
    454             0x05EA, 0x05D5, 0x05D9, 0x05D5, 0x05D3, 0x05E2, 0x05D4, 0x0020, 0x05DC, 0x05D9, 0x05DC, 0x05DE, 0x05EA,
    455             0x05D1, 0x0020, 0x05DF, 0x05D9, 0x05D9, 0x05E2, 0x05E9, 0x0020, 0x05E8, 0x05D7, 0x05D0, 0x05DC, 0x0020, 0x05D5, 0x05EA, 0x05D8,
    456             0x05DC, 0x05D7, 0x05D4, 0x0020, 0x05EA, 0x05D0, 0x0020, 0x05DC, 0x05D1, 0x05D9, 0x05E7, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05D1,
    457             0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x002E, 0x0022, 0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4,
    458             0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE, 0x0020, 0x05DA, 0x05DC, 0x05D4, 0x05DE, 0x05D1, 0x0020, 0x05DD, 0x05D9,
    459             0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05DC, 0x05E9, 0x0020, 0x05D4, 0x05E8, 0x05D5, 0x05D0, 0x05DB, 0x05DC, 0x0020, 0x05D4,
    460             0x05DC, 0x05D5, 0x05E1, 0x05E4, 0x0020, 0x05EA, 0x05D5, 0x05D2, 0x05D4, 0x05E0, 0x05EA, 0x05D4, 0x0022, 0x0020, 0x05DC, 0x05E9,
    461             0x0020, 0x05D4, 0x05E0, 0x05D5, 0x05DE, 0x05EA, 0x0020, 0x05D4, 0x05DC, 0x05D5, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D9, 0x05D5,
    462             0x05D3, 0x05E2, 0x05D4, 0x05DE, 0x0020, 0x002C, 0x05E8, 0x0022, 0x05E6, 0x05E4, 0x05D4, 0x0020, 0x05D9, 0x05E8, 0x05D1, 0x05D3,
    463             0x05DC, 0x0020, 0x002E, 0x05D4, 0x05D6, 0x05E2, 0x0020, 0x05EA, 0x05E2, 0x05D5, 0x05E6, 0x05E8, 0x0020, 0x002B, 0x05D1, 0x0020,
    464             0x05D4, 0x05E7, 0x05D5, 0x05E6, 0x05D9, 0x0020, 0x05EA, 0x05E8, 0x05E4, 0x05D5, 0x05E2, 0x0020, 0x05E2, 0x05E6, 0x05D1, 0x05DE,
    465             0x05DE, 0x0020, 0x05DC, 0x0022, 0x05D4, 0x05E6, 0x0020, 0x05D9, 0x05DC, 0x05D9, 0x05D9, 0x05D7, 0x0020, 0x05EA, 0x05D5, 0x05D9,
    466             0x05D5, 0x05D3, 0x05E2, 0x0020, 0x05EA, 0x05D5, 0x05D1, 0x05E7, 0x05E2, 0x05D1, 0x0020, 0x05D7, 0x0022, 0x05E6, 0x05DE, 0x0020,
    467             0x05EA, 0x05E8, 0x05D9, 0x05E7, 0x05D7, 0x0020, 0x05EA, 0x05D7, 0x05D9, 0x05EA, 0x05E4, 0x0020, 0x05DC, 0x05E2, 0x0020, 0x05D4,
    468             0x05E8, 0x05D5, 0x05D4, 0x0020, 0x002C, 0x05D8, 0x05D9, 0x05DC, 0x05D1, 0x05DC, 0x05D3, 0x05E0, 0x05DE, 0x0020, 0x05D9, 0x05D7,
    469             0x05D9, 0x05D1, 0x05D0, 0x0020, 0x05E3, 0x05D5, 0x05DC, 0x05D0, 0x0020, 0x05EA, 0x05EA, 0x0020, 0x002C, 0x05D9, 0x05E9, 0x05D0,
    470             0x05E8, 0x05D4, 0x0020, 0x05D9, 0x05D0, 0x05D1, 0x05E6, 0x05D4, 0x0020, 0x05D8, 0x05D9, 0x05DC, 0x05E7, 0x05E8, 0x05E4, 0x05D4,
    471             0x0000
    472     };
    473 
    474     int32_t bLength = 0, brLength = 0, cLength = UPRV_LENGTHOF(chars), crLength = UPRV_LENGTHOF(chars_reverse);
    475 
    476     char *bytes = extractBytes(chars, cLength, "IBM424", &bLength);
    477     char *bytes_r = extractBytes(chars_reverse, crLength, "IBM424", &brLength);
    478 
    479     UCharsetDetector *csd = ucsdet_open(&status);
    480     const UCharsetMatch *match;
    481     const char *name;
    482 
    483     ucsdet_setText(csd, bytes, bLength, &status);
    484     match = ucsdet_detect(csd, &status);
    485 
    486     if (match == NULL) {
    487         log_err("Encoding detection failure for IBM424_rtl: got no matches.\n");
    488         goto bail;
    489     }
    490 
    491     name  = ucsdet_getName(match, &status);
    492     if (strcmp(name, "IBM424_rtl") != 0) {
    493         log_data_err("Encoding detection failure for IBM424_rtl: got %s. (Are you missing data?)\n", name);
    494     }
    495 
    496     ucsdet_setText(csd, bytes_r, brLength, &status);
    497     match = ucsdet_detect(csd, &status);
    498 
    499     if (match == NULL) {
    500         log_err("Encoding detection failure for IBM424_ltr: got no matches.\n");
    501         goto bail;
    502     }
    503 
    504     name  = ucsdet_getName(match, &status);
    505     if (strcmp(name, "IBM424_ltr") != 0) {
    506         log_data_err("Encoding detection failure for IBM424_ltr: got %s. (Are you missing data?)\n", name);
    507     }
    508 
    509 bail:
    510     freeBytes(bytes);
    511     freeBytes(bytes_r);
    512     ucsdet_close(csd);
    513 }
    514 
    515 static void TestIBM420(void)
    516 {
    517     UErrorCode status = U_ZERO_ERROR;
    518 
    519     static const UChar chars[] = {
    520         0x0648, 0x064F, 0x0636, 0x0639, 0x062A, 0x0020, 0x0648, 0x0646, 0x064F, 0x0641, 0x0630, 0x062A, 0x0020, 0x0628, 0x0631, 0x0627,
    521         0x0645, 0x062C, 0x0020, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020, 0x0639, 0x062F, 0x064A, 0x062F, 0x0629, 0x0020, 0x0641,
    522         0x064A, 0x0020, 0x0645, 0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0627, 0x0644, 0x062A, 0x0623, 0x0645, 0x064A, 0x0646, 0x0020,
    523         0x0627, 0x0644, 0x0648, 0x0637, 0x0646, 0x064A, 0x002C, 0x0020, 0x0645, 0x0639, 0x0020, 0x0645, 0x0644, 0x0627, 0x0626, 0x0645,
    524         0x062A, 0x0647, 0x0627, 0x0020, 0x062F, 0x0627, 0x0626, 0x0645, 0x0627, 0x064B, 0x0020, 0x0644, 0x0644, 0x0627, 0x062D, 0x062A,
    525         0x064A, 0x0627, 0x062C, 0x0627, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645, 0x062A, 0x063A, 0x064A, 0x0631, 0x0629, 0x0020, 0x0644,
    526         0x0644, 0x0645, 0x062C, 0x062A, 0x0645, 0x0639, 0x0020, 0x0648, 0x0644, 0x0644, 0x062F, 0x0648, 0x0644, 0x0629, 0x002E, 0x0020,
    527         0x062A, 0x0648, 0x0633, 0x0639, 0x062A, 0x0020, 0x0648, 0x062A, 0x0637, 0x0648, 0x0631, 0x062A, 0x0020, 0x0627, 0x0644, 0x0645,
    528         0x0624, 0x0633, 0x0633, 0x0629, 0x0020, 0x0628, 0x0647, 0x062F, 0x0641, 0x0020, 0x0636, 0x0645, 0x0627, 0x0646, 0x0020, 0x0634,
    529         0x0628, 0x0643, 0x0629, 0x0020, 0x0623, 0x0645, 0x0627, 0x0646, 0x0020, 0x0644, 0x0633, 0x0643, 0x0627, 0x0646, 0x0020, 0x062F,
    530         0x0648, 0x0644, 0x0629, 0x0020, 0x0627, 0x0633, 0x0631, 0x0627, 0x0626, 0x064A, 0x0644, 0x0020, 0x0628, 0x0648, 0x062C, 0x0647,
    531         0x0020, 0x0627, 0x0644, 0x0645, 0x062E, 0x0627, 0x0637, 0x0631, 0x0020, 0x0627, 0x0644, 0x0627, 0x0642, 0x062A, 0x0635, 0x0627,
    532         0x062F, 0x064A, 0x0629, 0x0020, 0x0648, 0x0627, 0x0644, 0x0627, 0x062C, 0x062A, 0x0645, 0x0627, 0x0639, 0x064A, 0x0629, 0x002E,
    533         0x0000
    534     };
    535     static const UChar chars_reverse[] = {
    536         0x002E, 0x0629, 0x064A, 0x0639, 0x0627, 0x0645, 0x062A, 0x062C, 0x0627, 0x0644, 0x0627, 0x0648, 0x0020, 0x0629, 0x064A, 0x062F,
    537         0x0627, 0x0635, 0x062A, 0x0642, 0x0627, 0x0644, 0x0627, 0x0020, 0x0631, 0x0637, 0x0627, 0x062E, 0x0645, 0x0644, 0x0627, 0x0020,
    538         0x0647, 0x062C, 0x0648, 0x0628, 0x0020, 0x0644, 0x064A, 0x0626, 0x0627, 0x0631, 0x0633, 0x0627, 0x0020, 0x0629, 0x0644, 0x0648,
    539         0x062F, 0x0020, 0x0646, 0x0627, 0x0643, 0x0633, 0x0644, 0x0020, 0x0646, 0x0627, 0x0645, 0x0623, 0x0020, 0x0629, 0x0643, 0x0628,
    540         0x0634, 0x0020, 0x0646, 0x0627, 0x0645, 0x0636, 0x0020, 0x0641, 0x062F, 0x0647, 0x0628, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624,
    541         0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0631, 0x0648, 0x0637, 0x062A, 0x0648, 0x0020, 0x062A, 0x0639, 0x0633, 0x0648, 0x062A,
    542         0x0020, 0x002E, 0x0629, 0x0644, 0x0648, 0x062F, 0x0644, 0x0644, 0x0648, 0x0020, 0x0639, 0x0645, 0x062A, 0x062C, 0x0645, 0x0644,
    543         0x0644, 0x0020, 0x0629, 0x0631, 0x064A, 0x063A, 0x062A, 0x0645, 0x0644, 0x0627, 0x0020, 0x062A, 0x0627, 0x062C, 0x0627, 0x064A,
    544         0x062A, 0x062D, 0x0627, 0x0644, 0x0644, 0x0020, 0x064B, 0x0627, 0x0645, 0x0626, 0x0627, 0x062F, 0x0020, 0x0627, 0x0647, 0x062A,
    545         0x0645, 0x0626, 0x0627, 0x0644, 0x0645, 0x0020, 0x0639, 0x0645, 0x0020, 0x002C, 0x064A, 0x0646, 0x0637, 0x0648, 0x0644, 0x0627,
    546         0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0644, 0x0627, 0x0020, 0x0629, 0x0633, 0x0633, 0x0624, 0x0645, 0x0020, 0x064A,
    547         0x0641, 0x0020, 0x0629, 0x062F, 0x064A, 0x062F, 0x0639, 0x0020, 0x0646, 0x064A, 0x0645, 0x0623, 0x062A, 0x0020, 0x062C, 0x0645,
    548         0x0627, 0x0631, 0x0628, 0x0020, 0x062A, 0x0630, 0x0641, 0x064F, 0x0646, 0x0648, 0x0020, 0x062A, 0x0639, 0x0636, 0x064F, 0x0648,
    549         0x0000,
    550     };
    551 
    552     int32_t bLength = 0, brLength = 0, cLength = UPRV_LENGTHOF(chars), crLength = UPRV_LENGTHOF(chars_reverse);
    553 
    554     char *bytes = extractBytes(chars, cLength, "IBM420", &bLength);
    555     char *bytes_r = extractBytes(chars_reverse, crLength, "IBM420", &brLength);
    556 
    557     UCharsetDetector *csd = ucsdet_open(&status);
    558     const UCharsetMatch *match;
    559     const char *name;
    560 
    561     ucsdet_setText(csd, bytes, bLength, &status);
    562     match = ucsdet_detect(csd, &status);
    563 
    564     if (match == NULL) {
    565         log_err("Encoding detection failure for IBM420_rtl: got no matches.\n");
    566         goto bail;
    567     }
    568 
    569     name  = ucsdet_getName(match, &status);
    570     if (strcmp(name, "IBM420_rtl") != 0) {
    571         log_data_err("Encoding detection failure for IBM420_rtl: got %s. (Are you missing data?)\n", name);
    572     }
    573 
    574     ucsdet_setText(csd, bytes_r, brLength, &status);
    575     match = ucsdet_detect(csd, &status);
    576 
    577     if (match == NULL) {
    578         log_err("Encoding detection failure for IBM420_ltr: got no matches.\n");
    579         goto bail;
    580     }
    581 
    582     name  = ucsdet_getName(match, &status);
    583     if (strcmp(name, "IBM420_ltr") != 0) {
    584         log_data_err("Encoding detection failure for IBM420_ltr: got %s. (Are you missing data?)\n", name);
    585     }
    586 
    587 bail:
    588     freeBytes(bytes);
    589     freeBytes(bytes_r);
    590     ucsdet_close(csd);
    591 }
    592