Home | History | Annotate | Download | only in intltest
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 ************************************************************************
      5 * Copyright (c) 1997-2016, International Business Machines
      6 * Corporation and others.  All Rights Reserved.
      7 ************************************************************************
      8 */
      9 
     10 #include "unicode/utypes.h"
     11 
     12 #if !UCONFIG_NO_NORMALIZATION
     13 
     14 #include <string>
     15 #include "unicode/bytestream.h"
     16 #include "unicode/edits.h"
     17 #include "unicode/uchar.h"
     18 #include "unicode/normalizer2.h"
     19 #include "unicode/normlzr.h"
     20 #include "unicode/uniset.h"
     21 #include "unicode/putil.h"
     22 #include "cmemory.h"
     23 #include "cstring.h"
     24 #include "filestrm.h"
     25 #include "normconf.h"
     26 #include "uassert.h"
     27 #include <stdio.h>
     28 
     29 void NormalizerConformanceTest::runIndexedTest(int32_t index, UBool exec, const char* &name, char* /*par*/) {
     30     TESTCASE_AUTO_BEGIN;
     31     TESTCASE_AUTO(TestConformance);
     32     TESTCASE_AUTO(TestConformance32);
     33     TESTCASE_AUTO(TestCase6);
     34     TESTCASE_AUTO_END;
     35 }
     36 
     37 #define FIELD_COUNT 5
     38 
     39 NormalizerConformanceTest::NormalizerConformanceTest() :
     40         normalizer(UnicodeString(), UNORM_NFC) {
     41     UErrorCode errorCode = U_ZERO_ERROR;
     42     nfc = Normalizer2::getNFCInstance(errorCode);
     43     nfd = Normalizer2::getNFDInstance(errorCode);
     44     nfkc = Normalizer2::getNFKCInstance(errorCode);
     45     nfkd = Normalizer2::getNFKDInstance(errorCode);
     46     assertSuccess("", errorCode, true, __FILE__, __LINE__);
     47 }
     48 
     49 NormalizerConformanceTest::~NormalizerConformanceTest() {}
     50 
     51 // more interesting conformance test cases, not in the unicode.org NormalizationTest.txt
     52 static const char *moreCases[]={
     53     // Markus 2001aug30
     54     "0061 0332 0308;00E4 0332;0061 0332 0308;00E4 0332;0061 0332 0308; # Markus 0",
     55 
     56     // Markus 2001oct26 - test edge case for iteration: U+0f73.cc==0 but decomposition.lead.cc==129
     57     "0061 0301 0F73;00E1 0F71 0F72;0061 0F71 0F72 0301;00E1 0F71 0F72;0061 0F71 0F72 0301; # Markus 1"
     58 };
     59 
     60 void NormalizerConformanceTest::compare(const UnicodeString& s1, const UnicodeString& s2){
     61     UErrorCode status=U_ZERO_ERROR;
     62      // TODO: Re-enable this tests after UTC fixes UAX 21
     63     if(s1.indexOf((UChar32)0x0345)>=0)return;
     64     if(Normalizer::compare(s1,s2,U_FOLD_CASE_DEFAULT,status)!=0){
     65         errln("Normalizer::compare() failed for s1: " + prettify(s1) + " s2: " +prettify(s2));
     66     }
     67 }
     68 
     69 FileStream *
     70 NormalizerConformanceTest::openNormalizationTestFile(const char *filename) {
     71     char unidataPath[2000];
     72     const char *folder;
     73     FileStream *input;
     74     UErrorCode errorCode;
     75 
     76     // look inside ICU_DATA first
     77     folder=pathToDataDirectory();
     78     if(folder!=NULL) {
     79         strcpy(unidataPath, folder);
     80         strcat(unidataPath, "unidata" U_FILE_SEP_STRING);
     81         strcat(unidataPath, filename);
     82         input=T_FileStream_open(unidataPath, "rb");
     83         if(input!=NULL) {
     84             return input;
     85         }
     86     }
     87 
     88     // find icu/source/data/unidata relative to the test data
     89     errorCode=U_ZERO_ERROR;
     90     folder=loadTestData(errorCode);
     91     if(U_SUCCESS(errorCode)) {
     92         strcpy(unidataPath, folder);
     93         strcat(unidataPath, U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".."
     94                      U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".."
     95                      U_FILE_SEP_STRING "data" U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING);
     96         strcat(unidataPath, filename);
     97         input=T_FileStream_open(unidataPath, "rb");
     98         if(input!=NULL) {
     99             return input;
    100         }
    101     }
    102 
    103     // look in icu/source/test/testdata/out/build
    104     errorCode=U_ZERO_ERROR;
    105     folder=loadTestData(errorCode);
    106     if(U_SUCCESS(errorCode)) {
    107         strcpy(unidataPath, folder);
    108         strcat(unidataPath, U_FILE_SEP_STRING);
    109         strcat(unidataPath, filename);
    110         input=T_FileStream_open(unidataPath, "rb");
    111         if(input!=NULL) {
    112             return input;
    113         }
    114     }
    115 
    116     // look in icu/source/test/testdata
    117     errorCode=U_ZERO_ERROR;
    118     folder=loadTestData(errorCode);
    119     if(U_SUCCESS(errorCode)) {
    120         strcpy(unidataPath, folder);
    121         strcat(unidataPath, U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING);
    122         strcat(unidataPath, filename);
    123         input=T_FileStream_open(unidataPath, "rb");
    124         if(input!=NULL) {
    125             return input;
    126         }
    127     }
    128 
    129     // find icu/source/data/unidata relative to U_TOPSRCDIR
    130 #if defined(U_TOPSRCDIR)
    131     strcpy(unidataPath, U_TOPSRCDIR U_FILE_SEP_STRING "data" U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING);
    132     strcat(unidataPath, filename);
    133     input=T_FileStream_open(unidataPath, "rb");
    134     if(input!=NULL) {
    135         return input;
    136     }
    137 
    138     strcpy(unidataPath, U_TOPSRCDIR U_FILE_SEP_STRING "test" U_FILE_SEP_STRING "testdata" U_FILE_SEP_STRING);
    139     strcat(unidataPath, filename);
    140     input=T_FileStream_open(unidataPath, "rb");
    141     if(input!=NULL) {
    142         return input;
    143     }
    144 #endif
    145 
    146     dataerrln("Failed to open %s", filename);
    147     return NULL;
    148 }
    149 
    150 /**
    151  * Test the conformance of Normalizer to
    152  * http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt
    153  */
    154 void NormalizerConformanceTest::TestConformance() {
    155     TestConformance(openNormalizationTestFile("NormalizationTest.txt"), 0);
    156 }
    157 
    158 void NormalizerConformanceTest::TestConformance32() {
    159     TestConformance(openNormalizationTestFile("NormalizationTest-3.2.0.txt"), UNORM_UNICODE_3_2);
    160 }
    161 
    162 void NormalizerConformanceTest::TestConformance(FileStream *input, int32_t options) {
    163     enum { BUF_SIZE = 1024 };
    164     char lineBuf[BUF_SIZE];
    165     UnicodeString fields[FIELD_COUNT];
    166     UErrorCode status = U_ZERO_ERROR;
    167     int32_t passCount = 0;
    168     int32_t failCount = 0;
    169     UChar32 c;
    170 
    171     if(input==NULL) {
    172         return;
    173     }
    174 
    175     // UnicodeSet for all code points that are not mentioned in NormalizationTest.txt
    176     UnicodeSet other(0, 0x10ffff);
    177 
    178     int32_t count, countMoreCases = UPRV_LENGTHOF(moreCases);
    179     for (count = 1;;++count) {
    180         if (!T_FileStream_eof(input)) {
    181             T_FileStream_readLine(input, lineBuf, (int32_t)sizeof(lineBuf));
    182         } else {
    183             // once NormalizationTest.txt is finished, use moreCases[]
    184             if(count > countMoreCases) {
    185                 count = 0;
    186             } else if(count == countMoreCases) {
    187                 // all done
    188                 break;
    189             }
    190             uprv_strcpy(lineBuf, moreCases[count]);
    191         }
    192         if (lineBuf[0] == 0 || lineBuf[0] == '\n' || lineBuf[0] == '\r') continue;
    193 
    194         // Expect 5 columns of this format:
    195         // 1E0C;1E0C;0044 0323;1E0C;0044 0323; # <comments>
    196 
    197         // Parse out the comment.
    198         if (lineBuf[0] == '#') continue;
    199 
    200         // Read separator lines starting with '@'
    201         if (lineBuf[0] == '@') {
    202             logln(lineBuf);
    203             continue;
    204         }
    205 
    206         // Parse out the fields
    207         if (!hexsplit(lineBuf, ';', fields, FIELD_COUNT)) {
    208             errln((UnicodeString)"Unable to parse line " + count);
    209             break; // Syntax error
    210         }
    211 
    212         // Remove a single code point from the "other" UnicodeSet
    213         if(fields[0].length()==fields[0].moveIndex32(0, 1)) {
    214             c=fields[0].char32At(0);
    215             if(0xac20<=c && c<=0xd73f && quick) {
    216                 // not an exhaustive test run: skip most Hangul syllables
    217                 if(c==0xac20) {
    218                     other.remove(0xac20, 0xd73f);
    219                 }
    220                 continue;
    221             }
    222             other.remove(c);
    223         }
    224 
    225         if (checkConformance(fields, lineBuf, options, status)) {
    226             ++passCount;
    227         } else {
    228             ++failCount;
    229             if(status == U_FILE_ACCESS_ERROR) {
    230               dataerrln("Something is wrong with the normalizer, skipping the rest of the test.");
    231               break;
    232             }
    233         }
    234         if ((count % 1000) == 0) {
    235             logln("Line %d", count);
    236         }
    237     }
    238 
    239     T_FileStream_close(input);
    240 
    241     /*
    242      * Test that all characters that are not mentioned
    243      * as single code points in column 1
    244      * do not change under any normalization.
    245      */
    246 
    247     // remove U+ffff because that is the end-of-iteration sentinel value
    248     other.remove(0xffff);
    249 
    250     for(c=0; c<=0x10ffff; quick ? c+=113 : ++c) {
    251         if(0x30000<=c && c<0xe0000) {
    252             c=0xe0000;
    253         }
    254         if(!other.contains(c)) {
    255             continue;
    256         }
    257 
    258         fields[0]=fields[1]=fields[2]=fields[3]=fields[4].setTo(c);
    259         sprintf(lineBuf, "not mentioned code point U+%04lx", (long)c);
    260 
    261         if (checkConformance(fields, lineBuf, options, status)) {
    262             ++passCount;
    263         } else {
    264             ++failCount;
    265             if(status == U_FILE_ACCESS_ERROR) {
    266               dataerrln("Something is wrong with the normalizer, skipping the rest of the test.: %s", u_errorName(status));
    267               break;
    268             }
    269         }
    270         if ((c % 0x1000) == 0) {
    271             logln("Code point U+%04lx", c);
    272         }
    273     }
    274 
    275     if (failCount != 0) {
    276         dataerrln((UnicodeString)"Total: " + failCount + " lines/code points failed, " +
    277               passCount + " lines/code points passed");
    278     } else {
    279         logln((UnicodeString)"Total: " + passCount + " lines/code points passed");
    280     }
    281 }
    282 
    283 namespace {
    284 
    285 UBool isNormalizedUTF8(const Normalizer2 &norm2, const UnicodeString &s, UErrorCode &errorCode) {
    286     std::string s8;
    287     return norm2.isNormalizedUTF8(s.toUTF8String(s8), errorCode);
    288 }
    289 
    290 }  // namespace
    291 
    292 /**
    293  * Verify the conformance of the given line of the Unicode
    294  * normalization (UTR 15) test suite file.  For each line,
    295  * there are five columns, corresponding to field[0]..field[4].
    296  *
    297  * The following invariants must be true for all conformant implementations
    298  *  c2 == NFC(c1) == NFC(c2) == NFC(c3)
    299  *  c3 == NFD(c1) == NFD(c2) == NFD(c3)
    300  *  c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
    301  *  c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
    302  *
    303  * @param field the 5 columns
    304  * @param line the source line from the test suite file
    305  * @return true if the test passes
    306  */
    307 UBool NormalizerConformanceTest::checkConformance(const UnicodeString* field,
    308                                                   const char *line,
    309                                                   int32_t options,
    310                                                   UErrorCode &status) {
    311     UBool pass = TRUE, result;
    312     UnicodeString out, fcd;
    313     int32_t fieldNum;
    314 
    315     for (int32_t i=0; i<FIELD_COUNT; ++i) {
    316         fieldNum = i+1;
    317         if (i<3) {
    318             pass &= checkNorm(UNORM_NFC, options, nfc, field[i], field[1], fieldNum);
    319             pass &= checkNorm(UNORM_NFD, options, nfd, field[i], field[2], fieldNum);
    320         }
    321         pass &= checkNorm(UNORM_NFKC, options, nfkc, field[i], field[3], fieldNum);
    322         pass &= checkNorm(UNORM_NFKD, options, nfkd, field[i], field[4], fieldNum);
    323     }
    324     compare(field[1],field[2]);
    325     compare(field[0],field[1]);
    326     // test quick checks
    327     if(UNORM_NO == Normalizer::quickCheck(field[1], UNORM_NFC, options, status)) {
    328         errln("Normalizer error: quickCheck(NFC(s), UNORM_NFC) is UNORM_NO");
    329         pass = FALSE;
    330     }
    331     if(UNORM_NO == Normalizer::quickCheck(field[2], UNORM_NFD, options, status)) {
    332         errln("Normalizer error: quickCheck(NFD(s), UNORM_NFD) is UNORM_NO");
    333         pass = FALSE;
    334     }
    335     if(UNORM_NO == Normalizer::quickCheck(field[3], UNORM_NFKC, options, status)) {
    336         errln("Normalizer error: quickCheck(NFKC(s), UNORM_NFKC) is UNORM_NO");
    337         pass = FALSE;
    338     }
    339     if(UNORM_NO == Normalizer::quickCheck(field[4], UNORM_NFKD, options, status)) {
    340         errln("Normalizer error: quickCheck(NFKD(s), UNORM_NFKD) is UNORM_NO");
    341         pass = FALSE;
    342     }
    343 
    344     // branch on options==0 for better code coverage
    345     if(options==0) {
    346         result = Normalizer::isNormalized(field[1], UNORM_NFC, status);
    347     } else {
    348         result = Normalizer::isNormalized(field[1], UNORM_NFC, options, status);
    349     }
    350     if(!result) {
    351         dataerrln("Normalizer error: isNormalized(NFC(s), UNORM_NFC) is FALSE");
    352         pass = FALSE;
    353     }
    354     if(options==0 && !isNormalizedUTF8(*nfc, field[1], status)) {
    355         dataerrln("Normalizer error: nfc.isNormalizedUTF8(NFC(s)) is FALSE");
    356         pass = FALSE;
    357     }
    358     if(field[0]!=field[1]) {
    359         if(Normalizer::isNormalized(field[0], UNORM_NFC, options, status)) {
    360             errln("Normalizer error: isNormalized(s, UNORM_NFC) is TRUE");
    361             pass = FALSE;
    362         }
    363         if(isNormalizedUTF8(*nfc, field[0], status)) {
    364             errln("Normalizer error: nfc.isNormalizedUTF8(s) is TRUE");
    365             pass = FALSE;
    366         }
    367     }
    368     if(!Normalizer::isNormalized(field[3], UNORM_NFKC, options, status)) {
    369         dataerrln("Normalizer error: isNormalized(NFKC(s), UNORM_NFKC) is FALSE");
    370         pass = FALSE;
    371     } else {
    372         if(options==0 && !isNormalizedUTF8(*nfkc, field[3], status)) {
    373             dataerrln("Normalizer error: nfkc.isNormalizedUTF8(NFKC(s)) is FALSE");
    374             pass = FALSE;
    375         }
    376         if(field[0]!=field[3]) {
    377             if(Normalizer::isNormalized(field[0], UNORM_NFKC, options, status)) {
    378                 errln("Normalizer error: isNormalized(s, UNORM_NFKC) is TRUE");
    379                 pass = FALSE;
    380             }
    381             if(options==0 && isNormalizedUTF8(*nfkc, field[0], status)) {
    382                 errln("Normalizer error: nfkc.isNormalizedUTF8(s) is TRUE");
    383                 pass = FALSE;
    384             }
    385         }
    386     }
    387 
    388     // test FCD quick check and "makeFCD"
    389     Normalizer::normalize(field[0], UNORM_FCD, options, fcd, status);
    390     if(UNORM_NO == Normalizer::quickCheck(fcd, UNORM_FCD, options, status)) {
    391         errln("Normalizer error: quickCheck(FCD(s), UNORM_FCD) is UNORM_NO");
    392         pass = FALSE;
    393     }
    394     if(UNORM_NO == Normalizer::quickCheck(field[2], UNORM_FCD, options, status)) {
    395         errln("Normalizer error: quickCheck(NFD(s), UNORM_FCD) is UNORM_NO");
    396         pass = FALSE;
    397     }
    398     if(UNORM_NO == Normalizer::quickCheck(field[4], UNORM_FCD, options, status)) {
    399         errln("Normalizer error: quickCheck(NFKD(s), UNORM_FCD) is UNORM_NO");
    400         pass = FALSE;
    401     }
    402 
    403     Normalizer::normalize(fcd, UNORM_NFD, options, out, status);
    404     if(out != field[2]) {
    405         dataerrln("Normalizer error: NFD(FCD(s))!=NFD(s)");
    406         pass = FALSE;
    407     }
    408 
    409     if (U_FAILURE(status)) {
    410         dataerrln("Normalizer::normalize returned error status: %s", u_errorName(status));
    411         pass = FALSE;
    412     }
    413 
    414     if(field[0]!=field[2]) {
    415         // two strings that are canonically equivalent must test
    416         // equal under a canonical caseless match
    417         // see UAX #21 Case Mappings and Jitterbug 2021 and
    418         // Unicode Technical Committee meeting consensus 92-C31
    419         int32_t rc;
    420 
    421         status=U_ZERO_ERROR;
    422         rc=Normalizer::compare(field[0], field[2], (options<<UNORM_COMPARE_NORM_OPTIONS_SHIFT)|U_COMPARE_IGNORE_CASE, status);
    423         if(U_FAILURE(status)) {
    424             dataerrln("Normalizer::compare(case-insensitive) sets %s", u_errorName(status));
    425             pass=FALSE;
    426         } else if(rc!=0) {
    427             errln("Normalizer::compare(original, NFD, case-insensitive) returned %d instead of 0 for equal", rc);
    428             pass=FALSE;
    429         }
    430     }
    431 
    432     if (!pass) {
    433         dataerrln("FAIL: %s", line);
    434     }
    435     return pass;
    436 }
    437 
    438 static const char *const kModeStrings[UNORM_MODE_COUNT] = {
    439     "?", "none", "D", "KD", "C", "KC", "FCD"
    440 };
    441 
    442 static const char *const kMessages[UNORM_MODE_COUNT] = {
    443     "?!=?", "?!=?", "c3!=D(c%d)", "c5!=KC(c%d)", "c2!=C(c%d)", "c4!=KC(c%d)", "FCD"
    444 };
    445 
    446 UBool NormalizerConformanceTest::checkNorm(UNormalizationMode mode, int32_t options,
    447                                            const Normalizer2 *norm2,
    448                                            const UnicodeString &s, const UnicodeString &exp,
    449                                            int32_t field) {
    450     const char *modeString = kModeStrings[mode];
    451     char msg[20];
    452     snprintf(msg, sizeof(msg), kMessages[mode], field);
    453     UnicodeString out;
    454     UErrorCode errorCode = U_ZERO_ERROR;
    455     Normalizer::normalize(s, mode, options, out, errorCode);
    456     if (U_FAILURE(errorCode)) {
    457         dataerrln("Error running normalize UNORM_NF%s: %s", modeString, u_errorName(errorCode));
    458         return FALSE;
    459     }
    460     if (!assertEqual(modeString, "", s, out, exp, msg)) {
    461         return FALSE;
    462     }
    463 
    464     iterativeNorm(s, mode, options, out, +1);
    465     if (!assertEqual(modeString, "(+1)", s, out, exp, msg)) {
    466         return FALSE;
    467     }
    468 
    469     iterativeNorm(s, mode, options, out, -1);
    470     if (!assertEqual(modeString, "(-1)", s, out, exp, msg)) {
    471         return FALSE;
    472     }
    473 
    474     if (norm2 == nullptr || options != 0) {
    475         return TRUE;
    476     }
    477 
    478     std::string s8;
    479     s.toUTF8String(s8);
    480     std::string exp8;
    481     exp.toUTF8String(exp8);
    482     std::string out8;
    483     Edits edits;
    484     Edits *editsPtr = (mode == UNORM_NFC || mode == UNORM_NFKC) ? &edits : nullptr;
    485     StringByteSink<std::string> sink(&out8, exp8.length());
    486     norm2->normalizeUTF8(0, s8, sink, editsPtr, errorCode);
    487     if (U_FAILURE(errorCode)) {
    488         errln("Normalizer2.%s.normalizeUTF8(%s) failed: %s",
    489               modeString, s8.c_str(), u_errorName(errorCode));
    490         return FALSE;
    491     }
    492     if (out8 != exp8) {
    493         errln("Normalizer2.%s.normalizeUTF8(%s)=%s != %s",
    494               modeString, s8.c_str(), out8.c_str(), exp8.c_str());
    495         return FALSE;
    496     }
    497     if (editsPtr == nullptr) {
    498         return TRUE;
    499     }
    500 
    501     // Do the Edits cover the entire input & output?
    502     UBool pass = TRUE;
    503     pass &= assertEquals("edits.hasChanges()", (UBool)(s8 != out8), edits.hasChanges());
    504     pass &= assertEquals("edits.lengthDelta()",
    505                          (int32_t)(out8.length() - s8.length()), edits.lengthDelta());
    506     Edits::Iterator iter = edits.getCoarseIterator();
    507     while (iter.next(errorCode)) {}
    508     pass &= assertEquals("edits source length", s8.length(), iter.sourceIndex());
    509     pass &= assertEquals("edits destination length", out8.length(), iter.destinationIndex());
    510     return pass;
    511 }
    512 
    513 /**
    514  * Do a normalization using the iterative API in the given direction.
    515  * @param dir either +1 or -1
    516  */
    517 void NormalizerConformanceTest::iterativeNorm(const UnicodeString& str,
    518                                               UNormalizationMode mode, int32_t options,
    519                                               UnicodeString& result,
    520                                               int8_t dir) {
    521     UErrorCode status = U_ZERO_ERROR;
    522     normalizer.setText(str, status);
    523     normalizer.setMode(mode);
    524     normalizer.setOption(-1, 0);        // reset all options
    525     normalizer.setOption(options, 1);   // set desired options
    526     result.truncate(0);
    527     if (U_FAILURE(status)) {
    528         return;
    529     }
    530     UChar32 ch;
    531     if (dir > 0) {
    532         for (ch = normalizer.first(); ch != Normalizer::DONE;
    533              ch = normalizer.next()) {
    534             result.append(ch);
    535         }
    536     } else {
    537         for (ch = normalizer.last(); ch != Normalizer::DONE;
    538              ch = normalizer.previous()) {
    539             result.insert(0, ch);
    540         }
    541     }
    542 }
    543 
    544 UBool NormalizerConformanceTest::assertEqual(const char *op, const char *op2,
    545                                              const UnicodeString& s,
    546                                              const UnicodeString& got,
    547                                              const UnicodeString& exp,
    548                                              const char *msg) {
    549     if (exp == got)
    550         return TRUE;
    551 
    552     char *sChars, *gotChars, *expChars;
    553     UnicodeString sPretty(prettify(s));
    554     UnicodeString gotPretty(prettify(got));
    555     UnicodeString expPretty(prettify(exp));
    556 
    557     sChars = new char[sPretty.length() + 1];
    558     gotChars = new char[gotPretty.length() + 1];
    559     expChars = new char[expPretty.length() + 1];
    560 
    561     sPretty.extract(0, sPretty.length(), sChars, sPretty.length() + 1);
    562     sChars[sPretty.length()] = 0;
    563     gotPretty.extract(0, gotPretty.length(), gotChars, gotPretty.length() + 1);
    564     gotChars[gotPretty.length()] = 0;
    565     expPretty.extract(0, expPretty.length(), expChars, expPretty.length() + 1);
    566     expChars[expPretty.length()] = 0;
    567 
    568     errln("    %s: %s%s(%s)=%s, exp. %s", msg, op, op2, sChars, gotChars, expChars);
    569 
    570     delete []sChars;
    571     delete []gotChars;
    572     delete []expChars;
    573     return FALSE;
    574 }
    575 
    576 /**
    577  * Split a string into pieces based on the given delimiter
    578  * character.  Then, parse the resultant fields from hex into
    579  * characters.  That is, "0040 0400;0C00;0899" -> new String[] {
    580  * "\u0040\u0400", "\u0C00", "\u0899" }.  The output is assumed to
    581  * be of the proper length already, and exactly output.length
    582  * fields are parsed.  If there are too few an exception is
    583  * thrown.  If there are too many the extras are ignored.
    584  *
    585  * @return FALSE upon failure
    586  */
    587 UBool NormalizerConformanceTest::hexsplit(const char *s, char delimiter,
    588                                           UnicodeString output[], int32_t outputLength) {
    589     const char *t = s;
    590     char *end = NULL;
    591     UChar32 c;
    592     int32_t i;
    593     for (i=0; i<outputLength; ++i) {
    594         // skip whitespace
    595         while(*t == ' ' || *t == '\t') {
    596             ++t;
    597         }
    598 
    599         // read a sequence of code points
    600         output[i].remove();
    601         for(;;) {
    602             c = (UChar32)uprv_strtoul(t, &end, 16);
    603 
    604             if( (char *)t == end ||
    605                 (uint32_t)c > 0x10ffff ||
    606                 (*end != ' ' && *end != '\t' && *end != delimiter)
    607             ) {
    608                 errln(UnicodeString("Bad field ", "") + (i + 1) + " in " + UnicodeString(s, ""));
    609                 return FALSE;
    610             }
    611 
    612             output[i].append(c);
    613 
    614             t = (const char *)end;
    615 
    616             // skip whitespace
    617             while(*t == ' ' || *t == '\t') {
    618                 ++t;
    619             }
    620 
    621             if(*t == delimiter) {
    622                 ++t;
    623                 break;
    624             }
    625             if(*t == 0) {
    626                 if((i + 1) == outputLength) {
    627                     return TRUE;
    628                 } else {
    629                     errln(UnicodeString("Missing field(s) in ", "") + s + " only " + (i + 1) + " out of " + outputLength);
    630                     return FALSE;
    631                 }
    632             }
    633         }
    634     }
    635     return TRUE;
    636 }
    637 
    638 // Specific tests for debugging.  These are generally failures taken from
    639 // the conformance file, but culled out to make debugging easier.
    640 
    641 void NormalizerConformanceTest::TestCase6(void) {
    642     _testOneLine("0385;0385;00A8 0301;0020 0308 0301;0020 0308 0301;");
    643 }
    644 
    645 void NormalizerConformanceTest::_testOneLine(const char *line) {
    646   UErrorCode status = U_ZERO_ERROR;
    647     UnicodeString fields[FIELD_COUNT];
    648     if (!hexsplit(line, ';', fields, FIELD_COUNT)) {
    649         errln((UnicodeString)"Unable to parse line " + line);
    650     } else {
    651         checkConformance(fields, line, 0, status);
    652     }
    653 }
    654 
    655 #endif /* #if !UCONFIG_NO_NORMALIZATION */
    656