Home | History | Annotate | Download | only in intltest
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /********************************************************************
      4  * COPYRIGHT:
      5  * Copyright (c) 2002-2016, International Business Machines Corporation and
      6  * others. All Rights Reserved.
      7  ********************************************************************
      8  *
      9  * @author Mark E. Davis
     10  * @author Vladimir Weinstein
     11  */
     12 
     13 #include "unicode/utypes.h"
     14 
     15 #if !UCONFIG_NO_NORMALIZATION
     16 
     17 #include "intltest.h"
     18 #include "cmemory.h"
     19 #include "cstring.h"
     20 #include "canittst.h"
     21 #include "unicode/caniter.h"
     22 #include "unicode/normlzr.h"
     23 #include "unicode/uchar.h"
     24 #include "hash.h"
     25 
     26 #define CASE(id,test) case id:                          \
     27                           name = #test;                 \
     28                           if (exec) {                   \
     29                               logln(#test "---");       \
     30                               logln((UnicodeString)""); \
     31                               test();                   \
     32                           }                             \
     33                           break
     34 
     35 void CanonicalIteratorTest::runIndexedTest(int32_t index, UBool exec,
     36                                          const char* &name, char* /*par*/) {
     37     switch (index) {
     38         CASE(0, TestBasic);
     39         CASE(1, TestExhaustive);
     40         CASE(2, TestAPI);
     41       default: name = ""; break;
     42     }
     43 }
     44 
     45 /**
     46  * Convert Java-style strings with \u Unicode escapes into UnicodeString objects
     47 static UnicodeString str(const char *input)
     48 {
     49     UnicodeString str(input, ""); // Invariant conversion
     50     return str.unescape();
     51 }
     52  */
     53 
     54 
     55 CanonicalIteratorTest::CanonicalIteratorTest() :
     56 nameTrans(NULL), hexTrans(NULL)
     57 {
     58 }
     59 
     60 CanonicalIteratorTest::~CanonicalIteratorTest()
     61 {
     62 #if !UCONFIG_NO_TRANSLITERATION
     63   if(nameTrans != NULL) {
     64     delete(nameTrans);
     65   }
     66   if(hexTrans != NULL) {
     67     delete(hexTrans);
     68   }
     69 #endif
     70 }
     71 
     72 void CanonicalIteratorTest::TestExhaustive() {
     73     UErrorCode status = U_ZERO_ERROR;
     74     CanonicalIterator it("", status);
     75     if (U_FAILURE(status)) {
     76         dataerrln("Error creating CanonicalIterator: %s", u_errorName(status));
     77         return;
     78     }
     79     UChar32 i = 0;
     80     UnicodeString s;
     81     // Test static and dynamic class IDs
     82     if(it.getDynamicClassID() != CanonicalIterator::getStaticClassID()){
     83         errln("CanonicalIterator::getStaticClassId ! = CanonicalIterator.getDynamicClassID");
     84     }
     85     for (i = 0; i < 0x10FFFF; quick?i+=0x10:++i) {
     86         //for (i = 0xae00; i < 0xaf00; ++i) {
     87 
     88         if ((i % 0x100) == 0) {
     89             logln("Testing U+%06X", i);
     90         }
     91 
     92         // skip characters we know don't have decomps
     93         int8_t type = u_charType(i);
     94         if (type == U_UNASSIGNED || type == U_PRIVATE_USE_CHAR
     95             || type == U_SURROGATE) continue;
     96 
     97         s = i;
     98         characterTest(s, i, it);
     99 
    100         s += (UChar32)0x0345; //"\\u0345";
    101         characterTest(s, i, it);
    102     }
    103 }
    104 
    105 void CanonicalIteratorTest::TestBasic() {
    106 
    107     UErrorCode status = U_ZERO_ERROR;
    108 
    109     static const char * const testArray[][2] = {
    110         {"\\u00C5d\\u0307\\u0327", "A\\u030Ad\\u0307\\u0327, A\\u030Ad\\u0327\\u0307, A\\u030A\\u1E0B\\u0327, "
    111             "A\\u030A\\u1E11\\u0307, \\u00C5d\\u0307\\u0327, \\u00C5d\\u0327\\u0307, "
    112             "\\u00C5\\u1E0B\\u0327, \\u00C5\\u1E11\\u0307, \\u212Bd\\u0307\\u0327, "
    113             "\\u212Bd\\u0327\\u0307, \\u212B\\u1E0B\\u0327, \\u212B\\u1E11\\u0307"},
    114         {"\\u010d\\u017E", "c\\u030Cz\\u030C, c\\u030C\\u017E, \\u010Dz\\u030C, \\u010D\\u017E"},
    115         {"x\\u0307\\u0327", "x\\u0307\\u0327, x\\u0327\\u0307, \\u1E8B\\u0327"},
    116     };
    117 
    118 #if 0
    119     // This is not interesting for C/C++ as the data is already built beforehand
    120     // check build
    121     UnicodeSet ss = CanonicalIterator.getSafeStart();
    122     logln("Safe Start: " + ss.toPattern(true));
    123     ss = CanonicalIterator.getStarts('a');
    124     expectEqual("Characters with 'a' at the start of their decomposition: ", "", CanonicalIterator.getStarts('a'),
    125         new UnicodeSet("[\u00E0-\u00E5\u0101\u0103\u0105\u01CE\u01DF\u01E1\u01FB"
    126         + "\u0201\u0203\u0227\u1E01\u1EA1\u1EA3\u1EA5\u1EA7\u1EA9\u1EAB\u1EAD\u1EAF\u1EB1\u1EB3\u1EB5\u1EB7]")
    127             );
    128 #endif
    129 
    130     // check permute
    131     // NOTE: we use a TreeSet below to sort the output, which is not guaranteed to be sorted!
    132 
    133     Hashtable *permutations = new Hashtable(FALSE, status);
    134     permutations->setValueDeleter(uprv_deleteUObject);
    135     UnicodeString toPermute("ABC");
    136 
    137     CanonicalIterator::permute(toPermute, FALSE, permutations, status);
    138 
    139     logln("testing permutation");
    140 
    141     expectEqual("Simple permutation ", "", collectionToString(permutations), "ABC, ACB, BAC, BCA, CAB, CBA");
    142 
    143     delete permutations;
    144 
    145     // try samples
    146     logln("testing samples");
    147     Hashtable *set = new Hashtable(FALSE, status);
    148     set->setValueDeleter(uprv_deleteUObject);
    149     int32_t i = 0;
    150     CanonicalIterator it("", status);
    151     if(U_SUCCESS(status)) {
    152       for (i = 0; i < UPRV_LENGTHOF(testArray); ++i) {
    153           //logln("Results for: " + name.transliterate(testArray[i]));
    154           UnicodeString testStr = CharsToUnicodeString(testArray[i][0]);
    155           it.setSource(testStr, status);
    156           set->removeAll();
    157           for (;;) {
    158               //UnicodeString *result = new UnicodeString(it.next());
    159               UnicodeString result(it.next());
    160               if (result.isBogus()) {
    161                   break;
    162               }
    163               set->put(result, new UnicodeString(result), status); // Add result to the table
    164               //logln(++counter + ": " + hex.transliterate(result));
    165               //logln(" = " + name.transliterate(result));
    166           }
    167           expectEqual(i + UnicodeString(": "), testStr, collectionToString(set), CharsToUnicodeString(testArray[i][1]));
    168 
    169       }
    170     } else {
    171       dataerrln("Couldn't instantiate canonical iterator. Error: %s", u_errorName(status));
    172     }
    173     delete set;
    174 }
    175 
    176 void CanonicalIteratorTest::characterTest(UnicodeString &s, UChar32 ch, CanonicalIterator &it)
    177 {
    178     UErrorCode status = U_ZERO_ERROR;
    179     UnicodeString decomp, comp;
    180     UBool gotDecomp = FALSE;
    181     UBool gotComp = FALSE;
    182     UBool gotSource = FALSE;
    183 
    184     Normalizer::decompose(s, FALSE, 0, decomp, status);
    185     Normalizer::compose(s, FALSE, 0, comp, status);
    186 
    187     // skip characters that don't have either decomp.
    188     // need quick test for this!
    189     if (s == decomp && s == comp) {
    190         return;
    191     }
    192 
    193     it.setSource(s, status);
    194 
    195     for (;;) {
    196         UnicodeString item = it.next();
    197         if (item.isBogus()) break;
    198         if (item == s) gotSource = TRUE;
    199         if (item == decomp) gotDecomp = TRUE;
    200         if (item == comp) gotComp = TRUE;
    201     }
    202 
    203     if (!gotSource || !gotDecomp || !gotComp) {
    204         errln("FAIL CanonicalIterator: " + s + (int)ch);
    205     }
    206 }
    207 
    208 void CanonicalIteratorTest::expectEqual(const UnicodeString &message, const UnicodeString &item, const UnicodeString &a, const UnicodeString &b) {
    209     if (!(a==b)) {
    210         errln("FAIL: " + message + getReadable(item));
    211         errln("\t" + getReadable(a));
    212         errln("\t" + getReadable(b));
    213     } else {
    214         logln("Checked: " + message + getReadable(item));
    215         logln("\t" + getReadable(a));
    216         logln("\t" + getReadable(b));
    217     }
    218 }
    219 
    220 UnicodeString CanonicalIteratorTest::getReadable(const UnicodeString &s) {
    221   UErrorCode status = U_ZERO_ERROR;
    222   UnicodeString result = "[";
    223     if (s.length() == 0) return "";
    224     // set up for readable display
    225 #if !UCONFIG_NO_TRANSLITERATION
    226     if(verbose) {
    227       if (nameTrans == NULL)
    228           nameTrans = Transliterator::createInstance("[^\\ -\\u007F] name", UTRANS_FORWARD, status);
    229       UnicodeString sName = s;
    230       nameTrans->transliterate(sName);
    231       result += sName;
    232       result += ";";
    233     }
    234     if (hexTrans == NULL)
    235         hexTrans = Transliterator::createInstance("[^\\ -\\u007F] hex", UTRANS_FORWARD, status);
    236 #endif
    237     UnicodeString sHex = s;
    238 #if !UCONFIG_NO_TRANSLITERATION
    239     if(hexTrans) { // maybe there is no data and transliterator cannot be instantiated
    240       hexTrans->transliterate(sHex);
    241     }
    242 #endif
    243     result += sHex;
    244     result += "]";
    245     return result;
    246     //return "[" + (verbose ? name->transliterate(s) + "; " : "") + hex->transliterate(s) + "]";
    247 }
    248 
    249 U_CFUNC int U_CALLCONV
    250 compareUnicodeStrings(const void *s1, const void *s2) {
    251   UnicodeString **st1 = (UnicodeString **)s1;
    252   UnicodeString **st2 = (UnicodeString **)s2;
    253 
    254   return (*st1)->compare(**st2);
    255 }
    256 
    257 
    258 UnicodeString CanonicalIteratorTest::collectionToString(Hashtable *col) {
    259     UnicodeString result;
    260 
    261     // Iterate over the Hashtable, then qsort.
    262 
    263     UnicodeString **resArray = new UnicodeString*[col->count()];
    264     int32_t i = 0;
    265 
    266     const UHashElement *ne = NULL;
    267     int32_t el = UHASH_FIRST;
    268     //Iterator it = basic.iterator();
    269     ne = col->nextElement(el);
    270     //while (it.hasNext())
    271     while (ne != NULL) {
    272       //String item = (String) it.next();
    273       UnicodeString *item = (UnicodeString *)(ne->value.pointer);
    274       resArray[i++] = item;
    275       ne = col->nextElement(el);
    276     }
    277 
    278     for(i = 0; i<col->count(); ++i) {
    279       logln(*resArray[i]);
    280     }
    281 
    282     qsort(resArray, col->count(), sizeof(UnicodeString *), compareUnicodeStrings);
    283 
    284     result = *resArray[0];
    285 
    286     for(i = 1; i<col->count(); ++i) {
    287       result += ", ";
    288       result += *resArray[i];
    289     }
    290 
    291 /*
    292     Iterator it = col.iterator();
    293     while (it.hasNext()) {
    294         if (result.length() != 0) result.append(", ");
    295         result.append(it.next().toString());
    296     }
    297 */
    298 
    299     delete [] resArray;
    300 
    301     return result;
    302 }
    303 
    304 void CanonicalIteratorTest::TestAPI() {
    305   UErrorCode status = U_ZERO_ERROR;
    306   // Test reset and getSource
    307   UnicodeString start("ljubav");
    308   logln("Testing CanonicalIterator::getSource");
    309   logln("Instantiating canonical iterator with string "+start);
    310   CanonicalIterator can(start, status);
    311   if (U_FAILURE(status)) {
    312       dataerrln("Error creating CanonicalIterator: %s", u_errorName(status));
    313       return;
    314   }
    315   UnicodeString source = can.getSource();
    316   logln("CanonicalIterator::getSource returned "+source);
    317   if(start != source) {
    318     errln("CanonicalIterator.getSource() didn't return the starting string. Expected "+start+", got "+source);
    319   }
    320   logln("Testing CanonicalIterator::reset");
    321   UnicodeString next = can.next();
    322   logln("CanonicalIterator::next returned "+next);
    323 
    324   can.reset();
    325 
    326   UnicodeString afterReset = can.next();
    327   logln("After reset, CanonicalIterator::next returned "+afterReset);
    328 
    329   if(next != afterReset) {
    330     errln("Next after instantiation ("+next+") is different from next after reset ("+afterReset+").");
    331   }
    332 
    333   logln("Testing getStaticClassID and getDynamicClassID");
    334   if(can.getDynamicClassID() != CanonicalIterator::getStaticClassID()){
    335       errln("RTTI failed for CanonicalIterator getDynamicClassID != getStaticClassID");
    336   }
    337 }
    338 
    339 #endif /* #if !UCONFIG_NO_NORMALIZATION */
    340