1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /******************************************************************** 4 * COPYRIGHT: 5 * Copyright (c) 2002-2016, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************** 8 * 9 * @author Mark E. Davis 10 * @author Vladimir Weinstein 11 */ 12 13 #include "unicode/utypes.h" 14 15 #if !UCONFIG_NO_NORMALIZATION 16 17 #include "intltest.h" 18 #include "cmemory.h" 19 #include "cstring.h" 20 #include "canittst.h" 21 #include "unicode/caniter.h" 22 #include "unicode/normlzr.h" 23 #include "unicode/uchar.h" 24 #include "hash.h" 25 26 #define CASE(id,test) case id: \ 27 name = #test; \ 28 if (exec) { \ 29 logln(#test "---"); \ 30 logln((UnicodeString)""); \ 31 test(); \ 32 } \ 33 break 34 35 void CanonicalIteratorTest::runIndexedTest(int32_t index, UBool exec, 36 const char* &name, char* /*par*/) { 37 switch (index) { 38 CASE(0, TestBasic); 39 CASE(1, TestExhaustive); 40 CASE(2, TestAPI); 41 default: name = ""; break; 42 } 43 } 44 45 /** 46 * Convert Java-style strings with \u Unicode escapes into UnicodeString objects 47 static UnicodeString str(const char *input) 48 { 49 UnicodeString str(input, ""); // Invariant conversion 50 return str.unescape(); 51 } 52 */ 53 54 55 CanonicalIteratorTest::CanonicalIteratorTest() : 56 nameTrans(NULL), hexTrans(NULL) 57 { 58 } 59 60 CanonicalIteratorTest::~CanonicalIteratorTest() 61 { 62 #if !UCONFIG_NO_TRANSLITERATION 63 if(nameTrans != NULL) { 64 delete(nameTrans); 65 } 66 if(hexTrans != NULL) { 67 delete(hexTrans); 68 } 69 #endif 70 } 71 72 void CanonicalIteratorTest::TestExhaustive() { 73 UErrorCode status = U_ZERO_ERROR; 74 CanonicalIterator it("", status); 75 if (U_FAILURE(status)) { 76 dataerrln("Error creating CanonicalIterator: %s", u_errorName(status)); 77 return; 78 } 79 UChar32 i = 0; 80 UnicodeString s; 81 // Test static and dynamic class IDs 82 if(it.getDynamicClassID() != CanonicalIterator::getStaticClassID()){ 83 errln("CanonicalIterator::getStaticClassId ! = CanonicalIterator.getDynamicClassID"); 84 } 85 for (i = 0; i < 0x10FFFF; quick?i+=0x10:++i) { 86 //for (i = 0xae00; i < 0xaf00; ++i) { 87 88 if ((i % 0x100) == 0) { 89 logln("Testing U+%06X", i); 90 } 91 92 // skip characters we know don't have decomps 93 int8_t type = u_charType(i); 94 if (type == U_UNASSIGNED || type == U_PRIVATE_USE_CHAR 95 || type == U_SURROGATE) continue; 96 97 s = i; 98 characterTest(s, i, it); 99 100 s += (UChar32)0x0345; //"\\u0345"; 101 characterTest(s, i, it); 102 } 103 } 104 105 void CanonicalIteratorTest::TestBasic() { 106 107 UErrorCode status = U_ZERO_ERROR; 108 109 static const char * const testArray[][2] = { 110 {"\\u00C5d\\u0307\\u0327", "A\\u030Ad\\u0307\\u0327, A\\u030Ad\\u0327\\u0307, A\\u030A\\u1E0B\\u0327, " 111 "A\\u030A\\u1E11\\u0307, \\u00C5d\\u0307\\u0327, \\u00C5d\\u0327\\u0307, " 112 "\\u00C5\\u1E0B\\u0327, \\u00C5\\u1E11\\u0307, \\u212Bd\\u0307\\u0327, " 113 "\\u212Bd\\u0327\\u0307, \\u212B\\u1E0B\\u0327, \\u212B\\u1E11\\u0307"}, 114 {"\\u010d\\u017E", "c\\u030Cz\\u030C, c\\u030C\\u017E, \\u010Dz\\u030C, \\u010D\\u017E"}, 115 {"x\\u0307\\u0327", "x\\u0307\\u0327, x\\u0327\\u0307, \\u1E8B\\u0327"}, 116 }; 117 118 #if 0 119 // This is not interesting for C/C++ as the data is already built beforehand 120 // check build 121 UnicodeSet ss = CanonicalIterator.getSafeStart(); 122 logln("Safe Start: " + ss.toPattern(true)); 123 ss = CanonicalIterator.getStarts('a'); 124 expectEqual("Characters with 'a' at the start of their decomposition: ", "", CanonicalIterator.getStarts('a'), 125 new UnicodeSet("[\u00E0-\u00E5\u0101\u0103\u0105\u01CE\u01DF\u01E1\u01FB" 126 + "\u0201\u0203\u0227\u1E01\u1EA1\u1EA3\u1EA5\u1EA7\u1EA9\u1EAB\u1EAD\u1EAF\u1EB1\u1EB3\u1EB5\u1EB7]") 127 ); 128 #endif 129 130 // check permute 131 // NOTE: we use a TreeSet below to sort the output, which is not guaranteed to be sorted! 132 133 Hashtable *permutations = new Hashtable(FALSE, status); 134 permutations->setValueDeleter(uprv_deleteUObject); 135 UnicodeString toPermute("ABC"); 136 137 CanonicalIterator::permute(toPermute, FALSE, permutations, status); 138 139 logln("testing permutation"); 140 141 expectEqual("Simple permutation ", "", collectionToString(permutations), "ABC, ACB, BAC, BCA, CAB, CBA"); 142 143 delete permutations; 144 145 // try samples 146 logln("testing samples"); 147 Hashtable *set = new Hashtable(FALSE, status); 148 set->setValueDeleter(uprv_deleteUObject); 149 int32_t i = 0; 150 CanonicalIterator it("", status); 151 if(U_SUCCESS(status)) { 152 for (i = 0; i < UPRV_LENGTHOF(testArray); ++i) { 153 //logln("Results for: " + name.transliterate(testArray[i])); 154 UnicodeString testStr = CharsToUnicodeString(testArray[i][0]); 155 it.setSource(testStr, status); 156 set->removeAll(); 157 for (;;) { 158 //UnicodeString *result = new UnicodeString(it.next()); 159 UnicodeString result(it.next()); 160 if (result.isBogus()) { 161 break; 162 } 163 set->put(result, new UnicodeString(result), status); // Add result to the table 164 //logln(++counter + ": " + hex.transliterate(result)); 165 //logln(" = " + name.transliterate(result)); 166 } 167 expectEqual(i + UnicodeString(": "), testStr, collectionToString(set), CharsToUnicodeString(testArray[i][1])); 168 169 } 170 } else { 171 dataerrln("Couldn't instantiate canonical iterator. Error: %s", u_errorName(status)); 172 } 173 delete set; 174 } 175 176 void CanonicalIteratorTest::characterTest(UnicodeString &s, UChar32 ch, CanonicalIterator &it) 177 { 178 UErrorCode status = U_ZERO_ERROR; 179 UnicodeString decomp, comp; 180 UBool gotDecomp = FALSE; 181 UBool gotComp = FALSE; 182 UBool gotSource = FALSE; 183 184 Normalizer::decompose(s, FALSE, 0, decomp, status); 185 Normalizer::compose(s, FALSE, 0, comp, status); 186 187 // skip characters that don't have either decomp. 188 // need quick test for this! 189 if (s == decomp && s == comp) { 190 return; 191 } 192 193 it.setSource(s, status); 194 195 for (;;) { 196 UnicodeString item = it.next(); 197 if (item.isBogus()) break; 198 if (item == s) gotSource = TRUE; 199 if (item == decomp) gotDecomp = TRUE; 200 if (item == comp) gotComp = TRUE; 201 } 202 203 if (!gotSource || !gotDecomp || !gotComp) { 204 errln("FAIL CanonicalIterator: " + s + (int)ch); 205 } 206 } 207 208 void CanonicalIteratorTest::expectEqual(const UnicodeString &message, const UnicodeString &item, const UnicodeString &a, const UnicodeString &b) { 209 if (!(a==b)) { 210 errln("FAIL: " + message + getReadable(item)); 211 errln("\t" + getReadable(a)); 212 errln("\t" + getReadable(b)); 213 } else { 214 logln("Checked: " + message + getReadable(item)); 215 logln("\t" + getReadable(a)); 216 logln("\t" + getReadable(b)); 217 } 218 } 219 220 UnicodeString CanonicalIteratorTest::getReadable(const UnicodeString &s) { 221 UErrorCode status = U_ZERO_ERROR; 222 UnicodeString result = "["; 223 if (s.length() == 0) return ""; 224 // set up for readable display 225 #if !UCONFIG_NO_TRANSLITERATION 226 if(verbose) { 227 if (nameTrans == NULL) 228 nameTrans = Transliterator::createInstance("[^\\ -\\u007F] name", UTRANS_FORWARD, status); 229 UnicodeString sName = s; 230 nameTrans->transliterate(sName); 231 result += sName; 232 result += ";"; 233 } 234 if (hexTrans == NULL) 235 hexTrans = Transliterator::createInstance("[^\\ -\\u007F] hex", UTRANS_FORWARD, status); 236 #endif 237 UnicodeString sHex = s; 238 #if !UCONFIG_NO_TRANSLITERATION 239 if(hexTrans) { // maybe there is no data and transliterator cannot be instantiated 240 hexTrans->transliterate(sHex); 241 } 242 #endif 243 result += sHex; 244 result += "]"; 245 return result; 246 //return "[" + (verbose ? name->transliterate(s) + "; " : "") + hex->transliterate(s) + "]"; 247 } 248 249 U_CFUNC int U_CALLCONV 250 compareUnicodeStrings(const void *s1, const void *s2) { 251 UnicodeString **st1 = (UnicodeString **)s1; 252 UnicodeString **st2 = (UnicodeString **)s2; 253 254 return (*st1)->compare(**st2); 255 } 256 257 258 UnicodeString CanonicalIteratorTest::collectionToString(Hashtable *col) { 259 UnicodeString result; 260 261 // Iterate over the Hashtable, then qsort. 262 263 UnicodeString **resArray = new UnicodeString*[col->count()]; 264 int32_t i = 0; 265 266 const UHashElement *ne = NULL; 267 int32_t el = UHASH_FIRST; 268 //Iterator it = basic.iterator(); 269 ne = col->nextElement(el); 270 //while (it.hasNext()) 271 while (ne != NULL) { 272 //String item = (String) it.next(); 273 UnicodeString *item = (UnicodeString *)(ne->value.pointer); 274 resArray[i++] = item; 275 ne = col->nextElement(el); 276 } 277 278 for(i = 0; i<col->count(); ++i) { 279 logln(*resArray[i]); 280 } 281 282 qsort(resArray, col->count(), sizeof(UnicodeString *), compareUnicodeStrings); 283 284 result = *resArray[0]; 285 286 for(i = 1; i<col->count(); ++i) { 287 result += ", "; 288 result += *resArray[i]; 289 } 290 291 /* 292 Iterator it = col.iterator(); 293 while (it.hasNext()) { 294 if (result.length() != 0) result.append(", "); 295 result.append(it.next().toString()); 296 } 297 */ 298 299 delete [] resArray; 300 301 return result; 302 } 303 304 void CanonicalIteratorTest::TestAPI() { 305 UErrorCode status = U_ZERO_ERROR; 306 // Test reset and getSource 307 UnicodeString start("ljubav"); 308 logln("Testing CanonicalIterator::getSource"); 309 logln("Instantiating canonical iterator with string "+start); 310 CanonicalIterator can(start, status); 311 if (U_FAILURE(status)) { 312 dataerrln("Error creating CanonicalIterator: %s", u_errorName(status)); 313 return; 314 } 315 UnicodeString source = can.getSource(); 316 logln("CanonicalIterator::getSource returned "+source); 317 if(start != source) { 318 errln("CanonicalIterator.getSource() didn't return the starting string. Expected "+start+", got "+source); 319 } 320 logln("Testing CanonicalIterator::reset"); 321 UnicodeString next = can.next(); 322 logln("CanonicalIterator::next returned "+next); 323 324 can.reset(); 325 326 UnicodeString afterReset = can.next(); 327 logln("After reset, CanonicalIterator::next returned "+afterReset); 328 329 if(next != afterReset) { 330 errln("Next after instantiation ("+next+") is different from next after reset ("+afterReset+")."); 331 } 332 333 logln("Testing getStaticClassID and getDynamicClassID"); 334 if(can.getDynamicClassID() != CanonicalIterator::getStaticClassID()){ 335 errln("RTTI failed for CanonicalIterator getDynamicClassID != getStaticClassID"); 336 } 337 } 338 339 #endif /* #if !UCONFIG_NO_NORMALIZATION */ 340