1 /******************************************************************** 2 * COPYRIGHT: 3 * Copyright (c) 2002-2010, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ******************************************************************** 6 * 7 * @author Mark E. Davis 8 * @author Vladimir Weinstein 9 */ 10 11 #include "unicode/utypes.h" 12 13 #if !UCONFIG_NO_NORMALIZATION 14 15 #include "intltest.h" 16 #include "cstring.h" 17 #include "canittst.h" 18 #include "unicode/caniter.h" 19 #include "unicode/normlzr.h" 20 #include "unicode/uchar.h" 21 #include "hash.h" 22 23 #define ARRAY_LENGTH(array) ((int32_t)(sizeof (array) / sizeof (*array))) 24 25 #define CASE(id,test) case id: \ 26 name = #test; \ 27 if (exec) { \ 28 logln(#test "---"); \ 29 logln((UnicodeString)""); \ 30 test(); \ 31 } \ 32 break 33 34 void CanonicalIteratorTest::runIndexedTest(int32_t index, UBool exec, 35 const char* &name, char* /*par*/) { 36 switch (index) { 37 CASE(0, TestBasic); 38 CASE(1, TestExhaustive); 39 CASE(2, TestAPI); 40 default: name = ""; break; 41 } 42 } 43 44 /** 45 * Convert Java-style strings with \u Unicode escapes into UnicodeString objects 46 static UnicodeString str(const char *input) 47 { 48 UnicodeString str(input, ""); // Invariant conversion 49 return str.unescape(); 50 } 51 */ 52 53 54 CanonicalIteratorTest::CanonicalIteratorTest() : 55 nameTrans(NULL), hexTrans(NULL) 56 { 57 } 58 59 CanonicalIteratorTest::~CanonicalIteratorTest() 60 { 61 #if !UCONFIG_NO_TRANSLITERATION 62 if(nameTrans != NULL) { 63 delete(nameTrans); 64 } 65 if(hexTrans != NULL) { 66 delete(hexTrans); 67 } 68 #endif 69 } 70 71 void CanonicalIteratorTest::TestExhaustive() { 72 UErrorCode status = U_ZERO_ERROR; 73 CanonicalIterator it("", status); 74 if (U_FAILURE(status)) { 75 dataerrln("Error creating CanonicalIterator: %s", u_errorName(status)); 76 return; 77 } 78 UChar32 i = 0; 79 UnicodeString s; 80 // Test static and dynamic class IDs 81 if(it.getDynamicClassID() != CanonicalIterator::getStaticClassID()){ 82 errln("CanonicalIterator::getStaticClassId ! = CanonicalIterator.getDynamicClassID"); 83 } 84 for (i = 0; i < 0x10FFFF; quick?i+=0x10:++i) { 85 //for (i = 0xae00; i < 0xaf00; ++i) { 86 87 if ((i % 0x100) == 0) { 88 logln("Testing U+%06X", i); 89 } 90 91 // skip characters we know don't have decomps 92 int8_t type = u_charType(i); 93 if (type == U_UNASSIGNED || type == U_PRIVATE_USE_CHAR 94 || type == U_SURROGATE) continue; 95 96 s = i; 97 characterTest(s, i, it); 98 99 s += (UChar32)0x0345; //"\\u0345"; 100 characterTest(s, i, it); 101 } 102 } 103 104 void CanonicalIteratorTest::TestBasic() { 105 106 UErrorCode status = U_ZERO_ERROR; 107 108 static const char * const testArray[][2] = { 109 {"\\u00C5d\\u0307\\u0327", "A\\u030Ad\\u0307\\u0327, A\\u030Ad\\u0327\\u0307, A\\u030A\\u1E0B\\u0327, " 110 "A\\u030A\\u1E11\\u0307, \\u00C5d\\u0307\\u0327, \\u00C5d\\u0327\\u0307, " 111 "\\u00C5\\u1E0B\\u0327, \\u00C5\\u1E11\\u0307, \\u212Bd\\u0307\\u0327, " 112 "\\u212Bd\\u0327\\u0307, \\u212B\\u1E0B\\u0327, \\u212B\\u1E11\\u0307"}, 113 {"\\u010d\\u017E", "c\\u030Cz\\u030C, c\\u030C\\u017E, \\u010Dz\\u030C, \\u010D\\u017E"}, 114 {"x\\u0307\\u0327", "x\\u0307\\u0327, x\\u0327\\u0307, \\u1E8B\\u0327"}, 115 }; 116 117 #if 0 118 // This is not interesting for C/C++ as the data is already built beforehand 119 // check build 120 UnicodeSet ss = CanonicalIterator.getSafeStart(); 121 logln("Safe Start: " + ss.toPattern(true)); 122 ss = CanonicalIterator.getStarts('a'); 123 expectEqual("Characters with 'a' at the start of their decomposition: ", "", CanonicalIterator.getStarts('a'), 124 new UnicodeSet("[\u00E0-\u00E5\u0101\u0103\u0105\u01CE\u01DF\u01E1\u01FB" 125 + "\u0201\u0203\u0227\u1E01\u1EA1\u1EA3\u1EA5\u1EA7\u1EA9\u1EAB\u1EAD\u1EAF\u1EB1\u1EB3\u1EB5\u1EB7]") 126 ); 127 #endif 128 129 // check permute 130 // NOTE: we use a TreeSet below to sort the output, which is not guaranteed to be sorted! 131 132 Hashtable *permutations = new Hashtable(FALSE, status); 133 permutations->setValueDeleter(uhash_deleteUnicodeString); 134 UnicodeString toPermute("ABC"); 135 136 CanonicalIterator::permute(toPermute, FALSE, permutations, status); 137 138 logln("testing permutation"); 139 140 expectEqual("Simple permutation ", "", collectionToString(permutations), "ABC, ACB, BAC, BCA, CAB, CBA"); 141 142 delete permutations; 143 144 // try samples 145 logln("testing samples"); 146 Hashtable *set = new Hashtable(FALSE, status); 147 set->setValueDeleter(uhash_deleteUnicodeString); 148 int32_t i = 0; 149 CanonicalIterator it("", status); 150 if(U_SUCCESS(status)) { 151 for (i = 0; i < ARRAY_LENGTH(testArray); ++i) { 152 //logln("Results for: " + name.transliterate(testArray[i])); 153 UnicodeString testStr = CharsToUnicodeString(testArray[i][0]); 154 it.setSource(testStr, status); 155 set->removeAll(); 156 for (;;) { 157 //UnicodeString *result = new UnicodeString(it.next()); 158 UnicodeString result(it.next()); 159 if (result.isBogus()) { 160 break; 161 } 162 set->put(result, new UnicodeString(result), status); // Add result to the table 163 //logln(++counter + ": " + hex.transliterate(result)); 164 //logln(" = " + name.transliterate(result)); 165 } 166 expectEqual(i + ": ", testStr, collectionToString(set), CharsToUnicodeString(testArray[i][1])); 167 168 } 169 } else { 170 dataerrln("Couldn't instantiate canonical iterator. Error: %s", u_errorName(status)); 171 } 172 delete set; 173 } 174 175 void CanonicalIteratorTest::characterTest(UnicodeString &s, UChar32 ch, CanonicalIterator &it) 176 { 177 UErrorCode status = U_ZERO_ERROR; 178 UnicodeString decomp, comp; 179 UBool gotDecomp = FALSE; 180 UBool gotComp = FALSE; 181 UBool gotSource = FALSE; 182 183 Normalizer::decompose(s, FALSE, 0, decomp, status); 184 Normalizer::compose(s, FALSE, 0, comp, status); 185 186 // skip characters that don't have either decomp. 187 // need quick test for this! 188 if (s == decomp && s == comp) { 189 return; 190 } 191 192 it.setSource(s, status); 193 194 for (;;) { 195 UnicodeString item = it.next(); 196 if (item.isBogus()) break; 197 if (item == s) gotSource = TRUE; 198 if (item == decomp) gotDecomp = TRUE; 199 if (item == comp) gotComp = TRUE; 200 } 201 202 if (!gotSource || !gotDecomp || !gotComp) { 203 errln("FAIL CanonicalIterator: " + s + (int)ch); 204 } 205 } 206 207 void CanonicalIteratorTest::expectEqual(const UnicodeString &message, const UnicodeString &item, const UnicodeString &a, const UnicodeString &b) { 208 if (!(a==b)) { 209 errln("FAIL: " + message + getReadable(item)); 210 errln("\t" + getReadable(a)); 211 errln("\t" + getReadable(b)); 212 } else { 213 logln("Checked: " + message + getReadable(item)); 214 logln("\t" + getReadable(a)); 215 logln("\t" + getReadable(b)); 216 } 217 } 218 219 UnicodeString CanonicalIteratorTest::getReadable(const UnicodeString &s) { 220 UErrorCode status = U_ZERO_ERROR; 221 UnicodeString result = "["; 222 if (s.length() == 0) return ""; 223 // set up for readable display 224 #if !UCONFIG_NO_TRANSLITERATION 225 if(verbose) { 226 if (nameTrans == NULL) 227 nameTrans = Transliterator::createInstance("[^\\ -\\u007F] name", UTRANS_FORWARD, status); 228 UnicodeString sName = s; 229 nameTrans->transliterate(sName); 230 result += sName; 231 result += ";"; 232 } 233 if (hexTrans == NULL) 234 hexTrans = Transliterator::createInstance("[^\\ -\\u007F] hex", UTRANS_FORWARD, status); 235 #endif 236 UnicodeString sHex = s; 237 #if !UCONFIG_NO_TRANSLITERATION 238 if(hexTrans) { // maybe there is no data and transliterator cannot be instantiated 239 hexTrans->transliterate(sHex); 240 } 241 #endif 242 result += sHex; 243 result += "]"; 244 return result; 245 //return "[" + (verbose ? name->transliterate(s) + "; " : "") + hex->transliterate(s) + "]"; 246 } 247 248 U_CFUNC int U_CALLCONV 249 compareUnicodeStrings(const void *s1, const void *s2) { 250 UnicodeString **st1 = (UnicodeString **)s1; 251 UnicodeString **st2 = (UnicodeString **)s2; 252 253 return (*st1)->compare(**st2); 254 } 255 256 257 UnicodeString CanonicalIteratorTest::collectionToString(Hashtable *col) { 258 UnicodeString result; 259 260 // Iterate over the Hashtable, then qsort. 261 262 UnicodeString **resArray = new UnicodeString*[col->count()]; 263 int32_t i = 0; 264 265 const UHashElement *ne = NULL; 266 int32_t el = -1; 267 //Iterator it = basic.iterator(); 268 ne = col->nextElement(el); 269 //while (it.hasNext()) 270 while (ne != NULL) { 271 //String item = (String) it.next(); 272 UnicodeString *item = (UnicodeString *)(ne->value.pointer); 273 resArray[i++] = item; 274 ne = col->nextElement(el); 275 } 276 277 for(i = 0; i<col->count(); ++i) { 278 logln(*resArray[i]); 279 } 280 281 qsort(resArray, col->count(), sizeof(UnicodeString *), compareUnicodeStrings); 282 283 result = *resArray[0]; 284 285 for(i = 1; i<col->count(); ++i) { 286 result += ", "; 287 result += *resArray[i]; 288 } 289 290 /* 291 Iterator it = col.iterator(); 292 while (it.hasNext()) { 293 if (result.length() != 0) result.append(", "); 294 result.append(it.next().toString()); 295 } 296 */ 297 298 delete [] resArray; 299 300 return result; 301 } 302 303 void CanonicalIteratorTest::TestAPI() { 304 UErrorCode status = U_ZERO_ERROR; 305 // Test reset and getSource 306 UnicodeString start("ljubav"); 307 logln("Testing CanonicalIterator::getSource"); 308 logln("Instantiating canonical iterator with string "+start); 309 CanonicalIterator can(start, status); 310 if (U_FAILURE(status)) { 311 dataerrln("Error creating CanonicalIterator: %s", u_errorName(status)); 312 return; 313 } 314 UnicodeString source = can.getSource(); 315 logln("CanonicalIterator::getSource returned "+source); 316 if(start != source) { 317 errln("CanonicalIterator.getSource() didn't return the starting string. Expected "+start+", got "+source); 318 } 319 logln("Testing CanonicalIterator::reset"); 320 UnicodeString next = can.next(); 321 logln("CanonicalIterator::next returned "+next); 322 323 can.reset(); 324 325 UnicodeString afterReset = can.next(); 326 logln("After reset, CanonicalIterator::next returned "+afterReset); 327 328 if(next != afterReset) { 329 errln("Next after instantiation ("+next+") is different from next after reset ("+afterReset+")."); 330 } 331 332 logln("Testing getStaticClassID and getDynamicClassID"); 333 if(can.getDynamicClassID() != CanonicalIterator::getStaticClassID()){ 334 errln("RTTI failed for CanonicalIterator getDynamicClassID != getStaticClassID"); 335 } 336 } 337 338 #endif /* #if !UCONFIG_NO_NORMALIZATION */ 339