1 /******************************************************************** 2 * COPYRIGHT: 3 * Copyright (c) 1997-2014, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ********************************************************************/ 6 7 #include "unicode/ustring.h" 8 #include "unicode/uchar.h" 9 #include "unicode/uniset.h" 10 #include "unicode/putil.h" 11 #include "unicode/uscript.h" 12 #include "cstring.h" 13 #include "hash.h" 14 #include "patternprops.h" 15 #include "normalizer2impl.h" 16 #include "uparse.h" 17 #include "ucdtest.h" 18 19 static const char *ignorePropNames[]={ 20 "FC_NFKC", 21 "NFD_QC", 22 "NFC_QC", 23 "NFKD_QC", 24 "NFKC_QC", 25 "Expands_On_NFD", 26 "Expands_On_NFC", 27 "Expands_On_NFKD", 28 "Expands_On_NFKC", 29 "NFKC_CF" 30 }; 31 32 UnicodeTest::UnicodeTest() 33 { 34 UErrorCode errorCode=U_ZERO_ERROR; 35 unknownPropertyNames=new U_NAMESPACE_QUALIFIER Hashtable(errorCode); 36 if(U_FAILURE(errorCode)) { 37 delete unknownPropertyNames; 38 unknownPropertyNames=NULL; 39 } 40 // Ignore some property names altogether. 41 for(int32_t i=0; i<UPRV_LENGTHOF(ignorePropNames); ++i) { 42 unknownPropertyNames->puti(UnicodeString(ignorePropNames[i], -1, US_INV), 1, errorCode); 43 } 44 } 45 46 UnicodeTest::~UnicodeTest() 47 { 48 delete unknownPropertyNames; 49 } 50 51 void UnicodeTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ ) 52 { 53 if(exec) { 54 logln("TestSuite UnicodeTest: "); 55 } 56 TESTCASE_AUTO_BEGIN; 57 TESTCASE_AUTO(TestAdditionalProperties); 58 TESTCASE_AUTO(TestBinaryValues); 59 TESTCASE_AUTO(TestConsistency); 60 TESTCASE_AUTO(TestPatternProperties); 61 TESTCASE_AUTO(TestScriptMetadata); 62 TESTCASE_AUTO(TestBidiPairedBracketType); 63 TESTCASE_AUTO_END; 64 } 65 66 //==================================================== 67 // private data used by the tests 68 //==================================================== 69 70 // test DerivedCoreProperties.txt ------------------------------------------- 71 72 // copied from genprops.c 73 static int32_t 74 getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) { 75 const char *t, *z; 76 int32_t i, j; 77 78 s=u_skipWhitespace(s); 79 for(i=0; i<countTokens; ++i) { 80 t=tokens[i]; 81 if(t!=NULL) { 82 for(j=0;; ++j) { 83 if(t[j]!=0) { 84 if(s[j]!=t[j]) { 85 break; 86 } 87 } else { 88 z=u_skipWhitespace(s+j); 89 if(*z==';' || *z==0) { 90 return i; 91 } else { 92 break; 93 } 94 } 95 } 96 } 97 } 98 return -1; 99 } 100 101 static const char *const 102 derivedPropsNames[]={ 103 "Math", 104 "Alphabetic", 105 "Lowercase", 106 "Uppercase", 107 "ID_Start", 108 "ID_Continue", 109 "XID_Start", 110 "XID_Continue", 111 "Default_Ignorable_Code_Point", 112 "Full_Composition_Exclusion", 113 "Grapheme_Extend", 114 "Grapheme_Link", /* Unicode 5 moves this property here from PropList.txt */ 115 "Grapheme_Base", 116 "Cased", 117 "Case_Ignorable", 118 "Changes_When_Lowercased", 119 "Changes_When_Uppercased", 120 "Changes_When_Titlecased", 121 "Changes_When_Casefolded", 122 "Changes_When_Casemapped", 123 "Changes_When_NFKC_Casefolded" 124 }; 125 126 static const UProperty 127 derivedPropsIndex[]={ 128 UCHAR_MATH, 129 UCHAR_ALPHABETIC, 130 UCHAR_LOWERCASE, 131 UCHAR_UPPERCASE, 132 UCHAR_ID_START, 133 UCHAR_ID_CONTINUE, 134 UCHAR_XID_START, 135 UCHAR_XID_CONTINUE, 136 UCHAR_DEFAULT_IGNORABLE_CODE_POINT, 137 UCHAR_FULL_COMPOSITION_EXCLUSION, 138 UCHAR_GRAPHEME_EXTEND, 139 UCHAR_GRAPHEME_LINK, 140 UCHAR_GRAPHEME_BASE, 141 UCHAR_CASED, 142 UCHAR_CASE_IGNORABLE, 143 UCHAR_CHANGES_WHEN_LOWERCASED, 144 UCHAR_CHANGES_WHEN_UPPERCASED, 145 UCHAR_CHANGES_WHEN_TITLECASED, 146 UCHAR_CHANGES_WHEN_CASEFOLDED, 147 UCHAR_CHANGES_WHEN_CASEMAPPED, 148 UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED 149 }; 150 151 static int32_t numErrors[UPRV_LENGTHOF(derivedPropsIndex)]={ 0 }; 152 153 enum { MAX_ERRORS=50 }; 154 155 U_CFUNC void U_CALLCONV 156 derivedPropsLineFn(void *context, 157 char *fields[][2], int32_t /* fieldCount */, 158 UErrorCode *pErrorCode) 159 { 160 UnicodeTest *me=(UnicodeTest *)context; 161 uint32_t start, end; 162 int32_t i; 163 164 u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode); 165 if(U_FAILURE(*pErrorCode)) { 166 me->errln("UnicodeTest: syntax error in DerivedCoreProperties.txt or DerivedNormalizationProps.txt field 0 at %s\n", fields[0][0]); 167 return; 168 } 169 170 /* parse derived binary property name, ignore unknown names */ 171 i=getTokenIndex(derivedPropsNames, UPRV_LENGTHOF(derivedPropsNames), fields[1][0]); 172 if(i<0) { 173 UnicodeString propName(fields[1][0], (int32_t)(fields[1][1]-fields[1][0])); 174 propName.trim(); 175 if(me->unknownPropertyNames->find(propName)==NULL) { 176 UErrorCode errorCode=U_ZERO_ERROR; 177 me->unknownPropertyNames->puti(propName, 1, errorCode); 178 me->errln("UnicodeTest warning: unknown property name '%s' in DerivedCoreProperties.txt or DerivedNormalizationProps.txt\n", fields[1][0]); 179 } 180 return; 181 } 182 183 me->derivedProps[i].add(start, end); 184 } 185 186 void UnicodeTest::TestAdditionalProperties() { 187 #if !UCONFIG_NO_NORMALIZATION 188 // test DerivedCoreProperties.txt and DerivedNormalizationProps.txt 189 if(UPRV_LENGTHOF(derivedProps)<UPRV_LENGTHOF(derivedPropsNames)) { 190 errln("error: UnicodeTest::derivedProps[] too short, need at least %d UnicodeSets\n", 191 UPRV_LENGTHOF(derivedPropsNames)); 192 return; 193 } 194 if(UPRV_LENGTHOF(derivedPropsIndex)!=UPRV_LENGTHOF(derivedPropsNames)) { 195 errln("error in ucdtest.cpp: UPRV_LENGTHOF(derivedPropsIndex)!=UPRV_LENGTHOF(derivedPropsNames)\n"); 196 return; 197 } 198 199 char path[500]; 200 if(getUnidataPath(path) == NULL) { 201 errln("unable to find path to source/data/unidata/"); 202 return; 203 } 204 char *basename=strchr(path, 0); 205 strcpy(basename, "DerivedCoreProperties.txt"); 206 207 char *fields[2][2]; 208 UErrorCode errorCode=U_ZERO_ERROR; 209 u_parseDelimitedFile(path, ';', fields, 2, derivedPropsLineFn, this, &errorCode); 210 if(U_FAILURE(errorCode)) { 211 errln("error parsing DerivedCoreProperties.txt: %s\n", u_errorName(errorCode)); 212 return; 213 } 214 215 strcpy(basename, "DerivedNormalizationProps.txt"); 216 u_parseDelimitedFile(path, ';', fields, 2, derivedPropsLineFn, this, &errorCode); 217 if(U_FAILURE(errorCode)) { 218 errln("error parsing DerivedNormalizationProps.txt: %s\n", u_errorName(errorCode)); 219 return; 220 } 221 222 // now we have all derived core properties in the UnicodeSets 223 // run them all through the API 224 int32_t rangeCount, range; 225 uint32_t i; 226 UChar32 start, end; 227 228 // test all TRUE properties 229 for(i=0; i<UPRV_LENGTHOF(derivedPropsNames); ++i) { 230 rangeCount=derivedProps[i].getRangeCount(); 231 for(range=0; range<rangeCount && numErrors[i]<MAX_ERRORS; ++range) { 232 start=derivedProps[i].getRangeStart(range); 233 end=derivedProps[i].getRangeEnd(range); 234 for(; start<=end; ++start) { 235 if(!u_hasBinaryProperty(start, derivedPropsIndex[i])) { 236 dataerrln("UnicodeTest error: u_hasBinaryProperty(U+%04lx, %s)==FALSE is wrong", start, derivedPropsNames[i]); 237 if(++numErrors[i]>=MAX_ERRORS) { 238 dataerrln("Too many errors, moving to the next test"); 239 break; 240 } 241 } 242 } 243 } 244 } 245 246 // invert all properties 247 for(i=0; i<UPRV_LENGTHOF(derivedPropsNames); ++i) { 248 derivedProps[i].complement(); 249 } 250 251 // test all FALSE properties 252 for(i=0; i<UPRV_LENGTHOF(derivedPropsNames); ++i) { 253 rangeCount=derivedProps[i].getRangeCount(); 254 for(range=0; range<rangeCount && numErrors[i]<MAX_ERRORS; ++range) { 255 start=derivedProps[i].getRangeStart(range); 256 end=derivedProps[i].getRangeEnd(range); 257 for(; start<=end; ++start) { 258 if(u_hasBinaryProperty(start, derivedPropsIndex[i])) { 259 errln("UnicodeTest error: u_hasBinaryProperty(U+%04lx, %s)==TRUE is wrong\n", start, derivedPropsNames[i]); 260 if(++numErrors[i]>=MAX_ERRORS) { 261 errln("Too many errors, moving to the next test"); 262 break; 263 } 264 } 265 } 266 } 267 } 268 #endif /* !UCONFIG_NO_NORMALIZATION */ 269 } 270 271 void UnicodeTest::TestBinaryValues() { 272 /* 273 * Unicode 5.1 explicitly defines binary property value aliases. 274 * Verify that they are all recognized. 275 */ 276 UErrorCode errorCode=U_ZERO_ERROR; 277 UnicodeSet alpha(UNICODE_STRING_SIMPLE("[:Alphabetic:]"), errorCode); 278 if(U_FAILURE(errorCode)) { 279 dataerrln("UnicodeSet([:Alphabetic:]) failed - %s", u_errorName(errorCode)); 280 return; 281 } 282 283 static const char *const falseValues[]={ "N", "No", "F", "False" }; 284 static const char *const trueValues[]={ "Y", "Yes", "T", "True" }; 285 int32_t i; 286 for(i=0; i<UPRV_LENGTHOF(falseValues); ++i) { 287 UnicodeString pattern=UNICODE_STRING_SIMPLE("[:Alphabetic=:]"); 288 pattern.insert(pattern.length()-2, UnicodeString(falseValues[i], -1, US_INV)); 289 errorCode=U_ZERO_ERROR; 290 UnicodeSet set(pattern, errorCode); 291 if(U_FAILURE(errorCode)) { 292 errln("UnicodeSet([:Alphabetic=%s:]) failed - %s\n", falseValues[i], u_errorName(errorCode)); 293 continue; 294 } 295 set.complement(); 296 if(set!=alpha) { 297 errln("UnicodeSet([:Alphabetic=%s:]).complement()!=UnicodeSet([:Alphabetic:])\n", falseValues[i]); 298 } 299 } 300 for(i=0; i<UPRV_LENGTHOF(trueValues); ++i) { 301 UnicodeString pattern=UNICODE_STRING_SIMPLE("[:Alphabetic=:]"); 302 pattern.insert(pattern.length()-2, UnicodeString(trueValues[i], -1, US_INV)); 303 errorCode=U_ZERO_ERROR; 304 UnicodeSet set(pattern, errorCode); 305 if(U_FAILURE(errorCode)) { 306 errln("UnicodeSet([:Alphabetic=%s:]) failed - %s\n", trueValues[i], u_errorName(errorCode)); 307 continue; 308 } 309 if(set!=alpha) { 310 errln("UnicodeSet([:Alphabetic=%s:])!=UnicodeSet([:Alphabetic:])\n", trueValues[i]); 311 } 312 } 313 } 314 315 void UnicodeTest::TestConsistency() { 316 #if !UCONFIG_NO_NORMALIZATION 317 /* 318 * Test for an example that getCanonStartSet() delivers 319 * all characters that compose from the input one, 320 * even in multiple steps. 321 * For example, the set for "I" (0049) should contain both 322 * I-diaeresis (00CF) and I-diaeresis-acute (1E2E). 323 * In general, the set for the middle such character should be a subset 324 * of the set for the first. 325 */ 326 IcuTestErrorCode errorCode(*this, "TestConsistency"); 327 const Normalizer2 *nfd=Normalizer2::getNFDInstance(errorCode); 328 const Normalizer2Impl *nfcImpl=Normalizer2Factory::getNFCImpl(errorCode); 329 if(!nfcImpl->ensureCanonIterData(errorCode) || errorCode.isFailure()) { 330 dataerrln("Normalizer2::getInstance(NFD) or Normalizer2Factory::getNFCImpl() failed - %s\n", 331 errorCode.errorName()); 332 errorCode.reset(); 333 return; 334 } 335 336 UnicodeSet set1, set2; 337 if (nfcImpl->getCanonStartSet(0x49, set1)) { 338 /* enumerate all characters that are plausible to be latin letters */ 339 for(UChar start=0xa0; start<0x2000; ++start) { 340 UnicodeString decomp=nfd->normalize(UnicodeString(start), errorCode); 341 if(decomp.length()>1 && decomp[0]==0x49) { 342 set2.add(start); 343 } 344 } 345 346 if (set1!=set2) { 347 errln("[canon start set of 0049] != [all c with canon decomp with 0049]"); 348 } 349 // This was available in cucdtst.c but the test had to move to intltest 350 // because the new internal normalization functions are in C++. 351 //compareUSets(set1, set2, 352 // "[canon start set of 0049]", "[all c with canon decomp with 0049]", 353 // TRUE); 354 } else { 355 errln("NFC.getCanonStartSet() returned FALSE"); 356 } 357 #endif 358 } 359 360 /** 361 * Test various implementations of Pattern_Syntax & Pattern_White_Space. 362 */ 363 void UnicodeTest::TestPatternProperties() { 364 IcuTestErrorCode errorCode(*this, "TestPatternProperties()"); 365 UnicodeSet syn_pp; 366 UnicodeSet syn_prop(UNICODE_STRING_SIMPLE("[:Pattern_Syntax:]"), errorCode); 367 UnicodeSet syn_list( 368 "[!-/\\:-@\\[-\\^`\\{-~" 369 "\\u00A1-\\u00A7\\u00A9\\u00AB\\u00AC\\u00AE\\u00B0\\u00B1\\u00B6\\u00BB\\u00BF\\u00D7\\u00F7" 370 "\\u2010-\\u2027\\u2030-\\u203E\\u2041-\\u2053\\u2055-\\u205E\\u2190-\\u245F\\u2500-\\u2775" 371 "\\u2794-\\u2BFF\\u2E00-\\u2E7F\\u3001-\\u3003\\u3008-\\u3020\\u3030\\uFD3E\\uFD3F\\uFE45\\uFE46]", errorCode); 372 UnicodeSet ws_pp; 373 UnicodeSet ws_prop(UNICODE_STRING_SIMPLE("[:Pattern_White_Space:]"), errorCode); 374 UnicodeSet ws_list(UNICODE_STRING_SIMPLE("[\\u0009-\\u000D\\ \\u0085\\u200E\\u200F\\u2028\\u2029]"), errorCode); 375 UnicodeSet syn_ws_pp; 376 UnicodeSet syn_ws_prop(syn_prop); 377 syn_ws_prop.addAll(ws_prop); 378 for(UChar32 c=0; c<=0xffff; ++c) { 379 if(PatternProps::isSyntax(c)) { 380 syn_pp.add(c); 381 } 382 if(PatternProps::isWhiteSpace(c)) { 383 ws_pp.add(c); 384 } 385 if(PatternProps::isSyntaxOrWhiteSpace(c)) { 386 syn_ws_pp.add(c); 387 } 388 } 389 compareUSets(syn_pp, syn_prop, 390 "PatternProps.isSyntax()", "[:Pattern_Syntax:]", TRUE); 391 compareUSets(syn_pp, syn_list, 392 "PatternProps.isSyntax()", "[Pattern_Syntax ranges]", TRUE); 393 compareUSets(ws_pp, ws_prop, 394 "PatternProps.isWhiteSpace()", "[:Pattern_White_Space:]", TRUE); 395 compareUSets(ws_pp, ws_list, 396 "PatternProps.isWhiteSpace()", "[Pattern_White_Space ranges]", TRUE); 397 compareUSets(syn_ws_pp, syn_ws_prop, 398 "PatternProps.isSyntaxOrWhiteSpace()", 399 "[[:Pattern_Syntax:][:Pattern_White_Space:]]", TRUE); 400 } 401 402 // So far only minimal port of Java & cucdtst.c compareUSets(). 403 UBool 404 UnicodeTest::compareUSets(const UnicodeSet &a, const UnicodeSet &b, 405 const char *a_name, const char *b_name, 406 UBool diffIsError) { 407 UBool same= a==b; 408 if(!same && diffIsError) { 409 errln("Sets are different: %s vs. %s\n", a_name, b_name); 410 } 411 return same; 412 } 413 414 namespace { 415 416 /** 417 * Maps a special script code to the most common script of its encoded characters. 418 */ 419 UScriptCode getCharScript(UScriptCode script) { 420 switch(script) { 421 case USCRIPT_SIMPLIFIED_HAN: 422 case USCRIPT_TRADITIONAL_HAN: 423 return USCRIPT_HAN; 424 case USCRIPT_JAPANESE: 425 return USCRIPT_HIRAGANA; 426 case USCRIPT_KOREAN: 427 return USCRIPT_HANGUL; 428 default: 429 return script; 430 } 431 } 432 433 } // namespace 434 435 void UnicodeTest::TestScriptMetadata() { 436 IcuTestErrorCode errorCode(*this, "TestScriptMetadata()"); 437 UnicodeSet rtl("[[:bc=R:][:bc=AL:]-[:Cn:]-[:sc=Common:]]", errorCode); 438 // So far, sample characters are uppercase. 439 // Georgian is special. 440 UnicodeSet cased("[[:Lu:]-[:sc=Common:]-[:sc=Geor:]]", errorCode); 441 for(int32_t sci = 0; sci < USCRIPT_CODE_LIMIT; ++sci) { 442 UScriptCode sc = (UScriptCode)sci; 443 // Run the test with -v to see which script has failures: 444 // .../intltest$ make && ./intltest utility/UnicodeTest/TestScriptMetadata -v | grep -C 3 FAIL 445 logln(uscript_getShortName(sc)); 446 UScriptUsage usage = uscript_getUsage(sc); 447 UnicodeString sample = uscript_getSampleUnicodeString(sc); 448 UnicodeSet scriptSet; 449 scriptSet.applyIntPropertyValue(UCHAR_SCRIPT, sc, errorCode); 450 if(usage == USCRIPT_USAGE_NOT_ENCODED) { 451 assertTrue("not encoded, no sample", sample.isEmpty()); 452 assertFalse("not encoded, not RTL", uscript_isRightToLeft(sc)); 453 assertFalse("not encoded, not LB letters", uscript_breaksBetweenLetters(sc)); 454 assertFalse("not encoded, not cased", uscript_isCased(sc)); 455 assertTrue("not encoded, no characters", scriptSet.isEmpty()); 456 } else { 457 assertFalse("encoded, has a sample character", sample.isEmpty()); 458 UChar32 firstChar = sample.char32At(0); 459 UScriptCode charScript = getCharScript(sc); 460 assertEquals("script(sample(script))", 461 (int32_t)charScript, (int32_t)uscript_getScript(firstChar, errorCode)); 462 assertEquals("RTL vs. set", (UBool)rtl.contains(firstChar), (UBool)uscript_isRightToLeft(sc)); 463 assertEquals("cased vs. set", (UBool)cased.contains(firstChar), (UBool)uscript_isCased(sc)); 464 assertEquals("encoded, has characters", (UBool)(sc == charScript), (UBool)(!scriptSet.isEmpty())); 465 if(uscript_isRightToLeft(sc)) { 466 rtl.removeAll(scriptSet); 467 } 468 if(uscript_isCased(sc)) { 469 cased.removeAll(scriptSet); 470 } 471 } 472 } 473 UnicodeString pattern; 474 assertEquals("no remaining RTL characters", 475 UnicodeString("[]"), rtl.toPattern(pattern)); 476 assertEquals("no remaining cased characters", 477 UnicodeString("[]"), cased.toPattern(pattern)); 478 479 assertTrue("Hani breaks between letters", uscript_breaksBetweenLetters(USCRIPT_HAN)); 480 assertTrue("Thai breaks between letters", uscript_breaksBetweenLetters(USCRIPT_THAI)); 481 assertFalse("Latn does not break between letters", uscript_breaksBetweenLetters(USCRIPT_LATIN)); 482 } 483 484 void UnicodeTest::TestBidiPairedBracketType() { 485 // BidiBrackets-6.3.0.txt says: 486 // 487 // The set of code points listed in this file was originally derived 488 // using the character properties General_Category (gc), Bidi_Class (bc), 489 // Bidi_Mirrored (Bidi_M), and Bidi_Mirroring_Glyph (bmg), as follows: 490 // two characters, A and B, form a pair if A has gc=Ps and B has gc=Pe, 491 // both have bc=ON and Bidi_M=Y, and bmg of A is B. Bidi_Paired_Bracket 492 // maps A to B and vice versa, and their Bidi_Paired_Bracket_Type 493 // property values are Open and Close, respectively. 494 IcuTestErrorCode errorCode(*this, "TestBidiPairedBracketType()"); 495 UnicodeSet bpt("[:^bpt=n:]", errorCode); 496 assertTrue("bpt!=None is not empty", !bpt.isEmpty()); 497 // The following should always be true. 498 UnicodeSet mirrored("[:Bidi_M:]", errorCode); 499 UnicodeSet other_neutral("[:bc=ON:]", errorCode); 500 assertTrue("bpt!=None is a subset of Bidi_M", mirrored.containsAll(bpt)); 501 assertTrue("bpt!=None is a subset of bc=ON", other_neutral.containsAll(bpt)); 502 // The following are true at least initially in Unicode 6.3. 503 UnicodeSet bpt_open("[:bpt=o:]", errorCode); 504 UnicodeSet bpt_close("[:bpt=c:]", errorCode); 505 UnicodeSet ps("[:Ps:]", errorCode); 506 UnicodeSet pe("[:Pe:]", errorCode); 507 assertTrue("bpt=Open is a subset of Ps", ps.containsAll(bpt_open)); 508 assertTrue("bpt=Close is a subset of Pe", pe.containsAll(bpt_close)); 509 } 510