1 /******************************************************************** 2 * COPYRIGHT: 3 * Copyright (c) 1997-2011, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ********************************************************************/ 6 7 #include "unicode/ustring.h" 8 #include "unicode/uchar.h" 9 #include "unicode/uniset.h" 10 #include "unicode/putil.h" 11 #include "cstring.h" 12 #include "hash.h" 13 #include "patternprops.h" 14 #include "normalizer2impl.h" 15 #include "uparse.h" 16 #include "ucdtest.h" 17 18 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof(array[0])) 19 20 static const char *ignorePropNames[]={ 21 "FC_NFKC", 22 "NFD_QC", 23 "NFC_QC", 24 "NFKD_QC", 25 "NFKC_QC", 26 "Expands_On_NFD", 27 "Expands_On_NFC", 28 "Expands_On_NFKD", 29 "Expands_On_NFKC", 30 "NFKC_CF" 31 }; 32 33 UnicodeTest::UnicodeTest() 34 { 35 UErrorCode errorCode=U_ZERO_ERROR; 36 unknownPropertyNames=new U_NAMESPACE_QUALIFIER Hashtable(errorCode); 37 if(U_FAILURE(errorCode)) { 38 delete unknownPropertyNames; 39 unknownPropertyNames=NULL; 40 } 41 // Ignore some property names altogether. 42 for(int32_t i=0; i<LENGTHOF(ignorePropNames); ++i) { 43 unknownPropertyNames->puti(UnicodeString(ignorePropNames[i], -1, US_INV), 1, errorCode); 44 } 45 } 46 47 UnicodeTest::~UnicodeTest() 48 { 49 delete unknownPropertyNames; 50 } 51 52 void UnicodeTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ ) 53 { 54 if(exec) { 55 logln("TestSuite UnicodeTest: "); 56 } 57 TESTCASE_AUTO_BEGIN; 58 TESTCASE_AUTO(TestAdditionalProperties); 59 TESTCASE_AUTO(TestBinaryValues); 60 TESTCASE_AUTO(TestConsistency); 61 TESTCASE_AUTO(TestPatternProperties); 62 TESTCASE_AUTO_END; 63 } 64 65 //==================================================== 66 // private data used by the tests 67 //==================================================== 68 69 // test DerivedCoreProperties.txt ------------------------------------------- 70 71 // copied from genprops.c 72 static int32_t 73 getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) { 74 const char *t, *z; 75 int32_t i, j; 76 77 s=u_skipWhitespace(s); 78 for(i=0; i<countTokens; ++i) { 79 t=tokens[i]; 80 if(t!=NULL) { 81 for(j=0;; ++j) { 82 if(t[j]!=0) { 83 if(s[j]!=t[j]) { 84 break; 85 } 86 } else { 87 z=u_skipWhitespace(s+j); 88 if(*z==';' || *z==0) { 89 return i; 90 } else { 91 break; 92 } 93 } 94 } 95 } 96 } 97 return -1; 98 } 99 100 static const char *const 101 derivedPropsNames[]={ 102 "Math", 103 "Alphabetic", 104 "Lowercase", 105 "Uppercase", 106 "ID_Start", 107 "ID_Continue", 108 "XID_Start", 109 "XID_Continue", 110 "Default_Ignorable_Code_Point", 111 "Full_Composition_Exclusion", 112 "Grapheme_Extend", 113 "Grapheme_Link", /* Unicode 5 moves this property here from PropList.txt */ 114 "Grapheme_Base", 115 "Cased", 116 "Case_Ignorable", 117 "Changes_When_Lowercased", 118 "Changes_When_Uppercased", 119 "Changes_When_Titlecased", 120 "Changes_When_Casefolded", 121 "Changes_When_Casemapped", 122 "Changes_When_NFKC_Casefolded" 123 }; 124 125 static const UProperty 126 derivedPropsIndex[]={ 127 UCHAR_MATH, 128 UCHAR_ALPHABETIC, 129 UCHAR_LOWERCASE, 130 UCHAR_UPPERCASE, 131 UCHAR_ID_START, 132 UCHAR_ID_CONTINUE, 133 UCHAR_XID_START, 134 UCHAR_XID_CONTINUE, 135 UCHAR_DEFAULT_IGNORABLE_CODE_POINT, 136 UCHAR_FULL_COMPOSITION_EXCLUSION, 137 UCHAR_GRAPHEME_EXTEND, 138 UCHAR_GRAPHEME_LINK, 139 UCHAR_GRAPHEME_BASE, 140 UCHAR_CASED, 141 UCHAR_CASE_IGNORABLE, 142 UCHAR_CHANGES_WHEN_LOWERCASED, 143 UCHAR_CHANGES_WHEN_UPPERCASED, 144 UCHAR_CHANGES_WHEN_TITLECASED, 145 UCHAR_CHANGES_WHEN_CASEFOLDED, 146 UCHAR_CHANGES_WHEN_CASEMAPPED, 147 UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED 148 }; 149 150 static int32_t numErrors[LENGTHOF(derivedPropsIndex)]={ 0 }; 151 152 enum { MAX_ERRORS=50 }; 153 154 U_CFUNC void U_CALLCONV 155 derivedPropsLineFn(void *context, 156 char *fields[][2], int32_t /* fieldCount */, 157 UErrorCode *pErrorCode) 158 { 159 UnicodeTest *me=(UnicodeTest *)context; 160 uint32_t start, end; 161 int32_t i; 162 163 u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode); 164 if(U_FAILURE(*pErrorCode)) { 165 me->errln("UnicodeTest: syntax error in DerivedCoreProperties.txt or DerivedNormalizationProps.txt field 0 at %s\n", fields[0][0]); 166 return; 167 } 168 169 /* parse derived binary property name, ignore unknown names */ 170 i=getTokenIndex(derivedPropsNames, LENGTHOF(derivedPropsNames), fields[1][0]); 171 if(i<0) { 172 UnicodeString propName(fields[1][0], (int32_t)(fields[1][1]-fields[1][0])); 173 propName.trim(); 174 if(me->unknownPropertyNames->find(propName)==NULL) { 175 UErrorCode errorCode=U_ZERO_ERROR; 176 me->unknownPropertyNames->puti(propName, 1, errorCode); 177 me->errln("UnicodeTest warning: unknown property name '%s' in DerivedCoreProperties.txt or DerivedNormalizationProps.txt\n", fields[1][0]); 178 } 179 return; 180 } 181 182 me->derivedProps[i].add(start, end); 183 } 184 185 void UnicodeTest::TestAdditionalProperties() { 186 #if !UCONFIG_NO_NORMALIZATION 187 // test DerivedCoreProperties.txt and DerivedNormalizationProps.txt 188 if(LENGTHOF(derivedProps)<LENGTHOF(derivedPropsNames)) { 189 errln("error: UnicodeTest::derivedProps[] too short, need at least %d UnicodeSets\n", 190 LENGTHOF(derivedPropsNames)); 191 return; 192 } 193 if(LENGTHOF(derivedPropsIndex)!=LENGTHOF(derivedPropsNames)) { 194 errln("error in ucdtest.cpp: LENGTHOF(derivedPropsIndex)!=LENGTHOF(derivedPropsNames)\n"); 195 return; 196 } 197 198 char newPath[256]; 199 char backupPath[256]; 200 char *fields[2][2]; 201 UErrorCode errorCode=U_ZERO_ERROR; 202 203 /* Look inside ICU_DATA first */ 204 strcpy(newPath, pathToDataDirectory()); 205 strcat(newPath, "unidata" U_FILE_SEP_STRING "DerivedCoreProperties.txt"); 206 207 // As a fallback, try to guess where the source data was located 208 // at the time ICU was built, and look there. 209 # ifdef U_TOPSRCDIR 210 strcpy(backupPath, U_TOPSRCDIR U_FILE_SEP_STRING "data"); 211 # else 212 strcpy(backupPath, loadTestData(errorCode)); 213 strcat(backupPath, U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING "data"); 214 # endif 215 strcat(backupPath, U_FILE_SEP_STRING); 216 strcat(backupPath, "unidata" U_FILE_SEP_STRING "DerivedCoreProperties.txt"); 217 218 char *path=newPath; 219 u_parseDelimitedFile(newPath, ';', fields, 2, derivedPropsLineFn, this, &errorCode); 220 221 if(errorCode==U_FILE_ACCESS_ERROR) { 222 errorCode=U_ZERO_ERROR; 223 path=backupPath; 224 u_parseDelimitedFile(backupPath, ';', fields, 2, derivedPropsLineFn, this, &errorCode); 225 } 226 if(U_FAILURE(errorCode)) { 227 errln("error parsing DerivedCoreProperties.txt: %s\n", u_errorName(errorCode)); 228 return; 229 } 230 char *basename=path+strlen(path)-strlen("DerivedCoreProperties.txt"); 231 strcpy(basename, "DerivedNormalizationProps.txt"); 232 u_parseDelimitedFile(path, ';', fields, 2, derivedPropsLineFn, this, &errorCode); 233 if(U_FAILURE(errorCode)) { 234 errln("error parsing DerivedNormalizationProps.txt: %s\n", u_errorName(errorCode)); 235 return; 236 } 237 238 // now we have all derived core properties in the UnicodeSets 239 // run them all through the API 240 int32_t rangeCount, range; 241 uint32_t i; 242 UChar32 start, end; 243 244 // test all TRUE properties 245 for(i=0; i<LENGTHOF(derivedPropsNames); ++i) { 246 rangeCount=derivedProps[i].getRangeCount(); 247 for(range=0; range<rangeCount && numErrors[i]<MAX_ERRORS; ++range) { 248 start=derivedProps[i].getRangeStart(range); 249 end=derivedProps[i].getRangeEnd(range); 250 for(; start<=end; ++start) { 251 if(!u_hasBinaryProperty(start, derivedPropsIndex[i])) { 252 dataerrln("UnicodeTest error: u_hasBinaryProperty(U+%04lx, %s)==FALSE is wrong", start, derivedPropsNames[i]); 253 if(++numErrors[i]>=MAX_ERRORS) { 254 dataerrln("Too many errors, moving to the next test"); 255 break; 256 } 257 } 258 } 259 } 260 } 261 262 // invert all properties 263 for(i=0; i<LENGTHOF(derivedPropsNames); ++i) { 264 derivedProps[i].complement(); 265 } 266 267 // test all FALSE properties 268 for(i=0; i<LENGTHOF(derivedPropsNames); ++i) { 269 rangeCount=derivedProps[i].getRangeCount(); 270 for(range=0; range<rangeCount && numErrors[i]<MAX_ERRORS; ++range) { 271 start=derivedProps[i].getRangeStart(range); 272 end=derivedProps[i].getRangeEnd(range); 273 for(; start<=end; ++start) { 274 if(u_hasBinaryProperty(start, derivedPropsIndex[i])) { 275 errln("UnicodeTest error: u_hasBinaryProperty(U+%04lx, %s)==TRUE is wrong\n", start, derivedPropsNames[i]); 276 if(++numErrors[i]>=MAX_ERRORS) { 277 errln("Too many errors, moving to the next test"); 278 break; 279 } 280 } 281 } 282 } 283 } 284 #endif /* !UCONFIG_NO_NORMALIZATION */ 285 } 286 287 void UnicodeTest::TestBinaryValues() { 288 /* 289 * Unicode 5.1 explicitly defines binary property value aliases. 290 * Verify that they are all recognized. 291 */ 292 UErrorCode errorCode=U_ZERO_ERROR; 293 UnicodeSet alpha(UNICODE_STRING_SIMPLE("[:Alphabetic:]"), errorCode); 294 if(U_FAILURE(errorCode)) { 295 dataerrln("UnicodeSet([:Alphabetic:]) failed - %s", u_errorName(errorCode)); 296 return; 297 } 298 299 static const char *const falseValues[]={ "N", "No", "F", "False" }; 300 static const char *const trueValues[]={ "Y", "Yes", "T", "True" }; 301 int32_t i; 302 for(i=0; i<LENGTHOF(falseValues); ++i) { 303 UnicodeString pattern=UNICODE_STRING_SIMPLE("[:Alphabetic=:]"); 304 pattern.insert(pattern.length()-2, UnicodeString(falseValues[i], -1, US_INV)); 305 errorCode=U_ZERO_ERROR; 306 UnicodeSet set(pattern, errorCode); 307 if(U_FAILURE(errorCode)) { 308 errln("UnicodeSet([:Alphabetic=%s:]) failed - %s\n", falseValues[i], u_errorName(errorCode)); 309 continue; 310 } 311 set.complement(); 312 if(set!=alpha) { 313 errln("UnicodeSet([:Alphabetic=%s:]).complement()!=UnicodeSet([:Alphabetic:])\n", falseValues[i]); 314 } 315 } 316 for(i=0; i<LENGTHOF(trueValues); ++i) { 317 UnicodeString pattern=UNICODE_STRING_SIMPLE("[:Alphabetic=:]"); 318 pattern.insert(pattern.length()-2, UnicodeString(trueValues[i], -1, US_INV)); 319 errorCode=U_ZERO_ERROR; 320 UnicodeSet set(pattern, errorCode); 321 if(U_FAILURE(errorCode)) { 322 errln("UnicodeSet([:Alphabetic=%s:]) failed - %s\n", trueValues[i], u_errorName(errorCode)); 323 continue; 324 } 325 if(set!=alpha) { 326 errln("UnicodeSet([:Alphabetic=%s:])!=UnicodeSet([:Alphabetic:])\n", trueValues[i]); 327 } 328 } 329 } 330 331 void UnicodeTest::TestConsistency() { 332 #if !UCONFIG_NO_NORMALIZATION 333 /* 334 * Test for an example that getCanonStartSet() delivers 335 * all characters that compose from the input one, 336 * even in multiple steps. 337 * For example, the set for "I" (0049) should contain both 338 * I-diaeresis (00CF) and I-diaeresis-acute (1E2E). 339 * In general, the set for the middle such character should be a subset 340 * of the set for the first. 341 */ 342 IcuTestErrorCode errorCode(*this, "TestConsistency"); 343 const Normalizer2 *nfd=Normalizer2::getInstance(NULL, "nfc", UNORM2_DECOMPOSE, errorCode); 344 const Normalizer2Impl *nfcImpl=Normalizer2Factory::getNFCImpl(errorCode); 345 if(!nfcImpl->ensureCanonIterData(errorCode) || errorCode.isFailure()) { 346 dataerrln("Normalizer2::getInstance(NFD) or Normalizer2Factory::getNFCImpl() failed - %s\n", 347 errorCode.errorName()); 348 errorCode.reset(); 349 return; 350 } 351 352 UnicodeSet set1, set2; 353 if (nfcImpl->getCanonStartSet(0x49, set1)) { 354 /* enumerate all characters that are plausible to be latin letters */ 355 for(UChar start=0xa0; start<0x2000; ++start) { 356 UnicodeString decomp=nfd->normalize(UnicodeString(start), errorCode); 357 if(decomp.length()>1 && decomp[0]==0x49) { 358 set2.add(start); 359 } 360 } 361 362 if (set1!=set2) { 363 errln("[canon start set of 0049] != [all c with canon decomp with 0049]"); 364 } 365 // This was available in cucdtst.c but the test had to move to intltest 366 // because the new internal normalization functions are in C++. 367 //compareUSets(set1, set2, 368 // "[canon start set of 0049]", "[all c with canon decomp with 0049]", 369 // TRUE); 370 } else { 371 errln("NFC.getCanonStartSet() returned FALSE"); 372 } 373 #endif 374 } 375 376 /** 377 * Test various implementations of Pattern_Syntax & Pattern_White_Space. 378 */ 379 void UnicodeTest::TestPatternProperties() { 380 IcuTestErrorCode errorCode(*this, "TestPatternProperties()"); 381 UnicodeSet syn_pp; 382 UnicodeSet syn_prop(UNICODE_STRING_SIMPLE("[:Pattern_Syntax:]"), errorCode); 383 UnicodeSet syn_list( 384 "[!-/\\:-@\\[-\\^`\\{-~" 385 "\\u00A1-\\u00A7\\u00A9\\u00AB\\u00AC\\u00AE\\u00B0\\u00B1\\u00B6\\u00BB\\u00BF\\u00D7\\u00F7" 386 "\\u2010-\\u2027\\u2030-\\u203E\\u2041-\\u2053\\u2055-\\u205E\\u2190-\\u245F\\u2500-\\u2775" 387 "\\u2794-\\u2BFF\\u2E00-\\u2E7F\\u3001-\\u3003\\u3008-\\u3020\\u3030\\uFD3E\\uFD3F\\uFE45\\uFE46]", errorCode); 388 UnicodeSet ws_pp; 389 UnicodeSet ws_prop(UNICODE_STRING_SIMPLE("[:Pattern_White_Space:]"), errorCode); 390 UnicodeSet ws_list(UNICODE_STRING_SIMPLE("[\\u0009-\\u000D\\ \\u0085\\u200E\\u200F\\u2028\\u2029]"), errorCode); 391 UnicodeSet syn_ws_pp; 392 UnicodeSet syn_ws_prop(syn_prop); 393 syn_ws_prop.addAll(ws_prop); 394 for(UChar32 c=0; c<=0xffff; ++c) { 395 if(PatternProps::isSyntax(c)) { 396 syn_pp.add(c); 397 } 398 if(PatternProps::isWhiteSpace(c)) { 399 ws_pp.add(c); 400 } 401 if(PatternProps::isSyntaxOrWhiteSpace(c)) { 402 syn_ws_pp.add(c); 403 } 404 } 405 compareUSets(syn_pp, syn_prop, 406 "PatternProps.isSyntax()", "[:Pattern_Syntax:]", TRUE); 407 compareUSets(syn_pp, syn_list, 408 "PatternProps.isSyntax()", "[Pattern_Syntax ranges]", TRUE); 409 compareUSets(ws_pp, ws_prop, 410 "PatternProps.isWhiteSpace()", "[:Pattern_White_Space:]", TRUE); 411 compareUSets(ws_pp, ws_list, 412 "PatternProps.isWhiteSpace()", "[Pattern_White_Space ranges]", TRUE); 413 compareUSets(syn_ws_pp, syn_ws_prop, 414 "PatternProps.isSyntaxOrWhiteSpace()", 415 "[[:Pattern_Syntax:][:Pattern_White_Space:]]", TRUE); 416 } 417 418 // So far only minimal port of Java & cucdtst.c compareUSets(). 419 UBool 420 UnicodeTest::compareUSets(const UnicodeSet &a, const UnicodeSet &b, 421 const char *a_name, const char *b_name, 422 UBool diffIsError) { 423 UBool same= a==b; 424 if(!same && diffIsError) { 425 errln("Sets are different: %s vs. %s\n", a_name, b_name); 426 } 427 return same; 428 } 429