1 /******************************************************************** 2 * COPYRIGHT: 3 * Copyright (c) 1997-2012, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ********************************************************************/ 6 /******************************************************************************* 7 * 8 * File CUCDTST.C 9 * 10 * Modification History: 11 * Name Description 12 * Madhu Katragadda Ported for C API, added tests for string functions 13 ******************************************************************************** 14 */ 15 16 #include <string.h> 17 #include <math.h> 18 #include <stdlib.h> 19 20 #include "unicode/utypes.h" 21 #include "unicode/uchar.h" 22 #include "unicode/putil.h" 23 #include "unicode/ustring.h" 24 #include "unicode/uloc.h" 25 #include "unicode/unorm2.h" 26 27 #include "cintltst.h" 28 #include "putilimp.h" 29 #include "uparse.h" 30 #include "ucase.h" 31 #include "ubidi_props.h" 32 #include "uprops.h" 33 #include "uset_imp.h" 34 #include "usc_impl.h" 35 #include "udatamem.h" /* for testing ucase_openBinary() */ 36 #include "cucdapi.h" 37 38 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 39 40 /* prototypes --------------------------------------------------------------- */ 41 42 static void TestUpperLower(void); 43 static void TestLetterNumber(void); 44 static void TestMisc(void); 45 static void TestPOSIX(void); 46 static void TestControlPrint(void); 47 static void TestIdentifier(void); 48 static void TestUnicodeData(void); 49 static void TestCodeUnit(void); 50 static void TestCodePoint(void); 51 static void TestCharLength(void); 52 static void TestCharNames(void); 53 static void TestMirroring(void); 54 static void TestUScriptRunAPI(void); 55 static void TestAdditionalProperties(void); 56 static void TestNumericProperties(void); 57 static void TestPropertyNames(void); 58 static void TestPropertyValues(void); 59 static void TestConsistency(void); 60 static void TestUCase(void); 61 static void TestUBiDiProps(void); 62 static void TestCaseFolding(void); 63 64 /* internal methods used */ 65 static int32_t MakeProp(char* str); 66 static int32_t MakeDir(char* str); 67 68 /* helpers ------------------------------------------------------------------ */ 69 70 static void 71 parseUCDFile(const char *filename, 72 char *fields[][2], int32_t fieldCount, 73 UParseLineFn *lineFn, void *context, 74 UErrorCode *pErrorCode) { 75 char path[256]; 76 char backupPath[256]; 77 78 if(U_FAILURE(*pErrorCode)) { 79 return; 80 } 81 82 /* Look inside ICU_DATA first */ 83 strcpy(path, u_getDataDirectory()); 84 strcat(path, ".." U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING); 85 strcat(path, filename); 86 87 /* As a fallback, try to guess where the source data was located 88 * at the time ICU was built, and look there. 89 */ 90 strcpy(backupPath, ctest_dataSrcDir()); 91 strcat(backupPath, U_FILE_SEP_STRING); 92 strcat(backupPath, "unidata" U_FILE_SEP_STRING); 93 strcat(backupPath, filename); 94 95 u_parseDelimitedFile(path, ';', fields, fieldCount, lineFn, context, pErrorCode); 96 if(*pErrorCode==U_FILE_ACCESS_ERROR) { 97 *pErrorCode=U_ZERO_ERROR; 98 u_parseDelimitedFile(backupPath, ';', fields, fieldCount, lineFn, context, pErrorCode); 99 } 100 if(U_FAILURE(*pErrorCode)) { 101 log_err_status(*pErrorCode, "error parsing %s: %s\n", filename, u_errorName(*pErrorCode)); 102 } 103 } 104 105 /* test data ---------------------------------------------------------------- */ 106 107 static const UChar LAST_CHAR_CODE_IN_FILE = 0xFFFD; 108 static const char tagStrings[] = "MnMcMeNdNlNoZsZlZpCcCfCsCoCnLuLlLtLmLoPcPdPsPePoSmScSkSoPiPf"; 109 static const int32_t tagValues[] = 110 { 111 /* Mn */ U_NON_SPACING_MARK, 112 /* Mc */ U_COMBINING_SPACING_MARK, 113 /* Me */ U_ENCLOSING_MARK, 114 /* Nd */ U_DECIMAL_DIGIT_NUMBER, 115 /* Nl */ U_LETTER_NUMBER, 116 /* No */ U_OTHER_NUMBER, 117 /* Zs */ U_SPACE_SEPARATOR, 118 /* Zl */ U_LINE_SEPARATOR, 119 /* Zp */ U_PARAGRAPH_SEPARATOR, 120 /* Cc */ U_CONTROL_CHAR, 121 /* Cf */ U_FORMAT_CHAR, 122 /* Cs */ U_SURROGATE, 123 /* Co */ U_PRIVATE_USE_CHAR, 124 /* Cn */ U_UNASSIGNED, 125 /* Lu */ U_UPPERCASE_LETTER, 126 /* Ll */ U_LOWERCASE_LETTER, 127 /* Lt */ U_TITLECASE_LETTER, 128 /* Lm */ U_MODIFIER_LETTER, 129 /* Lo */ U_OTHER_LETTER, 130 /* Pc */ U_CONNECTOR_PUNCTUATION, 131 /* Pd */ U_DASH_PUNCTUATION, 132 /* Ps */ U_START_PUNCTUATION, 133 /* Pe */ U_END_PUNCTUATION, 134 /* Po */ U_OTHER_PUNCTUATION, 135 /* Sm */ U_MATH_SYMBOL, 136 /* Sc */ U_CURRENCY_SYMBOL, 137 /* Sk */ U_MODIFIER_SYMBOL, 138 /* So */ U_OTHER_SYMBOL, 139 /* Pi */ U_INITIAL_PUNCTUATION, 140 /* Pf */ U_FINAL_PUNCTUATION 141 }; 142 143 static const char dirStrings[][5] = { 144 "L", 145 "R", 146 "EN", 147 "ES", 148 "ET", 149 "AN", 150 "CS", 151 "B", 152 "S", 153 "WS", 154 "ON", 155 "LRE", 156 "LRO", 157 "AL", 158 "RLE", 159 "RLO", 160 "PDF", 161 "NSM", 162 "BN" 163 }; 164 165 void addUnicodeTest(TestNode** root); 166 167 void addUnicodeTest(TestNode** root) 168 { 169 addTest(root, &TestCodeUnit, "tsutil/cucdtst/TestCodeUnit"); 170 addTest(root, &TestCodePoint, "tsutil/cucdtst/TestCodePoint"); 171 addTest(root, &TestCharLength, "tsutil/cucdtst/TestCharLength"); 172 addTest(root, &TestBinaryValues, "tsutil/cucdtst/TestBinaryValues"); 173 addTest(root, &TestUnicodeData, "tsutil/cucdtst/TestUnicodeData"); 174 addTest(root, &TestAdditionalProperties, "tsutil/cucdtst/TestAdditionalProperties"); 175 addTest(root, &TestNumericProperties, "tsutil/cucdtst/TestNumericProperties"); 176 addTest(root, &TestUpperLower, "tsutil/cucdtst/TestUpperLower"); 177 addTest(root, &TestLetterNumber, "tsutil/cucdtst/TestLetterNumber"); 178 addTest(root, &TestMisc, "tsutil/cucdtst/TestMisc"); 179 addTest(root, &TestPOSIX, "tsutil/cucdtst/TestPOSIX"); 180 addTest(root, &TestControlPrint, "tsutil/cucdtst/TestControlPrint"); 181 addTest(root, &TestIdentifier, "tsutil/cucdtst/TestIdentifier"); 182 addTest(root, &TestCharNames, "tsutil/cucdtst/TestCharNames"); 183 addTest(root, &TestMirroring, "tsutil/cucdtst/TestMirroring"); 184 addTest(root, &TestUScriptCodeAPI, "tsutil/cucdtst/TestUScriptCodeAPI"); 185 addTest(root, &TestHasScript, "tsutil/cucdtst/TestHasScript"); 186 addTest(root, &TestGetScriptExtensions, "tsutil/cucdtst/TestGetScriptExtensions"); 187 addTest(root, &TestUScriptRunAPI, "tsutil/cucdtst/TestUScriptRunAPI"); 188 addTest(root, &TestPropertyNames, "tsutil/cucdtst/TestPropertyNames"); 189 addTest(root, &TestPropertyValues, "tsutil/cucdtst/TestPropertyValues"); 190 addTest(root, &TestConsistency, "tsutil/cucdtst/TestConsistency"); 191 addTest(root, &TestUCase, "tsutil/cucdtst/TestUCase"); 192 addTest(root, &TestUBiDiProps, "tsutil/cucdtst/TestUBiDiProps"); 193 addTest(root, &TestCaseFolding, "tsutil/cucdtst/TestCaseFolding"); 194 } 195 196 /*==================================================== */ 197 /* test u_toupper() and u_tolower() */ 198 /*==================================================== */ 199 static void TestUpperLower() 200 { 201 const UChar upper[] = {0x41, 0x42, 0x00b2, 0x01c4, 0x01c6, 0x01c9, 0x01c8, 0x01c9, 0x000c, 0x0000}; 202 const UChar lower[] = {0x61, 0x62, 0x00b2, 0x01c6, 0x01c6, 0x01c9, 0x01c9, 0x01c9, 0x000c, 0x0000}; 203 U_STRING_DECL(upperTest, "abcdefg123hij.?:klmno", 21); 204 U_STRING_DECL(lowerTest, "ABCDEFG123HIJ.?:KLMNO", 21); 205 int32_t i; 206 207 U_STRING_INIT(upperTest, "abcdefg123hij.?:klmno", 21); 208 U_STRING_INIT(lowerTest, "ABCDEFG123HIJ.?:KLMNO", 21); 209 210 /* 211 Checks LetterLike Symbols which were previously a source of confusion 212 [Bertrand A. D. 02/04/98] 213 */ 214 for (i=0x2100;i<0x2138;i++) 215 { 216 /* Unicode 5.0 adds lowercase U+214E (TURNED SMALL F) to U+2132 (TURNED CAPITAL F) */ 217 if(i!=0x2126 && i!=0x212a && i!=0x212b && i!=0x2132) 218 { 219 if (i != (int)u_tolower(i)) /* itself */ 220 log_err("Failed case conversion with itself: U+%04x\n", i); 221 if (i != (int)u_toupper(i)) 222 log_err("Failed case conversion with itself: U+%04x\n", i); 223 } 224 } 225 226 for(i=0; i < u_strlen(upper); i++){ 227 if(u_tolower(upper[i]) != lower[i]){ 228 log_err("FAILED u_tolower() for %lx Expected %lx Got %lx\n", upper[i], lower[i], u_tolower(upper[i])); 229 } 230 } 231 232 log_verbose("testing upper lower\n"); 233 for (i = 0; i < 21; i++) { 234 235 if (u_isalpha(upperTest[i]) && !u_islower(upperTest[i])) 236 { 237 log_err("Failed isLowerCase test at %c\n", upperTest[i]); 238 } 239 else if (u_isalpha(lowerTest[i]) && !u_isupper(lowerTest[i])) 240 { 241 log_err("Failed isUpperCase test at %c\n", lowerTest[i]); 242 } 243 else if (upperTest[i] != u_tolower(lowerTest[i])) 244 { 245 log_err("Failed case conversion from %c To %c :\n", lowerTest[i], upperTest[i]); 246 } 247 else if (lowerTest[i] != u_toupper(upperTest[i])) 248 { 249 log_err("Failed case conversion : %c To %c \n", upperTest[i], lowerTest[i]); 250 } 251 else if (upperTest[i] != u_tolower(upperTest[i])) 252 { 253 log_err("Failed case conversion with itself: %c\n", upperTest[i]); 254 } 255 else if (lowerTest[i] != u_toupper(lowerTest[i])) 256 { 257 log_err("Failed case conversion with itself: %c\n", lowerTest[i]); 258 } 259 } 260 log_verbose("done testing upper lower\n"); 261 262 log_verbose("testing u_istitle\n"); 263 { 264 static const UChar expected[] = { 265 0x1F88, 266 0x1F89, 267 0x1F8A, 268 0x1F8B, 269 0x1F8C, 270 0x1F8D, 271 0x1F8E, 272 0x1F8F, 273 0x1F88, 274 0x1F89, 275 0x1F8A, 276 0x1F8B, 277 0x1F8C, 278 0x1F8D, 279 0x1F8E, 280 0x1F8F, 281 0x1F98, 282 0x1F99, 283 0x1F9A, 284 0x1F9B, 285 0x1F9C, 286 0x1F9D, 287 0x1F9E, 288 0x1F9F, 289 0x1F98, 290 0x1F99, 291 0x1F9A, 292 0x1F9B, 293 0x1F9C, 294 0x1F9D, 295 0x1F9E, 296 0x1F9F, 297 0x1FA8, 298 0x1FA9, 299 0x1FAA, 300 0x1FAB, 301 0x1FAC, 302 0x1FAD, 303 0x1FAE, 304 0x1FAF, 305 0x1FA8, 306 0x1FA9, 307 0x1FAA, 308 0x1FAB, 309 0x1FAC, 310 0x1FAD, 311 0x1FAE, 312 0x1FAF, 313 0x1FBC, 314 0x1FBC, 315 0x1FCC, 316 0x1FCC, 317 0x1FFC, 318 0x1FFC, 319 }; 320 int32_t num = sizeof(expected)/sizeof(expected[0]); 321 for(i=0; i<num; i++){ 322 if(!u_istitle(expected[i])){ 323 log_err("u_istitle failed for 0x%4X. Expected TRUE, got FALSE\n",expected[i]); 324 } 325 } 326 327 } 328 } 329 330 /* compare two sets and verify that their difference or intersection is empty */ 331 static UBool 332 showADiffB(const USet *a, const USet *b, 333 const char *a_name, const char *b_name, 334 UBool expect, UBool diffIsError) { 335 USet *aa; 336 int32_t i, start, end, length; 337 UErrorCode errorCode; 338 339 /* 340 * expect: 341 * TRUE -> a-b should be empty, that is, b should contain all of a 342 * FALSE -> a&b should be empty, that is, a should contain none of b (and vice versa) 343 */ 344 if(expect ? uset_containsAll(b, a) : uset_containsNone(a, b)) { 345 return TRUE; 346 } 347 348 /* clone a to aa because a is const */ 349 aa=uset_open(1, 0); 350 if(aa==NULL) { 351 /* unusual problem - out of memory? */ 352 return FALSE; 353 } 354 uset_addAll(aa, a); 355 356 /* compute the set in question */ 357 if(expect) { 358 /* a-b */ 359 uset_removeAll(aa, b); 360 } else { 361 /* a&b */ 362 uset_retainAll(aa, b); 363 } 364 365 /* aa is not empty because of the initial tests above; show its contents */ 366 errorCode=U_ZERO_ERROR; 367 i=0; 368 for(;;) { 369 length=uset_getItem(aa, i, &start, &end, NULL, 0, &errorCode); 370 if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) { 371 break; /* done */ 372 } 373 if(U_FAILURE(errorCode)) { 374 log_err("error comparing %s with %s at difference item %d: %s\n", 375 a_name, b_name, i, u_errorName(errorCode)); 376 break; 377 } 378 if(length!=0) { 379 break; /* done with code points, got a string or -1 */ 380 } 381 382 if(diffIsError) { 383 if(expect) { 384 log_err("error: %s contains U+%04x..U+%04x but %s does not\n", a_name, start, end, b_name); 385 } else { 386 log_err("error: %s and %s both contain U+%04x..U+%04x but should not intersect\n", a_name, b_name, start, end); 387 } 388 } else { 389 if(expect) { 390 log_verbose("info: %s contains U+%04x..U+%04x but %s does not\n", a_name, start, end, b_name); 391 } else { 392 log_verbose("info: %s and %s both contain U+%04x..U+%04x but should not intersect\n", a_name, b_name, start, end); 393 } 394 } 395 396 ++i; 397 } 398 399 uset_close(aa); 400 return FALSE; 401 } 402 403 static UBool 404 showAMinusB(const USet *a, const USet *b, 405 const char *a_name, const char *b_name, 406 UBool diffIsError) { 407 return showADiffB(a, b, a_name, b_name, TRUE, diffIsError); 408 } 409 410 static UBool 411 showAIntersectB(const USet *a, const USet *b, 412 const char *a_name, const char *b_name, 413 UBool diffIsError) { 414 return showADiffB(a, b, a_name, b_name, FALSE, diffIsError); 415 } 416 417 static UBool 418 compareUSets(const USet *a, const USet *b, 419 const char *a_name, const char *b_name, 420 UBool diffIsError) { 421 /* 422 * Use an arithmetic & not a logical && so that both branches 423 * are always taken and all differences are shown. 424 */ 425 return 426 showAMinusB(a, b, a_name, b_name, diffIsError) & 427 showAMinusB(b, a, b_name, a_name, diffIsError); 428 } 429 430 /* test isLetter(u_isapha()) and isDigit(u_isdigit()) */ 431 static void TestLetterNumber() 432 { 433 UChar i = 0x0000; 434 435 log_verbose("Testing for isalpha\n"); 436 for (i = 0x0041; i < 0x005B; i++) { 437 if (!u_isalpha(i)) 438 { 439 log_err("Failed isLetter test at %.4X\n", i); 440 } 441 } 442 for (i = 0x0660; i < 0x066A; i++) { 443 if (u_isalpha(i)) 444 { 445 log_err("Failed isLetter test with numbers at %.4X\n", i); 446 } 447 } 448 449 log_verbose("Testing for isdigit\n"); 450 for (i = 0x0660; i < 0x066A; i++) { 451 if (!u_isdigit(i)) 452 { 453 log_verbose("Failed isNumber test at %.4X\n", i); 454 } 455 } 456 457 log_verbose("Testing for isalnum\n"); 458 for (i = 0x0041; i < 0x005B; i++) { 459 if (!u_isalnum(i)) 460 { 461 log_err("Failed isAlNum test at %.4X\n", i); 462 } 463 } 464 for (i = 0x0660; i < 0x066A; i++) { 465 if (!u_isalnum(i)) 466 { 467 log_err("Failed isAlNum test at %.4X\n", i); 468 } 469 } 470 471 { 472 /* 473 * The following checks work only starting from Unicode 4.0. 474 * Check the version number here. 475 */ 476 static UVersionInfo u401={ 4, 0, 1, 0 }; 477 UVersionInfo version; 478 u_getUnicodeVersion(version); 479 if(version[0]<4 || 0==memcmp(version, u401, 4)) { 480 return; 481 } 482 } 483 484 { 485 /* 486 * Sanity check: 487 * Verify that exactly the digit characters have decimal digit values. 488 * This assumption is used in the implementation of u_digit() 489 * (which checks nt=de) 490 * compared with the parallel java.lang.Character.digit() 491 * (which checks Nd). 492 * 493 * This was not true in Unicode 3.2 and earlier. 494 * Unicode 4.0 fixed discrepancies. 495 * Unicode 4.0.1 re-introduced problems in this area due to an 496 * unintentionally incomplete last-minute change. 497 */ 498 U_STRING_DECL(digitsPattern, "[:Nd:]", 6); 499 U_STRING_DECL(decimalValuesPattern, "[:Numeric_Type=Decimal:]", 24); 500 501 USet *digits, *decimalValues; 502 UErrorCode errorCode; 503 504 U_STRING_INIT(digitsPattern, "[:Nd:]", 6); 505 U_STRING_INIT(decimalValuesPattern, "[:Numeric_Type=Decimal:]", 24); 506 errorCode=U_ZERO_ERROR; 507 digits=uset_openPattern(digitsPattern, 6, &errorCode); 508 decimalValues=uset_openPattern(decimalValuesPattern, 24, &errorCode); 509 510 if(U_SUCCESS(errorCode)) { 511 compareUSets(digits, decimalValues, "[:Nd:]", "[:Numeric_Type=Decimal:]", TRUE); 512 } 513 514 uset_close(digits); 515 uset_close(decimalValues); 516 } 517 } 518 519 static void testSampleCharProps(UBool propFn(UChar32), const char *propName, 520 const UChar32 *sampleChars, int32_t sampleCharsLength, 521 UBool expected) { 522 int32_t i; 523 for (i = 0; i < sampleCharsLength; ++i) { 524 UBool result = propFn(sampleChars[i]); 525 if (result != expected) { 526 log_err("error: character property function %s(U+%04x)=%d is wrong\n", 527 propName, sampleChars[i], result); 528 } 529 } 530 } 531 532 /* Tests for isDefined(u_isdefined)(, isBaseForm(u_isbase()), isSpaceChar(u_isspace()), isWhiteSpace(), u_CharDigitValue() */ 533 static void TestMisc() 534 { 535 static const UChar32 sampleSpaces[] = {0x0020, 0x00a0, 0x2000, 0x2001, 0x2005}; 536 static const UChar32 sampleNonSpaces[] = {0x61, 0x62, 0x63, 0x64, 0x74}; 537 static const UChar32 sampleUndefined[] = {0xfff1, 0xfff7, 0xfa6e}; 538 static const UChar32 sampleDefined[] = {0x523E, 0x4f88, 0xfffd}; 539 static const UChar32 sampleBase[] = {0x0061, 0x0031, 0x03d2}; 540 static const UChar32 sampleNonBase[] = {0x002B, 0x0020, 0x203B}; 541 /* static const UChar sampleChars[] = {0x000a, 0x0045, 0x4e00, 0xDC00, 0xFFE8, 0xFFF0};*/ 542 static const UChar32 sampleDigits[]= {0x0030, 0x0662, 0x0F23, 0x0ED5}; 543 static const UChar32 sampleNonDigits[] = {0x0010, 0x0041, 0x0122, 0x68FE}; 544 static const UChar32 sampleWhiteSpaces[] = {0x2008, 0x2009, 0x200a, 0x001c, 0x000c}; 545 static const UChar32 sampleNonWhiteSpaces[] = {0x61, 0x62, 0x3c, 0x28, 0x3f, 0x85, 0x2007, 0xffef}; 546 547 static const int32_t sampleDigitValues[] = {0, 2, 3, 5}; 548 549 uint32_t mask; 550 551 int32_t i; 552 char icuVersion[U_MAX_VERSION_STRING_LENGTH]; 553 UVersionInfo realVersion; 554 555 memset(icuVersion, 0, U_MAX_VERSION_STRING_LENGTH); 556 557 testSampleCharProps(u_isspace, "u_isspace", sampleSpaces, LENGTHOF(sampleSpaces), TRUE); 558 testSampleCharProps(u_isspace, "u_isspace", sampleNonSpaces, LENGTHOF(sampleNonSpaces), FALSE); 559 560 testSampleCharProps(u_isJavaSpaceChar, "u_isJavaSpaceChar", 561 sampleSpaces, LENGTHOF(sampleSpaces), TRUE); 562 testSampleCharProps(u_isJavaSpaceChar, "u_isJavaSpaceChar", 563 sampleNonSpaces, LENGTHOF(sampleNonSpaces), FALSE); 564 565 testSampleCharProps(u_isWhitespace, "u_isWhitespace", 566 sampleWhiteSpaces, LENGTHOF(sampleWhiteSpaces), TRUE); 567 testSampleCharProps(u_isWhitespace, "u_isWhitespace", 568 sampleNonWhiteSpaces, LENGTHOF(sampleNonWhiteSpaces), FALSE); 569 570 testSampleCharProps(u_isdefined, "u_isdefined", 571 sampleDefined, LENGTHOF(sampleDefined), TRUE); 572 testSampleCharProps(u_isdefined, "u_isdefined", 573 sampleUndefined, LENGTHOF(sampleUndefined), FALSE); 574 575 testSampleCharProps(u_isbase, "u_isbase", sampleBase, LENGTHOF(sampleBase), TRUE); 576 testSampleCharProps(u_isbase, "u_isbase", sampleNonBase, LENGTHOF(sampleNonBase), FALSE); 577 578 testSampleCharProps(u_isdigit, "u_isdigit", sampleDigits, LENGTHOF(sampleDigits), TRUE); 579 testSampleCharProps(u_isdigit, "u_isdigit", sampleNonDigits, LENGTHOF(sampleNonDigits), FALSE); 580 581 for (i = 0; i < LENGTHOF(sampleDigits); i++) { 582 if (u_charDigitValue(sampleDigits[i]) != sampleDigitValues[i]) { 583 log_err("error: u_charDigitValue(U+04x)=%d != %d\n", 584 sampleDigits[i], u_charDigitValue(sampleDigits[i]), sampleDigitValues[i]); 585 } 586 } 587 588 /* Tests the ICU version #*/ 589 u_getVersion(realVersion); 590 u_versionToString(realVersion, icuVersion); 591 if (strncmp(icuVersion, U_ICU_VERSION, uprv_min((int32_t)strlen(icuVersion), (int32_t)strlen(U_ICU_VERSION))) != 0) 592 { 593 log_err("ICU version test failed. Header says=%s, got=%s \n", U_ICU_VERSION, icuVersion); 594 } 595 #if defined(ICU_VERSION) 596 /* test only happens where we have configure.in with VERSION - sanity check. */ 597 if(strcmp(U_ICU_VERSION, ICU_VERSION)) 598 { 599 log_err("ICU version mismatch: Header says %s, build environment says %s.\n", U_ICU_VERSION, ICU_VERSION); 600 } 601 #endif 602 603 /* test U_GC_... */ 604 if( 605 U_GET_GC_MASK(0x41)!=U_GC_LU_MASK || 606 U_GET_GC_MASK(0x662)!=U_GC_ND_MASK || 607 U_GET_GC_MASK(0xa0)!=U_GC_ZS_MASK || 608 U_GET_GC_MASK(0x28)!=U_GC_PS_MASK || 609 U_GET_GC_MASK(0x2044)!=U_GC_SM_MASK || 610 U_GET_GC_MASK(0xe0063)!=U_GC_CF_MASK 611 ) { 612 log_err("error: U_GET_GC_MASK does not work properly\n"); 613 } 614 615 mask=0; 616 mask=(mask&~U_GC_CN_MASK)|U_GC_CN_MASK; 617 618 mask=(mask&~U_GC_LU_MASK)|U_GC_LU_MASK; 619 mask=(mask&~U_GC_LL_MASK)|U_GC_LL_MASK; 620 mask=(mask&~U_GC_LT_MASK)|U_GC_LT_MASK; 621 mask=(mask&~U_GC_LM_MASK)|U_GC_LM_MASK; 622 mask=(mask&~U_GC_LO_MASK)|U_GC_LO_MASK; 623 624 mask=(mask&~U_GC_MN_MASK)|U_GC_MN_MASK; 625 mask=(mask&~U_GC_ME_MASK)|U_GC_ME_MASK; 626 mask=(mask&~U_GC_MC_MASK)|U_GC_MC_MASK; 627 628 mask=(mask&~U_GC_ND_MASK)|U_GC_ND_MASK; 629 mask=(mask&~U_GC_NL_MASK)|U_GC_NL_MASK; 630 mask=(mask&~U_GC_NO_MASK)|U_GC_NO_MASK; 631 632 mask=(mask&~U_GC_ZS_MASK)|U_GC_ZS_MASK; 633 mask=(mask&~U_GC_ZL_MASK)|U_GC_ZL_MASK; 634 mask=(mask&~U_GC_ZP_MASK)|U_GC_ZP_MASK; 635 636 mask=(mask&~U_GC_CC_MASK)|U_GC_CC_MASK; 637 mask=(mask&~U_GC_CF_MASK)|U_GC_CF_MASK; 638 mask=(mask&~U_GC_CO_MASK)|U_GC_CO_MASK; 639 mask=(mask&~U_GC_CS_MASK)|U_GC_CS_MASK; 640 641 mask=(mask&~U_GC_PD_MASK)|U_GC_PD_MASK; 642 mask=(mask&~U_GC_PS_MASK)|U_GC_PS_MASK; 643 mask=(mask&~U_GC_PE_MASK)|U_GC_PE_MASK; 644 mask=(mask&~U_GC_PC_MASK)|U_GC_PC_MASK; 645 mask=(mask&~U_GC_PO_MASK)|U_GC_PO_MASK; 646 647 mask=(mask&~U_GC_SM_MASK)|U_GC_SM_MASK; 648 mask=(mask&~U_GC_SC_MASK)|U_GC_SC_MASK; 649 mask=(mask&~U_GC_SK_MASK)|U_GC_SK_MASK; 650 mask=(mask&~U_GC_SO_MASK)|U_GC_SO_MASK; 651 652 mask=(mask&~U_GC_PI_MASK)|U_GC_PI_MASK; 653 mask=(mask&~U_GC_PF_MASK)|U_GC_PF_MASK; 654 655 if(mask!=(U_CHAR_CATEGORY_COUNT<32 ? U_MASK(U_CHAR_CATEGORY_COUNT)-1: 0xffffffff)) { 656 log_err("error: problems with U_GC_XX_MASK constants\n"); 657 } 658 659 mask=0; 660 mask=(mask&~U_GC_C_MASK)|U_GC_C_MASK; 661 mask=(mask&~U_GC_L_MASK)|U_GC_L_MASK; 662 mask=(mask&~U_GC_M_MASK)|U_GC_M_MASK; 663 mask=(mask&~U_GC_N_MASK)|U_GC_N_MASK; 664 mask=(mask&~U_GC_Z_MASK)|U_GC_Z_MASK; 665 mask=(mask&~U_GC_P_MASK)|U_GC_P_MASK; 666 mask=(mask&~U_GC_S_MASK)|U_GC_S_MASK; 667 668 if(mask!=(U_CHAR_CATEGORY_COUNT<32 ? U_MASK(U_CHAR_CATEGORY_COUNT)-1: 0xffffffff)) { 669 log_err("error: problems with U_GC_Y_MASK constants\n"); 670 } 671 { 672 static const UChar32 digit[10]={ 0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,0x0038,0x0039 }; 673 for(i=0; i<10; i++){ 674 if(digit[i]!=u_forDigit(i,10)){ 675 log_err("u_forDigit failed for %i. Expected: 0x%4X Got: 0x%4X\n",i,digit[i],u_forDigit(i,10)); 676 } 677 } 678 } 679 680 /* test u_digit() */ 681 { 682 static const struct { 683 UChar32 c; 684 int8_t radix, value; 685 } data[]={ 686 /* base 16 */ 687 { 0x0031, 16, 1 }, 688 { 0x0038, 16, 8 }, 689 { 0x0043, 16, 12 }, 690 { 0x0066, 16, 15 }, 691 { 0x00e4, 16, -1 }, 692 { 0x0662, 16, 2 }, 693 { 0x06f5, 16, 5 }, 694 { 0xff13, 16, 3 }, 695 { 0xff41, 16, 10 }, 696 697 /* base 8 */ 698 { 0x0031, 8, 1 }, 699 { 0x0038, 8, -1 }, 700 { 0x0043, 8, -1 }, 701 { 0x0066, 8, -1 }, 702 { 0x00e4, 8, -1 }, 703 { 0x0662, 8, 2 }, 704 { 0x06f5, 8, 5 }, 705 { 0xff13, 8, 3 }, 706 { 0xff41, 8, -1 }, 707 708 /* base 36 */ 709 { 0x5a, 36, 35 }, 710 { 0x7a, 36, 35 }, 711 { 0xff3a, 36, 35 }, 712 { 0xff5a, 36, 35 }, 713 714 /* wrong radix values */ 715 { 0x0031, 1, -1 }, 716 { 0xff3a, 37, -1 } 717 }; 718 719 for(i=0; i<LENGTHOF(data); ++i) { 720 if(u_digit(data[i].c, data[i].radix)!=data[i].value) { 721 log_err("u_digit(U+%04x, %d)=%d expected %d\n", 722 data[i].c, 723 data[i].radix, 724 u_digit(data[i].c, data[i].radix), 725 data[i].value); 726 } 727 } 728 } 729 } 730 731 /* test C/POSIX-style functions --------------------------------------------- */ 732 733 /* bit flags */ 734 #define ISAL 1 735 #define ISLO 2 736 #define ISUP 4 737 738 #define ISDI 8 739 #define ISXD 0x10 740 741 #define ISAN 0x20 742 743 #define ISPU 0x40 744 #define ISGR 0x80 745 #define ISPR 0x100 746 747 #define ISSP 0x200 748 #define ISBL 0x400 749 #define ISCN 0x800 750 751 /* C/POSIX-style functions, in the same order as the bit flags */ 752 typedef UBool U_EXPORT2 IsPOSIXClass(UChar32 c); 753 754 static const struct { 755 IsPOSIXClass *fn; 756 const char *name; 757 } posixClasses[]={ 758 { u_isalpha, "isalpha" }, 759 { u_islower, "islower" }, 760 { u_isupper, "isupper" }, 761 { u_isdigit, "isdigit" }, 762 { u_isxdigit, "isxdigit" }, 763 { u_isalnum, "isalnum" }, 764 { u_ispunct, "ispunct" }, 765 { u_isgraph, "isgraph" }, 766 { u_isprint, "isprint" }, 767 { u_isspace, "isspace" }, 768 { u_isblank, "isblank" }, 769 { u_iscntrl, "iscntrl" } 770 }; 771 772 static const struct { 773 UChar32 c; 774 uint32_t posixResults; 775 } posixData[]={ 776 { 0x0008, ISCN }, /* backspace */ 777 { 0x0009, ISSP|ISBL|ISCN }, /* TAB */ 778 { 0x000a, ISSP| ISCN }, /* LF */ 779 { 0x000c, ISSP| ISCN }, /* FF */ 780 { 0x000d, ISSP| ISCN }, /* CR */ 781 { 0x0020, ISPR|ISSP|ISBL }, /* space */ 782 { 0x0021, ISPU|ISGR|ISPR }, /* ! */ 783 { 0x0033, ISDI|ISXD|ISAN| ISGR|ISPR }, /* 3 */ 784 { 0x0040, ISPU|ISGR|ISPR }, /* @ */ 785 { 0x0041, ISAL| ISUP| ISXD|ISAN| ISGR|ISPR }, /* A */ 786 { 0x007a, ISAL|ISLO| ISAN| ISGR|ISPR }, /* z */ 787 { 0x007b, ISPU|ISGR|ISPR }, /* { */ 788 { 0x0085, ISSP| ISCN }, /* NEL */ 789 { 0x00a0, ISPR|ISSP|ISBL }, /* NBSP */ 790 { 0x00a4, ISGR|ISPR }, /* currency sign */ 791 { 0x00e4, ISAL|ISLO| ISAN| ISGR|ISPR }, /* a-umlaut */ 792 { 0x0300, ISGR|ISPR }, /* combining grave */ 793 { 0x0600, ISCN }, /* arabic number sign */ 794 { 0x0627, ISAL| ISAN| ISGR|ISPR }, /* alef */ 795 { 0x0663, ISDI|ISXD|ISAN| ISGR|ISPR }, /* arabic 3 */ 796 { 0x2002, ISPR|ISSP|ISBL }, /* en space */ 797 { 0x2007, ISPR|ISSP|ISBL }, /* figure space */ 798 { 0x2009, ISPR|ISSP|ISBL }, /* thin space */ 799 { 0x200b, ISCN }, /* ZWSP */ 800 /*{ 0x200b, ISPR|ISSP },*/ /* ZWSP */ /* ZWSP became a control char in 4.0.1*/ 801 { 0x200e, ISCN }, /* LRM */ 802 { 0x2028, ISPR|ISSP| ISCN }, /* LS */ 803 { 0x2029, ISPR|ISSP| ISCN }, /* PS */ 804 { 0x20ac, ISGR|ISPR }, /* Euro */ 805 { 0xff15, ISDI|ISXD|ISAN| ISGR|ISPR }, /* fullwidth 5 */ 806 { 0xff25, ISAL| ISUP| ISXD|ISAN| ISGR|ISPR }, /* fullwidth E */ 807 { 0xff35, ISAL| ISUP| ISAN| ISGR|ISPR }, /* fullwidth U */ 808 { 0xff45, ISAL|ISLO| ISXD|ISAN| ISGR|ISPR }, /* fullwidth e */ 809 { 0xff55, ISAL|ISLO| ISAN| ISGR|ISPR } /* fullwidth u */ 810 }; 811 812 static void 813 TestPOSIX() { 814 uint32_t mask; 815 int32_t cl, i; 816 UBool expect; 817 818 mask=1; 819 for(cl=0; cl<12; ++cl) { 820 for(i=0; i<LENGTHOF(posixData); ++i) { 821 expect=(UBool)((posixData[i].posixResults&mask)!=0); 822 if(posixClasses[cl].fn(posixData[i].c)!=expect) { 823 log_err("u_%s(U+%04x)=%s is wrong\n", 824 posixClasses[cl].name, posixData[i].c, expect ? "FALSE" : "TRUE"); 825 } 826 } 827 mask<<=1; 828 } 829 } 830 831 /* Tests for isControl(u_iscntrl()) and isPrintable(u_isprint()) */ 832 static void TestControlPrint() 833 { 834 const UChar32 sampleControl[] = {0x1b, 0x97, 0x82, 0x2028, 0x2029, 0x200c, 0x202b}; 835 const UChar32 sampleNonControl[] = {0x61, 0x0031, 0x00e2}; 836 const UChar32 samplePrintable[] = {0x0042, 0x005f, 0x2014}; 837 const UChar32 sampleNonPrintable[] = {0x200c, 0x009f, 0x001b}; 838 UChar32 c; 839 840 testSampleCharProps(u_iscntrl, "u_iscntrl", sampleControl, LENGTHOF(sampleControl), TRUE); 841 testSampleCharProps(u_iscntrl, "u_iscntrl", sampleNonControl, LENGTHOF(sampleNonControl), FALSE); 842 843 testSampleCharProps(u_isprint, "u_isprint", 844 samplePrintable, LENGTHOF(samplePrintable), TRUE); 845 testSampleCharProps(u_isprint, "u_isprint", 846 sampleNonPrintable, LENGTHOF(sampleNonPrintable), FALSE); 847 848 /* test all ISO 8 controls */ 849 for(c=0; c<=0x9f; ++c) { 850 if(c==0x20) { 851 /* skip ASCII graphic characters and continue with DEL */ 852 c=0x7f; 853 } 854 if(!u_iscntrl(c)) { 855 log_err("error: u_iscntrl(ISO 8 control U+%04x)=FALSE\n", c); 856 } 857 if(!u_isISOControl(c)) { 858 log_err("error: u_isISOControl(ISO 8 control U+%04x)=FALSE\n", c); 859 } 860 if(u_isprint(c)) { 861 log_err("error: u_isprint(ISO 8 control U+%04x)=TRUE\n", c); 862 } 863 } 864 865 /* test all Latin-1 graphic characters */ 866 for(c=0x20; c<=0xff; ++c) { 867 if(c==0x7f) { 868 c=0xa0; 869 } else if(c==0xad) { 870 /* Unicode 4 changes 00AD Soft Hyphen to Cf (and it is in fact not printable) */ 871 ++c; 872 } 873 if(!u_isprint(c)) { 874 log_err("error: u_isprint(Latin-1 graphic character U+%04x)=FALSE\n", c); 875 } 876 } 877 } 878 879 /* u_isJavaIDStart, u_isJavaIDPart, u_isIDStart(), u_isIDPart(), u_isIDIgnorable()*/ 880 static void TestIdentifier() 881 { 882 const UChar32 sampleJavaIDStart[] = {0x0071, 0x00e4, 0x005f}; 883 const UChar32 sampleNonJavaIDStart[] = {0x0020, 0x2030, 0x0082}; 884 const UChar32 sampleJavaIDPart[] = {0x005f, 0x0032, 0x0045}; 885 const UChar32 sampleNonJavaIDPart[] = {0x2030, 0x2020, 0x0020}; 886 const UChar32 sampleUnicodeIDStart[] = {0x0250, 0x00e2, 0x0061}; 887 const UChar32 sampleNonUnicodeIDStart[] = {0x2000, 0x000a, 0x2019}; 888 const UChar32 sampleUnicodeIDPart[] = {0x005f, 0x0032, 0x0045}; 889 const UChar32 sampleNonUnicodeIDPart[] = {0x2030, 0x00a3, 0x0020}; 890 const UChar32 sampleIDIgnore[] = {0x0006, 0x0010, 0x206b, 0x85}; 891 const UChar32 sampleNonIDIgnore[] = {0x0075, 0x00a3, 0x0061}; 892 893 testSampleCharProps(u_isJavaIDStart, "u_isJavaIDStart", 894 sampleJavaIDStart, LENGTHOF(sampleJavaIDStart), TRUE); 895 testSampleCharProps(u_isJavaIDStart, "u_isJavaIDStart", 896 sampleNonJavaIDStart, LENGTHOF(sampleNonJavaIDStart), FALSE); 897 898 testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart", 899 sampleJavaIDPart, LENGTHOF(sampleJavaIDPart), TRUE); 900 testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart", 901 sampleNonJavaIDPart, LENGTHOF(sampleNonJavaIDPart), FALSE); 902 903 /* IDPart should imply IDStart */ 904 testSampleCharProps(u_isJavaIDPart, "u_isJavaIDPart", 905 sampleJavaIDStart, LENGTHOF(sampleJavaIDStart), TRUE); 906 907 testSampleCharProps(u_isIDStart, "u_isIDStart", 908 sampleUnicodeIDStart, LENGTHOF(sampleUnicodeIDStart), TRUE); 909 testSampleCharProps(u_isIDStart, "u_isIDStart", 910 sampleNonUnicodeIDStart, LENGTHOF(sampleNonUnicodeIDStart), FALSE); 911 912 testSampleCharProps(u_isIDPart, "u_isIDPart", 913 sampleUnicodeIDPart, LENGTHOF(sampleUnicodeIDPart), TRUE); 914 testSampleCharProps(u_isIDPart, "u_isIDPart", 915 sampleNonUnicodeIDPart, LENGTHOF(sampleNonUnicodeIDPart), FALSE); 916 917 /* IDPart should imply IDStart */ 918 testSampleCharProps(u_isIDPart, "u_isIDPart", 919 sampleUnicodeIDStart, LENGTHOF(sampleUnicodeIDStart), TRUE); 920 921 testSampleCharProps(u_isIDIgnorable, "u_isIDIgnorable", 922 sampleIDIgnore, LENGTHOF(sampleIDIgnore), TRUE); 923 testSampleCharProps(u_isIDIgnorable, "u_isIDIgnorable", 924 sampleNonIDIgnore, LENGTHOF(sampleNonIDIgnore), FALSE); 925 } 926 927 /* for each line of UnicodeData.txt, check some of the properties */ 928 typedef struct UnicodeDataContext { 929 #if UCONFIG_NO_NORMALIZATION 930 const void *dummy; 931 #else 932 const UNormalizer2 *nfc; 933 const UNormalizer2 *nfkc; 934 #endif 935 } UnicodeDataContext; 936 937 /* 938 * ### TODO 939 * This test fails incorrectly if the First or Last code point of a repetitive area 940 * is overridden, which is allowed and is encouraged for the PUAs. 941 * Currently, this means that both area First/Last and override lines are 942 * tested against the properties from the API, 943 * and the area boundary will not match and cause an error. 944 * 945 * This function should detect area boundaries and skip them for the test of individual 946 * code points' properties. 947 * Then it should check that the areas contain all the same properties except where overridden. 948 * For this, it would have had to set a flag for which code points were listed explicitly. 949 */ 950 static void U_CALLCONV 951 unicodeDataLineFn(void *context, 952 char *fields[][2], int32_t fieldCount, 953 UErrorCode *pErrorCode) 954 { 955 char buffer[100]; 956 const char *d; 957 char *end; 958 uint32_t value; 959 UChar32 c; 960 int32_t i; 961 int8_t type; 962 int32_t dt; 963 UChar dm[32], s[32]; 964 int32_t dmLength, length; 965 966 #if !UCONFIG_NO_NORMALIZATION 967 const UNormalizer2 *nfc, *nfkc; 968 #endif 969 970 /* get the character code, field 0 */ 971 c=strtoul(fields[0][0], &end, 16); 972 if(end<=fields[0][0] || end!=fields[0][1]) { 973 log_err("error: syntax error in field 0 at %s\n", fields[0][0]); 974 return; 975 } 976 if((uint32_t)c>=UCHAR_MAX_VALUE + 1) { 977 log_err("error in UnicodeData.txt: code point %lu out of range\n", c); 978 return; 979 } 980 981 /* get general category, field 2 */ 982 *fields[2][1]=0; 983 type = (int8_t)tagValues[MakeProp(fields[2][0])]; 984 if(u_charType(c)!=type) { 985 log_err("error: u_charType(U+%04lx)==%u instead of %u\n", c, u_charType(c), type); 986 } 987 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) { 988 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c); 989 } 990 991 /* get canonical combining class, field 3 */ 992 value=strtoul(fields[3][0], &end, 10); 993 if(end<=fields[3][0] || end!=fields[3][1]) { 994 log_err("error: syntax error in field 3 at code 0x%lx\n", c); 995 return; 996 } 997 if(value>255) { 998 log_err("error in UnicodeData.txt: combining class %lu out of range\n", value); 999 return; 1000 } 1001 #if !UCONFIG_NO_NORMALIZATION 1002 if(value!=u_getCombiningClass(c) || value!=(uint32_t)u_getIntPropertyValue(c, UCHAR_CANONICAL_COMBINING_CLASS)) { 1003 log_err("error: u_getCombiningClass(U+%04lx)==%hu instead of %lu\n", c, u_getCombiningClass(c), value); 1004 } 1005 nfkc=((UnicodeDataContext *)context)->nfkc; 1006 if(value!=unorm2_getCombiningClass(nfkc, c)) { 1007 log_err("error: unorm2_getCombiningClass(nfkc, U+%04lx)==%hu instead of %lu\n", c, unorm2_getCombiningClass(nfkc, c), value); 1008 } 1009 #endif 1010 1011 /* get BiDi category, field 4 */ 1012 *fields[4][1]=0; 1013 i=MakeDir(fields[4][0]); 1014 if(i!=u_charDirection(c) || i!=u_getIntPropertyValue(c, UCHAR_BIDI_CLASS)) { 1015 log_err("error: u_charDirection(U+%04lx)==%u instead of %u (%s)\n", c, u_charDirection(c), MakeDir(fields[4][0]), fields[4][0]); 1016 } 1017 1018 /* get Decomposition_Type & Decomposition_Mapping, field 5 */ 1019 d=NULL; 1020 if(fields[5][0]==fields[5][1]) { 1021 /* no decomposition, except UnicodeData.txt omits Hangul syllable decompositions */ 1022 if(c==0xac00 || c==0xd7a3) { 1023 dt=U_DT_CANONICAL; 1024 } else { 1025 dt=U_DT_NONE; 1026 } 1027 } else { 1028 d=fields[5][0]; 1029 *fields[5][1]=0; 1030 dt=UCHAR_INVALID_CODE; 1031 if(*d=='<') { 1032 end=strchr(++d, '>'); 1033 if(end!=NULL) { 1034 *end=0; 1035 dt=u_getPropertyValueEnum(UCHAR_DECOMPOSITION_TYPE, d); 1036 d=u_skipWhitespace(end+1); 1037 } 1038 } else { 1039 dt=U_DT_CANONICAL; 1040 } 1041 } 1042 if(dt>U_DT_NONE) { 1043 if(c==0xac00) { 1044 dm[0]=0x1100; 1045 dm[1]=0x1161; 1046 dm[2]=0; 1047 dmLength=2; 1048 } else if(c==0xd7a3) { 1049 dm[0]=0xd788; 1050 dm[1]=0x11c2; 1051 dm[2]=0; 1052 dmLength=2; 1053 } else { 1054 dmLength=u_parseString(d, dm, 32, NULL, pErrorCode); 1055 } 1056 } else { 1057 dmLength=-1; 1058 } 1059 if(dt<0 || U_FAILURE(*pErrorCode)) { 1060 log_err("error in UnicodeData.txt: syntax error in U+%04lX decomposition field\n", (long)c); 1061 return; 1062 } 1063 #if !UCONFIG_NO_NORMALIZATION 1064 i=u_getIntPropertyValue(c, UCHAR_DECOMPOSITION_TYPE); 1065 if(i!=dt) { 1066 log_err("error: u_getIntPropertyValue(U+%04lx, UCHAR_DECOMPOSITION_TYPE)==%d instead of %d\n", c, i, dt); 1067 } 1068 /* Expect Decomposition_Mapping=nfkc.getRawDecomposition(c). */ 1069 length=unorm2_getRawDecomposition(nfkc, c, s, 32, pErrorCode); 1070 if(U_FAILURE(*pErrorCode) || length!=dmLength || (length>0 && 0!=u_strcmp(s, dm))) { 1071 log_err("error: unorm2_getRawDecomposition(nfkc, U+%04lx)==%d instead of %d " 1072 "or the Decomposition_Mapping is different (%s)\n", 1073 c, length, dmLength, u_errorName(*pErrorCode)); 1074 return; 1075 } 1076 /* For canonical decompositions only, expect Decomposition_Mapping=nfc.getRawDecomposition(c). */ 1077 if(dt!=U_DT_CANONICAL) { 1078 dmLength=-1; 1079 } 1080 nfc=((UnicodeDataContext *)context)->nfc; 1081 length=unorm2_getRawDecomposition(nfc, c, s, 32, pErrorCode); 1082 if(U_FAILURE(*pErrorCode) || length!=dmLength || (length>0 && 0!=u_strcmp(s, dm))) { 1083 log_err("error: unorm2_getRawDecomposition(nfc, U+%04lx)==%d instead of %d " 1084 "or the Decomposition_Mapping is different (%s)\n", 1085 c, length, dmLength, u_errorName(*pErrorCode)); 1086 return; 1087 } 1088 /* recompose */ 1089 if(dt==U_DT_CANONICAL && !u_hasBinaryProperty(c, UCHAR_FULL_COMPOSITION_EXCLUSION)) { 1090 UChar32 a, b, composite; 1091 i=0; 1092 U16_NEXT(dm, i, dmLength, a); 1093 U16_NEXT(dm, i, dmLength, b); 1094 /* i==dmLength */ 1095 composite=unorm2_composePair(nfc, a, b); 1096 if(composite!=c) { 1097 log_err("error: nfc U+%04lX decomposes to U+%04lX+U+%04lX but does not compose back (instead U+%04lX)\n", 1098 (long)c, (long)a, (long)b, (long)composite); 1099 } 1100 /* 1101 * Note: NFKC has fewer round-trip mappings than NFC, 1102 * so we can't just test unorm2_composePair(nfkc, a, b) here without further data. 1103 */ 1104 } 1105 #endif 1106 1107 /* get ISO Comment, field 11 */ 1108 *fields[11][1]=0; 1109 i=u_getISOComment(c, buffer, sizeof(buffer), pErrorCode); 1110 if(U_FAILURE(*pErrorCode) || 0!=strcmp(fields[11][0], buffer)) { 1111 log_err_status(*pErrorCode, "error: u_getISOComment(U+%04lx) wrong (%s): \"%s\" should be \"%s\"\n", 1112 c, u_errorName(*pErrorCode), 1113 U_FAILURE(*pErrorCode) ? buffer : "[error]", 1114 fields[11][0]); 1115 } 1116 1117 /* get uppercase mapping, field 12 */ 1118 if(fields[12][0]!=fields[12][1]) { 1119 value=strtoul(fields[12][0], &end, 16); 1120 if(end!=fields[12][1]) { 1121 log_err("error: syntax error in field 12 at code 0x%lx\n", c); 1122 return; 1123 } 1124 if((UChar32)value!=u_toupper(c)) { 1125 log_err("error: u_toupper(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_toupper(c), value); 1126 } 1127 } else { 1128 /* no case mapping: the API must map the code point to itself */ 1129 if(c!=u_toupper(c)) { 1130 log_err("error: U+%04lx does not have an uppercase mapping but u_toupper()==U+%04lx\n", c, u_toupper(c)); 1131 } 1132 } 1133 1134 /* get lowercase mapping, field 13 */ 1135 if(fields[13][0]!=fields[13][1]) { 1136 value=strtoul(fields[13][0], &end, 16); 1137 if(end!=fields[13][1]) { 1138 log_err("error: syntax error in field 13 at code 0x%lx\n", c); 1139 return; 1140 } 1141 if((UChar32)value!=u_tolower(c)) { 1142 log_err("error: u_tolower(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_tolower(c), value); 1143 } 1144 } else { 1145 /* no case mapping: the API must map the code point to itself */ 1146 if(c!=u_tolower(c)) { 1147 log_err("error: U+%04lx does not have a lowercase mapping but u_tolower()==U+%04lx\n", c, u_tolower(c)); 1148 } 1149 } 1150 1151 /* get titlecase mapping, field 14 */ 1152 if(fields[14][0]!=fields[14][1]) { 1153 value=strtoul(fields[14][0], &end, 16); 1154 if(end!=fields[14][1]) { 1155 log_err("error: syntax error in field 14 at code 0x%lx\n", c); 1156 return; 1157 } 1158 if((UChar32)value!=u_totitle(c)) { 1159 log_err("error: u_totitle(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_totitle(c), value); 1160 } 1161 } else { 1162 /* no case mapping: the API must map the code point to itself */ 1163 if(c!=u_totitle(c)) { 1164 log_err("error: U+%04lx does not have a titlecase mapping but u_totitle()==U+%04lx\n", c, u_totitle(c)); 1165 } 1166 } 1167 } 1168 1169 static UBool U_CALLCONV 1170 enumTypeRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type) { 1171 static const UChar32 test[][2]={ 1172 {0x41, U_UPPERCASE_LETTER}, 1173 {0x308, U_NON_SPACING_MARK}, 1174 {0xfffe, U_GENERAL_OTHER_TYPES}, 1175 {0xe0041, U_FORMAT_CHAR}, 1176 {0xeffff, U_UNASSIGNED} 1177 }; 1178 1179 int32_t i, count; 1180 1181 if(0!=strcmp((const char *)context, "a1")) { 1182 log_err("error: u_enumCharTypes() passes on an incorrect context pointer\n"); 1183 return FALSE; 1184 } 1185 1186 count=LENGTHOF(test); 1187 for(i=0; i<count; ++i) { 1188 if(start<=test[i][0] && test[i][0]<limit) { 1189 if(type!=(UCharCategory)test[i][1]) { 1190 log_err("error: u_enumCharTypes() has range [U+%04lx, U+%04lx[ with %ld instead of U+%04lx with %ld\n", 1191 start, limit, (long)type, test[i][0], test[i][1]); 1192 } 1193 /* stop at the range that includes the last test code point (increases code coverage for enumeration) */ 1194 return i==(count-1) ? FALSE : TRUE; 1195 } 1196 } 1197 1198 if(start>test[count-1][0]) { 1199 log_err("error: u_enumCharTypes() has range [U+%04lx, U+%04lx[ with %ld after it should have stopped\n", 1200 start, limit, (long)type); 1201 return FALSE; 1202 } 1203 1204 return TRUE; 1205 } 1206 1207 static UBool U_CALLCONV 1208 enumDefaultsRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type) { 1209 /* default Bidi classes for unassigned code points, from the DerivedBidiClass.txt header */ 1210 static const int32_t defaultBidi[][2]={ /* { limit, class } */ 1211 { 0x0590, U_LEFT_TO_RIGHT }, 1212 { 0x0600, U_RIGHT_TO_LEFT }, 1213 { 0x07C0, U_RIGHT_TO_LEFT_ARABIC }, 1214 { 0x08A0, U_RIGHT_TO_LEFT }, 1215 { 0x0900, U_RIGHT_TO_LEFT_ARABIC }, /* Unicode 6.1 changes U+08A0..U+08FF from R to AL */ 1216 { 0xFB1D, U_LEFT_TO_RIGHT }, 1217 { 0xFB50, U_RIGHT_TO_LEFT }, 1218 { 0xFE00, U_RIGHT_TO_LEFT_ARABIC }, 1219 { 0xFE70, U_LEFT_TO_RIGHT }, 1220 { 0xFF00, U_RIGHT_TO_LEFT_ARABIC }, 1221 { 0x10800, U_LEFT_TO_RIGHT }, 1222 { 0x11000, U_RIGHT_TO_LEFT }, 1223 { 0x1E800, U_LEFT_TO_RIGHT }, /* new default-R range in Unicode 5.2: U+1E800 - U+1EFFF */ 1224 { 0x1EE00, U_RIGHT_TO_LEFT }, 1225 { 0x1EF00, U_RIGHT_TO_LEFT_ARABIC }, /* Unicode 6.1 changes U+1EE00..U+1EEFF from R to AL */ 1226 { 0x1F000, U_RIGHT_TO_LEFT }, 1227 { 0x110000, U_LEFT_TO_RIGHT } 1228 }; 1229 1230 UChar32 c; 1231 int32_t i; 1232 UCharDirection shouldBeDir; 1233 1234 /* 1235 * LineBreak.txt specifies: 1236 * # - Assigned characters that are not listed explicitly are given the value 1237 * # "AL". 1238 * # - Unassigned characters are given the value "XX". 1239 * 1240 * PUA characters are listed explicitly with "XX". 1241 * Verify that no assigned character has "XX". 1242 */ 1243 if(type!=U_UNASSIGNED && type!=U_PRIVATE_USE_CHAR) { 1244 c=start; 1245 while(c<limit) { 1246 if(0==u_getIntPropertyValue(c, UCHAR_LINE_BREAK)) { 1247 log_err("error UCHAR_LINE_BREAK(assigned U+%04lx)=XX\n", c); 1248 } 1249 ++c; 1250 } 1251 } 1252 1253 /* 1254 * Verify default Bidi classes. 1255 * For recent Unicode versions, see UCD.html. 1256 * 1257 * For older Unicode versions: 1258 * See table 3-7 "Bidirectional Character Types" in UAX #9. 1259 * http://www.unicode.org/reports/tr9/ 1260 * 1261 * See also DerivedBidiClass.txt for Cn code points! 1262 * 1263 * Unicode 4.0.1/Public Review Issue #28 (http://www.unicode.org/review/resolved-pri.html) 1264 * changed some default values. 1265 * In particular, non-characters and unassigned Default Ignorable Code Points 1266 * change from L to BN. 1267 * 1268 * UCD.html version 4.0.1 does not yet reflect these changes. 1269 */ 1270 if(type==U_UNASSIGNED || type==U_PRIVATE_USE_CHAR) { 1271 /* enumerate the intersections of defaultBidi ranges with [start..limit[ */ 1272 c=start; 1273 for(i=0; i<LENGTHOF(defaultBidi) && c<limit; ++i) { 1274 if((int32_t)c<defaultBidi[i][0]) { 1275 while(c<limit && (int32_t)c<defaultBidi[i][0]) { 1276 if(U_IS_UNICODE_NONCHAR(c) || u_hasBinaryProperty(c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT)) { 1277 shouldBeDir=U_BOUNDARY_NEUTRAL; 1278 } else { 1279 shouldBeDir=(UCharDirection)defaultBidi[i][1]; 1280 } 1281 1282 if( u_charDirection(c)!=shouldBeDir || 1283 u_getIntPropertyValue(c, UCHAR_BIDI_CLASS)!=shouldBeDir 1284 ) { 1285 log_err("error: u_charDirection(unassigned/PUA U+%04lx)=%s should be %s\n", 1286 c, dirStrings[u_charDirection(c)], dirStrings[shouldBeDir]); 1287 } 1288 ++c; 1289 } 1290 } 1291 } 1292 } 1293 1294 return TRUE; 1295 } 1296 1297 /* tests for several properties */ 1298 static void TestUnicodeData() 1299 { 1300 UVersionInfo expectVersionArray; 1301 UVersionInfo versionArray; 1302 char *fields[15][2]; 1303 UErrorCode errorCode; 1304 UChar32 c; 1305 int8_t type; 1306 1307 UnicodeDataContext context; 1308 1309 u_versionFromString(expectVersionArray, U_UNICODE_VERSION); 1310 u_getUnicodeVersion(versionArray); 1311 if(memcmp(versionArray, expectVersionArray, U_MAX_VERSION_LENGTH) != 0) 1312 { 1313 log_err("Testing u_getUnicodeVersion() - expected " U_UNICODE_VERSION " got %d.%d.%d.%d\n", 1314 versionArray[0], versionArray[1], versionArray[2], versionArray[3]); 1315 } 1316 1317 #if defined(ICU_UNICODE_VERSION) 1318 /* test only happens where we have configure.in with UNICODE_VERSION - sanity check. */ 1319 if(strcmp(U_UNICODE_VERSION, ICU_UNICODE_VERSION)) 1320 { 1321 log_err("Testing configure.in's ICU_UNICODE_VERSION - expected " U_UNICODE_VERSION " got " ICU_UNICODE_VERSION "\n"); 1322 } 1323 #endif 1324 1325 if (ublock_getCode((UChar)0x0041) != UBLOCK_BASIC_LATIN || u_getIntPropertyValue(0x41, UCHAR_BLOCK)!=(int32_t)UBLOCK_BASIC_LATIN) { 1326 log_err("ublock_getCode(U+0041) property failed! Expected : %i Got: %i \n", UBLOCK_BASIC_LATIN,ublock_getCode((UChar)0x0041)); 1327 } 1328 1329 errorCode=U_ZERO_ERROR; 1330 #if !UCONFIG_NO_NORMALIZATION 1331 context.nfc=unorm2_getNFCInstance(&errorCode); 1332 context.nfkc=unorm2_getNFKCInstance(&errorCode); 1333 if(U_FAILURE(errorCode)) { 1334 log_data_err("error: unable to open an NFC or NFKC UNormalizer2 - %s\n", u_errorName(errorCode)); 1335 return; 1336 } 1337 #endif 1338 parseUCDFile("UnicodeData.txt", fields, 15, unicodeDataLineFn, &context, &errorCode); 1339 if(U_FAILURE(errorCode)) { 1340 return; /* if we couldn't parse UnicodeData.txt, we should return */ 1341 } 1342 1343 /* sanity check on repeated properties */ 1344 for(c=0xfffe; c<=0x10ffff;) { 1345 type=u_charType(c); 1346 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) { 1347 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c); 1348 } 1349 if(type!=U_UNASSIGNED) { 1350 log_err("error: u_charType(U+%04lx)!=U_UNASSIGNED (returns %d)\n", c, u_charType(c)); 1351 } 1352 if((c&0xffff)==0xfffe) { 1353 ++c; 1354 } else { 1355 c+=0xffff; 1356 } 1357 } 1358 1359 /* test that PUA is not "unassigned" */ 1360 for(c=0xe000; c<=0x10fffd;) { 1361 type=u_charType(c); 1362 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) { 1363 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c); 1364 } 1365 if(type==U_UNASSIGNED) { 1366 log_err("error: u_charType(U+%04lx)==U_UNASSIGNED\n", c); 1367 } else if(type!=U_PRIVATE_USE_CHAR) { 1368 log_verbose("PUA override: u_charType(U+%04lx)=%d\n", c, type); 1369 } 1370 if(c==0xf8ff) { 1371 c=0xf0000; 1372 } else if(c==0xffffd) { 1373 c=0x100000; 1374 } else { 1375 ++c; 1376 } 1377 } 1378 1379 /* test u_enumCharTypes() */ 1380 u_enumCharTypes(enumTypeRange, "a1"); 1381 1382 /* check default properties */ 1383 u_enumCharTypes(enumDefaultsRange, NULL); 1384 } 1385 1386 static void TestCodeUnit(){ 1387 const UChar codeunit[]={0x0000,0xe065,0x20ac,0xd7ff,0xd800,0xd841,0xd905,0xdbff,0xdc00,0xdc02,0xddee,0xdfff,0}; 1388 1389 int32_t i; 1390 1391 for(i=0; i<(int32_t)(sizeof(codeunit)/sizeof(codeunit[0])); i++){ 1392 UChar c=codeunit[i]; 1393 if(i<4){ 1394 if(!(UTF_IS_SINGLE(c)) || (UTF_IS_LEAD(c)) || (UTF_IS_TRAIL(c)) ||(UTF_IS_SURROGATE(c))){ 1395 log_err("ERROR: U+%04x is a single", c); 1396 } 1397 1398 } 1399 if(i >= 4 && i< 8){ 1400 if(!(UTF_IS_LEAD(c)) || UTF_IS_SINGLE(c) || UTF_IS_TRAIL(c) || !(UTF_IS_SURROGATE(c))){ 1401 log_err("ERROR: U+%04x is a first surrogate", c); 1402 } 1403 } 1404 if(i >= 8 && i< 12){ 1405 if(!(UTF_IS_TRAIL(c)) || UTF_IS_SINGLE(c) || UTF_IS_LEAD(c) || !(UTF_IS_SURROGATE(c))){ 1406 log_err("ERROR: U+%04x is a second surrogate", c); 1407 } 1408 } 1409 } 1410 1411 } 1412 1413 static void TestCodePoint(){ 1414 const UChar32 codePoint[]={ 1415 /*surrogate, notvalid(codepoint), not a UnicodeChar, not Error */ 1416 0xd800, 1417 0xdbff, 1418 0xdc00, 1419 0xdfff, 1420 0xdc04, 1421 0xd821, 1422 /*not a surrogate, valid, isUnicodeChar , not Error*/ 1423 0x20ac, 1424 0xd7ff, 1425 0xe000, 1426 0xe123, 1427 0x0061, 1428 0xe065, 1429 0x20402, 1430 0x24506, 1431 0x23456, 1432 0x20402, 1433 0x10402, 1434 0x23456, 1435 /*not a surrogate, not valid, isUnicodeChar, isError */ 1436 0x0015, 1437 0x009f, 1438 /*not a surrogate, not valid, not isUnicodeChar, isError */ 1439 0xffff, 1440 0xfffe, 1441 }; 1442 int32_t i; 1443 for(i=0; i<(int32_t)(sizeof(codePoint)/sizeof(codePoint[0])); i++){ 1444 UChar32 c=codePoint[i]; 1445 if(i<6){ 1446 if(!UTF_IS_SURROGATE(c) || !U_IS_SURROGATE(c) || !U16_IS_SURROGATE(c)){ 1447 log_err("ERROR: isSurrogate() failed for U+%04x\n", c); 1448 } 1449 if(UTF_IS_VALID(c)){ 1450 log_err("ERROR: isValid() failed for U+%04x\n", c); 1451 } 1452 if(UTF_IS_UNICODE_CHAR(c) || U_IS_UNICODE_CHAR(c)){ 1453 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c); 1454 } 1455 if(UTF_IS_ERROR(c)){ 1456 log_err("ERROR: isError() failed for U+%04x\n", c); 1457 } 1458 }else if(i >=6 && i<18){ 1459 if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){ 1460 log_err("ERROR: isSurrogate() failed for U+%04x\n", c); 1461 } 1462 if(!UTF_IS_VALID(c)){ 1463 log_err("ERROR: isValid() failed for U+%04x\n", c); 1464 } 1465 if(!UTF_IS_UNICODE_CHAR(c) || !U_IS_UNICODE_CHAR(c)){ 1466 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c); 1467 } 1468 if(UTF_IS_ERROR(c)){ 1469 log_err("ERROR: isError() failed for U+%04x\n", c); 1470 } 1471 }else if(i >=18 && i<20){ 1472 if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){ 1473 log_err("ERROR: isSurrogate() failed for U+%04x\n", c); 1474 } 1475 if(UTF_IS_VALID(c)){ 1476 log_err("ERROR: isValid() failed for U+%04x\n", c); 1477 } 1478 if(!UTF_IS_UNICODE_CHAR(c) || !U_IS_UNICODE_CHAR(c)){ 1479 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c); 1480 } 1481 if(!UTF_IS_ERROR(c)){ 1482 log_err("ERROR: isError() failed for U+%04x\n", c); 1483 } 1484 } 1485 else if(i >=18 && i<(int32_t)(sizeof(codePoint)/sizeof(codePoint[0]))){ 1486 if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){ 1487 log_err("ERROR: isSurrogate() failed for U+%04x\n", c); 1488 } 1489 if(UTF_IS_VALID(c)){ 1490 log_err("ERROR: isValid() failed for U+%04x\n", c); 1491 } 1492 if(UTF_IS_UNICODE_CHAR(c) || U_IS_UNICODE_CHAR(c)){ 1493 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c); 1494 } 1495 if(!UTF_IS_ERROR(c)){ 1496 log_err("ERROR: isError() failed for U+%04x\n", c); 1497 } 1498 } 1499 } 1500 1501 if( 1502 !U_IS_BMP(0) || !U_IS_BMP(0x61) || !U_IS_BMP(0x20ac) || 1503 !U_IS_BMP(0xd9da) || !U_IS_BMP(0xdfed) || !U_IS_BMP(0xffff) || 1504 U_IS_BMP(U_SENTINEL) || U_IS_BMP(0x10000) || U_IS_BMP(0x50005) || 1505 U_IS_BMP(0x10ffff) || U_IS_BMP(0x110000) || U_IS_BMP(0x7fffffff) 1506 ) { 1507 log_err("error with U_IS_BMP()\n"); 1508 } 1509 1510 if( 1511 U_IS_SUPPLEMENTARY(0) || U_IS_SUPPLEMENTARY(0x61) || U_IS_SUPPLEMENTARY(0x20ac) || 1512 U_IS_SUPPLEMENTARY(0xd9da) || U_IS_SUPPLEMENTARY(0xdfed) || U_IS_SUPPLEMENTARY(0xffff) || 1513 U_IS_SUPPLEMENTARY(U_SENTINEL) || !U_IS_SUPPLEMENTARY(0x10000) || !U_IS_SUPPLEMENTARY(0x50005) || 1514 !U_IS_SUPPLEMENTARY(0x10ffff) || U_IS_SUPPLEMENTARY(0x110000) || U_IS_SUPPLEMENTARY(0x7fffffff) 1515 ) { 1516 log_err("error with U_IS_SUPPLEMENTARY()\n"); 1517 } 1518 } 1519 1520 static void TestCharLength() 1521 { 1522 const int32_t codepoint[]={ 1523 1, 0x0061, 1524 1, 0xe065, 1525 1, 0x20ac, 1526 2, 0x20402, 1527 2, 0x23456, 1528 2, 0x24506, 1529 2, 0x20402, 1530 2, 0x10402, 1531 1, 0xd7ff, 1532 1, 0xe000 1533 }; 1534 1535 int32_t i; 1536 UBool multiple; 1537 for(i=0; i<(int32_t)(sizeof(codepoint)/sizeof(codepoint[0])); i=(int16_t)(i+2)){ 1538 UChar32 c=codepoint[i+1]; 1539 if(UTF_CHAR_LENGTH(c) != codepoint[i] || U16_LENGTH(c) != codepoint[i]){ 1540 log_err("The no: of code units for U+%04x:- Expected: %d Got: %d\n", c, codepoint[i], U16_LENGTH(c)); 1541 } 1542 multiple=(UBool)(codepoint[i] == 1 ? FALSE : TRUE); 1543 if(UTF_NEED_MULTIPLE_UCHAR(c) != multiple){ 1544 log_err("ERROR: Unicode::needMultipleUChar() failed for U+%04x\n", c); 1545 } 1546 } 1547 } 1548 1549 /*internal functions ----*/ 1550 static int32_t MakeProp(char* str) 1551 { 1552 int32_t result = 0; 1553 char* matchPosition =0; 1554 1555 matchPosition = strstr(tagStrings, str); 1556 if (matchPosition == 0) 1557 { 1558 log_err("unrecognized type letter "); 1559 log_err(str); 1560 } 1561 else 1562 result = (int32_t)((matchPosition - tagStrings) / 2); 1563 return result; 1564 } 1565 1566 static int32_t MakeDir(char* str) 1567 { 1568 int32_t pos = 0; 1569 for (pos = 0; pos < 19; pos++) { 1570 if (strcmp(str, dirStrings[pos]) == 0) { 1571 return pos; 1572 } 1573 } 1574 return -1; 1575 } 1576 1577 /* test u_charName() -------------------------------------------------------- */ 1578 1579 static const struct { 1580 uint32_t code; 1581 const char *name, *oldName, *extName, *alias; 1582 } names[]={ 1583 {0x0061, "LATIN SMALL LETTER A", "", "LATIN SMALL LETTER A"}, 1584 {0x01a2, "LATIN CAPITAL LETTER OI", "", 1585 "LATIN CAPITAL LETTER OI", 1586 "LATIN CAPITAL LETTER GHA"}, 1587 {0x0284, "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK", "", 1588 "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK" }, 1589 {0x0fd0, "TIBETAN MARK BSKA- SHOG GI MGO RGYAN", "", 1590 "TIBETAN MARK BSKA- SHOG GI MGO RGYAN", 1591 "TIBETAN MARK BKA- SHOG GI MGO RGYAN"}, 1592 {0x3401, "CJK UNIFIED IDEOGRAPH-3401", "", "CJK UNIFIED IDEOGRAPH-3401" }, 1593 {0x7fed, "CJK UNIFIED IDEOGRAPH-7FED", "", "CJK UNIFIED IDEOGRAPH-7FED" }, 1594 {0xac00, "HANGUL SYLLABLE GA", "", "HANGUL SYLLABLE GA" }, 1595 {0xd7a3, "HANGUL SYLLABLE HIH", "", "HANGUL SYLLABLE HIH" }, 1596 {0xd800, "", "", "<lead surrogate-D800>" }, 1597 {0xdc00, "", "", "<trail surrogate-DC00>" }, 1598 {0xff08, "FULLWIDTH LEFT PARENTHESIS", "", "FULLWIDTH LEFT PARENTHESIS" }, 1599 {0xffe5, "FULLWIDTH YEN SIGN", "", "FULLWIDTH YEN SIGN" }, 1600 {0xffff, "", "", "<noncharacter-FFFF>" }, 1601 {0x1d0c5, "BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS", "", 1602 "BYZANTINE MUSICAL SYMBOL FHTORA SKLIRON CHROMA VASIS", 1603 "BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS"}, 1604 {0x23456, "CJK UNIFIED IDEOGRAPH-23456", "", "CJK UNIFIED IDEOGRAPH-23456" } 1605 }; 1606 1607 static UBool 1608 enumCharNamesFn(void *context, 1609 UChar32 code, UCharNameChoice nameChoice, 1610 const char *name, int32_t length) { 1611 int32_t *pCount=(int32_t *)context; 1612 const char *expected; 1613 int i; 1614 1615 if(length<=0 || length!=(int32_t)strlen(name)) { 1616 /* should not be called with an empty string or invalid length */ 1617 log_err("u_enumCharName(0x%lx)=%s but length=%ld\n", name, length); 1618 return TRUE; 1619 } 1620 1621 ++*pCount; 1622 for(i=0; i<sizeof(names)/sizeof(names[0]); ++i) { 1623 if(code==(UChar32)names[i].code) { 1624 switch (nameChoice) { 1625 case U_EXTENDED_CHAR_NAME: 1626 if(0!=strcmp(name, names[i].extName)) { 1627 log_err("u_enumCharName(0x%lx - Extended)=%s instead of %s\n", code, name, names[i].extName); 1628 } 1629 break; 1630 case U_UNICODE_CHAR_NAME: 1631 if(0!=strcmp(name, names[i].name)) { 1632 log_err("u_enumCharName(0x%lx)=%s instead of %s\n", code, name, names[i].name); 1633 } 1634 break; 1635 case U_UNICODE_10_CHAR_NAME: 1636 expected=names[i].oldName; 1637 if(expected[0]==0 || 0!=strcmp(name, expected)) { 1638 log_err("u_enumCharName(0x%lx - 1.0)=%s instead of %s\n", code, name, expected); 1639 } 1640 break; 1641 case U_CHAR_NAME_ALIAS: 1642 expected=names[i].alias; 1643 if(expected==NULL || expected[0]==0 || 0!=strcmp(name, expected)) { 1644 log_err("u_enumCharName(0x%lx - alias)=%s instead of %s\n", code, name, expected); 1645 } 1646 break; 1647 case U_CHAR_NAME_CHOICE_COUNT: 1648 break; 1649 } 1650 break; 1651 } 1652 } 1653 return TRUE; 1654 } 1655 1656 struct enumExtCharNamesContext { 1657 uint32_t length; 1658 int32_t last; 1659 }; 1660 1661 static UBool 1662 enumExtCharNamesFn(void *context, 1663 UChar32 code, UCharNameChoice nameChoice, 1664 const char *name, int32_t length) { 1665 struct enumExtCharNamesContext *ecncp = (struct enumExtCharNamesContext *) context; 1666 1667 if (ecncp->last != (int32_t) code - 1) { 1668 if (ecncp->last < 0) { 1669 log_err("u_enumCharName(0x%lx - Ext) after u_enumCharName(0x%lx - Ext) instead of u_enumCharName(0x%lx - Ext)\n", code, ecncp->last, ecncp->last + 1); 1670 } else { 1671 log_err("u_enumCharName(0x%lx - Ext) instead of u_enumCharName(0x0 - Ext)\n", code); 1672 } 1673 } 1674 ecncp->last = (int32_t) code; 1675 1676 if (!*name) { 1677 log_err("u_enumCharName(0x%lx - Ext) should not be an empty string\n", code); 1678 } 1679 1680 return enumCharNamesFn(&ecncp->length, code, nameChoice, name, length); 1681 } 1682 1683 /** 1684 * This can be made more efficient by moving it into putil.c and having 1685 * it directly access the ebcdic translation tables. 1686 * TODO: If we get this method in putil.c, then delete it from here. 1687 */ 1688 static UChar 1689 u_charToUChar(char c) { 1690 UChar uc; 1691 u_charsToUChars(&c, &uc, 1); 1692 return uc; 1693 } 1694 1695 static void 1696 TestCharNames() { 1697 static char name[80]; 1698 UErrorCode errorCode=U_ZERO_ERROR; 1699 struct enumExtCharNamesContext extContext; 1700 const char *expected; 1701 int32_t length; 1702 UChar32 c; 1703 int32_t i; 1704 1705 log_verbose("Testing uprv_getMaxCharNameLength()\n"); 1706 length=uprv_getMaxCharNameLength(); 1707 if(length==0) { 1708 /* no names data available */ 1709 return; 1710 } 1711 if(length<83) { /* Unicode 3.2 max char name length */ 1712 log_err("uprv_getMaxCharNameLength()=%d is too short"); 1713 } 1714 /* ### TODO same tests for max ISO comment length as for max name length */ 1715 1716 log_verbose("Testing u_charName()\n"); 1717 for(i=0; i<(int32_t)(sizeof(names)/sizeof(names[0])); ++i) { 1718 /* modern Unicode character name */ 1719 length=u_charName(names[i].code, U_UNICODE_CHAR_NAME, name, sizeof(name), &errorCode); 1720 if(U_FAILURE(errorCode)) { 1721 log_err("u_charName(0x%lx) error %s\n", names[i].code, u_errorName(errorCode)); 1722 return; 1723 } 1724 if(length<0 || 0!=strcmp(name, names[i].name) || length!=(uint16_t)strlen(name)) { 1725 log_err("u_charName(0x%lx) gets: %s (length %ld) instead of: %s\n", names[i].code, name, length, names[i].name); 1726 } 1727 1728 /* find the modern name */ 1729 if (*names[i].name) { 1730 c=u_charFromName(U_UNICODE_CHAR_NAME, names[i].name, &errorCode); 1731 if(U_FAILURE(errorCode)) { 1732 log_err("u_charFromName(%s) error %s\n", names[i].name, u_errorName(errorCode)); 1733 return; 1734 } 1735 if(c!=(UChar32)names[i].code) { 1736 log_err("u_charFromName(%s) gets 0x%lx instead of 0x%lx\n", names[i].name, c, names[i].code); 1737 } 1738 } 1739 1740 /* Unicode 1.0 character name */ 1741 length=u_charName(names[i].code, U_UNICODE_10_CHAR_NAME, name, sizeof(name), &errorCode); 1742 if(U_FAILURE(errorCode)) { 1743 log_err("u_charName(0x%lx - 1.0) error %s\n", names[i].code, u_errorName(errorCode)); 1744 return; 1745 } 1746 if(length<0 || (length>0 && 0!=strcmp(name, names[i].oldName)) || length!=(uint16_t)strlen(name)) { 1747 log_err("u_charName(0x%lx - 1.0) gets %s length %ld instead of nothing or %s\n", names[i].code, name, length, names[i].oldName); 1748 } 1749 1750 /* find the Unicode 1.0 name if it is stored (length>0 means that we could read it) */ 1751 if(names[i].oldName[0]!=0 /* && length>0 */) { 1752 c=u_charFromName(U_UNICODE_10_CHAR_NAME, names[i].oldName, &errorCode); 1753 if(U_FAILURE(errorCode)) { 1754 log_err("u_charFromName(%s - 1.0) error %s\n", names[i].oldName, u_errorName(errorCode)); 1755 return; 1756 } 1757 if(c!=(UChar32)names[i].code) { 1758 log_err("u_charFromName(%s - 1.0) gets 0x%lx instead of 0x%lx\n", names[i].oldName, c, names[i].code); 1759 } 1760 } 1761 1762 /* Unicode character name alias */ 1763 length=u_charName(names[i].code, U_CHAR_NAME_ALIAS, name, sizeof(name), &errorCode); 1764 if(U_FAILURE(errorCode)) { 1765 log_err("u_charName(0x%lx - alias) error %s\n", names[i].code, u_errorName(errorCode)); 1766 return; 1767 } 1768 expected=names[i].alias; 1769 if(expected==NULL) { 1770 expected=""; 1771 } 1772 if(length<0 || (length>0 && 0!=strcmp(name, expected)) || length!=(uint16_t)strlen(name)) { 1773 log_err("u_charName(0x%lx - alias) gets %s length %ld instead of nothing or %s\n", 1774 names[i].code, name, length, expected); 1775 } 1776 1777 /* find the Unicode character name alias if it is stored (length>0 means that we could read it) */ 1778 if(expected[0]!=0 /* && length>0 */) { 1779 c=u_charFromName(U_CHAR_NAME_ALIAS, expected, &errorCode); 1780 if(U_FAILURE(errorCode)) { 1781 log_err("u_charFromName(%s - alias) error %s\n", 1782 expected, u_errorName(errorCode)); 1783 return; 1784 } 1785 if(c!=(UChar32)names[i].code) { 1786 log_err("u_charFromName(%s - alias) gets 0x%lx instead of 0x%lx\n", 1787 expected, c, names[i].code); 1788 } 1789 } 1790 } 1791 1792 /* test u_enumCharNames() */ 1793 length=0; 1794 errorCode=U_ZERO_ERROR; 1795 u_enumCharNames(UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, enumCharNamesFn, &length, U_UNICODE_CHAR_NAME, &errorCode); 1796 if(U_FAILURE(errorCode) || length<94140) { 1797 log_err("u_enumCharNames(%ld..%lx) error %s names count=%ld\n", UCHAR_MIN_VALUE, UCHAR_MAX_VALUE, u_errorName(errorCode), length); 1798 } 1799 1800 extContext.length = 0; 1801 extContext.last = -1; 1802 errorCode=U_ZERO_ERROR; 1803 u_enumCharNames(UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, enumExtCharNamesFn, &extContext, U_EXTENDED_CHAR_NAME, &errorCode); 1804 if(U_FAILURE(errorCode) || extContext.length<UCHAR_MAX_VALUE + 1) { 1805 log_err("u_enumCharNames(%ld..0x%lx - Extended) error %s names count=%ld\n", UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, u_errorName(errorCode), extContext.length); 1806 } 1807 1808 /* test that u_charFromName() uppercases the input name, i.e., works with mixed-case names (new in 2.0) */ 1809 if(0x61!=u_charFromName(U_UNICODE_CHAR_NAME, "LATin smALl letTER A", &errorCode)) { 1810 log_err("u_charFromName(U_UNICODE_CHAR_NAME, \"LATin smALl letTER A\") did not find U+0061 (%s)\n", u_errorName(errorCode)); 1811 } 1812 1813 /* Test getCharNameCharacters */ 1814 if(!getTestOption(QUICK_OPTION)) { 1815 enum { BUFSIZE = 256 }; 1816 UErrorCode ec = U_ZERO_ERROR; 1817 char buf[BUFSIZE]; 1818 int32_t maxLength; 1819 UChar32 cp; 1820 UChar pat[BUFSIZE], dumbPat[BUFSIZE]; 1821 int32_t l1, l2; 1822 UBool map[256]; 1823 UBool ok; 1824 1825 USet* set = uset_open(1, 0); /* empty set */ 1826 USet* dumb = uset_open(1, 0); /* empty set */ 1827 1828 /* 1829 * uprv_getCharNameCharacters() will likely return more lowercase 1830 * letters than actual character names contain because 1831 * it includes all the characters in lowercased names of 1832 * general categories, for the full possible set of extended names. 1833 */ 1834 { 1835 USetAdder sa={ 1836 NULL, 1837 uset_add, 1838 uset_addRange, 1839 uset_addString, 1840 NULL /* don't need remove() */ 1841 }; 1842 sa.set=set; 1843 uprv_getCharNameCharacters(&sa); 1844 } 1845 1846 /* build set the dumb (but sure-fire) way */ 1847 for (i=0; i<256; ++i) { 1848 map[i] = FALSE; 1849 } 1850 1851 maxLength=0; 1852 for (cp=0; cp<0x110000; ++cp) { 1853 int32_t len = u_charName(cp, U_EXTENDED_CHAR_NAME, 1854 buf, BUFSIZE, &ec); 1855 if (U_FAILURE(ec)) { 1856 log_err("FAIL: u_charName failed when it shouldn't\n"); 1857 uset_close(set); 1858 uset_close(dumb); 1859 return; 1860 } 1861 if(len>maxLength) { 1862 maxLength=len; 1863 } 1864 1865 for (i=0; i<len; ++i) { 1866 if (!map[(uint8_t) buf[i]]) { 1867 uset_add(dumb, (UChar32)u_charToUChar(buf[i])); 1868 map[(uint8_t) buf[i]] = TRUE; 1869 } 1870 } 1871 1872 /* test for leading/trailing whitespace */ 1873 if(buf[0]==' ' || buf[0]=='\t' || buf[len-1]==' ' || buf[len-1]=='\t') { 1874 log_err("u_charName(U+%04x) returns a name with leading or trailing whitespace\n", cp); 1875 } 1876 } 1877 1878 if(map[(uint8_t)'\t']) { 1879 log_err("u_charName() returned a name with a TAB for some code point\n", cp); 1880 } 1881 1882 length=uprv_getMaxCharNameLength(); 1883 if(length!=maxLength) { 1884 log_err("uprv_getMaxCharNameLength()=%d differs from the maximum length %d of all extended names\n", 1885 length, maxLength); 1886 } 1887 1888 /* compare the sets. Where is my uset_equals?!! */ 1889 ok=TRUE; 1890 for(i=0; i<256; ++i) { 1891 if(uset_contains(set, i)!=uset_contains(dumb, i)) { 1892 if(0x61<=i && i<=0x7a /* a-z */ && uset_contains(set, i) && !uset_contains(dumb, i)) { 1893 /* ignore lowercase a-z that are in set but not in dumb */ 1894 ok=TRUE; 1895 } else { 1896 ok=FALSE; 1897 break; 1898 } 1899 } 1900 } 1901 1902 l1 = uset_toPattern(set, pat, BUFSIZE, TRUE, &ec); 1903 l2 = uset_toPattern(dumb, dumbPat, BUFSIZE, TRUE, &ec); 1904 if (U_FAILURE(ec)) { 1905 log_err("FAIL: uset_toPattern failed when it shouldn't\n"); 1906 uset_close(set); 1907 uset_close(dumb); 1908 return; 1909 } 1910 1911 if (l1 >= BUFSIZE) { 1912 l1 = BUFSIZE-1; 1913 pat[l1] = 0; 1914 } 1915 if (l2 >= BUFSIZE) { 1916 l2 = BUFSIZE-1; 1917 dumbPat[l2] = 0; 1918 } 1919 1920 if (!ok) { 1921 log_err("FAIL: uprv_getCharNameCharacters() returned %s, expected %s (too many lowercase a-z are ok)\n", 1922 aescstrdup(pat, l1), aescstrdup(dumbPat, l2)); 1923 } else if(getTestOption(VERBOSITY_OPTION)) { 1924 log_verbose("Ok: uprv_getCharNameCharacters() returned %s\n", aescstrdup(pat, l1)); 1925 } 1926 1927 uset_close(set); 1928 uset_close(dumb); 1929 } 1930 1931 /* ### TODO: test error cases and other interesting things */ 1932 } 1933 1934 /* test u_isMirrored() and u_charMirror() ----------------------------------- */ 1935 1936 static void 1937 TestMirroring() { 1938 USet *set; 1939 UErrorCode errorCode; 1940 1941 UChar32 start, end, c2, c3; 1942 int32_t i; 1943 1944 U_STRING_DECL(mirroredPattern, "[:Bidi_Mirrored:]", 17); 1945 1946 U_STRING_INIT(mirroredPattern, "[:Bidi_Mirrored:]", 17); 1947 1948 log_verbose("Testing u_isMirrored()\n"); 1949 if(!(u_isMirrored(0x28) && u_isMirrored(0xbb) && u_isMirrored(0x2045) && u_isMirrored(0x232a) && 1950 !u_isMirrored(0x27) && !u_isMirrored(0x61) && !u_isMirrored(0x284) && !u_isMirrored(0x3400) 1951 ) 1952 ) { 1953 log_err("u_isMirrored() does not work correctly\n"); 1954 } 1955 1956 log_verbose("Testing u_charMirror()\n"); 1957 if(!(u_charMirror(0x3c)==0x3e && u_charMirror(0x5d)==0x5b && u_charMirror(0x208d)==0x208e && u_charMirror(0x3017)==0x3016 && 1958 u_charMirror(0xbb)==0xab && u_charMirror(0x2215)==0x29F5 && u_charMirror(0x29F5)==0x2215 && /* large delta between the code points */ 1959 u_charMirror(0x2e)==0x2e && u_charMirror(0x6f3)==0x6f3 && u_charMirror(0x301c)==0x301c && u_charMirror(0xa4ab)==0xa4ab && 1960 /* see Unicode Corrigendum #6 at http://www.unicode.org/versions/corrigendum6.html */ 1961 u_charMirror(0x2018)==0x2018 && u_charMirror(0x201b)==0x201b && u_charMirror(0x301d)==0x301d 1962 ) 1963 ) { 1964 log_err("u_charMirror() does not work correctly\n"); 1965 } 1966 1967 /* verify that Bidi_Mirroring_Glyph roundtrips */ 1968 errorCode=U_ZERO_ERROR; 1969 set=uset_openPattern(mirroredPattern, 17, &errorCode); 1970 1971 if (U_FAILURE(errorCode)) { 1972 log_data_err("uset_openPattern(mirroredPattern, 17, &errorCode) failed!\n"); 1973 } else { 1974 for(i=0; 0==uset_getItem(set, i, &start, &end, NULL, 0, &errorCode); ++i) { 1975 do { 1976 c2=u_charMirror(start); 1977 c3=u_charMirror(c2); 1978 if(c3!=start) { 1979 log_err("u_charMirror() does not roundtrip: U+%04lx->U+%04lx->U+%04lx\n", (long)start, (long)c2, (long)c3); 1980 } 1981 } while(++start<=end); 1982 } 1983 } 1984 1985 uset_close(set); 1986 } 1987 1988 1989 struct RunTestData 1990 { 1991 const char *runText; 1992 UScriptCode runCode; 1993 }; 1994 1995 typedef struct RunTestData RunTestData; 1996 1997 static void 1998 CheckScriptRuns(UScriptRun *scriptRun, int32_t *runStarts, const RunTestData *testData, int32_t nRuns, 1999 const char *prefix) 2000 { 2001 int32_t run, runStart, runLimit; 2002 UScriptCode runCode; 2003 2004 /* iterate over all the runs */ 2005 run = 0; 2006 while (uscript_nextRun(scriptRun, &runStart, &runLimit, &runCode)) { 2007 if (runStart != runStarts[run]) { 2008 log_err("%s: incorrect start offset for run %d: expected %d, got %d\n", 2009 prefix, run, runStarts[run], runStart); 2010 } 2011 2012 if (runLimit != runStarts[run + 1]) { 2013 log_err("%s: incorrect limit offset for run %d: expected %d, got %d\n", 2014 prefix, run, runStarts[run + 1], runLimit); 2015 } 2016 2017 if (runCode != testData[run].runCode) { 2018 log_err("%s: incorrect script for run %d: expected \"%s\", got \"%s\"\n", 2019 prefix, run, uscript_getName(testData[run].runCode), uscript_getName(runCode)); 2020 } 2021 2022 run += 1; 2023 2024 /* stop when we've seen all the runs we expect to see */ 2025 if (run >= nRuns) { 2026 break; 2027 } 2028 } 2029 2030 /* Complain if we didn't see then number of runs we expected */ 2031 if (run != nRuns) { 2032 log_err("%s: incorrect number of runs: expected %d, got %d\n", prefix, run, nRuns); 2033 } 2034 } 2035 2036 static void 2037 TestUScriptRunAPI() 2038 { 2039 static const RunTestData testData1[] = { 2040 {"\\u0020\\u0946\\u0939\\u093F\\u0928\\u094D\\u0926\\u0940\\u0020", USCRIPT_DEVANAGARI}, 2041 {"\\u0627\\u0644\\u0639\\u0631\\u0628\\u064A\\u0629\\u0020", USCRIPT_ARABIC}, 2042 {"\\u0420\\u0443\\u0441\\u0441\\u043A\\u0438\\u0439\\u0020", USCRIPT_CYRILLIC}, 2043 {"English (", USCRIPT_LATIN}, 2044 {"\\u0E44\\u0E17\\u0E22", USCRIPT_THAI}, 2045 {") ", USCRIPT_LATIN}, 2046 {"\\u6F22\\u5B75", USCRIPT_HAN}, 2047 {"\\u3068\\u3072\\u3089\\u304C\\u306A\\u3068", USCRIPT_HIRAGANA}, 2048 {"\\u30AB\\u30BF\\u30AB\\u30CA", USCRIPT_KATAKANA}, 2049 {"\\U00010400\\U00010401\\U00010402\\U00010403", USCRIPT_DESERET} 2050 }; 2051 2052 static const RunTestData testData2[] = { 2053 {"((((((((((abc))))))))))", USCRIPT_LATIN} 2054 }; 2055 2056 static const struct { 2057 const RunTestData *testData; 2058 int32_t nRuns; 2059 } testDataEntries[] = { 2060 {testData1, LENGTHOF(testData1)}, 2061 {testData2, LENGTHOF(testData2)} 2062 }; 2063 2064 static const int32_t nTestEntries = LENGTHOF(testDataEntries); 2065 int32_t testEntry; 2066 2067 for (testEntry = 0; testEntry < nTestEntries; testEntry += 1) { 2068 UChar testString[1024]; 2069 int32_t runStarts[256]; 2070 int32_t nTestRuns = testDataEntries[testEntry].nRuns; 2071 const RunTestData *testData = testDataEntries[testEntry].testData; 2072 2073 int32_t run, stringLimit; 2074 UScriptRun *scriptRun = NULL; 2075 UErrorCode err; 2076 2077 /* 2078 * Fill in the test string and the runStarts array. 2079 */ 2080 stringLimit = 0; 2081 for (run = 0; run < nTestRuns; run += 1) { 2082 runStarts[run] = stringLimit; 2083 stringLimit += u_unescape(testData[run].runText, &testString[stringLimit], 1024 - stringLimit); 2084 /*stringLimit -= 1;*/ 2085 } 2086 2087 /* The limit of the last run */ 2088 runStarts[nTestRuns] = stringLimit; 2089 2090 /* 2091 * Make sure that calling uscript_OpenRun with a NULL text pointer 2092 * and a non-zero text length returns the correct error. 2093 */ 2094 err = U_ZERO_ERROR; 2095 scriptRun = uscript_openRun(NULL, stringLimit, &err); 2096 2097 if (err != U_ILLEGAL_ARGUMENT_ERROR) { 2098 log_err("uscript_openRun(NULL, stringLimit, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err)); 2099 } 2100 2101 if (scriptRun != NULL) { 2102 log_err("uscript_openRun(NULL, stringLimit, &err) returned a non-NULL result.\n"); 2103 uscript_closeRun(scriptRun); 2104 } 2105 2106 /* 2107 * Make sure that calling uscript_OpenRun with a non-NULL text pointer 2108 * and a zero text length returns the correct error. 2109 */ 2110 err = U_ZERO_ERROR; 2111 scriptRun = uscript_openRun(testString, 0, &err); 2112 2113 if (err != U_ILLEGAL_ARGUMENT_ERROR) { 2114 log_err("uscript_openRun(testString, 0, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err)); 2115 } 2116 2117 if (scriptRun != NULL) { 2118 log_err("uscript_openRun(testString, 0, &err) returned a non-NULL result.\n"); 2119 uscript_closeRun(scriptRun); 2120 } 2121 2122 /* 2123 * Make sure that calling uscript_openRun with a NULL text pointer 2124 * and a zero text length doesn't return an error. 2125 */ 2126 err = U_ZERO_ERROR; 2127 scriptRun = uscript_openRun(NULL, 0, &err); 2128 2129 if (U_FAILURE(err)) { 2130 log_err("Got error %s from uscript_openRun(NULL, 0, &err)\n", u_errorName(err)); 2131 } 2132 2133 /* Make sure that the empty iterator doesn't find any runs */ 2134 if (uscript_nextRun(scriptRun, NULL, NULL, NULL)) { 2135 log_err("uscript_nextRun(...) returned TRUE for an empty iterator.\n"); 2136 } 2137 2138 /* 2139 * Make sure that calling uscript_setRunText with a NULL text pointer 2140 * and a non-zero text length returns the correct error. 2141 */ 2142 err = U_ZERO_ERROR; 2143 uscript_setRunText(scriptRun, NULL, stringLimit, &err); 2144 2145 if (err != U_ILLEGAL_ARGUMENT_ERROR) { 2146 log_err("uscript_setRunText(scriptRun, NULL, stringLimit, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err)); 2147 } 2148 2149 /* 2150 * Make sure that calling uscript_OpenRun with a non-NULL text pointer 2151 * and a zero text length returns the correct error. 2152 */ 2153 err = U_ZERO_ERROR; 2154 uscript_setRunText(scriptRun, testString, 0, &err); 2155 2156 if (err != U_ILLEGAL_ARGUMENT_ERROR) { 2157 log_err("uscript_setRunText(scriptRun, testString, 0, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err)); 2158 } 2159 2160 /* 2161 * Now call uscript_setRunText on the empty iterator 2162 * and make sure that it works. 2163 */ 2164 err = U_ZERO_ERROR; 2165 uscript_setRunText(scriptRun, testString, stringLimit, &err); 2166 2167 if (U_FAILURE(err)) { 2168 log_err("Got error %s from uscript_setRunText(...)\n", u_errorName(err)); 2169 } else { 2170 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_setRunText"); 2171 } 2172 2173 uscript_closeRun(scriptRun); 2174 2175 /* 2176 * Now open an interator over the testString 2177 * using uscript_openRun and make sure that it works 2178 */ 2179 scriptRun = uscript_openRun(testString, stringLimit, &err); 2180 2181 if (U_FAILURE(err)) { 2182 log_err("Got error %s from uscript_openRun(...)\n", u_errorName(err)); 2183 } else { 2184 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_openRun"); 2185 } 2186 2187 /* Now reset the iterator, and make sure 2188 * that it still works. 2189 */ 2190 uscript_resetRun(scriptRun); 2191 2192 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_resetRun"); 2193 2194 /* Close the iterator */ 2195 uscript_closeRun(scriptRun); 2196 } 2197 } 2198 2199 /* test additional, non-core properties */ 2200 static void 2201 TestAdditionalProperties() { 2202 /* test data for u_charAge() */ 2203 static const struct { 2204 UChar32 c; 2205 UVersionInfo version; 2206 } charAges[]={ 2207 {0x41, { 1, 1, 0, 0 }}, 2208 {0xffff, { 1, 1, 0, 0 }}, 2209 {0x20ab, { 2, 0, 0, 0 }}, 2210 {0x2fffe, { 2, 0, 0, 0 }}, 2211 {0x20ac, { 2, 1, 0, 0 }}, 2212 {0xfb1d, { 3, 0, 0, 0 }}, 2213 {0x3f4, { 3, 1, 0, 0 }}, 2214 {0x10300, { 3, 1, 0, 0 }}, 2215 {0x220, { 3, 2, 0, 0 }}, 2216 {0xff60, { 3, 2, 0, 0 }} 2217 }; 2218 2219 /* test data for u_hasBinaryProperty() */ 2220 static const int32_t 2221 props[][3]={ /* code point, property, value */ 2222 { 0x0627, UCHAR_ALPHABETIC, TRUE }, 2223 { 0x1034a, UCHAR_ALPHABETIC, TRUE }, 2224 { 0x2028, UCHAR_ALPHABETIC, FALSE }, 2225 2226 { 0x0066, UCHAR_ASCII_HEX_DIGIT, TRUE }, 2227 { 0x0067, UCHAR_ASCII_HEX_DIGIT, FALSE }, 2228 2229 { 0x202c, UCHAR_BIDI_CONTROL, TRUE }, 2230 { 0x202f, UCHAR_BIDI_CONTROL, FALSE }, 2231 2232 { 0x003c, UCHAR_BIDI_MIRRORED, TRUE }, 2233 { 0x003d, UCHAR_BIDI_MIRRORED, FALSE }, 2234 2235 /* see Unicode Corrigendum #6 at http://www.unicode.org/versions/corrigendum6.html */ 2236 { 0x2018, UCHAR_BIDI_MIRRORED, FALSE }, 2237 { 0x201d, UCHAR_BIDI_MIRRORED, FALSE }, 2238 { 0x201f, UCHAR_BIDI_MIRRORED, FALSE }, 2239 { 0x301e, UCHAR_BIDI_MIRRORED, FALSE }, 2240 2241 { 0x058a, UCHAR_DASH, TRUE }, 2242 { 0x007e, UCHAR_DASH, FALSE }, 2243 2244 { 0x0c4d, UCHAR_DIACRITIC, TRUE }, 2245 { 0x3000, UCHAR_DIACRITIC, FALSE }, 2246 2247 { 0x0e46, UCHAR_EXTENDER, TRUE }, 2248 { 0x0020, UCHAR_EXTENDER, FALSE }, 2249 2250 #if !UCONFIG_NO_NORMALIZATION 2251 { 0xfb1d, UCHAR_FULL_COMPOSITION_EXCLUSION, TRUE }, 2252 { 0x1d15f, UCHAR_FULL_COMPOSITION_EXCLUSION, TRUE }, 2253 { 0xfb1e, UCHAR_FULL_COMPOSITION_EXCLUSION, FALSE }, 2254 2255 { 0x110a, UCHAR_NFD_INERT, TRUE }, /* Jamo L */ 2256 { 0x0308, UCHAR_NFD_INERT, FALSE }, 2257 2258 { 0x1164, UCHAR_NFKD_INERT, TRUE }, /* Jamo V */ 2259 { 0x1d79d, UCHAR_NFKD_INERT, FALSE }, /* math compat version of xi */ 2260 2261 { 0x0021, UCHAR_NFC_INERT, TRUE }, /* ! */ 2262 { 0x0061, UCHAR_NFC_INERT, FALSE }, /* a */ 2263 { 0x00e4, UCHAR_NFC_INERT, FALSE }, /* a-umlaut */ 2264 { 0x0102, UCHAR_NFC_INERT, FALSE }, /* a-breve */ 2265 { 0xac1c, UCHAR_NFC_INERT, FALSE }, /* Hangul LV */ 2266 { 0xac1d, UCHAR_NFC_INERT, TRUE }, /* Hangul LVT */ 2267 2268 { 0x1d79d, UCHAR_NFKC_INERT, FALSE }, /* math compat version of xi */ 2269 { 0x2a6d6, UCHAR_NFKC_INERT, TRUE }, /* Han, last of CJK ext. B */ 2270 2271 { 0x00e4, UCHAR_SEGMENT_STARTER, TRUE }, 2272 { 0x0308, UCHAR_SEGMENT_STARTER, FALSE }, 2273 { 0x110a, UCHAR_SEGMENT_STARTER, TRUE }, /* Jamo L */ 2274 { 0x1164, UCHAR_SEGMENT_STARTER, FALSE },/* Jamo V */ 2275 { 0xac1c, UCHAR_SEGMENT_STARTER, TRUE }, /* Hangul LV */ 2276 { 0xac1d, UCHAR_SEGMENT_STARTER, TRUE }, /* Hangul LVT */ 2277 #endif 2278 2279 { 0x0044, UCHAR_HEX_DIGIT, TRUE }, 2280 { 0xff46, UCHAR_HEX_DIGIT, TRUE }, 2281 { 0x0047, UCHAR_HEX_DIGIT, FALSE }, 2282 2283 { 0x30fb, UCHAR_HYPHEN, TRUE }, 2284 { 0xfe58, UCHAR_HYPHEN, FALSE }, 2285 2286 { 0x2172, UCHAR_ID_CONTINUE, TRUE }, 2287 { 0x0307, UCHAR_ID_CONTINUE, TRUE }, 2288 { 0x005c, UCHAR_ID_CONTINUE, FALSE }, 2289 2290 { 0x2172, UCHAR_ID_START, TRUE }, 2291 { 0x007a, UCHAR_ID_START, TRUE }, 2292 { 0x0039, UCHAR_ID_START, FALSE }, 2293 2294 { 0x4db5, UCHAR_IDEOGRAPHIC, TRUE }, 2295 { 0x2f999, UCHAR_IDEOGRAPHIC, TRUE }, 2296 { 0x2f99, UCHAR_IDEOGRAPHIC, FALSE }, 2297 2298 { 0x200c, UCHAR_JOIN_CONTROL, TRUE }, 2299 { 0x2029, UCHAR_JOIN_CONTROL, FALSE }, 2300 2301 { 0x1d7bc, UCHAR_LOWERCASE, TRUE }, 2302 { 0x0345, UCHAR_LOWERCASE, TRUE }, 2303 { 0x0030, UCHAR_LOWERCASE, FALSE }, 2304 2305 { 0x1d7a9, UCHAR_MATH, TRUE }, 2306 { 0x2135, UCHAR_MATH, TRUE }, 2307 { 0x0062, UCHAR_MATH, FALSE }, 2308 2309 { 0xfde1, UCHAR_NONCHARACTER_CODE_POINT, TRUE }, 2310 { 0x10ffff, UCHAR_NONCHARACTER_CODE_POINT, TRUE }, 2311 { 0x10fffd, UCHAR_NONCHARACTER_CODE_POINT, FALSE }, 2312 2313 { 0x0022, UCHAR_QUOTATION_MARK, TRUE }, 2314 { 0xff62, UCHAR_QUOTATION_MARK, TRUE }, 2315 { 0xd840, UCHAR_QUOTATION_MARK, FALSE }, 2316 2317 { 0x061f, UCHAR_TERMINAL_PUNCTUATION, TRUE }, 2318 { 0xe003f, UCHAR_TERMINAL_PUNCTUATION, FALSE }, 2319 2320 { 0x1d44a, UCHAR_UPPERCASE, TRUE }, 2321 { 0x2162, UCHAR_UPPERCASE, TRUE }, 2322 { 0x0345, UCHAR_UPPERCASE, FALSE }, 2323 2324 { 0x0020, UCHAR_WHITE_SPACE, TRUE }, 2325 { 0x202f, UCHAR_WHITE_SPACE, TRUE }, 2326 { 0x3001, UCHAR_WHITE_SPACE, FALSE }, 2327 2328 { 0x0711, UCHAR_XID_CONTINUE, TRUE }, 2329 { 0x1d1aa, UCHAR_XID_CONTINUE, TRUE }, 2330 { 0x007c, UCHAR_XID_CONTINUE, FALSE }, 2331 2332 { 0x16ee, UCHAR_XID_START, TRUE }, 2333 { 0x23456, UCHAR_XID_START, TRUE }, 2334 { 0x1d1aa, UCHAR_XID_START, FALSE }, 2335 2336 /* 2337 * Version break: 2338 * The following properties are only supported starting with the 2339 * Unicode version indicated in the second field. 2340 */ 2341 { -1, 0x320, 0 }, 2342 2343 { 0x180c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, TRUE }, 2344 { 0xfe02, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, TRUE }, 2345 { 0x1801, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, FALSE }, 2346 2347 { 0x0149, UCHAR_DEPRECATED, TRUE }, /* changed in Unicode 5.2 */ 2348 { 0x0341, UCHAR_DEPRECATED, FALSE }, /* changed in Unicode 5.2 */ 2349 { 0xe0041, UCHAR_DEPRECATED, TRUE }, /* changed from Unicode 5 to 5.1 */ 2350 { 0xe0100, UCHAR_DEPRECATED, FALSE }, 2351 2352 { 0x00a0, UCHAR_GRAPHEME_BASE, TRUE }, 2353 { 0x0a4d, UCHAR_GRAPHEME_BASE, FALSE }, 2354 { 0xff9d, UCHAR_GRAPHEME_BASE, TRUE }, 2355 { 0xff9f, UCHAR_GRAPHEME_BASE, FALSE }, /* changed from Unicode 3.2 to 4 and again from 5 to 5.1 */ 2356 2357 { 0x0300, UCHAR_GRAPHEME_EXTEND, TRUE }, 2358 { 0xff9d, UCHAR_GRAPHEME_EXTEND, FALSE }, 2359 { 0xff9f, UCHAR_GRAPHEME_EXTEND, TRUE }, /* changed from Unicode 3.2 to 4 and again from 5 to 5.1 */ 2360 { 0x0603, UCHAR_GRAPHEME_EXTEND, FALSE }, 2361 2362 { 0x0a4d, UCHAR_GRAPHEME_LINK, TRUE }, 2363 { 0xff9f, UCHAR_GRAPHEME_LINK, FALSE }, 2364 2365 { 0x2ff7, UCHAR_IDS_BINARY_OPERATOR, TRUE }, 2366 { 0x2ff3, UCHAR_IDS_BINARY_OPERATOR, FALSE }, 2367 2368 { 0x2ff3, UCHAR_IDS_TRINARY_OPERATOR, TRUE }, 2369 { 0x2f03, UCHAR_IDS_TRINARY_OPERATOR, FALSE }, 2370 2371 { 0x0ec1, UCHAR_LOGICAL_ORDER_EXCEPTION, TRUE }, 2372 { 0xdcba, UCHAR_LOGICAL_ORDER_EXCEPTION, FALSE }, 2373 2374 { 0x2e9b, UCHAR_RADICAL, TRUE }, 2375 { 0x4e00, UCHAR_RADICAL, FALSE }, 2376 2377 { 0x012f, UCHAR_SOFT_DOTTED, TRUE }, 2378 { 0x0049, UCHAR_SOFT_DOTTED, FALSE }, 2379 2380 { 0xfa11, UCHAR_UNIFIED_IDEOGRAPH, TRUE }, 2381 { 0xfa12, UCHAR_UNIFIED_IDEOGRAPH, FALSE }, 2382 2383 { -1, 0x401, 0 }, /* version break for Unicode 4.0.1 */ 2384 2385 { 0x002e, UCHAR_S_TERM, TRUE }, 2386 { 0x0061, UCHAR_S_TERM, FALSE }, 2387 2388 { 0x180c, UCHAR_VARIATION_SELECTOR, TRUE }, 2389 { 0xfe03, UCHAR_VARIATION_SELECTOR, TRUE }, 2390 { 0xe01ef, UCHAR_VARIATION_SELECTOR, TRUE }, 2391 { 0xe0200, UCHAR_VARIATION_SELECTOR, FALSE }, 2392 2393 /* enum/integer type properties */ 2394 2395 /* UCHAR_BIDI_CLASS tested for assigned characters in TestUnicodeData() */ 2396 /* test default Bidi classes for unassigned code points */ 2397 { 0x0590, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, 2398 { 0x05cf, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, 2399 { 0x05ed, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, 2400 { 0x07f2, UCHAR_BIDI_CLASS, U_DIR_NON_SPACING_MARK }, /* Nko, new in Unicode 5.0 */ 2401 { 0x07fe, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, /* unassigned R */ 2402 { 0x089f, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, 2403 { 0xfb37, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, 2404 { 0xfb42, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, 2405 { 0x10806, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, 2406 { 0x10909, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, 2407 { 0x10fe4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, 2408 2409 { 0x0605, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC }, 2410 { 0x061c, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC }, 2411 { 0x063f, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC }, 2412 { 0x070e, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC }, 2413 { 0x0775, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC }, 2414 { 0xfbc2, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC }, 2415 { 0xfd90, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC }, 2416 { 0xfefe, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC }, 2417 2418 { 0x02AF, UCHAR_BLOCK, UBLOCK_IPA_EXTENSIONS }, 2419 { 0x0C4E, UCHAR_BLOCK, UBLOCK_TELUGU }, 2420 { 0x155A, UCHAR_BLOCK, UBLOCK_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS }, 2421 { 0x1717, UCHAR_BLOCK, UBLOCK_TAGALOG }, 2422 { 0x1900, UCHAR_BLOCK, UBLOCK_LIMBU }, 2423 { 0x1AFF, UCHAR_BLOCK, UBLOCK_NO_BLOCK }, 2424 { 0x3040, UCHAR_BLOCK, UBLOCK_HIRAGANA }, 2425 { 0x1D0FF, UCHAR_BLOCK, UBLOCK_BYZANTINE_MUSICAL_SYMBOLS }, 2426 { 0x50000, UCHAR_BLOCK, UBLOCK_NO_BLOCK }, 2427 { 0xEFFFF, UCHAR_BLOCK, UBLOCK_NO_BLOCK }, 2428 { 0x10D0FF, UCHAR_BLOCK, UBLOCK_SUPPLEMENTARY_PRIVATE_USE_AREA_B }, 2429 2430 /* UCHAR_CANONICAL_COMBINING_CLASS tested for assigned characters in TestUnicodeData() */ 2431 { 0xd7d7, UCHAR_CANONICAL_COMBINING_CLASS, 0 }, 2432 2433 { 0x00A0, UCHAR_DECOMPOSITION_TYPE, U_DT_NOBREAK }, 2434 { 0x00A8, UCHAR_DECOMPOSITION_TYPE, U_DT_COMPAT }, 2435 { 0x00bf, UCHAR_DECOMPOSITION_TYPE, U_DT_NONE }, 2436 { 0x00c0, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL }, 2437 { 0x1E9B, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL }, 2438 { 0xBCDE, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL }, 2439 { 0xFB5D, UCHAR_DECOMPOSITION_TYPE, U_DT_MEDIAL }, 2440 { 0x1D736, UCHAR_DECOMPOSITION_TYPE, U_DT_FONT }, 2441 { 0xe0033, UCHAR_DECOMPOSITION_TYPE, U_DT_NONE }, 2442 2443 { 0x0009, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL }, 2444 { 0x0020, UCHAR_EAST_ASIAN_WIDTH, U_EA_NARROW }, 2445 { 0x00B1, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS }, 2446 { 0x20A9, UCHAR_EAST_ASIAN_WIDTH, U_EA_HALFWIDTH }, 2447 { 0x2FFB, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, 2448 { 0x3000, UCHAR_EAST_ASIAN_WIDTH, U_EA_FULLWIDTH }, 2449 { 0x35bb, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, 2450 { 0x58bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, 2451 { 0xD7A3, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, 2452 { 0xEEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS }, 2453 { 0x1D198, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL }, 2454 { 0x20000, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, 2455 { 0x2F8C7, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, 2456 { 0x3a5bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, /* plane 3 got default W values in Unicode 4 */ 2457 { 0x5a5bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL }, 2458 { 0xFEEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS }, 2459 { 0x10EEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS }, 2460 2461 /* UCHAR_GENERAL_CATEGORY tested for assigned characters in TestUnicodeData() */ 2462 { 0xd7c7, UCHAR_GENERAL_CATEGORY, 0 }, 2463 { 0xd7d7, UCHAR_GENERAL_CATEGORY, U_OTHER_LETTER }, /* changed in Unicode 5.2 */ 2464 2465 { 0x0444, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP }, 2466 { 0x0639, UCHAR_JOINING_GROUP, U_JG_AIN }, 2467 { 0x072A, UCHAR_JOINING_GROUP, U_JG_DALATH_RISH }, 2468 { 0x0647, UCHAR_JOINING_GROUP, U_JG_HEH }, 2469 { 0x06C1, UCHAR_JOINING_GROUP, U_JG_HEH_GOAL }, 2470 2471 { 0x200C, UCHAR_JOINING_TYPE, U_JT_NON_JOINING }, 2472 { 0x200D, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING }, 2473 { 0x0639, UCHAR_JOINING_TYPE, U_JT_DUAL_JOINING }, 2474 { 0x0640, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING }, 2475 { 0x06C3, UCHAR_JOINING_TYPE, U_JT_RIGHT_JOINING }, 2476 { 0x0300, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT }, 2477 { 0x070F, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT }, 2478 { 0xe0033, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT }, 2479 2480 /* TestUnicodeData() verifies that no assigned character has "XX" (unknown) */ 2481 { 0xe7e7, UCHAR_LINE_BREAK, U_LB_UNKNOWN }, 2482 { 0x10fffd, UCHAR_LINE_BREAK, U_LB_UNKNOWN }, 2483 { 0x0028, UCHAR_LINE_BREAK, U_LB_OPEN_PUNCTUATION }, 2484 { 0x232A, UCHAR_LINE_BREAK, U_LB_CLOSE_PUNCTUATION }, 2485 { 0x3401, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC }, 2486 { 0x4e02, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC }, 2487 { 0x20004, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC }, 2488 { 0xf905, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC }, 2489 { 0xdb7e, UCHAR_LINE_BREAK, U_LB_SURROGATE }, 2490 { 0xdbfd, UCHAR_LINE_BREAK, U_LB_SURROGATE }, 2491 { 0xdffc, UCHAR_LINE_BREAK, U_LB_SURROGATE }, 2492 { 0x2762, UCHAR_LINE_BREAK, U_LB_EXCLAMATION }, 2493 { 0x002F, UCHAR_LINE_BREAK, U_LB_BREAK_SYMBOLS }, 2494 { 0x1D49C, UCHAR_LINE_BREAK, U_LB_ALPHABETIC }, 2495 { 0x1731, UCHAR_LINE_BREAK, U_LB_ALPHABETIC }, 2496 2497 /* UCHAR_NUMERIC_TYPE tested in TestNumericProperties() */ 2498 2499 /* UCHAR_SCRIPT tested in TestUScriptCodeAPI() */ 2500 2501 { 0x10ff, UCHAR_HANGUL_SYLLABLE_TYPE, 0 }, 2502 { 0x1100, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, 2503 { 0x1111, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, 2504 { 0x1159, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, 2505 { 0x115a, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */ 2506 { 0x115e, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */ 2507 { 0x115f, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, 2508 2509 { 0xa95f, UCHAR_HANGUL_SYLLABLE_TYPE, 0 }, 2510 { 0xa960, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */ 2511 { 0xa97c, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, /* changed in Unicode 5.2 */ 2512 { 0xa97d, UCHAR_HANGUL_SYLLABLE_TYPE, 0 }, 2513 2514 { 0x1160, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, 2515 { 0x1161, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, 2516 { 0x1172, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, 2517 { 0x11a2, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, 2518 { 0x11a3, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */ 2519 { 0x11a7, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */ 2520 2521 { 0xd7af, UCHAR_HANGUL_SYLLABLE_TYPE, 0 }, 2522 { 0xd7b0, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */ 2523 { 0xd7c6, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, /* changed in Unicode 5.2 */ 2524 { 0xd7c7, UCHAR_HANGUL_SYLLABLE_TYPE, 0 }, 2525 2526 { 0x11a8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, 2527 { 0x11b8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, 2528 { 0x11c8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, 2529 { 0x11f9, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, 2530 { 0x11fa, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */ 2531 { 0x11ff, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */ 2532 { 0x1200, UCHAR_HANGUL_SYLLABLE_TYPE, 0 }, 2533 2534 { 0xd7ca, UCHAR_HANGUL_SYLLABLE_TYPE, 0 }, 2535 { 0xd7cb, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */ 2536 { 0xd7fb, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, /* changed in Unicode 5.2 */ 2537 { 0xd7fc, UCHAR_HANGUL_SYLLABLE_TYPE, 0 }, 2538 2539 { 0xac00, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE }, 2540 { 0xac1c, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE }, 2541 { 0xc5ec, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE }, 2542 { 0xd788, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE }, 2543 2544 { 0xac01, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE }, 2545 { 0xac1b, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE }, 2546 { 0xac1d, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE }, 2547 { 0xc5ee, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE }, 2548 { 0xd7a3, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE }, 2549 2550 { 0xd7a4, UCHAR_HANGUL_SYLLABLE_TYPE, 0 }, 2551 2552 { -1, 0x410, 0 }, /* version break for Unicode 4.1 */ 2553 2554 { 0x00d7, UCHAR_PATTERN_SYNTAX, TRUE }, 2555 { 0xfe45, UCHAR_PATTERN_SYNTAX, TRUE }, 2556 { 0x0061, UCHAR_PATTERN_SYNTAX, FALSE }, 2557 2558 { 0x0020, UCHAR_PATTERN_WHITE_SPACE, TRUE }, 2559 { 0x0085, UCHAR_PATTERN_WHITE_SPACE, TRUE }, 2560 { 0x200f, UCHAR_PATTERN_WHITE_SPACE, TRUE }, 2561 { 0x00a0, UCHAR_PATTERN_WHITE_SPACE, FALSE }, 2562 { 0x3000, UCHAR_PATTERN_WHITE_SPACE, FALSE }, 2563 2564 { 0x1d200, UCHAR_BLOCK, UBLOCK_ANCIENT_GREEK_MUSICAL_NOTATION }, 2565 { 0x2c8e, UCHAR_BLOCK, UBLOCK_COPTIC }, 2566 { 0xfe17, UCHAR_BLOCK, UBLOCK_VERTICAL_FORMS }, 2567 2568 { 0x1a00, UCHAR_SCRIPT, USCRIPT_BUGINESE }, 2569 { 0x2cea, UCHAR_SCRIPT, USCRIPT_COPTIC }, 2570 { 0xa82b, UCHAR_SCRIPT, USCRIPT_SYLOTI_NAGRI }, 2571 { 0x103d0, UCHAR_SCRIPT, USCRIPT_OLD_PERSIAN }, 2572 2573 { 0xcc28, UCHAR_LINE_BREAK, U_LB_H2 }, 2574 { 0xcc29, UCHAR_LINE_BREAK, U_LB_H3 }, 2575 { 0xac03, UCHAR_LINE_BREAK, U_LB_H3 }, 2576 { 0x115f, UCHAR_LINE_BREAK, U_LB_JL }, 2577 { 0x11aa, UCHAR_LINE_BREAK, U_LB_JT }, 2578 { 0x11a1, UCHAR_LINE_BREAK, U_LB_JV }, 2579 2580 { 0xb2c9, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_LVT }, 2581 { 0x036f, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_EXTEND }, 2582 { 0x0000, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_CONTROL }, 2583 { 0x1160, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_V }, 2584 2585 { 0x05f4, UCHAR_WORD_BREAK, U_WB_MIDLETTER }, 2586 { 0x4ef0, UCHAR_WORD_BREAK, U_WB_OTHER }, 2587 { 0x19d9, UCHAR_WORD_BREAK, U_WB_NUMERIC }, 2588 { 0x2044, UCHAR_WORD_BREAK, U_WB_MIDNUM }, 2589 2590 { 0xfffd, UCHAR_SENTENCE_BREAK, U_SB_OTHER }, 2591 { 0x1ffc, UCHAR_SENTENCE_BREAK, U_SB_UPPER }, 2592 { 0xff63, UCHAR_SENTENCE_BREAK, U_SB_CLOSE }, 2593 { 0x2028, UCHAR_SENTENCE_BREAK, U_SB_SEP }, 2594 2595 { -1, 0x520, 0 }, /* version break for Unicode 5.2 */ 2596 2597 /* unassigned code points in new default Bidi R blocks */ 2598 { 0x1ede4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, 2599 { 0x1efe4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, 2600 2601 /* test some script codes >127 */ 2602 { 0xa6e6, UCHAR_SCRIPT, USCRIPT_BAMUM }, 2603 { 0xa4d0, UCHAR_SCRIPT, USCRIPT_LISU }, 2604 { 0x10a7f, UCHAR_SCRIPT, USCRIPT_OLD_SOUTH_ARABIAN }, 2605 2606 { -1, 0x600, 0 }, /* version break for Unicode 6.0 */ 2607 2608 /* value changed in Unicode 6.0 */ 2609 { 0x06C3, UCHAR_JOINING_GROUP, U_JG_TEH_MARBUTA_GOAL }, 2610 2611 { -1, 0x610, 0 }, /* version break for Unicode 6.1 */ 2612 2613 /* unassigned code points in new/changed default Bidi AL blocks */ 2614 { 0x08ba, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC }, 2615 { 0x1eee4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC }, 2616 2617 /* undefined UProperty values */ 2618 { 0x61, 0x4a7, 0 }, 2619 { 0x234bc, 0x15ed, 0 } 2620 }; 2621 2622 UVersionInfo version; 2623 UChar32 c; 2624 int32_t i, result, uVersion; 2625 UProperty which; 2626 2627 /* what is our Unicode version? */ 2628 u_getUnicodeVersion(version); 2629 uVersion=((int32_t)version[0]<<8)|(version[1]<<4)|version[2]; /* major/minor/update version numbers */ 2630 2631 u_charAge(0x20, version); 2632 if(version[0]==0) { 2633 /* no additional properties available */ 2634 log_err("TestAdditionalProperties: no additional properties available, not tested\n"); 2635 return; 2636 } 2637 2638 /* test u_charAge() */ 2639 for(i=0; i<sizeof(charAges)/sizeof(charAges[0]); ++i) { 2640 u_charAge(charAges[i].c, version); 2641 if(0!=memcmp(version, charAges[i].version, sizeof(UVersionInfo))) { 2642 log_err("error: u_charAge(U+%04lx)={ %u, %u, %u, %u } instead of { %u, %u, %u, %u }\n", 2643 charAges[i].c, 2644 version[0], version[1], version[2], version[3], 2645 charAges[i].version[0], charAges[i].version[1], charAges[i].version[2], charAges[i].version[3]); 2646 } 2647 } 2648 2649 if( u_getIntPropertyMinValue(UCHAR_DASH)!=0 || 2650 u_getIntPropertyMinValue(UCHAR_BIDI_CLASS)!=0 || 2651 u_getIntPropertyMinValue(UCHAR_BLOCK)!=0 || /* j2478 */ 2652 u_getIntPropertyMinValue(UCHAR_SCRIPT)!=0 || /*JB#2410*/ 2653 u_getIntPropertyMinValue(0x2345)!=0 2654 ) { 2655 log_err("error: u_getIntPropertyMinValue() wrong\n"); 2656 } 2657 if( u_getIntPropertyMaxValue(UCHAR_DASH)!=1) { 2658 log_err("error: u_getIntPropertyMaxValue(UCHAR_DASH) wrong\n"); 2659 } 2660 if( u_getIntPropertyMaxValue(UCHAR_ID_CONTINUE)!=1) { 2661 log_err("error: u_getIntPropertyMaxValue(UCHAR_ID_CONTINUE) wrong\n"); 2662 } 2663 if( u_getIntPropertyMaxValue((UProperty)(UCHAR_BINARY_LIMIT-1))!=1) { 2664 log_err("error: u_getIntPropertyMaxValue(UCHAR_BINARY_LIMIT-1) wrong\n"); 2665 } 2666 if( u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS)!=(int32_t)U_CHAR_DIRECTION_COUNT-1 ) { 2667 log_err("error: u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS) wrong\n"); 2668 } 2669 if( u_getIntPropertyMaxValue(UCHAR_BLOCK)!=(int32_t)UBLOCK_COUNT-1 ) { 2670 log_err("error: u_getIntPropertyMaxValue(UCHAR_BLOCK) wrong\n"); 2671 } 2672 if(u_getIntPropertyMaxValue(UCHAR_LINE_BREAK)!=(int32_t)U_LB_COUNT-1) { 2673 log_err("error: u_getIntPropertyMaxValue(UCHAR_LINE_BREAK) wrong\n"); 2674 } 2675 if(u_getIntPropertyMaxValue(UCHAR_SCRIPT)!=(int32_t)USCRIPT_CODE_LIMIT-1) { 2676 log_err("error: u_getIntPropertyMaxValue(UCHAR_SCRIPT) wrong\n"); 2677 } 2678 if(u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE)!=(int32_t)U_NT_COUNT-1) { 2679 log_err("error: u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE) wrong\n"); 2680 } 2681 if(u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY)!=(int32_t)U_CHAR_CATEGORY_COUNT-1) { 2682 log_err("error: u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY) wrong\n"); 2683 } 2684 if(u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE)!=(int32_t)U_HST_COUNT-1) { 2685 log_err("error: u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE) wrong\n"); 2686 } 2687 if(u_getIntPropertyMaxValue(UCHAR_GRAPHEME_CLUSTER_BREAK)!=(int32_t)U_GCB_COUNT-1) { 2688 log_err("error: u_getIntPropertyMaxValue(UCHAR_GRAPHEME_CLUSTER_BREAK) wrong\n"); 2689 } 2690 if(u_getIntPropertyMaxValue(UCHAR_SENTENCE_BREAK)!=(int32_t)U_SB_COUNT-1) { 2691 log_err("error: u_getIntPropertyMaxValue(UCHAR_SENTENCE_BREAK) wrong\n"); 2692 } 2693 if(u_getIntPropertyMaxValue(UCHAR_WORD_BREAK)!=(int32_t)U_WB_COUNT-1) { 2694 log_err("error: u_getIntPropertyMaxValue(UCHAR_WORD_BREAK) wrong\n"); 2695 } 2696 /*JB#2410*/ 2697 if( u_getIntPropertyMaxValue(0x2345)!=-1) { 2698 log_err("error: u_getIntPropertyMaxValue(0x2345) wrong\n"); 2699 } 2700 if( u_getIntPropertyMaxValue(UCHAR_DECOMPOSITION_TYPE) != (int32_t) (U_DT_COUNT - 1)) { 2701 log_err("error: u_getIntPropertyMaxValue(UCHAR_DECOMPOSITION_TYPE) wrong\n"); 2702 } 2703 if( u_getIntPropertyMaxValue(UCHAR_JOINING_GROUP) != (int32_t) (U_JG_COUNT -1)) { 2704 log_err("error: u_getIntPropertyMaxValue(UCHAR_JOINING_GROUP) wrong\n"); 2705 } 2706 if( u_getIntPropertyMaxValue(UCHAR_JOINING_TYPE) != (int32_t) (U_JT_COUNT -1)) { 2707 log_err("error: u_getIntPropertyMaxValue(UCHAR_JOINING_TYPE) wrong\n"); 2708 } 2709 if( u_getIntPropertyMaxValue(UCHAR_EAST_ASIAN_WIDTH) != (int32_t) (U_EA_COUNT -1)) { 2710 log_err("error: u_getIntPropertyMaxValue(UCHAR_EAST_ASIAN_WIDTH) wrong\n"); 2711 } 2712 2713 /* test u_hasBinaryProperty() and u_getIntPropertyValue() */ 2714 for(i=0; i<sizeof(props)/sizeof(props[0]); ++i) { 2715 const char *whichName; 2716 2717 if(props[i][0]<0) { 2718 /* Unicode version break */ 2719 if(uVersion<props[i][1]) { 2720 break; /* do not test properties that are not yet supported */ 2721 } else { 2722 continue; /* skip this row */ 2723 } 2724 } 2725 2726 c=(UChar32)props[i][0]; 2727 which=(UProperty)props[i][1]; 2728 whichName=u_getPropertyName(which, U_LONG_PROPERTY_NAME); 2729 2730 if(which<UCHAR_INT_START) { 2731 result=u_hasBinaryProperty(c, which); 2732 if(result!=props[i][2]) { 2733 log_data_err("error: u_hasBinaryProperty(U+%04lx, %s)=%d is wrong (props[%d]) - (Are you missing data?)\n", 2734 c, whichName, result, i); 2735 } 2736 } 2737 2738 result=u_getIntPropertyValue(c, which); 2739 if(result!=props[i][2]) { 2740 log_data_err("error: u_getIntPropertyValue(U+%04lx, %s)=%d is wrong, should be %d (props[%d]) - (Are you missing data?)\n", 2741 c, whichName, result, props[i][2], i); 2742 } 2743 2744 /* test separate functions, too */ 2745 switch((UProperty)props[i][1]) { 2746 case UCHAR_ALPHABETIC: 2747 if(u_isUAlphabetic((UChar32)props[i][0])!=(UBool)props[i][2]) { 2748 log_err("error: u_isUAlphabetic(U+%04lx)=%d is wrong (props[%d])\n", 2749 props[i][0], result, i); 2750 } 2751 break; 2752 case UCHAR_LOWERCASE: 2753 if(u_isULowercase((UChar32)props[i][0])!=(UBool)props[i][2]) { 2754 log_err("error: u_isULowercase(U+%04lx)=%d is wrong (props[%d])\n", 2755 props[i][0], result, i); 2756 } 2757 break; 2758 case UCHAR_UPPERCASE: 2759 if(u_isUUppercase((UChar32)props[i][0])!=(UBool)props[i][2]) { 2760 log_err("error: u_isUUppercase(U+%04lx)=%d is wrong (props[%d])\n", 2761 props[i][0], result, i); 2762 } 2763 break; 2764 case UCHAR_WHITE_SPACE: 2765 if(u_isUWhiteSpace((UChar32)props[i][0])!=(UBool)props[i][2]) { 2766 log_err("error: u_isUWhiteSpace(U+%04lx)=%d is wrong (props[%d])\n", 2767 props[i][0], result, i); 2768 } 2769 break; 2770 default: 2771 break; 2772 } 2773 } 2774 } 2775 2776 static void 2777 TestNumericProperties(void) { 2778 /* see UnicodeData.txt, DerivedNumericValues.txt */ 2779 static const struct { 2780 UChar32 c; 2781 int32_t type; 2782 double numValue; 2783 } values[]={ 2784 { 0x12456, U_NT_NUMERIC, -1. }, 2785 { 0x12457, U_NT_NUMERIC, -1. }, 2786 { 0x0F33, U_NT_NUMERIC, -1./2. }, 2787 { 0x0C66, U_NT_DECIMAL, 0 }, 2788 { 0x96f6, U_NT_NUMERIC, 0 }, 2789 { 0xa833, U_NT_NUMERIC, 1./16. }, 2790 { 0x2152, U_NT_NUMERIC, 1./10. }, 2791 { 0x2151, U_NT_NUMERIC, 1./9. }, 2792 { 0x1245f, U_NT_NUMERIC, 1./8. }, 2793 { 0x2150, U_NT_NUMERIC, 1./7. }, 2794 { 0x2159, U_NT_NUMERIC, 1./6. }, 2795 { 0x09f6, U_NT_NUMERIC, 3./16. }, 2796 { 0x2155, U_NT_NUMERIC, 1./5. }, 2797 { 0x00BD, U_NT_NUMERIC, 1./2. }, 2798 { 0x0031, U_NT_DECIMAL, 1. }, 2799 { 0x4e00, U_NT_NUMERIC, 1. }, 2800 { 0x58f1, U_NT_NUMERIC, 1. }, 2801 { 0x10320, U_NT_NUMERIC, 1. }, 2802 { 0x0F2B, U_NT_NUMERIC, 3./2. }, 2803 { 0x00B2, U_NT_DIGIT, 2. }, 2804 { 0x5f10, U_NT_NUMERIC, 2. }, 2805 { 0x1813, U_NT_DECIMAL, 3. }, 2806 { 0x5f0e, U_NT_NUMERIC, 3. }, 2807 { 0x2173, U_NT_NUMERIC, 4. }, 2808 { 0x8086, U_NT_NUMERIC, 4. }, 2809 { 0x278E, U_NT_DIGIT, 5. }, 2810 { 0x1D7F2, U_NT_DECIMAL, 6. }, 2811 { 0x247A, U_NT_DIGIT, 7. }, 2812 { 0x7396, U_NT_NUMERIC, 9. }, 2813 { 0x1372, U_NT_NUMERIC, 10. }, 2814 { 0x216B, U_NT_NUMERIC, 12. }, 2815 { 0x16EE, U_NT_NUMERIC, 17. }, 2816 { 0x249A, U_NT_NUMERIC, 19. }, 2817 { 0x303A, U_NT_NUMERIC, 30. }, 2818 { 0x5345, U_NT_NUMERIC, 30. }, 2819 { 0x32B2, U_NT_NUMERIC, 37. }, 2820 { 0x1375, U_NT_NUMERIC, 40. }, 2821 { 0x10323, U_NT_NUMERIC, 50. }, 2822 { 0x0BF1, U_NT_NUMERIC, 100. }, 2823 { 0x964c, U_NT_NUMERIC, 100. }, 2824 { 0x217E, U_NT_NUMERIC, 500. }, 2825 { 0x2180, U_NT_NUMERIC, 1000. }, 2826 { 0x4edf, U_NT_NUMERIC, 1000. }, 2827 { 0x2181, U_NT_NUMERIC, 5000. }, 2828 { 0x137C, U_NT_NUMERIC, 10000. }, 2829 { 0x4e07, U_NT_NUMERIC, 10000. }, 2830 { 0x12432, U_NT_NUMERIC, 216000. }, 2831 { 0x12433, U_NT_NUMERIC, 432000. }, 2832 { 0x4ebf, U_NT_NUMERIC, 100000000. }, 2833 { 0x5146, U_NT_NUMERIC, 1000000000000. }, 2834 { -1, U_NT_NONE, U_NO_NUMERIC_VALUE }, 2835 { 0x61, U_NT_NONE, U_NO_NUMERIC_VALUE }, 2836 { 0x3000, U_NT_NONE, U_NO_NUMERIC_VALUE }, 2837 { 0xfffe, U_NT_NONE, U_NO_NUMERIC_VALUE }, 2838 { 0x10301, U_NT_NONE, U_NO_NUMERIC_VALUE }, 2839 { 0xe0033, U_NT_NONE, U_NO_NUMERIC_VALUE }, 2840 { 0x10ffff, U_NT_NONE, U_NO_NUMERIC_VALUE }, 2841 { 0x110000, U_NT_NONE, U_NO_NUMERIC_VALUE } 2842 }; 2843 2844 double nv; 2845 UChar32 c; 2846 int32_t i, type; 2847 2848 for(i=0; i<LENGTHOF(values); ++i) { 2849 c=values[i].c; 2850 type=u_getIntPropertyValue(c, UCHAR_NUMERIC_TYPE); 2851 nv=u_getNumericValue(c); 2852 2853 if(type!=values[i].type) { 2854 log_err("UCHAR_NUMERIC_TYPE(U+%04lx)=%d should be %d\n", c, type, values[i].type); 2855 } 2856 if(0.000001 <= fabs(nv - values[i].numValue)) { 2857 log_err("u_getNumericValue(U+%04lx)=%g should be %g\n", c, nv, values[i].numValue); 2858 } 2859 } 2860 } 2861 2862 /** 2863 * Test the property names and property value names API. 2864 */ 2865 static void 2866 TestPropertyNames(void) { 2867 int32_t p, v, choice=0, rev; 2868 UBool atLeastSomething = FALSE; 2869 2870 for (p=0; ; ++p) { 2871 UProperty propEnum = (UProperty)p; 2872 UBool sawProp = FALSE; 2873 if(p > 10 && !atLeastSomething) { 2874 log_data_err("Never got anything after 10 tries.\nYour data is probably fried. Quitting this test\n", p, choice); 2875 return; 2876 } 2877 2878 for (choice=0; ; ++choice) { 2879 const char* name = u_getPropertyName(propEnum, (UPropertyNameChoice)choice); 2880 if (name) { 2881 if (!sawProp) 2882 log_verbose("prop 0x%04x+%2d:", p&~0xfff, p&0xfff); 2883 log_verbose("%d=\"%s\"", choice, name); 2884 sawProp = TRUE; 2885 atLeastSomething = TRUE; 2886 2887 /* test reverse mapping */ 2888 rev = u_getPropertyEnum(name); 2889 if (rev != p) { 2890 log_err("Property round-trip failure: %d -> %s -> %d\n", 2891 p, name, rev); 2892 } 2893 } 2894 if (!name && choice>0) break; 2895 } 2896 if (sawProp) { 2897 /* looks like a valid property; check the values */ 2898 const char* pname = u_getPropertyName(propEnum, U_LONG_PROPERTY_NAME); 2899 int32_t max = 0; 2900 if (p == UCHAR_CANONICAL_COMBINING_CLASS) { 2901 max = 255; 2902 } else if (p == UCHAR_GENERAL_CATEGORY_MASK) { 2903 /* it's far too slow to iterate all the way up to 2904 the real max, U_GC_P_MASK */ 2905 max = U_GC_NL_MASK; 2906 } else if (p == UCHAR_BLOCK) { 2907 /* UBlockCodes, unlike other values, start at 1 */ 2908 max = 1; 2909 } 2910 log_verbose("\n"); 2911 for (v=-1; ; ++v) { 2912 UBool sawValue = FALSE; 2913 for (choice=0; ; ++choice) { 2914 const char* vname = u_getPropertyValueName(propEnum, v, (UPropertyNameChoice)choice); 2915 if (vname) { 2916 if (!sawValue) log_verbose(" %s, value %d:", pname, v); 2917 log_verbose("%d=\"%s\"", choice, vname); 2918 sawValue = TRUE; 2919 2920 /* test reverse mapping */ 2921 rev = u_getPropertyValueEnum(propEnum, vname); 2922 if (rev != v) { 2923 log_err("Value round-trip failure (%s): %d -> %s -> %d\n", 2924 pname, v, vname, rev); 2925 } 2926 } 2927 if (!vname && choice>0) break; 2928 } 2929 if (sawValue) { 2930 log_verbose("\n"); 2931 } 2932 if (!sawValue && v>=max) break; 2933 } 2934 } 2935 if (!sawProp) { 2936 if (p>=UCHAR_STRING_LIMIT) { 2937 break; 2938 } else if (p>=UCHAR_DOUBLE_LIMIT) { 2939 p = UCHAR_STRING_START - 1; 2940 } else if (p>=UCHAR_MASK_LIMIT) { 2941 p = UCHAR_DOUBLE_START - 1; 2942 } else if (p>=UCHAR_INT_LIMIT) { 2943 p = UCHAR_MASK_START - 1; 2944 } else if (p>=UCHAR_BINARY_LIMIT) { 2945 p = UCHAR_INT_START - 1; 2946 } 2947 } 2948 } 2949 } 2950 2951 /** 2952 * Test the property values API. See JB#2410. 2953 */ 2954 static void 2955 TestPropertyValues(void) { 2956 int32_t i, p, min, max; 2957 UErrorCode ec; 2958 2959 /* Min should be 0 for everything. */ 2960 /* Until JB#2478 is fixed, the one exception is UCHAR_BLOCK. */ 2961 for (p=UCHAR_INT_START; p<UCHAR_INT_LIMIT; ++p) { 2962 UProperty propEnum = (UProperty)p; 2963 min = u_getIntPropertyMinValue(propEnum); 2964 if (min != 0) { 2965 if (p == UCHAR_BLOCK) { 2966 /* This is okay...for now. See JB#2487. 2967 TODO Update this for JB#2487. */ 2968 } else { 2969 const char* name; 2970 name = u_getPropertyName(propEnum, U_LONG_PROPERTY_NAME); 2971 if (name == NULL) 2972 name = "<ERROR>"; 2973 log_err("FAIL: u_getIntPropertyMinValue(%s) = %d, exp. 0\n", 2974 name, min); 2975 } 2976 } 2977 } 2978 2979 if( u_getIntPropertyMinValue(UCHAR_GENERAL_CATEGORY_MASK)!=0 || 2980 u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY_MASK)!=-1) { 2981 log_err("error: u_getIntPropertyMin/MaxValue(UCHAR_GENERAL_CATEGORY_MASK) is wrong\n"); 2982 } 2983 2984 /* Max should be -1 for invalid properties. */ 2985 max = u_getIntPropertyMaxValue(UCHAR_INVALID_CODE); 2986 if (max != -1) { 2987 log_err("FAIL: u_getIntPropertyMaxValue(-1) = %d, exp. -1\n", 2988 max); 2989 } 2990 2991 /* Script should return USCRIPT_INVALID_CODE for an invalid code point. */ 2992 for (i=0; i<2; ++i) { 2993 int32_t script; 2994 const char* desc; 2995 ec = U_ZERO_ERROR; 2996 switch (i) { 2997 case 0: 2998 script = uscript_getScript(-1, &ec); 2999 desc = "uscript_getScript(-1)"; 3000 break; 3001 case 1: 3002 script = u_getIntPropertyValue(-1, UCHAR_SCRIPT); 3003 desc = "u_getIntPropertyValue(-1, UCHAR_SCRIPT)"; 3004 break; 3005 default: 3006 log_err("Internal test error. Too many scripts\n"); 3007 return; 3008 } 3009 /* We don't explicitly test ec. It should be U_FAILURE but it 3010 isn't documented as such. */ 3011 if (script != (int32_t)USCRIPT_INVALID_CODE) { 3012 log_err("FAIL: %s = %d, exp. 0\n", 3013 desc, script); 3014 } 3015 } 3016 } 3017 3018 /* various tests for consistency of UCD data and API behavior */ 3019 static void 3020 TestConsistency() { 3021 char buffer[300]; 3022 USet *set1, *set2, *set3, *set4; 3023 UErrorCode errorCode; 3024 3025 UChar32 start, end; 3026 int32_t i, length; 3027 3028 U_STRING_DECL(hyphenPattern, "[:Hyphen:]", 10); 3029 U_STRING_DECL(dashPattern, "[:Dash:]", 8); 3030 U_STRING_DECL(lowerPattern, "[:Lowercase:]", 13); 3031 U_STRING_DECL(formatPattern, "[:Cf:]", 6); 3032 U_STRING_DECL(alphaPattern, "[:Alphabetic:]", 14); 3033 3034 U_STRING_DECL(mathBlocksPattern, 3035 "[[:block=Mathematical Operators:][:block=Miscellaneous Mathematical Symbols-A:][:block=Miscellaneous Mathematical Symbols-B:][:block=Supplemental Mathematical Operators:][:block=Mathematical Alphanumeric Symbols:]]", 3036 1+32+46+46+45+43+1+1); /* +1 for NUL */ 3037 U_STRING_DECL(mathPattern, "[:Math:]", 8); 3038 U_STRING_DECL(unassignedPattern, "[:Cn:]", 6); 3039 U_STRING_DECL(unknownPattern, "[:sc=Unknown:]", 14); 3040 U_STRING_DECL(reservedPattern, "[[:Cn:][:Co:][:Cs:]]", 20); 3041 3042 U_STRING_INIT(hyphenPattern, "[:Hyphen:]", 10); 3043 U_STRING_INIT(dashPattern, "[:Dash:]", 8); 3044 U_STRING_INIT(lowerPattern, "[:Lowercase:]", 13); 3045 U_STRING_INIT(formatPattern, "[:Cf:]", 6); 3046 U_STRING_INIT(alphaPattern, "[:Alphabetic:]", 14); 3047 3048 U_STRING_INIT(mathBlocksPattern, 3049 "[[:block=Mathematical Operators:][:block=Miscellaneous Mathematical Symbols-A:][:block=Miscellaneous Mathematical Symbols-B:][:block=Supplemental Mathematical Operators:][:block=Mathematical Alphanumeric Symbols:]]", 3050 1+32+46+46+45+43+1+1); /* +1 for NUL */ 3051 U_STRING_INIT(mathPattern, "[:Math:]", 8); 3052 U_STRING_INIT(unassignedPattern, "[:Cn:]", 6); 3053 U_STRING_INIT(unknownPattern, "[:sc=Unknown:]", 14); 3054 U_STRING_INIT(reservedPattern, "[[:Cn:][:Co:][:Cs:]]", 20); 3055 3056 /* 3057 * It used to be that UCD.html and its precursors said 3058 * "Those dashes used to mark connections between pieces of words, 3059 * plus the Katakana middle dot." 3060 * 3061 * Unicode 4 changed 00AD Soft Hyphen to Cf and removed it from Dash 3062 * but not from Hyphen. 3063 * UTC 94 (2003mar) decided to leave it that way and to change UCD.html. 3064 * Therefore, do not show errors when testing the Hyphen property. 3065 */ 3066 log_verbose("Starting with Unicode 4, inconsistencies with [:Hyphen:] are\n" 3067 "known to the UTC and not considered errors.\n"); 3068 3069 errorCode=U_ZERO_ERROR; 3070 set1=uset_openPattern(hyphenPattern, 10, &errorCode); 3071 set2=uset_openPattern(dashPattern, 8, &errorCode); 3072 if(U_SUCCESS(errorCode)) { 3073 /* remove the Katakana middle dot(s) from set1 */ 3074 uset_remove(set1, 0x30fb); 3075 uset_remove(set1, 0xff65); /* halfwidth variant */ 3076 showAMinusB(set1, set2, "[:Hyphen:]", "[:Dash:]", FALSE); 3077 } else { 3078 log_data_err("error opening [:Hyphen:] or [:Dash:] - %s (Are you missing data?)\n", u_errorName(errorCode)); 3079 } 3080 3081 /* check that Cf is neither Hyphen nor Dash nor Alphabetic */ 3082 set3=uset_openPattern(formatPattern, 6, &errorCode); 3083 set4=uset_openPattern(alphaPattern, 14, &errorCode); 3084 if(U_SUCCESS(errorCode)) { 3085 showAIntersectB(set3, set1, "[:Cf:]", "[:Hyphen:]", FALSE); 3086 showAIntersectB(set3, set2, "[:Cf:]", "[:Dash:]", TRUE); 3087 showAIntersectB(set3, set4, "[:Cf:]", "[:Alphabetic:]", TRUE); 3088 } else { 3089 log_data_err("error opening [:Cf:] or [:Alpbabetic:] - %s (Are you missing data?)\n", u_errorName(errorCode)); 3090 } 3091 3092 uset_close(set1); 3093 uset_close(set2); 3094 uset_close(set3); 3095 uset_close(set4); 3096 3097 /* 3098 * Check that each lowercase character has "small" in its name 3099 * and not "capital". 3100 * There are some such characters, some of which seem odd. 3101 * Use the verbose flag to see these notices. 3102 */ 3103 errorCode=U_ZERO_ERROR; 3104 set1=uset_openPattern(lowerPattern, 13, &errorCode); 3105 if(U_SUCCESS(errorCode)) { 3106 for(i=0;; ++i) { 3107 length=uset_getItem(set1, i, &start, &end, NULL, 0, &errorCode); 3108 if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) { 3109 break; /* done */ 3110 } 3111 if(U_FAILURE(errorCode)) { 3112 log_err("error iterating over [:Lowercase:] at item %d: %s\n", 3113 i, u_errorName(errorCode)); 3114 break; 3115 } 3116 if(length!=0) { 3117 break; /* done with code points, got a string or -1 */ 3118 } 3119 3120 while(start<=end) { 3121 length=u_charName(start, U_UNICODE_CHAR_NAME, buffer, sizeof(buffer), &errorCode); 3122 if(U_FAILURE(errorCode)) { 3123 log_data_err("error getting the name of U+%04x - %s\n", start, u_errorName(errorCode)); 3124 errorCode=U_ZERO_ERROR; 3125 } 3126 if( (strstr(buffer, "SMALL")==NULL || strstr(buffer, "CAPITAL")!=NULL) && 3127 strstr(buffer, "SMALL CAPITAL")==NULL 3128 ) { 3129 log_verbose("info: [:Lowercase:] contains U+%04x whose name does not suggest lowercase: %s\n", start, buffer); 3130 } 3131 ++start; 3132 } 3133 } 3134 } else { 3135 log_data_err("error opening [:Lowercase:] - %s (Are you missing data?)\n", u_errorName(errorCode)); 3136 } 3137 uset_close(set1); 3138 3139 /* verify that all assigned characters in Math blocks are exactly Math characters */ 3140 errorCode=U_ZERO_ERROR; 3141 set1=uset_openPattern(mathBlocksPattern, -1, &errorCode); 3142 set2=uset_openPattern(mathPattern, 8, &errorCode); 3143 set3=uset_openPattern(unassignedPattern, 6, &errorCode); 3144 if(U_SUCCESS(errorCode)) { 3145 uset_retainAll(set2, set1); /* [math blocks]&[:Math:] */ 3146 uset_complement(set3); /* assigned characters */ 3147 uset_retainAll(set1, set3); /* [math blocks]&[assigned] */ 3148 compareUSets(set1, set2, 3149 "[assigned Math block chars]", "[math blocks]&[:Math:]", 3150 TRUE); 3151 } else { 3152 log_data_err("error opening [math blocks] or [:Math:] or [:Cn:] - %s (Are you missing data?)\n", u_errorName(errorCode)); 3153 } 3154 uset_close(set1); 3155 uset_close(set2); 3156 uset_close(set3); 3157 3158 /* new in Unicode 5.0: exactly all unassigned+PUA+surrogate code points have script=Unknown */ 3159 errorCode=U_ZERO_ERROR; 3160 set1=uset_openPattern(unknownPattern, 14, &errorCode); 3161 set2=uset_openPattern(reservedPattern, 20, &errorCode); 3162 if(U_SUCCESS(errorCode)) { 3163 compareUSets(set1, set2, 3164 "[:sc=Unknown:]", "[[:Cn:][:Co:][:Cs:]]", 3165 TRUE); 3166 } else { 3167 log_data_err("error opening [:sc=Unknown:] or [[:Cn:][:Co:][:Cs:]] - %s (Are you missing data?)\n", u_errorName(errorCode)); 3168 } 3169 uset_close(set1); 3170 uset_close(set2); 3171 } 3172 3173 /* 3174 * Starting with ICU4C 3.4, the core Unicode properties files 3175 * (uprops.icu, ucase.icu, ubidi.icu, unorm.icu) 3176 * are hardcoded in the common DLL and therefore not included 3177 * in the data package any more. 3178 * Test requiring these files are disabled so that 3179 * we need not jump through hoops (like adding snapshots of these files 3180 * to testdata). 3181 * See Jitterbug 4497. 3182 */ 3183 #define HARDCODED_DATA_4497 1 3184 3185 /* API coverage for ucase.c */ 3186 static void TestUCase() { 3187 #if !HARDCODED_DATA_4497 3188 UDataMemory *pData; 3189 UCaseProps *csp; 3190 const UCaseProps *ccsp; 3191 UErrorCode errorCode; 3192 3193 /* coverage for ucase_openBinary() */ 3194 errorCode=U_ZERO_ERROR; 3195 pData=udata_open(NULL, UCASE_DATA_TYPE, UCASE_DATA_NAME, &errorCode); 3196 if(U_FAILURE(errorCode)) { 3197 log_data_err("unable to open " UCASE_DATA_NAME "." UCASE_DATA_TYPE ": %s\n", 3198 u_errorName(errorCode)); 3199 return; 3200 } 3201 3202 csp=ucase_openBinary((const uint8_t *)pData->pHeader, -1, &errorCode); 3203 if(U_FAILURE(errorCode)) { 3204 log_err("ucase_openBinary() fails for the contents of " UCASE_DATA_NAME "." UCASE_DATA_TYPE ": %s\n", 3205 u_errorName(errorCode)); 3206 udata_close(pData); 3207 return; 3208 } 3209 3210 if(UCASE_LOWER!=ucase_getType(csp, 0xdf)) { /* verify islower(sharp s) */ 3211 log_err("ucase_openBinary() does not seem to return working UCaseProps\n"); 3212 } 3213 3214 ucase_close(csp); 3215 udata_close(pData); 3216 3217 /* coverage for ucase_getDummy() */ 3218 errorCode=U_ZERO_ERROR; 3219 ccsp=ucase_getDummy(&errorCode); 3220 if(ucase_tolower(ccsp, 0x41)!=0x41) { 3221 log_err("ucase_tolower(dummy, A)!=A\n"); 3222 } 3223 #endif 3224 } 3225 3226 /* API coverage for ubidi_props.c */ 3227 static void TestUBiDiProps() { 3228 #if !HARDCODED_DATA_4497 3229 UDataMemory *pData; 3230 UBiDiProps *bdp; 3231 const UBiDiProps *cbdp; 3232 UErrorCode errorCode; 3233 3234 /* coverage for ubidi_openBinary() */ 3235 errorCode=U_ZERO_ERROR; 3236 pData=udata_open(NULL, UBIDI_DATA_TYPE, UBIDI_DATA_NAME, &errorCode); 3237 if(U_FAILURE(errorCode)) { 3238 log_data_err("unable to open " UBIDI_DATA_NAME "." UBIDI_DATA_TYPE ": %s\n", 3239 u_errorName(errorCode)); 3240 return; 3241 } 3242 3243 bdp=ubidi_openBinary((const uint8_t *)pData->pHeader, -1, &errorCode); 3244 if(U_FAILURE(errorCode)) { 3245 log_err("ubidi_openBinary() fails for the contents of " UBIDI_DATA_NAME "." UBIDI_DATA_TYPE ": %s\n", 3246 u_errorName(errorCode)); 3247 udata_close(pData); 3248 return; 3249 } 3250 3251 if(0x2215!=ubidi_getMirror(bdp, 0x29F5)) { /* verify some data */ 3252 log_err("ubidi_openBinary() does not seem to return working UBiDiProps\n"); 3253 } 3254 3255 ubidi_closeProps(bdp); 3256 udata_close(pData); 3257 3258 /* coverage for ubidi_getDummy() */ 3259 errorCode=U_ZERO_ERROR; 3260 cbdp=ubidi_getDummy(&errorCode); 3261 if(ubidi_getClass(cbdp, 0x20)!=0) { 3262 log_err("ubidi_getClass(dummy, space)!=0\n"); 3263 } 3264 #endif 3265 } 3266 3267 /* test case folding, compare return values with CaseFolding.txt ------------ */ 3268 3269 /* bit set for which case foldings for a character have been tested already */ 3270 enum { 3271 CF_SIMPLE=1, 3272 CF_FULL=2, 3273 CF_TURKIC=4, 3274 CF_ALL=7 3275 }; 3276 3277 static void 3278 testFold(UChar32 c, int which, 3279 UChar32 simple, UChar32 turkic, 3280 const UChar *full, int32_t fullLength, 3281 const UChar *turkicFull, int32_t turkicFullLength) { 3282 UChar s[2], t[32]; 3283 UChar32 c2; 3284 int32_t length, length2; 3285 3286 UErrorCode errorCode=U_ZERO_ERROR; 3287 3288 length=0; 3289 U16_APPEND_UNSAFE(s, length, c); 3290 3291 if((which&CF_SIMPLE)!=0 && (c2=u_foldCase(c, 0))!=simple) { 3292 log_err("u_foldCase(U+%04lx, default)=U+%04lx != U+%04lx\n", (long)c, (long)c2, (long)simple); 3293 } 3294 if((which&CF_FULL)!=0) { 3295 length2=u_strFoldCase(t, LENGTHOF(t), s, length, 0, &errorCode); 3296 if(length2!=fullLength || 0!=u_memcmp(t, full, fullLength)) { 3297 log_err("u_strFoldCase(U+%04lx, default) does not fold properly\n", (long)c); 3298 } 3299 } 3300 if((which&CF_TURKIC)!=0) { 3301 if((c2=u_foldCase(c, U_FOLD_CASE_EXCLUDE_SPECIAL_I))!=turkic) { 3302 log_err("u_foldCase(U+%04lx, turkic)=U+%04lx != U+%04lx\n", (long)c, (long)c2, (long)simple); 3303 } 3304 3305 length2=u_strFoldCase(t, LENGTHOF(t), s, length, U_FOLD_CASE_EXCLUDE_SPECIAL_I, &errorCode); 3306 if(length2!=turkicFullLength || 0!=u_memcmp(t, turkicFull, length2)) { 3307 log_err("u_strFoldCase(U+%04lx, turkic) does not fold properly\n", (long)c); 3308 } 3309 } 3310 } 3311 3312 /* test that c case-folds to itself */ 3313 static void 3314 testFoldToSelf(UChar32 c, int which) { 3315 UChar s[2]; 3316 int32_t length; 3317 3318 length=0; 3319 U16_APPEND_UNSAFE(s, length, c); 3320 testFold(c, which, c, c, s, length, s, length); 3321 } 3322 3323 struct CaseFoldingData { 3324 USet *notSeen; 3325 UChar32 prev, prevSimple; 3326 UChar prevFull[32]; 3327 int32_t prevFullLength; 3328 int which; 3329 }; 3330 typedef struct CaseFoldingData CaseFoldingData; 3331 3332 static void U_CALLCONV 3333 caseFoldingLineFn(void *context, 3334 char *fields[][2], int32_t fieldCount, 3335 UErrorCode *pErrorCode) { 3336 CaseFoldingData *pData=(CaseFoldingData *)context; 3337 char *end; 3338 UChar full[32]; 3339 UChar32 c, prev, simple; 3340 int32_t count; 3341 int which; 3342 char status; 3343 3344 /* get code point */ 3345 const char *s=u_skipWhitespace(fields[0][0]); 3346 if(0==strncmp(s, "0000..10FFFF", 12)) { 3347 /* 3348 * Ignore the line 3349 * # @missing: 0000..10FFFF; C; <code point> 3350 * because maps-to-self is already our default, and this line breaks this parser. 3351 */ 3352 return; 3353 } 3354 c=(UChar32)strtoul(s, &end, 16); 3355 end=(char *)u_skipWhitespace(end); 3356 if(end<=fields[0][0] || end!=fields[0][1]) { 3357 log_err("syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]); 3358 *pErrorCode=U_PARSE_ERROR; 3359 return; 3360 } 3361 3362 /* get the status of this mapping */ 3363 status=*u_skipWhitespace(fields[1][0]); 3364 if(status!='C' && status!='S' && status!='F' && status!='T') { 3365 log_err("unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]); 3366 *pErrorCode=U_PARSE_ERROR; 3367 return; 3368 } 3369 3370 /* get the mapping */ 3371 count=u_parseString(fields[2][0], full, 32, (uint32_t *)&simple, pErrorCode); 3372 if(U_FAILURE(*pErrorCode)) { 3373 log_err("error parsing CaseFolding.txt mapping at %s\n", fields[0][0]); 3374 return; 3375 } 3376 3377 /* there is a simple mapping only if there is exactly one code point (count is in UChars) */ 3378 if(count==0 || count>2 || (count==2 && U16_IS_SINGLE(full[1]))) { 3379 simple=c; 3380 } 3381 3382 if(c!=(prev=pData->prev)) { 3383 /* 3384 * Test remaining mappings for the previous code point. 3385 * If a turkic folding was not mentioned, then it should fold the same 3386 * as the regular simple case folding. 3387 */ 3388 UChar prevString[2]; 3389 int32_t length; 3390 3391 length=0; 3392 U16_APPEND_UNSAFE(prevString, length, prev); 3393 testFold(prev, (~pData->which)&CF_ALL, 3394 prev, pData->prevSimple, 3395 prevString, length, 3396 pData->prevFull, pData->prevFullLength); 3397 pData->prev=pData->prevSimple=c; 3398 length=0; 3399 U16_APPEND_UNSAFE(pData->prevFull, length, c); 3400 pData->prevFullLength=length; 3401 pData->which=0; 3402 } 3403 3404 /* 3405 * Turn the status into a bit set of case foldings to test. 3406 * Remember non-Turkic case foldings as defaults for Turkic mode. 3407 */ 3408 switch(status) { 3409 case 'C': 3410 which=CF_SIMPLE|CF_FULL; 3411 pData->prevSimple=simple; 3412 u_memcpy(pData->prevFull, full, count); 3413 pData->prevFullLength=count; 3414 break; 3415 case 'S': 3416 which=CF_SIMPLE; 3417 pData->prevSimple=simple; 3418 break; 3419 case 'F': 3420 which=CF_FULL; 3421 u_memcpy(pData->prevFull, full, count); 3422 pData->prevFullLength=count; 3423 break; 3424 case 'T': 3425 which=CF_TURKIC; 3426 break; 3427 default: 3428 which=0; 3429 break; /* won't happen because of test above */ 3430 } 3431 3432 testFold(c, which, simple, simple, full, count, full, count); 3433 3434 /* remember which case foldings of c have been tested */ 3435 pData->which|=which; 3436 3437 /* remove c from the set of ones not mentioned in CaseFolding.txt */ 3438 uset_remove(pData->notSeen, c); 3439 } 3440 3441 static void 3442 TestCaseFolding() { 3443 CaseFoldingData data={ NULL }; 3444 char *fields[3][2]; 3445 UErrorCode errorCode; 3446 3447 static char *lastLine= (char *)"10FFFF; C; 10FFFF;"; 3448 3449 errorCode=U_ZERO_ERROR; 3450 /* test BMP & plane 1 - nothing interesting above */ 3451 data.notSeen=uset_open(0, 0x1ffff); 3452 data.prevFullLength=1; /* length of full case folding of U+0000 */ 3453 3454 parseUCDFile("CaseFolding.txt", fields, 3, caseFoldingLineFn, &data, &errorCode); 3455 if(U_SUCCESS(errorCode)) { 3456 int32_t i, start, end; 3457 3458 /* add a pseudo-last line to finish testing of the actual last one */ 3459 fields[0][0]=lastLine; 3460 fields[0][1]=lastLine+6; 3461 fields[1][0]=lastLine+7; 3462 fields[1][1]=lastLine+9; 3463 fields[2][0]=lastLine+10; 3464 fields[2][1]=lastLine+17; 3465 caseFoldingLineFn(&data, fields, 3, &errorCode); 3466 3467 /* verify that all code points that are not mentioned in CaseFolding.txt fold to themselves */ 3468 for(i=0; 3469 0==uset_getItem(data.notSeen, i, &start, &end, NULL, 0, &errorCode) && 3470 U_SUCCESS(errorCode); 3471 ++i 3472 ) { 3473 do { 3474 testFoldToSelf(start, CF_ALL); 3475 } while(++start<=end); 3476 } 3477 } 3478 3479 uset_close(data.notSeen); 3480 } 3481