1 /******************************************************************** 2 * COPYRIGHT: 3 * Copyright (c) 1997-2009, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ********************************************************************/ 6 /******************************************************************************* 7 * 8 * File CUCDTST.C 9 * 10 * Modification History: 11 * Name Description 12 * Madhu Katragadda Ported for C API, added tests for string functions 13 ******************************************************************************** 14 */ 15 16 #include <string.h> 17 #include <math.h> 18 #include <stdlib.h> 19 20 #include "unicode/utypes.h" 21 #include "unicode/uchar.h" 22 #include "unicode/putil.h" 23 #include "unicode/ustring.h" 24 #include "unicode/uloc.h" 25 26 #include "cintltst.h" 27 #include "putilimp.h" 28 #include "uparse.h" 29 #include "ucase.h" 30 #include "ubidi_props.h" 31 #include "uprops.h" 32 #include "uset_imp.h" 33 #include "usc_impl.h" 34 #include "unormimp.h" 35 #include "udatamem.h" /* for testing ucase_openBinary() */ 36 #include "cucdapi.h" 37 38 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 39 40 /* prototypes --------------------------------------------------------------- */ 41 42 static void TestUpperLower(void); 43 static void TestLetterNumber(void); 44 static void TestMisc(void); 45 static void TestPOSIX(void); 46 static void TestControlPrint(void); 47 static void TestIdentifier(void); 48 static void TestUnicodeData(void); 49 static void TestCodeUnit(void); 50 static void TestCodePoint(void); 51 static void TestCharLength(void); 52 static void TestCharNames(void); 53 static void TestMirroring(void); 54 static void TestUScriptRunAPI(void); 55 static void TestAdditionalProperties(void); 56 static void TestNumericProperties(void); 57 static void TestPropertyNames(void); 58 static void TestPropertyValues(void); 59 static void TestConsistency(void); 60 static void TestUCase(void); 61 static void TestUBiDiProps(void); 62 static void TestCaseFolding(void); 63 64 /* internal methods used */ 65 static int32_t MakeProp(char* str); 66 static int32_t MakeDir(char* str); 67 68 /* helpers ------------------------------------------------------------------ */ 69 70 static void 71 parseUCDFile(const char *filename, 72 char *fields[][2], int32_t fieldCount, 73 UParseLineFn *lineFn, void *context, 74 UErrorCode *pErrorCode) { 75 char path[256]; 76 char backupPath[256]; 77 78 if(U_FAILURE(*pErrorCode)) { 79 return; 80 } 81 82 /* Look inside ICU_DATA first */ 83 strcpy(path, u_getDataDirectory()); 84 strcat(path, ".." U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING); 85 strcat(path, filename); 86 87 /* As a fallback, try to guess where the source data was located 88 * at the time ICU was built, and look there. 89 */ 90 strcpy(backupPath, ctest_dataSrcDir()); 91 strcat(backupPath, U_FILE_SEP_STRING); 92 strcat(backupPath, "unidata" U_FILE_SEP_STRING); 93 strcat(backupPath, filename); 94 95 u_parseDelimitedFile(path, ';', fields, fieldCount, lineFn, context, pErrorCode); 96 if(*pErrorCode==U_FILE_ACCESS_ERROR) { 97 *pErrorCode=U_ZERO_ERROR; 98 u_parseDelimitedFile(backupPath, ';', fields, fieldCount, lineFn, context, pErrorCode); 99 } 100 if(U_FAILURE(*pErrorCode)) { 101 log_err_status(*pErrorCode, "error parsing %s: %s\n", filename, u_errorName(*pErrorCode)); 102 } 103 } 104 105 /* test data ---------------------------------------------------------------- */ 106 107 static const UChar LAST_CHAR_CODE_IN_FILE = 0xFFFD; 108 static const char tagStrings[] = "MnMcMeNdNlNoZsZlZpCcCfCsCoCnLuLlLtLmLoPcPdPsPePoSmScSkSoPiPf"; 109 static const int32_t tagValues[] = 110 { 111 /* Mn */ U_NON_SPACING_MARK, 112 /* Mc */ U_COMBINING_SPACING_MARK, 113 /* Me */ U_ENCLOSING_MARK, 114 /* Nd */ U_DECIMAL_DIGIT_NUMBER, 115 /* Nl */ U_LETTER_NUMBER, 116 /* No */ U_OTHER_NUMBER, 117 /* Zs */ U_SPACE_SEPARATOR, 118 /* Zl */ U_LINE_SEPARATOR, 119 /* Zp */ U_PARAGRAPH_SEPARATOR, 120 /* Cc */ U_CONTROL_CHAR, 121 /* Cf */ U_FORMAT_CHAR, 122 /* Cs */ U_SURROGATE, 123 /* Co */ U_PRIVATE_USE_CHAR, 124 /* Cn */ U_UNASSIGNED, 125 /* Lu */ U_UPPERCASE_LETTER, 126 /* Ll */ U_LOWERCASE_LETTER, 127 /* Lt */ U_TITLECASE_LETTER, 128 /* Lm */ U_MODIFIER_LETTER, 129 /* Lo */ U_OTHER_LETTER, 130 /* Pc */ U_CONNECTOR_PUNCTUATION, 131 /* Pd */ U_DASH_PUNCTUATION, 132 /* Ps */ U_START_PUNCTUATION, 133 /* Pe */ U_END_PUNCTUATION, 134 /* Po */ U_OTHER_PUNCTUATION, 135 /* Sm */ U_MATH_SYMBOL, 136 /* Sc */ U_CURRENCY_SYMBOL, 137 /* Sk */ U_MODIFIER_SYMBOL, 138 /* So */ U_OTHER_SYMBOL, 139 /* Pi */ U_INITIAL_PUNCTUATION, 140 /* Pf */ U_FINAL_PUNCTUATION 141 }; 142 143 static const char dirStrings[][5] = { 144 "L", 145 "R", 146 "EN", 147 "ES", 148 "ET", 149 "AN", 150 "CS", 151 "B", 152 "S", 153 "WS", 154 "ON", 155 "LRE", 156 "LRO", 157 "AL", 158 "RLE", 159 "RLO", 160 "PDF", 161 "NSM", 162 "BN" 163 }; 164 165 void addUnicodeTest(TestNode** root); 166 167 void addUnicodeTest(TestNode** root) 168 { 169 addTest(root, &TestCodeUnit, "tsutil/cucdtst/TestCodeUnit"); 170 addTest(root, &TestCodePoint, "tsutil/cucdtst/TestCodePoint"); 171 addTest(root, &TestCharLength, "tsutil/cucdtst/TestCharLength"); 172 addTest(root, &TestBinaryValues, "tsutil/cucdtst/TestBinaryValues"); 173 addTest(root, &TestUnicodeData, "tsutil/cucdtst/TestUnicodeData"); 174 addTest(root, &TestAdditionalProperties, "tsutil/cucdtst/TestAdditionalProperties"); 175 addTest(root, &TestNumericProperties, "tsutil/cucdtst/TestNumericProperties"); 176 addTest(root, &TestUpperLower, "tsutil/cucdtst/TestUpperLower"); 177 addTest(root, &TestLetterNumber, "tsutil/cucdtst/TestLetterNumber"); 178 addTest(root, &TestMisc, "tsutil/cucdtst/TestMisc"); 179 addTest(root, &TestPOSIX, "tsutil/cucdtst/TestPOSIX"); 180 addTest(root, &TestControlPrint, "tsutil/cucdtst/TestControlPrint"); 181 addTest(root, &TestIdentifier, "tsutil/cucdtst/TestIdentifier"); 182 addTest(root, &TestCharNames, "tsutil/cucdtst/TestCharNames"); 183 addTest(root, &TestMirroring, "tsutil/cucdtst/TestMirroring"); 184 addTest(root, &TestUScriptCodeAPI, "tsutil/cucdtst/TestUScriptCodeAPI"); 185 addTest(root, &TestUScriptRunAPI, "tsutil/cucdtst/TestUScriptRunAPI"); 186 addTest(root, &TestPropertyNames, "tsutil/cucdtst/TestPropertyNames"); 187 addTest(root, &TestPropertyValues, "tsutil/cucdtst/TestPropertyValues"); 188 addTest(root, &TestConsistency, "tsutil/cucdtst/TestConsistency"); 189 addTest(root, &TestUCase, "tsutil/cucdtst/TestUCase"); 190 addTest(root, &TestUBiDiProps, "tsutil/cucdtst/TestUBiDiProps"); 191 addTest(root, &TestCaseFolding, "tsutil/cucdtst/TestCaseFolding"); 192 } 193 194 /*==================================================== */ 195 /* test u_toupper() and u_tolower() */ 196 /*==================================================== */ 197 static void TestUpperLower() 198 { 199 const UChar upper[] = {0x41, 0x42, 0x00b2, 0x01c4, 0x01c6, 0x01c9, 0x01c8, 0x01c9, 0x000c, 0x0000}; 200 const UChar lower[] = {0x61, 0x62, 0x00b2, 0x01c6, 0x01c6, 0x01c9, 0x01c9, 0x01c9, 0x000c, 0x0000}; 201 U_STRING_DECL(upperTest, "abcdefg123hij.?:klmno", 21); 202 U_STRING_DECL(lowerTest, "ABCDEFG123HIJ.?:KLMNO", 21); 203 int32_t i; 204 205 U_STRING_INIT(upperTest, "abcdefg123hij.?:klmno", 21); 206 U_STRING_INIT(lowerTest, "ABCDEFG123HIJ.?:KLMNO", 21); 207 208 /* 209 Checks LetterLike Symbols which were previously a source of confusion 210 [Bertrand A. D. 02/04/98] 211 */ 212 for (i=0x2100;i<0x2138;i++) 213 { 214 /* Unicode 5.0 adds lowercase U+214E (TURNED SMALL F) to U+2132 (TURNED CAPITAL F) */ 215 if(i!=0x2126 && i!=0x212a && i!=0x212b && i!=0x2132) 216 { 217 if (i != (int)u_tolower(i)) /* itself */ 218 log_err("Failed case conversion with itself: U+%04x\n", i); 219 if (i != (int)u_toupper(i)) 220 log_err("Failed case conversion with itself: U+%04x\n", i); 221 } 222 } 223 224 for(i=0; i < u_strlen(upper); i++){ 225 if(u_tolower(upper[i]) != lower[i]){ 226 log_err("FAILED u_tolower() for %lx Expected %lx Got %lx\n", upper[i], lower[i], u_tolower(upper[i])); 227 } 228 } 229 230 log_verbose("testing upper lower\n"); 231 for (i = 0; i < 21; i++) { 232 233 if (u_isalpha(upperTest[i]) && !u_islower(upperTest[i])) 234 { 235 log_err("Failed isLowerCase test at %c\n", upperTest[i]); 236 } 237 else if (u_isalpha(lowerTest[i]) && !u_isupper(lowerTest[i])) 238 { 239 log_err("Failed isUpperCase test at %c\n", lowerTest[i]); 240 } 241 else if (upperTest[i] != u_tolower(lowerTest[i])) 242 { 243 log_err("Failed case conversion from %c To %c :\n", lowerTest[i], upperTest[i]); 244 } 245 else if (lowerTest[i] != u_toupper(upperTest[i])) 246 { 247 log_err("Failed case conversion : %c To %c \n", upperTest[i], lowerTest[i]); 248 } 249 else if (upperTest[i] != u_tolower(upperTest[i])) 250 { 251 log_err("Failed case conversion with itself: %c\n", upperTest[i]); 252 } 253 else if (lowerTest[i] != u_toupper(lowerTest[i])) 254 { 255 log_err("Failed case conversion with itself: %c\n", lowerTest[i]); 256 } 257 } 258 log_verbose("done testing upper lower\n"); 259 260 log_verbose("testing u_istitle\n"); 261 { 262 static const UChar expected[] = { 263 0x1F88, 264 0x1F89, 265 0x1F8A, 266 0x1F8B, 267 0x1F8C, 268 0x1F8D, 269 0x1F8E, 270 0x1F8F, 271 0x1F88, 272 0x1F89, 273 0x1F8A, 274 0x1F8B, 275 0x1F8C, 276 0x1F8D, 277 0x1F8E, 278 0x1F8F, 279 0x1F98, 280 0x1F99, 281 0x1F9A, 282 0x1F9B, 283 0x1F9C, 284 0x1F9D, 285 0x1F9E, 286 0x1F9F, 287 0x1F98, 288 0x1F99, 289 0x1F9A, 290 0x1F9B, 291 0x1F9C, 292 0x1F9D, 293 0x1F9E, 294 0x1F9F, 295 0x1FA8, 296 0x1FA9, 297 0x1FAA, 298 0x1FAB, 299 0x1FAC, 300 0x1FAD, 301 0x1FAE, 302 0x1FAF, 303 0x1FA8, 304 0x1FA9, 305 0x1FAA, 306 0x1FAB, 307 0x1FAC, 308 0x1FAD, 309 0x1FAE, 310 0x1FAF, 311 0x1FBC, 312 0x1FBC, 313 0x1FCC, 314 0x1FCC, 315 0x1FFC, 316 0x1FFC, 317 }; 318 int32_t num = sizeof(expected)/sizeof(expected[0]); 319 for(i=0; i<num; i++){ 320 if(!u_istitle(expected[i])){ 321 log_err("u_istitle failed for 0x%4X. Expected TRUE, got FALSE\n",expected[i]); 322 } 323 } 324 325 } 326 } 327 328 /* compare two sets and verify that their difference or intersection is empty */ 329 static UBool 330 showADiffB(const USet *a, const USet *b, 331 const char *a_name, const char *b_name, 332 UBool expect, UBool diffIsError) { 333 USet *aa; 334 int32_t i, start, end, length; 335 UErrorCode errorCode; 336 337 /* 338 * expect: 339 * TRUE -> a-b should be empty, that is, b should contain all of a 340 * FALSE -> a&b should be empty, that is, a should contain none of b (and vice versa) 341 */ 342 if(expect ? uset_containsAll(b, a) : uset_containsNone(a, b)) { 343 return TRUE; 344 } 345 346 /* clone a to aa because a is const */ 347 aa=uset_open(1, 0); 348 if(aa==NULL) { 349 /* unusual problem - out of memory? */ 350 return FALSE; 351 } 352 uset_addAll(aa, a); 353 354 /* compute the set in question */ 355 if(expect) { 356 /* a-b */ 357 uset_removeAll(aa, b); 358 } else { 359 /* a&b */ 360 uset_retainAll(aa, b); 361 } 362 363 /* aa is not empty because of the initial tests above; show its contents */ 364 errorCode=U_ZERO_ERROR; 365 i=0; 366 for(;;) { 367 length=uset_getItem(aa, i, &start, &end, NULL, 0, &errorCode); 368 if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) { 369 break; /* done */ 370 } 371 if(U_FAILURE(errorCode)) { 372 log_err("error comparing %s with %s at difference item %d: %s\n", 373 a_name, b_name, i, u_errorName(errorCode)); 374 break; 375 } 376 if(length!=0) { 377 break; /* done with code points, got a string or -1 */ 378 } 379 380 if(diffIsError) { 381 if(expect) { 382 log_err("error: %s contains U+%04x..U+%04x but %s does not\n", a_name, start, end, b_name); 383 } else { 384 log_err("error: %s and %s both contain U+%04x..U+%04x but should not intersect\n", a_name, b_name, start, end); 385 } 386 } else { 387 if(expect) { 388 log_verbose("info: %s contains U+%04x..U+%04x but %s does not\n", a_name, start, end, b_name); 389 } else { 390 log_verbose("info: %s and %s both contain U+%04x..U+%04x but should not intersect\n", a_name, b_name, start, end); 391 } 392 } 393 394 ++i; 395 } 396 397 uset_close(aa); 398 return FALSE; 399 } 400 401 static UBool 402 showAMinusB(const USet *a, const USet *b, 403 const char *a_name, const char *b_name, 404 UBool diffIsError) { 405 return showADiffB(a, b, a_name, b_name, TRUE, diffIsError); 406 } 407 408 static UBool 409 showAIntersectB(const USet *a, const USet *b, 410 const char *a_name, const char *b_name, 411 UBool diffIsError) { 412 return showADiffB(a, b, a_name, b_name, FALSE, diffIsError); 413 } 414 415 static UBool 416 compareUSets(const USet *a, const USet *b, 417 const char *a_name, const char *b_name, 418 UBool diffIsError) { 419 /* 420 * Use an arithmetic & not a logical && so that both branches 421 * are always taken and all differences are shown. 422 */ 423 return 424 showAMinusB(a, b, a_name, b_name, diffIsError) & 425 showAMinusB(b, a, b_name, a_name, diffIsError); 426 } 427 428 /* test isLetter(u_isapha()) and isDigit(u_isdigit()) */ 429 static void TestLetterNumber() 430 { 431 UChar i = 0x0000; 432 433 log_verbose("Testing for isalpha\n"); 434 for (i = 0x0041; i < 0x005B; i++) { 435 if (!u_isalpha(i)) 436 { 437 log_err("Failed isLetter test at %.4X\n", i); 438 } 439 } 440 for (i = 0x0660; i < 0x066A; i++) { 441 if (u_isalpha(i)) 442 { 443 log_err("Failed isLetter test with numbers at %.4X\n", i); 444 } 445 } 446 447 log_verbose("Testing for isdigit\n"); 448 for (i = 0x0660; i < 0x066A; i++) { 449 if (!u_isdigit(i)) 450 { 451 log_verbose("Failed isNumber test at %.4X\n", i); 452 } 453 } 454 455 log_verbose("Testing for isalnum\n"); 456 for (i = 0x0041; i < 0x005B; i++) { 457 if (!u_isalnum(i)) 458 { 459 log_err("Failed isAlNum test at %.4X\n", i); 460 } 461 } 462 for (i = 0x0660; i < 0x066A; i++) { 463 if (!u_isalnum(i)) 464 { 465 log_err("Failed isAlNum test at %.4X\n", i); 466 } 467 } 468 469 { 470 /* 471 * The following checks work only starting from Unicode 4.0. 472 * Check the version number here. 473 */ 474 static UVersionInfo u401={ 4, 0, 1, 0 }; 475 UVersionInfo version; 476 u_getUnicodeVersion(version); 477 if(version[0]<4 || 0==memcmp(version, u401, 4)) { 478 return; 479 } 480 } 481 482 { 483 /* 484 * Sanity check: 485 * Verify that exactly the digit characters have decimal digit values. 486 * This assumption is used in the implementation of u_digit() 487 * (which checks nt=de) 488 * compared with the parallel java.lang.Character.digit() 489 * (which checks Nd). 490 * 491 * This was not true in Unicode 3.2 and earlier. 492 * Unicode 4.0 fixed discrepancies. 493 * Unicode 4.0.1 re-introduced problems in this area due to an 494 * unintentionally incomplete last-minute change. 495 */ 496 U_STRING_DECL(digitsPattern, "[:Nd:]", 6); 497 U_STRING_DECL(decimalValuesPattern, "[:Numeric_Type=Decimal:]", 24); 498 499 USet *digits, *decimalValues; 500 UErrorCode errorCode; 501 502 U_STRING_INIT(digitsPattern, "[:Nd:]", 6); 503 U_STRING_INIT(decimalValuesPattern, "[:Numeric_Type=Decimal:]", 24); 504 errorCode=U_ZERO_ERROR; 505 digits=uset_openPattern(digitsPattern, 6, &errorCode); 506 decimalValues=uset_openPattern(decimalValuesPattern, 24, &errorCode); 507 508 if(U_SUCCESS(errorCode)) { 509 compareUSets(digits, decimalValues, "[:Nd:]", "[:Numeric_Type=Decimal:]", TRUE); 510 } 511 512 uset_close(digits); 513 uset_close(decimalValues); 514 } 515 } 516 517 /* Tests for isDefined(u_isdefined)(, isBaseForm(u_isbase()), isSpaceChar(u_isspace()), isWhiteSpace(), u_CharDigitValue() */ 518 static void TestMisc() 519 { 520 static const UChar sampleSpaces[] = {0x0020, 0x00a0, 0x2000, 0x2001, 0x2005}; 521 static const UChar sampleNonSpaces[] = {0x61, 0x62, 0x63, 0x64, 0x74}; 522 static const UChar sampleUndefined[] = {0xfff1, 0xfff7, 0xfa6b }; 523 static const UChar sampleDefined[] = {0x523E, 0x4f88, 0xfffd}; 524 static const UChar sampleBase[] = {0x0061, 0x0031, 0x03d2}; 525 static const UChar sampleNonBase[] = {0x002B, 0x0020, 0x203B}; 526 /* static const UChar sampleChars[] = {0x000a, 0x0045, 0x4e00, 0xDC00, 0xFFE8, 0xFFF0};*/ 527 static const UChar sampleDigits[]= {0x0030, 0x0662, 0x0F23, 0x0ED5}; 528 static const UChar sampleNonDigits[] = {0x0010, 0x0041, 0x0122, 0x68FE}; 529 static const UChar sampleWhiteSpaces[] = {0x2008, 0x2009, 0x200a, 0x001c, 0x000c}; 530 static const UChar sampleNonWhiteSpaces[] = {0x61, 0x62, 0x3c, 0x28, 0x3f}; 531 532 533 static const int32_t sampleDigitValues[] = {0, 2, 3, 5}; 534 535 uint32_t mask; 536 537 int32_t i; 538 char icuVersion[U_MAX_VERSION_STRING_LENGTH]; 539 UVersionInfo realVersion; 540 541 memset(icuVersion, 0, U_MAX_VERSION_STRING_LENGTH); 542 543 log_verbose("Testing for isspace and nonspaces\n"); 544 for (i = 0; i < 5; i++) { 545 if (!(u_isspace(sampleSpaces[i])) || 546 (u_isspace(sampleNonSpaces[i]))) 547 { 548 log_err("Space char test error : %d or %d \n", (int32_t)sampleSpaces[i], (int32_t)sampleNonSpaces[i]); 549 } 550 if (!(u_isJavaSpaceChar(sampleSpaces[i])) || 551 (u_isJavaSpaceChar(sampleNonSpaces[i]))) 552 { 553 log_err("u_isJavaSpaceChar() test error : %d or %d \n", (int32_t)sampleSpaces[i], (int32_t)sampleNonSpaces[i]); 554 } 555 } 556 557 log_verbose("Testing for isspace and nonspaces\n"); 558 for (i = 0; i < 5; i++) { 559 if (!(u_isWhitespace(sampleWhiteSpaces[i])) || 560 (u_isWhitespace(sampleNonWhiteSpaces[i]))) 561 { 562 log_err("White Space char test error : %lx or %lx \n", sampleWhiteSpaces[i], sampleNonWhiteSpaces[i]); 563 } 564 } 565 566 log_verbose("Testing for isdefined\n"); 567 for (i = 0; i < 3; i++) { 568 if ((u_isdefined(sampleUndefined[i])) || 569 !(u_isdefined(sampleDefined[i]))) 570 { 571 log_err("Undefined char test error : U+%04x or U+%04x\n", (int32_t)sampleUndefined[i], (int32_t)sampleDefined[i]); 572 } 573 } 574 575 log_verbose("Testing for isbase\n"); 576 for (i = 0; i < 3; i++) { 577 if ((u_isbase(sampleNonBase[i])) || 578 !(u_isbase(sampleBase[i]))) 579 { 580 log_err("Non-baseform char test error : U+%04x or U+%04x",(int32_t)sampleNonBase[i], (int32_t)sampleBase[i]); 581 } 582 } 583 584 log_verbose("Testing for isdigit \n"); 585 for (i = 0; i < 4; i++) { 586 if ((u_isdigit(sampleDigits[i]) && 587 (u_charDigitValue(sampleDigits[i])!= sampleDigitValues[i])) || 588 (u_isdigit(sampleNonDigits[i]))) { 589 log_err("Digit char test error : %lx or %lx\n", sampleDigits[i], sampleNonDigits[i]); 590 } 591 } 592 593 /* Tests the ICU version #*/ 594 u_getVersion(realVersion); 595 u_versionToString(realVersion, icuVersion); 596 if (strncmp(icuVersion, U_ICU_VERSION, uprv_min((int32_t)strlen(icuVersion), (int32_t)strlen(U_ICU_VERSION))) != 0) 597 { 598 log_err("ICU version test failed. Header says=%s, got=%s \n", U_ICU_VERSION, icuVersion); 599 } 600 #if defined(ICU_VERSION) 601 /* test only happens where we have configure.in with VERSION - sanity check. */ 602 if(strcmp(U_ICU_VERSION, ICU_VERSION)) 603 { 604 log_err("ICU version mismatch: Header says %s, build environment says %s.\n", U_ICU_VERSION, ICU_VERSION); 605 } 606 #endif 607 608 /* test U_GC_... */ 609 if( 610 U_GET_GC_MASK(0x41)!=U_GC_LU_MASK || 611 U_GET_GC_MASK(0x662)!=U_GC_ND_MASK || 612 U_GET_GC_MASK(0xa0)!=U_GC_ZS_MASK || 613 U_GET_GC_MASK(0x28)!=U_GC_PS_MASK || 614 U_GET_GC_MASK(0x2044)!=U_GC_SM_MASK || 615 U_GET_GC_MASK(0xe0063)!=U_GC_CF_MASK 616 ) { 617 log_err("error: U_GET_GC_MASK does not work properly\n"); 618 } 619 620 mask=0; 621 mask=(mask&~U_GC_CN_MASK)|U_GC_CN_MASK; 622 623 mask=(mask&~U_GC_LU_MASK)|U_GC_LU_MASK; 624 mask=(mask&~U_GC_LL_MASK)|U_GC_LL_MASK; 625 mask=(mask&~U_GC_LT_MASK)|U_GC_LT_MASK; 626 mask=(mask&~U_GC_LM_MASK)|U_GC_LM_MASK; 627 mask=(mask&~U_GC_LO_MASK)|U_GC_LO_MASK; 628 629 mask=(mask&~U_GC_MN_MASK)|U_GC_MN_MASK; 630 mask=(mask&~U_GC_ME_MASK)|U_GC_ME_MASK; 631 mask=(mask&~U_GC_MC_MASK)|U_GC_MC_MASK; 632 633 mask=(mask&~U_GC_ND_MASK)|U_GC_ND_MASK; 634 mask=(mask&~U_GC_NL_MASK)|U_GC_NL_MASK; 635 mask=(mask&~U_GC_NO_MASK)|U_GC_NO_MASK; 636 637 mask=(mask&~U_GC_ZS_MASK)|U_GC_ZS_MASK; 638 mask=(mask&~U_GC_ZL_MASK)|U_GC_ZL_MASK; 639 mask=(mask&~U_GC_ZP_MASK)|U_GC_ZP_MASK; 640 641 mask=(mask&~U_GC_CC_MASK)|U_GC_CC_MASK; 642 mask=(mask&~U_GC_CF_MASK)|U_GC_CF_MASK; 643 mask=(mask&~U_GC_CO_MASK)|U_GC_CO_MASK; 644 mask=(mask&~U_GC_CS_MASK)|U_GC_CS_MASK; 645 646 mask=(mask&~U_GC_PD_MASK)|U_GC_PD_MASK; 647 mask=(mask&~U_GC_PS_MASK)|U_GC_PS_MASK; 648 mask=(mask&~U_GC_PE_MASK)|U_GC_PE_MASK; 649 mask=(mask&~U_GC_PC_MASK)|U_GC_PC_MASK; 650 mask=(mask&~U_GC_PO_MASK)|U_GC_PO_MASK; 651 652 mask=(mask&~U_GC_SM_MASK)|U_GC_SM_MASK; 653 mask=(mask&~U_GC_SC_MASK)|U_GC_SC_MASK; 654 mask=(mask&~U_GC_SK_MASK)|U_GC_SK_MASK; 655 mask=(mask&~U_GC_SO_MASK)|U_GC_SO_MASK; 656 657 mask=(mask&~U_GC_PI_MASK)|U_GC_PI_MASK; 658 mask=(mask&~U_GC_PF_MASK)|U_GC_PF_MASK; 659 660 if(mask!=(U_CHAR_CATEGORY_COUNT<32 ? U_MASK(U_CHAR_CATEGORY_COUNT)-1: 0xffffffff)) { 661 log_err("error: problems with U_GC_XX_MASK constants\n"); 662 } 663 664 mask=0; 665 mask=(mask&~U_GC_C_MASK)|U_GC_C_MASK; 666 mask=(mask&~U_GC_L_MASK)|U_GC_L_MASK; 667 mask=(mask&~U_GC_M_MASK)|U_GC_M_MASK; 668 mask=(mask&~U_GC_N_MASK)|U_GC_N_MASK; 669 mask=(mask&~U_GC_Z_MASK)|U_GC_Z_MASK; 670 mask=(mask&~U_GC_P_MASK)|U_GC_P_MASK; 671 mask=(mask&~U_GC_S_MASK)|U_GC_S_MASK; 672 673 if(mask!=(U_CHAR_CATEGORY_COUNT<32 ? U_MASK(U_CHAR_CATEGORY_COUNT)-1: 0xffffffff)) { 674 log_err("error: problems with U_GC_Y_MASK constants\n"); 675 } 676 { 677 static const UChar32 digit[10]={ 0x0030,0x0031,0x0032,0x0033,0x0034,0x0035,0x0036,0x0037,0x0038,0x0039 }; 678 for(i=0; i<10; i++){ 679 if(digit[i]!=u_forDigit(i,10)){ 680 log_err("u_forDigit failed for %i. Expected: 0x%4X Got: 0x%4X\n",i,digit[i],u_forDigit(i,10)); 681 } 682 } 683 } 684 685 /* test u_digit() */ 686 { 687 static const struct { 688 UChar32 c; 689 int8_t radix, value; 690 } data[]={ 691 /* base 16 */ 692 { 0x0031, 16, 1 }, 693 { 0x0038, 16, 8 }, 694 { 0x0043, 16, 12 }, 695 { 0x0066, 16, 15 }, 696 { 0x00e4, 16, -1 }, 697 { 0x0662, 16, 2 }, 698 { 0x06f5, 16, 5 }, 699 { 0xff13, 16, 3 }, 700 { 0xff41, 16, 10 }, 701 702 /* base 8 */ 703 { 0x0031, 8, 1 }, 704 { 0x0038, 8, -1 }, 705 { 0x0043, 8, -1 }, 706 { 0x0066, 8, -1 }, 707 { 0x00e4, 8, -1 }, 708 { 0x0662, 8, 2 }, 709 { 0x06f5, 8, 5 }, 710 { 0xff13, 8, 3 }, 711 { 0xff41, 8, -1 }, 712 713 /* base 36 */ 714 { 0x5a, 36, 35 }, 715 { 0x7a, 36, 35 }, 716 { 0xff3a, 36, 35 }, 717 { 0xff5a, 36, 35 }, 718 719 /* wrong radix values */ 720 { 0x0031, 1, -1 }, 721 { 0xff3a, 37, -1 } 722 }; 723 724 for(i=0; i<LENGTHOF(data); ++i) { 725 if(u_digit(data[i].c, data[i].radix)!=data[i].value) { 726 log_err("u_digit(U+%04x, %d)=%d expected %d\n", 727 data[i].c, 728 data[i].radix, 729 u_digit(data[i].c, data[i].radix), 730 data[i].value); 731 } 732 } 733 } 734 } 735 736 /* test C/POSIX-style functions --------------------------------------------- */ 737 738 /* bit flags */ 739 #define ISAL 1 740 #define ISLO 2 741 #define ISUP 4 742 743 #define ISDI 8 744 #define ISXD 0x10 745 746 #define ISAN 0x20 747 748 #define ISPU 0x40 749 #define ISGR 0x80 750 #define ISPR 0x100 751 752 #define ISSP 0x200 753 #define ISBL 0x400 754 #define ISCN 0x800 755 756 /* C/POSIX-style functions, in the same order as the bit flags */ 757 typedef UBool U_EXPORT2 IsPOSIXClass(UChar32 c); 758 759 static const struct { 760 IsPOSIXClass *fn; 761 const char *name; 762 } posixClasses[]={ 763 { u_isalpha, "isalpha" }, 764 { u_islower, "islower" }, 765 { u_isupper, "isupper" }, 766 { u_isdigit, "isdigit" }, 767 { u_isxdigit, "isxdigit" }, 768 { u_isalnum, "isalnum" }, 769 { u_ispunct, "ispunct" }, 770 { u_isgraph, "isgraph" }, 771 { u_isprint, "isprint" }, 772 { u_isspace, "isspace" }, 773 { u_isblank, "isblank" }, 774 { u_iscntrl, "iscntrl" } 775 }; 776 777 static const struct { 778 UChar32 c; 779 uint32_t posixResults; 780 } posixData[]={ 781 { 0x0008, ISCN }, /* backspace */ 782 { 0x0009, ISSP|ISBL|ISCN }, /* TAB */ 783 { 0x000a, ISSP| ISCN }, /* LF */ 784 { 0x000c, ISSP| ISCN }, /* FF */ 785 { 0x000d, ISSP| ISCN }, /* CR */ 786 { 0x0020, ISPR|ISSP|ISBL }, /* space */ 787 { 0x0021, ISPU|ISGR|ISPR }, /* ! */ 788 { 0x0033, ISDI|ISXD|ISAN| ISGR|ISPR }, /* 3 */ 789 { 0x0040, ISPU|ISGR|ISPR }, /* @ */ 790 { 0x0041, ISAL| ISUP| ISXD|ISAN| ISGR|ISPR }, /* A */ 791 { 0x007a, ISAL|ISLO| ISAN| ISGR|ISPR }, /* z */ 792 { 0x007b, ISPU|ISGR|ISPR }, /* { */ 793 { 0x0085, ISSP| ISCN }, /* NEL */ 794 { 0x00a0, ISPR|ISSP|ISBL }, /* NBSP */ 795 { 0x00a4, ISGR|ISPR }, /* currency sign */ 796 { 0x00e4, ISAL|ISLO| ISAN| ISGR|ISPR }, /* a-umlaut */ 797 { 0x0300, ISGR|ISPR }, /* combining grave */ 798 { 0x0600, ISCN }, /* arabic number sign */ 799 { 0x0627, ISAL| ISAN| ISGR|ISPR }, /* alef */ 800 { 0x0663, ISDI|ISXD|ISAN| ISGR|ISPR }, /* arabic 3 */ 801 { 0x2002, ISPR|ISSP|ISBL }, /* en space */ 802 { 0x2007, ISPR|ISSP|ISBL }, /* figure space */ 803 { 0x2009, ISPR|ISSP|ISBL }, /* thin space */ 804 { 0x200b, ISCN }, /* ZWSP */ 805 /*{ 0x200b, ISPR|ISSP },*/ /* ZWSP */ /* ZWSP became a control char in 4.0.1*/ 806 { 0x200e, ISCN }, /* LRM */ 807 { 0x2028, ISPR|ISSP| ISCN }, /* LS */ 808 { 0x2029, ISPR|ISSP| ISCN }, /* PS */ 809 { 0x20ac, ISGR|ISPR }, /* Euro */ 810 { 0xff15, ISDI|ISXD|ISAN| ISGR|ISPR }, /* fullwidth 5 */ 811 { 0xff25, ISAL| ISUP| ISXD|ISAN| ISGR|ISPR }, /* fullwidth E */ 812 { 0xff35, ISAL| ISUP| ISAN| ISGR|ISPR }, /* fullwidth U */ 813 { 0xff45, ISAL|ISLO| ISXD|ISAN| ISGR|ISPR }, /* fullwidth e */ 814 { 0xff55, ISAL|ISLO| ISAN| ISGR|ISPR } /* fullwidth u */ 815 }; 816 817 static void 818 TestPOSIX() { 819 uint32_t mask; 820 int32_t cl, i; 821 UBool expect; 822 823 mask=1; 824 for(cl=0; cl<12; ++cl) { 825 for(i=0; i<LENGTHOF(posixData); ++i) { 826 expect=(UBool)((posixData[i].posixResults&mask)!=0); 827 if(posixClasses[cl].fn(posixData[i].c)!=expect) { 828 log_err("u_%s(U+%04x)=%s is wrong\n", 829 posixClasses[cl].name, posixData[i].c, expect ? "FALSE" : "TRUE"); 830 } 831 } 832 mask<<=1; 833 } 834 } 835 836 /* Tests for isControl(u_iscntrl()) and isPrintable(u_isprint()) */ 837 static void TestControlPrint() 838 { 839 const UChar sampleControl[] = {0x1b, 0x97, 0x82, 0x2028, 0x2029, 0x200c, 0x202b}; 840 const UChar sampleNonControl[] = {0x61, 0x0031, 0x00e2}; 841 const UChar samplePrintable[] = {0x0042, 0x005f, 0x2014}; 842 const UChar sampleNonPrintable[] = {0x200c, 0x009f, 0x001b}; 843 UChar32 c; 844 int i; 845 846 log_verbose("Testing for iscontrol\n"); 847 for (i = 0; i < LENGTHOF(sampleControl); i++) { 848 if (!u_iscntrl(sampleControl[i])) 849 { 850 log_err("Control char test error : U+%04x should be control but is not\n", (int32_t)sampleControl[i]); 851 } 852 } 853 854 log_verbose("Testing for !iscontrol\n"); 855 for (i = 0; i < LENGTHOF(sampleNonControl); i++) { 856 if (u_iscntrl(sampleNonControl[i])) 857 { 858 log_err("Control char test error : U+%04x should not be control but is\n", (int32_t)sampleNonControl[i]); 859 } 860 } 861 862 log_verbose("testing for isprintable\n"); 863 for (i = 0; i < 3; i++) { 864 if (!u_isprint(samplePrintable[i])) 865 { 866 log_err("Printable char test error : U+%04x should be printable but is not\n", (int32_t)samplePrintable[i]); 867 } 868 if (u_isprint(sampleNonPrintable[i])) 869 { 870 log_err("Printable char test error : U+%04x should not be printable but is\n", (int32_t)sampleNonPrintable[i]); 871 } 872 } 873 874 /* test all ISO 8 controls */ 875 for(c=0; c<=0x9f; ++c) { 876 if(c==0x20) { 877 /* skip ASCII graphic characters and continue with DEL */ 878 c=0x7f; 879 } 880 if(!u_iscntrl(c)) { 881 log_err("error: u_iscntrl(ISO 8 control U+%04x)=FALSE\n", c); 882 } 883 if(!u_isISOControl(c)) { 884 log_err("error: u_isISOControl(ISO 8 control U+%04x)=FALSE\n", c); 885 } 886 if(u_isprint(c)) { 887 log_err("error: u_isprint(ISO 8 control U+%04x)=TRUE\n", c); 888 } 889 } 890 891 /* test all Latin-1 graphic characters */ 892 for(c=0x20; c<=0xff; ++c) { 893 if(c==0x7f) { 894 c=0xa0; 895 } else if(c==0xad) { 896 /* Unicode 4 changes 00AD Soft Hyphen to Cf (and it is in fact not printable) */ 897 ++c; 898 } 899 if(!u_isprint(c)) { 900 log_err("error: u_isprint(Latin-1 graphic character U+%04x)=FALSE\n", c); 901 } 902 } 903 } 904 905 /* u_isJavaIDStart, u_isJavaIDPart, u_isIDStart(), u_isIDPart(), u_isIDIgnorable()*/ 906 static void TestIdentifier() 907 { 908 const UChar sampleJavaIDStart[] = {0x0071, 0x00e4, 0x005f}; 909 const UChar sampleNonJavaIDStart[] = {0x0020, 0x2030, 0x0082}; 910 const UChar sampleJavaIDPart[] = {0x005f, 0x0032, 0x0045}; 911 const UChar sampleNonJavaIDPart[] = {0x2030, 0x2020, 0x0020}; 912 const UChar sampleUnicodeIDStart[] = {0x0250, 0x00e2, 0x0061}; 913 const UChar sampleNonUnicodeIDStart[] = {0x2000, 0x000a, 0x2019}; 914 const UChar sampleUnicodeIDPart[] = {0x005f, 0x0032, 0x0045}; 915 const UChar sampleNonUnicodeIDPart[] = {0x2030, 0x00a3, 0x0020}; 916 const UChar sampleIDIgnore[] = {0x0006, 0x0010, 0x206b}; 917 const UChar sampleNonIDIgnore[] = {0x0075, 0x00a3, 0x0061}; 918 919 int i; 920 921 log_verbose("Testing sampleJavaID start \n"); 922 for (i = 0; i < 3; i++) { 923 if (!(u_isJavaIDStart(sampleJavaIDStart[i])) || 924 (u_isJavaIDStart(sampleNonJavaIDStart[i]))) 925 log_err("Java ID Start char test error : %lx or %lx\n", 926 sampleJavaIDStart[i], sampleNonJavaIDStart[i]); 927 } 928 929 log_verbose("Testing sampleJavaID part \n"); 930 for (i = 0; i < 3; i++) { 931 if (!(u_isJavaIDPart(sampleJavaIDPart[i])) || 932 (u_isJavaIDPart(sampleNonJavaIDPart[i]))) 933 log_err("Java ID Part char test error : %lx or %lx\n", 934 sampleJavaIDPart[i], sampleNonJavaIDPart[i]); 935 } 936 937 log_verbose("Testing sampleUnicodeID start \n"); 938 for (i = 0; i < 3; i++) { 939 /* T_test_logln_ustr((int32_t)i); */ 940 if (!(u_isIDStart(sampleUnicodeIDStart[i])) || 941 (u_isIDStart(sampleNonUnicodeIDStart[i]))) 942 { 943 log_err("Unicode ID Start char test error : %lx or %lx\n", sampleUnicodeIDStart[i], 944 sampleNonUnicodeIDStart[i]); 945 } 946 } 947 948 log_verbose("Testing sample unicode ID part \n"); 949 for (i = 2; i < 3; i++) { /* nos *** starts with 2 instead of 0, until clarified */ 950 /* T_test_logln_ustr((int32_t)i); */ 951 if (!(u_isIDPart(sampleUnicodeIDPart[i])) || 952 (u_isIDPart(sampleNonUnicodeIDPart[i]))) 953 { 954 log_err("Unicode ID Part char test error : %lx or %lx", sampleUnicodeIDPart[i], sampleNonUnicodeIDPart[i]); 955 } 956 } 957 958 log_verbose("Testing sampleId ignore\n"); 959 for (i = 0; i < 3; i++) { 960 /*T_test_logln_ustr((int32_t)i); */ 961 if (!(u_isIDIgnorable(sampleIDIgnore[i])) || 962 (u_isIDIgnorable(sampleNonIDIgnore[i]))) 963 { 964 log_err("ID ignorable char test error : U+%04x or U+%04x\n", sampleIDIgnore[i], sampleNonIDIgnore[i]); 965 } 966 } 967 } 968 969 /* for each line of UnicodeData.txt, check some of the properties */ 970 /* 971 * ### TODO 972 * This test fails incorrectly if the First or Last code point of a repetitive area 973 * is overridden, which is allowed and is encouraged for the PUAs. 974 * Currently, this means that both area First/Last and override lines are 975 * tested against the properties from the API, 976 * and the area boundary will not match and cause an error. 977 * 978 * This function should detect area boundaries and skip them for the test of individual 979 * code points' properties. 980 * Then it should check that the areas contain all the same properties except where overridden. 981 * For this, it would have had to set a flag for which code points were listed explicitly. 982 */ 983 static void U_CALLCONV 984 unicodeDataLineFn(void *context, 985 char *fields[][2], int32_t fieldCount, 986 UErrorCode *pErrorCode) 987 { 988 char buffer[100]; 989 char *end; 990 uint32_t value; 991 UChar32 c; 992 int32_t i; 993 int8_t type; 994 995 /* get the character code, field 0 */ 996 c=strtoul(fields[0][0], &end, 16); 997 if(end<=fields[0][0] || end!=fields[0][1]) { 998 log_err("error: syntax error in field 0 at %s\n", fields[0][0]); 999 return; 1000 } 1001 if((uint32_t)c>=UCHAR_MAX_VALUE + 1) { 1002 log_err("error in UnicodeData.txt: code point %lu out of range\n", c); 1003 return; 1004 } 1005 1006 /* get general category, field 2 */ 1007 *fields[2][1]=0; 1008 type = (int8_t)tagValues[MakeProp(fields[2][0])]; 1009 if(u_charType(c)!=type) { 1010 log_err("error: u_charType(U+%04lx)==%u instead of %u\n", c, u_charType(c), type); 1011 } 1012 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) { 1013 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c); 1014 } 1015 1016 /* get canonical combining class, field 3 */ 1017 value=strtoul(fields[3][0], &end, 10); 1018 if(end<=fields[3][0] || end!=fields[3][1]) { 1019 log_err("error: syntax error in field 3 at code 0x%lx\n", c); 1020 return; 1021 } 1022 if(value>255) { 1023 log_err("error in UnicodeData.txt: combining class %lu out of range\n", value); 1024 return; 1025 } 1026 #if !UCONFIG_NO_NORMALIZATION 1027 if(value!=u_getCombiningClass(c) || value!=(uint32_t)u_getIntPropertyValue(c, UCHAR_CANONICAL_COMBINING_CLASS)) { 1028 log_err("error: u_getCombiningClass(U+%04lx)==%hu instead of %lu\n", c, u_getCombiningClass(c), value); 1029 } 1030 #endif 1031 1032 /* get BiDi category, field 4 */ 1033 *fields[4][1]=0; 1034 i=MakeDir(fields[4][0]); 1035 if(i!=u_charDirection(c) || i!=u_getIntPropertyValue(c, UCHAR_BIDI_CLASS)) { 1036 log_err("error: u_charDirection(U+%04lx)==%u instead of %u (%s)\n", c, u_charDirection(c), MakeDir(fields[4][0]), fields[4][0]); 1037 } 1038 1039 /* get ISO Comment, field 11 */ 1040 *fields[11][1]=0; 1041 i=u_getISOComment(c, buffer, sizeof(buffer), pErrorCode); 1042 if(U_FAILURE(*pErrorCode) || 0!=strcmp(fields[11][0], buffer)) { 1043 log_err_status(*pErrorCode, "error: u_getISOComment(U+%04lx) wrong (%s): \"%s\" should be \"%s\"\n", 1044 c, u_errorName(*pErrorCode), 1045 U_FAILURE(*pErrorCode) ? buffer : "[error]", 1046 fields[11][0]); 1047 } 1048 1049 /* get uppercase mapping, field 12 */ 1050 if(fields[12][0]!=fields[12][1]) { 1051 value=strtoul(fields[12][0], &end, 16); 1052 if(end!=fields[12][1]) { 1053 log_err("error: syntax error in field 12 at code 0x%lx\n", c); 1054 return; 1055 } 1056 if((UChar32)value!=u_toupper(c)) { 1057 log_err("error: u_toupper(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_toupper(c), value); 1058 } 1059 } else { 1060 /* no case mapping: the API must map the code point to itself */ 1061 if(c!=u_toupper(c)) { 1062 log_err("error: U+%04lx does not have an uppercase mapping but u_toupper()==U+%04lx\n", c, u_toupper(c)); 1063 } 1064 } 1065 1066 /* get lowercase mapping, field 13 */ 1067 if(fields[13][0]!=fields[13][1]) { 1068 value=strtoul(fields[13][0], &end, 16); 1069 if(end!=fields[13][1]) { 1070 log_err("error: syntax error in field 13 at code 0x%lx\n", c); 1071 return; 1072 } 1073 if((UChar32)value!=u_tolower(c)) { 1074 log_err("error: u_tolower(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_tolower(c), value); 1075 } 1076 } else { 1077 /* no case mapping: the API must map the code point to itself */ 1078 if(c!=u_tolower(c)) { 1079 log_err("error: U+%04lx does not have a lowercase mapping but u_tolower()==U+%04lx\n", c, u_tolower(c)); 1080 } 1081 } 1082 1083 /* get titlecase mapping, field 14 */ 1084 if(fields[14][0]!=fields[14][1]) { 1085 value=strtoul(fields[14][0], &end, 16); 1086 if(end!=fields[14][1]) { 1087 log_err("error: syntax error in field 14 at code 0x%lx\n", c); 1088 return; 1089 } 1090 if((UChar32)value!=u_totitle(c)) { 1091 log_err("error: u_totitle(U+%04lx)==U+%04lx instead of U+%04lx\n", c, u_totitle(c), value); 1092 } 1093 } else { 1094 /* no case mapping: the API must map the code point to itself */ 1095 if(c!=u_totitle(c)) { 1096 log_err("error: U+%04lx does not have a titlecase mapping but u_totitle()==U+%04lx\n", c, u_totitle(c)); 1097 } 1098 } 1099 } 1100 1101 static UBool U_CALLCONV 1102 enumTypeRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type) { 1103 static const UChar32 test[][2]={ 1104 {0x41, U_UPPERCASE_LETTER}, 1105 {0x308, U_NON_SPACING_MARK}, 1106 {0xfffe, U_GENERAL_OTHER_TYPES}, 1107 {0xe0041, U_FORMAT_CHAR}, 1108 {0xeffff, U_UNASSIGNED} 1109 }; 1110 1111 int32_t i, count; 1112 1113 if(0!=strcmp((const char *)context, "a1")) { 1114 log_err("error: u_enumCharTypes() passes on an incorrect context pointer\n"); 1115 return FALSE; 1116 } 1117 1118 count=LENGTHOF(test); 1119 for(i=0; i<count; ++i) { 1120 if(start<=test[i][0] && test[i][0]<limit) { 1121 if(type!=(UCharCategory)test[i][1]) { 1122 log_err("error: u_enumCharTypes() has range [U+%04lx, U+%04lx[ with %ld instead of U+%04lx with %ld\n", 1123 start, limit, (long)type, test[i][0], test[i][1]); 1124 } 1125 /* stop at the range that includes the last test code point (increases code coverage for enumeration) */ 1126 return i==(count-1) ? FALSE : TRUE; 1127 } 1128 } 1129 1130 if(start>test[count-1][0]) { 1131 log_err("error: u_enumCharTypes() has range [U+%04lx, U+%04lx[ with %ld after it should have stopped\n", 1132 start, limit, (long)type); 1133 return FALSE; 1134 } 1135 1136 return TRUE; 1137 } 1138 1139 static UBool U_CALLCONV 1140 enumDefaultsRange(const void *context, UChar32 start, UChar32 limit, UCharCategory type) { 1141 /* default Bidi classes for unassigned code points */ 1142 static const int32_t defaultBidi[][2]={ /* { limit, class } */ 1143 { 0x0590, U_LEFT_TO_RIGHT }, 1144 { 0x0600, U_RIGHT_TO_LEFT }, 1145 { 0x07C0, U_RIGHT_TO_LEFT_ARABIC }, 1146 { 0x0900, U_RIGHT_TO_LEFT }, 1147 { 0xFB1D, U_LEFT_TO_RIGHT }, 1148 { 0xFB50, U_RIGHT_TO_LEFT }, 1149 { 0xFE00, U_RIGHT_TO_LEFT_ARABIC }, 1150 { 0xFE70, U_LEFT_TO_RIGHT }, 1151 { 0xFF00, U_RIGHT_TO_LEFT_ARABIC }, 1152 { 0x10800, U_LEFT_TO_RIGHT }, 1153 { 0x11000, U_RIGHT_TO_LEFT }, 1154 { 0x110000, U_LEFT_TO_RIGHT } 1155 }; 1156 1157 UChar32 c; 1158 int32_t i; 1159 UCharDirection shouldBeDir; 1160 1161 /* 1162 * LineBreak.txt specifies: 1163 * # - Assigned characters that are not listed explicitly are given the value 1164 * # "AL". 1165 * # - Unassigned characters are given the value "XX". 1166 * 1167 * PUA characters are listed explicitly with "XX". 1168 * Verify that no assigned character has "XX". 1169 */ 1170 if(type!=U_UNASSIGNED && type!=U_PRIVATE_USE_CHAR) { 1171 c=start; 1172 while(c<limit) { 1173 if(0==u_getIntPropertyValue(c, UCHAR_LINE_BREAK)) { 1174 log_err("error UCHAR_LINE_BREAK(assigned U+%04lx)=XX\n", c); 1175 } 1176 ++c; 1177 } 1178 } 1179 1180 /* 1181 * Verify default Bidi classes. 1182 * For recent Unicode versions, see UCD.html. 1183 * 1184 * For older Unicode versions: 1185 * See table 3-7 "Bidirectional Character Types" in UAX #9. 1186 * http://www.unicode.org/reports/tr9/ 1187 * 1188 * See also DerivedBidiClass.txt for Cn code points! 1189 * 1190 * Unicode 4.0.1/Public Review Issue #28 (http://www.unicode.org/review/resolved-pri.html) 1191 * changed some default values. 1192 * In particular, non-characters and unassigned Default Ignorable Code Points 1193 * change from L to BN. 1194 * 1195 * UCD.html version 4.0.1 does not yet reflect these changes. 1196 */ 1197 if(type==U_UNASSIGNED || type==U_PRIVATE_USE_CHAR) { 1198 /* enumerate the intersections of defaultBidi ranges with [start..limit[ */ 1199 c=start; 1200 for(i=0; i<LENGTHOF(defaultBidi) && c<limit; ++i) { 1201 if((int32_t)c<defaultBidi[i][0]) { 1202 while(c<limit && (int32_t)c<defaultBidi[i][0]) { 1203 if(U_IS_UNICODE_NONCHAR(c) || u_hasBinaryProperty(c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT)) { 1204 shouldBeDir=U_BOUNDARY_NEUTRAL; 1205 } else { 1206 shouldBeDir=(UCharDirection)defaultBidi[i][1]; 1207 } 1208 1209 if( u_charDirection(c)!=shouldBeDir || 1210 u_getIntPropertyValue(c, UCHAR_BIDI_CLASS)!=shouldBeDir 1211 ) { 1212 log_err("error: u_charDirection(unassigned/PUA U+%04lx)=%s should be %s\n", 1213 c, dirStrings[u_charDirection(c)], dirStrings[shouldBeDir]); 1214 } 1215 ++c; 1216 } 1217 } 1218 } 1219 } 1220 1221 return TRUE; 1222 } 1223 1224 /* tests for several properties */ 1225 static void TestUnicodeData() 1226 { 1227 UVersionInfo expectVersionArray; 1228 UVersionInfo versionArray; 1229 char *fields[15][2]; 1230 UErrorCode errorCode; 1231 UChar32 c; 1232 int8_t type; 1233 1234 u_versionFromString(expectVersionArray, U_UNICODE_VERSION); 1235 u_getUnicodeVersion(versionArray); 1236 if(memcmp(versionArray, expectVersionArray, U_MAX_VERSION_LENGTH) != 0) 1237 { 1238 log_err("Testing u_getUnicodeVersion() - expected " U_UNICODE_VERSION " got %d.%d.%d.%d\n", 1239 versionArray[0], versionArray[1], versionArray[2], versionArray[3]); 1240 } 1241 1242 #if defined(ICU_UNICODE_VERSION) 1243 /* test only happens where we have configure.in with UNICODE_VERSION - sanity check. */ 1244 if(strcmp(U_UNICODE_VERSION, ICU_UNICODE_VERSION)) 1245 { 1246 log_err("Testing configure.in's ICU_UNICODE_VERSION - expected " U_UNICODE_VERSION " got " ICU_UNICODE_VERSION "\n"); 1247 } 1248 #endif 1249 1250 if (ublock_getCode((UChar)0x0041) != UBLOCK_BASIC_LATIN || u_getIntPropertyValue(0x41, UCHAR_BLOCK)!=(int32_t)UBLOCK_BASIC_LATIN) { 1251 log_err("ublock_getCode(U+0041) property failed! Expected : %i Got: %i \n", UBLOCK_BASIC_LATIN,ublock_getCode((UChar)0x0041)); 1252 } 1253 1254 errorCode=U_ZERO_ERROR; 1255 parseUCDFile("UnicodeData.txt", fields, 15, unicodeDataLineFn, NULL, &errorCode); 1256 if(U_FAILURE(errorCode)) { 1257 return; /* if we couldn't parse UnicodeData.txt, we should return */ 1258 } 1259 1260 /* sanity check on repeated properties */ 1261 for(c=0xfffe; c<=0x10ffff;) { 1262 type=u_charType(c); 1263 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) { 1264 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c); 1265 } 1266 if(type!=U_UNASSIGNED) { 1267 log_err("error: u_charType(U+%04lx)!=U_UNASSIGNED (returns %d)\n", c, u_charType(c)); 1268 } 1269 if((c&0xffff)==0xfffe) { 1270 ++c; 1271 } else { 1272 c+=0xffff; 1273 } 1274 } 1275 1276 /* test that PUA is not "unassigned" */ 1277 for(c=0xe000; c<=0x10fffd;) { 1278 type=u_charType(c); 1279 if((uint32_t)u_getIntPropertyValue(c, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(type)) { 1280 log_err("error: (uint32_t)u_getIntPropertyValue(U+%04lx, UCHAR_GENERAL_CATEGORY_MASK)!=U_MASK(u_charType())\n", c); 1281 } 1282 if(type==U_UNASSIGNED) { 1283 log_err("error: u_charType(U+%04lx)==U_UNASSIGNED\n", c); 1284 } else if(type!=U_PRIVATE_USE_CHAR) { 1285 log_verbose("PUA override: u_charType(U+%04lx)=%d\n", c, type); 1286 } 1287 if(c==0xf8ff) { 1288 c=0xf0000; 1289 } else if(c==0xffffd) { 1290 c=0x100000; 1291 } else { 1292 ++c; 1293 } 1294 } 1295 1296 /* test u_enumCharTypes() */ 1297 u_enumCharTypes(enumTypeRange, "a1"); 1298 1299 /* check default properties */ 1300 u_enumCharTypes(enumDefaultsRange, NULL); 1301 } 1302 1303 static void TestCodeUnit(){ 1304 const UChar codeunit[]={0x0000,0xe065,0x20ac,0xd7ff,0xd800,0xd841,0xd905,0xdbff,0xdc00,0xdc02,0xddee,0xdfff,0}; 1305 1306 int32_t i; 1307 1308 for(i=0; i<(int32_t)(sizeof(codeunit)/sizeof(codeunit[0])); i++){ 1309 UChar c=codeunit[i]; 1310 if(i<4){ 1311 if(!(UTF_IS_SINGLE(c)) || (UTF_IS_LEAD(c)) || (UTF_IS_TRAIL(c)) ||(UTF_IS_SURROGATE(c))){ 1312 log_err("ERROR: U+%04x is a single", c); 1313 } 1314 1315 } 1316 if(i >= 4 && i< 8){ 1317 if(!(UTF_IS_LEAD(c)) || UTF_IS_SINGLE(c) || UTF_IS_TRAIL(c) || !(UTF_IS_SURROGATE(c))){ 1318 log_err("ERROR: U+%04x is a first surrogate", c); 1319 } 1320 } 1321 if(i >= 8 && i< 12){ 1322 if(!(UTF_IS_TRAIL(c)) || UTF_IS_SINGLE(c) || UTF_IS_LEAD(c) || !(UTF_IS_SURROGATE(c))){ 1323 log_err("ERROR: U+%04x is a second surrogate", c); 1324 } 1325 } 1326 } 1327 1328 } 1329 1330 static void TestCodePoint(){ 1331 const UChar32 codePoint[]={ 1332 /*surrogate, notvalid(codepoint), not a UnicodeChar, not Error */ 1333 0xd800, 1334 0xdbff, 1335 0xdc00, 1336 0xdfff, 1337 0xdc04, 1338 0xd821, 1339 /*not a surrogate, valid, isUnicodeChar , not Error*/ 1340 0x20ac, 1341 0xd7ff, 1342 0xe000, 1343 0xe123, 1344 0x0061, 1345 0xe065, 1346 0x20402, 1347 0x24506, 1348 0x23456, 1349 0x20402, 1350 0x10402, 1351 0x23456, 1352 /*not a surrogate, not valid, isUnicodeChar, isError */ 1353 0x0015, 1354 0x009f, 1355 /*not a surrogate, not valid, not isUnicodeChar, isError */ 1356 0xffff, 1357 0xfffe, 1358 }; 1359 int32_t i; 1360 for(i=0; i<(int32_t)(sizeof(codePoint)/sizeof(codePoint[0])); i++){ 1361 UChar32 c=codePoint[i]; 1362 if(i<6){ 1363 if(!UTF_IS_SURROGATE(c) || !U_IS_SURROGATE(c) || !U16_IS_SURROGATE(c)){ 1364 log_err("ERROR: isSurrogate() failed for U+%04x\n", c); 1365 } 1366 if(UTF_IS_VALID(c)){ 1367 log_err("ERROR: isValid() failed for U+%04x\n", c); 1368 } 1369 if(UTF_IS_UNICODE_CHAR(c) || U_IS_UNICODE_CHAR(c)){ 1370 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c); 1371 } 1372 if(UTF_IS_ERROR(c)){ 1373 log_err("ERROR: isError() failed for U+%04x\n", c); 1374 } 1375 }else if(i >=6 && i<18){ 1376 if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){ 1377 log_err("ERROR: isSurrogate() failed for U+%04x\n", c); 1378 } 1379 if(!UTF_IS_VALID(c)){ 1380 log_err("ERROR: isValid() failed for U+%04x\n", c); 1381 } 1382 if(!UTF_IS_UNICODE_CHAR(c) || !U_IS_UNICODE_CHAR(c)){ 1383 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c); 1384 } 1385 if(UTF_IS_ERROR(c)){ 1386 log_err("ERROR: isError() failed for U+%04x\n", c); 1387 } 1388 }else if(i >=18 && i<20){ 1389 if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){ 1390 log_err("ERROR: isSurrogate() failed for U+%04x\n", c); 1391 } 1392 if(UTF_IS_VALID(c)){ 1393 log_err("ERROR: isValid() failed for U+%04x\n", c); 1394 } 1395 if(!UTF_IS_UNICODE_CHAR(c) || !U_IS_UNICODE_CHAR(c)){ 1396 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c); 1397 } 1398 if(!UTF_IS_ERROR(c)){ 1399 log_err("ERROR: isError() failed for U+%04x\n", c); 1400 } 1401 } 1402 else if(i >=18 && i<(int32_t)(sizeof(codePoint)/sizeof(codePoint[0]))){ 1403 if(UTF_IS_SURROGATE(c) || U_IS_SURROGATE(c) || U16_IS_SURROGATE(c)){ 1404 log_err("ERROR: isSurrogate() failed for U+%04x\n", c); 1405 } 1406 if(UTF_IS_VALID(c)){ 1407 log_err("ERROR: isValid() failed for U+%04x\n", c); 1408 } 1409 if(UTF_IS_UNICODE_CHAR(c) || U_IS_UNICODE_CHAR(c)){ 1410 log_err("ERROR: isUnicodeChar() failed for U+%04x\n", c); 1411 } 1412 if(!UTF_IS_ERROR(c)){ 1413 log_err("ERROR: isError() failed for U+%04x\n", c); 1414 } 1415 } 1416 } 1417 1418 if( 1419 !U_IS_BMP(0) || !U_IS_BMP(0x61) || !U_IS_BMP(0x20ac) || 1420 !U_IS_BMP(0xd9da) || !U_IS_BMP(0xdfed) || !U_IS_BMP(0xffff) || 1421 U_IS_BMP(U_SENTINEL) || U_IS_BMP(0x10000) || U_IS_BMP(0x50005) || 1422 U_IS_BMP(0x10ffff) || U_IS_BMP(0x110000) || U_IS_BMP(0x7fffffff) 1423 ) { 1424 log_err("error with U_IS_BMP()\n"); 1425 } 1426 1427 if( 1428 U_IS_SUPPLEMENTARY(0) || U_IS_SUPPLEMENTARY(0x61) || U_IS_SUPPLEMENTARY(0x20ac) || 1429 U_IS_SUPPLEMENTARY(0xd9da) || U_IS_SUPPLEMENTARY(0xdfed) || U_IS_SUPPLEMENTARY(0xffff) || 1430 U_IS_SUPPLEMENTARY(U_SENTINEL) || !U_IS_SUPPLEMENTARY(0x10000) || !U_IS_SUPPLEMENTARY(0x50005) || 1431 !U_IS_SUPPLEMENTARY(0x10ffff) || U_IS_SUPPLEMENTARY(0x110000) || U_IS_SUPPLEMENTARY(0x7fffffff) 1432 ) { 1433 log_err("error with U_IS_SUPPLEMENTARY()\n"); 1434 } 1435 } 1436 1437 static void TestCharLength() 1438 { 1439 const int32_t codepoint[]={ 1440 1, 0x0061, 1441 1, 0xe065, 1442 1, 0x20ac, 1443 2, 0x20402, 1444 2, 0x23456, 1445 2, 0x24506, 1446 2, 0x20402, 1447 2, 0x10402, 1448 1, 0xd7ff, 1449 1, 0xe000 1450 }; 1451 1452 int32_t i; 1453 UBool multiple; 1454 for(i=0; i<(int32_t)(sizeof(codepoint)/sizeof(codepoint[0])); i=(int16_t)(i+2)){ 1455 UChar32 c=codepoint[i+1]; 1456 if(UTF_CHAR_LENGTH(c) != codepoint[i] || U16_LENGTH(c) != codepoint[i]){ 1457 log_err("The no: of code units for U+%04x:- Expected: %d Got: %d\n", c, codepoint[i], UTF_CHAR_LENGTH(c)); 1458 } 1459 multiple=(UBool)(codepoint[i] == 1 ? FALSE : TRUE); 1460 if(UTF_NEED_MULTIPLE_UCHAR(c) != multiple){ 1461 log_err("ERROR: Unicode::needMultipleUChar() failed for U+%04x\n", c); 1462 } 1463 } 1464 } 1465 1466 /*internal functions ----*/ 1467 static int32_t MakeProp(char* str) 1468 { 1469 int32_t result = 0; 1470 char* matchPosition =0; 1471 1472 matchPosition = strstr(tagStrings, str); 1473 if (matchPosition == 0) 1474 { 1475 log_err("unrecognized type letter "); 1476 log_err(str); 1477 } 1478 else 1479 result = (int32_t)((matchPosition - tagStrings) / 2); 1480 return result; 1481 } 1482 1483 static int32_t MakeDir(char* str) 1484 { 1485 int32_t pos = 0; 1486 for (pos = 0; pos < 19; pos++) { 1487 if (strcmp(str, dirStrings[pos]) == 0) { 1488 return pos; 1489 } 1490 } 1491 return -1; 1492 } 1493 1494 /* test u_charName() -------------------------------------------------------- */ 1495 1496 static const struct { 1497 uint32_t code; 1498 const char *name, *oldName, *extName; 1499 } names[]={ 1500 {0x0061, "LATIN SMALL LETTER A", "", "LATIN SMALL LETTER A"}, 1501 {0x0284, "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK", "LATIN SMALL LETTER DOTLESS J BAR HOOK", "LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK" }, 1502 {0x3401, "CJK UNIFIED IDEOGRAPH-3401", "", "CJK UNIFIED IDEOGRAPH-3401" }, 1503 {0x7fed, "CJK UNIFIED IDEOGRAPH-7FED", "", "CJK UNIFIED IDEOGRAPH-7FED" }, 1504 {0xac00, "HANGUL SYLLABLE GA", "", "HANGUL SYLLABLE GA" }, 1505 {0xd7a3, "HANGUL SYLLABLE HIH", "", "HANGUL SYLLABLE HIH" }, 1506 {0xd800, "", "", "<lead surrogate-D800>" }, 1507 {0xdc00, "", "", "<trail surrogate-DC00>" }, 1508 {0xff08, "FULLWIDTH LEFT PARENTHESIS", "FULLWIDTH OPENING PARENTHESIS", "FULLWIDTH LEFT PARENTHESIS" }, 1509 {0xffe5, "FULLWIDTH YEN SIGN", "", "FULLWIDTH YEN SIGN" }, 1510 {0xffff, "", "", "<noncharacter-FFFF>" }, 1511 {0x23456, "CJK UNIFIED IDEOGRAPH-23456", "", "CJK UNIFIED IDEOGRAPH-23456" } 1512 }; 1513 1514 static UBool 1515 enumCharNamesFn(void *context, 1516 UChar32 code, UCharNameChoice nameChoice, 1517 const char *name, int32_t length) { 1518 int32_t *pCount=(int32_t *)context; 1519 int i; 1520 1521 if(length<=0 || length!=(int32_t)strlen(name)) { 1522 /* should not be called with an empty string or invalid length */ 1523 log_err("u_enumCharName(0x%lx)=%s but length=%ld\n", name, length); 1524 return TRUE; 1525 } 1526 1527 ++*pCount; 1528 for(i=0; i<sizeof(names)/sizeof(names[0]); ++i) { 1529 if(code==(UChar32)names[i].code) { 1530 switch (nameChoice) { 1531 case U_EXTENDED_CHAR_NAME: 1532 if(0!=strcmp(name, names[i].extName)) { 1533 log_err("u_enumCharName(0x%lx - Extended)=%s instead of %s\n", code, name, names[i].extName); 1534 } 1535 break; 1536 case U_UNICODE_CHAR_NAME: 1537 if(0!=strcmp(name, names[i].name)) { 1538 log_err("u_enumCharName(0x%lx)=%s instead of %s\n", code, name, names[i].name); 1539 } 1540 break; 1541 case U_UNICODE_10_CHAR_NAME: 1542 if(names[i].oldName[0]==0 || 0!=strcmp(name, names[i].oldName)) { 1543 log_err("u_enumCharName(0x%lx - 1.0)=%s instead of %s\n", code, name, names[i].oldName); 1544 } 1545 break; 1546 case U_CHAR_NAME_CHOICE_COUNT: 1547 break; 1548 } 1549 break; 1550 } 1551 } 1552 return TRUE; 1553 } 1554 1555 struct enumExtCharNamesContext { 1556 uint32_t length; 1557 int32_t last; 1558 }; 1559 1560 static UBool 1561 enumExtCharNamesFn(void *context, 1562 UChar32 code, UCharNameChoice nameChoice, 1563 const char *name, int32_t length) { 1564 struct enumExtCharNamesContext *ecncp = (struct enumExtCharNamesContext *) context; 1565 1566 if (ecncp->last != (int32_t) code - 1) { 1567 if (ecncp->last < 0) { 1568 log_err("u_enumCharName(0x%lx - Ext) after u_enumCharName(0x%lx - Ext) instead of u_enumCharName(0x%lx - Ext)\n", code, ecncp->last, ecncp->last + 1); 1569 } else { 1570 log_err("u_enumCharName(0x%lx - Ext) instead of u_enumCharName(0x0 - Ext)\n", code); 1571 } 1572 } 1573 ecncp->last = (int32_t) code; 1574 1575 if (!*name) { 1576 log_err("u_enumCharName(0x%lx - Ext) should not be an empty string\n", code); 1577 } 1578 1579 return enumCharNamesFn(&ecncp->length, code, nameChoice, name, length); 1580 } 1581 1582 /** 1583 * This can be made more efficient by moving it into putil.c and having 1584 * it directly access the ebcdic translation tables. 1585 * TODO: If we get this method in putil.c, then delete it from here. 1586 */ 1587 static UChar 1588 u_charToUChar(char c) { 1589 UChar uc; 1590 u_charsToUChars(&c, &uc, 1); 1591 return uc; 1592 } 1593 1594 static void 1595 TestCharNames() { 1596 static char name[80]; 1597 UErrorCode errorCode=U_ZERO_ERROR; 1598 struct enumExtCharNamesContext extContext; 1599 int32_t length; 1600 UChar32 c; 1601 int32_t i; 1602 1603 log_verbose("Testing uprv_getMaxCharNameLength()\n"); 1604 length=uprv_getMaxCharNameLength(); 1605 if(length==0) { 1606 /* no names data available */ 1607 return; 1608 } 1609 if(length<83) { /* Unicode 3.2 max char name length */ 1610 log_err("uprv_getMaxCharNameLength()=%d is too short"); 1611 } 1612 /* ### TODO same tests for max ISO comment length as for max name length */ 1613 1614 log_verbose("Testing u_charName()\n"); 1615 for(i=0; i<(int32_t)(sizeof(names)/sizeof(names[0])); ++i) { 1616 /* modern Unicode character name */ 1617 length=u_charName(names[i].code, U_UNICODE_CHAR_NAME, name, sizeof(name), &errorCode); 1618 if(U_FAILURE(errorCode)) { 1619 log_err("u_charName(0x%lx) error %s\n", names[i].code, u_errorName(errorCode)); 1620 return; 1621 } 1622 if(length<0 || 0!=strcmp(name, names[i].name) || length!=(uint16_t)strlen(name)) { 1623 log_err("u_charName(0x%lx) gets: %s (length %ld) instead of: %s\n", names[i].code, name, length, names[i].name); 1624 } 1625 1626 /* find the modern name */ 1627 if (*names[i].name) { 1628 c=u_charFromName(U_UNICODE_CHAR_NAME, names[i].name, &errorCode); 1629 if(U_FAILURE(errorCode)) { 1630 log_err("u_charFromName(%s) error %s\n", names[i].name, u_errorName(errorCode)); 1631 return; 1632 } 1633 if(c!=(UChar32)names[i].code) { 1634 log_err("u_charFromName(%s) gets 0x%lx instead of 0x%lx\n", names[i].name, c, names[i].code); 1635 } 1636 } 1637 1638 /* Unicode 1.0 character name */ 1639 length=u_charName(names[i].code, U_UNICODE_10_CHAR_NAME, name, sizeof(name), &errorCode); 1640 if(U_FAILURE(errorCode)) { 1641 log_err("u_charName(0x%lx - 1.0) error %s\n", names[i].code, u_errorName(errorCode)); 1642 return; 1643 } 1644 if(length<0 || (length>0 && 0!=strcmp(name, names[i].oldName)) || length!=(uint16_t)strlen(name)) { 1645 log_err("u_charName(0x%lx - 1.0) gets %s length %ld instead of nothing or %s\n", names[i].code, name, length, names[i].oldName); 1646 } 1647 1648 /* find the Unicode 1.0 name if it is stored (length>0 means that we could read it) */ 1649 if(names[i].oldName[0]!=0 /* && length>0 */) { 1650 c=u_charFromName(U_UNICODE_10_CHAR_NAME, names[i].oldName, &errorCode); 1651 if(U_FAILURE(errorCode)) { 1652 log_err("u_charFromName(%s - 1.0) error %s\n", names[i].oldName, u_errorName(errorCode)); 1653 return; 1654 } 1655 if(c!=(UChar32)names[i].code) { 1656 log_err("u_charFromName(%s - 1.0) gets 0x%lx instead of 0x%lx\n", names[i].oldName, c, names[i].code); 1657 } 1658 } 1659 } 1660 1661 /* test u_enumCharNames() */ 1662 length=0; 1663 errorCode=U_ZERO_ERROR; 1664 u_enumCharNames(UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, enumCharNamesFn, &length, U_UNICODE_CHAR_NAME, &errorCode); 1665 if(U_FAILURE(errorCode) || length<94140) { 1666 log_err("u_enumCharNames(%ld..%lx) error %s names count=%ld\n", UCHAR_MIN_VALUE, UCHAR_MAX_VALUE, u_errorName(errorCode), length); 1667 } 1668 1669 extContext.length = 0; 1670 extContext.last = -1; 1671 errorCode=U_ZERO_ERROR; 1672 u_enumCharNames(UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, enumExtCharNamesFn, &extContext, U_EXTENDED_CHAR_NAME, &errorCode); 1673 if(U_FAILURE(errorCode) || extContext.length<UCHAR_MAX_VALUE + 1) { 1674 log_err("u_enumCharNames(%ld..0x%lx - Extended) error %s names count=%ld\n", UCHAR_MIN_VALUE, UCHAR_MAX_VALUE + 1, u_errorName(errorCode), extContext.length); 1675 } 1676 1677 /* test that u_charFromName() uppercases the input name, i.e., works with mixed-case names (new in 2.0) */ 1678 if(0x61!=u_charFromName(U_UNICODE_CHAR_NAME, "LATin smALl letTER A", &errorCode)) { 1679 log_err("u_charFromName(U_UNICODE_CHAR_NAME, \"LATin smALl letTER A\") did not find U+0061 (%s)\n", u_errorName(errorCode)); 1680 } 1681 1682 /* Test getCharNameCharacters */ 1683 if(!QUICK) { 1684 enum { BUFSIZE = 256 }; 1685 UErrorCode ec = U_ZERO_ERROR; 1686 char buf[BUFSIZE]; 1687 int32_t maxLength; 1688 UChar32 cp; 1689 UChar pat[BUFSIZE], dumbPat[BUFSIZE]; 1690 int32_t l1, l2; 1691 UBool map[256]; 1692 UBool ok; 1693 1694 USet* set = uset_open(1, 0); /* empty set */ 1695 USet* dumb = uset_open(1, 0); /* empty set */ 1696 1697 /* 1698 * uprv_getCharNameCharacters() will likely return more lowercase 1699 * letters than actual character names contain because 1700 * it includes all the characters in lowercased names of 1701 * general categories, for the full possible set of extended names. 1702 */ 1703 { 1704 USetAdder sa={ 1705 NULL, 1706 uset_add, 1707 uset_addRange, 1708 uset_addString, 1709 NULL /* don't need remove() */ 1710 }; 1711 sa.set=set; 1712 uprv_getCharNameCharacters(&sa); 1713 } 1714 1715 /* build set the dumb (but sure-fire) way */ 1716 for (i=0; i<256; ++i) { 1717 map[i] = FALSE; 1718 } 1719 1720 maxLength=0; 1721 for (cp=0; cp<0x110000; ++cp) { 1722 int32_t len = u_charName(cp, U_EXTENDED_CHAR_NAME, 1723 buf, BUFSIZE, &ec); 1724 if (U_FAILURE(ec)) { 1725 log_err("FAIL: u_charName failed when it shouldn't\n"); 1726 uset_close(set); 1727 uset_close(dumb); 1728 return; 1729 } 1730 if(len>maxLength) { 1731 maxLength=len; 1732 } 1733 1734 for (i=0; i<len; ++i) { 1735 if (!map[(uint8_t) buf[i]]) { 1736 uset_add(dumb, (UChar32)u_charToUChar(buf[i])); 1737 map[(uint8_t) buf[i]] = TRUE; 1738 } 1739 } 1740 1741 /* test for leading/trailing whitespace */ 1742 if(buf[0]==' ' || buf[0]=='\t' || buf[len-1]==' ' || buf[len-1]=='\t') { 1743 log_err("u_charName(U+%04x) returns a name with leading or trailing whitespace\n", cp); 1744 } 1745 } 1746 1747 if(map[(uint8_t)'\t']) { 1748 log_err("u_charName() returned a name with a TAB for some code point\n", cp); 1749 } 1750 1751 length=uprv_getMaxCharNameLength(); 1752 if(length!=maxLength) { 1753 log_err("uprv_getMaxCharNameLength()=%d differs from the maximum length %d of all extended names\n", 1754 length, maxLength); 1755 } 1756 1757 /* compare the sets. Where is my uset_equals?!! */ 1758 ok=TRUE; 1759 for(i=0; i<256; ++i) { 1760 if(uset_contains(set, i)!=uset_contains(dumb, i)) { 1761 if(0x61<=i && i<=0x7a /* a-z */ && uset_contains(set, i) && !uset_contains(dumb, i)) { 1762 /* ignore lowercase a-z that are in set but not in dumb */ 1763 ok=TRUE; 1764 } else { 1765 ok=FALSE; 1766 break; 1767 } 1768 } 1769 } 1770 1771 l1 = uset_toPattern(set, pat, BUFSIZE, TRUE, &ec); 1772 l2 = uset_toPattern(dumb, dumbPat, BUFSIZE, TRUE, &ec); 1773 if (U_FAILURE(ec)) { 1774 log_err("FAIL: uset_toPattern failed when it shouldn't\n"); 1775 uset_close(set); 1776 uset_close(dumb); 1777 return; 1778 } 1779 1780 if (l1 >= BUFSIZE) { 1781 l1 = BUFSIZE-1; 1782 pat[l1] = 0; 1783 } 1784 if (l2 >= BUFSIZE) { 1785 l2 = BUFSIZE-1; 1786 dumbPat[l2] = 0; 1787 } 1788 1789 if (!ok) { 1790 log_err("FAIL: uprv_getCharNameCharacters() returned %s, expected %s (too many lowercase a-z are ok)\n", 1791 aescstrdup(pat, l1), aescstrdup(dumbPat, l2)); 1792 } else if(VERBOSITY) { 1793 log_verbose("Ok: uprv_getCharNameCharacters() returned %s\n", aescstrdup(pat, l1)); 1794 } 1795 1796 uset_close(set); 1797 uset_close(dumb); 1798 } 1799 1800 /* ### TODO: test error cases and other interesting things */ 1801 } 1802 1803 /* test u_isMirrored() and u_charMirror() ----------------------------------- */ 1804 1805 static void 1806 TestMirroring() { 1807 USet *set; 1808 UErrorCode errorCode; 1809 1810 UChar32 start, end, c2, c3; 1811 int32_t i; 1812 1813 U_STRING_DECL(mirroredPattern, "[:Bidi_Mirrored:]", 17); 1814 1815 U_STRING_INIT(mirroredPattern, "[:Bidi_Mirrored:]", 17); 1816 1817 log_verbose("Testing u_isMirrored()\n"); 1818 if(!(u_isMirrored(0x28) && u_isMirrored(0xbb) && u_isMirrored(0x2045) && u_isMirrored(0x232a) && 1819 !u_isMirrored(0x27) && !u_isMirrored(0x61) && !u_isMirrored(0x284) && !u_isMirrored(0x3400) 1820 ) 1821 ) { 1822 log_err("u_isMirrored() does not work correctly\n"); 1823 } 1824 1825 log_verbose("Testing u_charMirror()\n"); 1826 if(!(u_charMirror(0x3c)==0x3e && u_charMirror(0x5d)==0x5b && u_charMirror(0x208d)==0x208e && u_charMirror(0x3017)==0x3016 && 1827 u_charMirror(0xbb)==0xab && u_charMirror(0x2215)==0x29F5 && u_charMirror(0x29F5)==0x2215 && /* large delta between the code points */ 1828 u_charMirror(0x2e)==0x2e && u_charMirror(0x6f3)==0x6f3 && u_charMirror(0x301c)==0x301c && u_charMirror(0xa4ab)==0xa4ab && 1829 /* see Unicode Corrigendum #6 at http://www.unicode.org/versions/corrigendum6.html */ 1830 u_charMirror(0x2018)==0x2018 && u_charMirror(0x201b)==0x201b && u_charMirror(0x301d)==0x301d 1831 ) 1832 ) { 1833 log_err("u_charMirror() does not work correctly\n"); 1834 } 1835 1836 /* verify that Bidi_Mirroring_Glyph roundtrips */ 1837 errorCode=U_ZERO_ERROR; 1838 set=uset_openPattern(mirroredPattern, 17, &errorCode); 1839 1840 if (U_FAILURE(errorCode)) { 1841 log_data_err("uset_openPattern(mirroredPattern, 17, &errorCode) failed!\n"); 1842 } else { 1843 for(i=0; 0==uset_getItem(set, i, &start, &end, NULL, 0, &errorCode); ++i) { 1844 do { 1845 c2=u_charMirror(start); 1846 c3=u_charMirror(c2); 1847 if(c3!=start) { 1848 log_err("u_charMirror() does not roundtrip: U+%04lx->U+%04lx->U+%04lx\n", (long)start, (long)c2, (long)c3); 1849 } 1850 } while(++start<=end); 1851 } 1852 } 1853 1854 uset_close(set); 1855 } 1856 1857 1858 struct RunTestData 1859 { 1860 const char *runText; 1861 UScriptCode runCode; 1862 }; 1863 1864 typedef struct RunTestData RunTestData; 1865 1866 static void 1867 CheckScriptRuns(UScriptRun *scriptRun, int32_t *runStarts, const RunTestData *testData, int32_t nRuns, 1868 const char *prefix) 1869 { 1870 int32_t run, runStart, runLimit; 1871 UScriptCode runCode; 1872 1873 /* iterate over all the runs */ 1874 run = 0; 1875 while (uscript_nextRun(scriptRun, &runStart, &runLimit, &runCode)) { 1876 if (runStart != runStarts[run]) { 1877 log_err("%s: incorrect start offset for run %d: expected %d, got %d\n", 1878 prefix, run, runStarts[run], runStart); 1879 } 1880 1881 if (runLimit != runStarts[run + 1]) { 1882 log_err("%s: incorrect limit offset for run %d: expected %d, got %d\n", 1883 prefix, run, runStarts[run + 1], runLimit); 1884 } 1885 1886 if (runCode != testData[run].runCode) { 1887 log_err("%s: incorrect script for run %d: expected \"%s\", got \"%s\"\n", 1888 prefix, run, uscript_getName(testData[run].runCode), uscript_getName(runCode)); 1889 } 1890 1891 run += 1; 1892 1893 /* stop when we've seen all the runs we expect to see */ 1894 if (run >= nRuns) { 1895 break; 1896 } 1897 } 1898 1899 /* Complain if we didn't see then number of runs we expected */ 1900 if (run != nRuns) { 1901 log_err("%s: incorrect number of runs: expected %d, got %d\n", prefix, run, nRuns); 1902 } 1903 } 1904 1905 static void 1906 TestUScriptRunAPI() 1907 { 1908 static const RunTestData testData1[] = { 1909 {"\\u0020\\u0946\\u0939\\u093F\\u0928\\u094D\\u0926\\u0940\\u0020", USCRIPT_DEVANAGARI}, 1910 {"\\u0627\\u0644\\u0639\\u0631\\u0628\\u064A\\u0629\\u0020", USCRIPT_ARABIC}, 1911 {"\\u0420\\u0443\\u0441\\u0441\\u043A\\u0438\\u0439\\u0020", USCRIPT_CYRILLIC}, 1912 {"English (", USCRIPT_LATIN}, 1913 {"\\u0E44\\u0E17\\u0E22", USCRIPT_THAI}, 1914 {") ", USCRIPT_LATIN}, 1915 {"\\u6F22\\u5B75", USCRIPT_HAN}, 1916 {"\\u3068\\u3072\\u3089\\u304C\\u306A\\u3068", USCRIPT_HIRAGANA}, 1917 {"\\u30AB\\u30BF\\u30AB\\u30CA", USCRIPT_KATAKANA}, 1918 {"\\U00010400\\U00010401\\U00010402\\U00010403", USCRIPT_DESERET} 1919 }; 1920 1921 static const RunTestData testData2[] = { 1922 {"((((((((((abc))))))))))", USCRIPT_LATIN} 1923 }; 1924 1925 static const struct { 1926 const RunTestData *testData; 1927 int32_t nRuns; 1928 } testDataEntries[] = { 1929 {testData1, LENGTHOF(testData1)}, 1930 {testData2, LENGTHOF(testData2)} 1931 }; 1932 1933 static const int32_t nTestEntries = LENGTHOF(testDataEntries); 1934 int32_t testEntry; 1935 1936 for (testEntry = 0; testEntry < nTestEntries; testEntry += 1) { 1937 UChar testString[1024]; 1938 int32_t runStarts[256]; 1939 int32_t nTestRuns = testDataEntries[testEntry].nRuns; 1940 const RunTestData *testData = testDataEntries[testEntry].testData; 1941 1942 int32_t run, stringLimit; 1943 UScriptRun *scriptRun = NULL; 1944 UErrorCode err; 1945 1946 /* 1947 * Fill in the test string and the runStarts array. 1948 */ 1949 stringLimit = 0; 1950 for (run = 0; run < nTestRuns; run += 1) { 1951 runStarts[run] = stringLimit; 1952 stringLimit += u_unescape(testData[run].runText, &testString[stringLimit], 1024 - stringLimit); 1953 /*stringLimit -= 1;*/ 1954 } 1955 1956 /* The limit of the last run */ 1957 runStarts[nTestRuns] = stringLimit; 1958 1959 /* 1960 * Make sure that calling uscript_OpenRun with a NULL text pointer 1961 * and a non-zero text length returns the correct error. 1962 */ 1963 err = U_ZERO_ERROR; 1964 scriptRun = uscript_openRun(NULL, stringLimit, &err); 1965 1966 if (err != U_ILLEGAL_ARGUMENT_ERROR) { 1967 log_err("uscript_openRun(NULL, stringLimit, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err)); 1968 } 1969 1970 if (scriptRun != NULL) { 1971 log_err("uscript_openRun(NULL, stringLimit, &err) returned a non-NULL result.\n"); 1972 uscript_closeRun(scriptRun); 1973 } 1974 1975 /* 1976 * Make sure that calling uscript_OpenRun with a non-NULL text pointer 1977 * and a zero text length returns the correct error. 1978 */ 1979 err = U_ZERO_ERROR; 1980 scriptRun = uscript_openRun(testString, 0, &err); 1981 1982 if (err != U_ILLEGAL_ARGUMENT_ERROR) { 1983 log_err("uscript_openRun(testString, 0, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err)); 1984 } 1985 1986 if (scriptRun != NULL) { 1987 log_err("uscript_openRun(testString, 0, &err) returned a non-NULL result.\n"); 1988 uscript_closeRun(scriptRun); 1989 } 1990 1991 /* 1992 * Make sure that calling uscript_openRun with a NULL text pointer 1993 * and a zero text length doesn't return an error. 1994 */ 1995 err = U_ZERO_ERROR; 1996 scriptRun = uscript_openRun(NULL, 0, &err); 1997 1998 if (U_FAILURE(err)) { 1999 log_err("Got error %s from uscript_openRun(NULL, 0, &err)\n", u_errorName(err)); 2000 } 2001 2002 /* Make sure that the empty iterator doesn't find any runs */ 2003 if (uscript_nextRun(scriptRun, NULL, NULL, NULL)) { 2004 log_err("uscript_nextRun(...) returned TRUE for an empty iterator.\n"); 2005 } 2006 2007 /* 2008 * Make sure that calling uscript_setRunText with a NULL text pointer 2009 * and a non-zero text length returns the correct error. 2010 */ 2011 err = U_ZERO_ERROR; 2012 uscript_setRunText(scriptRun, NULL, stringLimit, &err); 2013 2014 if (err != U_ILLEGAL_ARGUMENT_ERROR) { 2015 log_err("uscript_setRunText(scriptRun, NULL, stringLimit, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err)); 2016 } 2017 2018 /* 2019 * Make sure that calling uscript_OpenRun with a non-NULL text pointer 2020 * and a zero text length returns the correct error. 2021 */ 2022 err = U_ZERO_ERROR; 2023 uscript_setRunText(scriptRun, testString, 0, &err); 2024 2025 if (err != U_ILLEGAL_ARGUMENT_ERROR) { 2026 log_err("uscript_setRunText(scriptRun, testString, 0, &err) returned %s instead of U_ILLEGAL_ARGUMENT_ERROR.\n", u_errorName(err)); 2027 } 2028 2029 /* 2030 * Now call uscript_setRunText on the empty iterator 2031 * and make sure that it works. 2032 */ 2033 err = U_ZERO_ERROR; 2034 uscript_setRunText(scriptRun, testString, stringLimit, &err); 2035 2036 if (U_FAILURE(err)) { 2037 log_err("Got error %s from uscript_setRunText(...)\n", u_errorName(err)); 2038 } else { 2039 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_setRunText"); 2040 } 2041 2042 uscript_closeRun(scriptRun); 2043 2044 /* 2045 * Now open an interator over the testString 2046 * using uscript_openRun and make sure that it works 2047 */ 2048 scriptRun = uscript_openRun(testString, stringLimit, &err); 2049 2050 if (U_FAILURE(err)) { 2051 log_err("Got error %s from uscript_openRun(...)\n", u_errorName(err)); 2052 } else { 2053 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_openRun"); 2054 } 2055 2056 /* Now reset the iterator, and make sure 2057 * that it still works. 2058 */ 2059 uscript_resetRun(scriptRun); 2060 2061 CheckScriptRuns(scriptRun, runStarts, testData, nTestRuns, "uscript_resetRun"); 2062 2063 /* Close the iterator */ 2064 uscript_closeRun(scriptRun); 2065 } 2066 } 2067 2068 /* test additional, non-core properties */ 2069 static void 2070 TestAdditionalProperties() { 2071 /* test data for u_charAge() */ 2072 static const struct { 2073 UChar32 c; 2074 UVersionInfo version; 2075 } charAges[]={ 2076 {0x41, { 1, 1, 0, 0 }}, 2077 {0xffff, { 1, 1, 0, 0 }}, 2078 {0x20ab, { 2, 0, 0, 0 }}, 2079 {0x2fffe, { 2, 0, 0, 0 }}, 2080 {0x20ac, { 2, 1, 0, 0 }}, 2081 {0xfb1d, { 3, 0, 0, 0 }}, 2082 {0x3f4, { 3, 1, 0, 0 }}, 2083 {0x10300, { 3, 1, 0, 0 }}, 2084 {0x220, { 3, 2, 0, 0 }}, 2085 {0xff60, { 3, 2, 0, 0 }} 2086 }; 2087 2088 /* test data for u_hasBinaryProperty() */ 2089 static const int32_t 2090 props[][3]={ /* code point, property, value */ 2091 { 0x0627, UCHAR_ALPHABETIC, TRUE }, 2092 { 0x1034a, UCHAR_ALPHABETIC, TRUE }, 2093 { 0x2028, UCHAR_ALPHABETIC, FALSE }, 2094 2095 { 0x0066, UCHAR_ASCII_HEX_DIGIT, TRUE }, 2096 { 0x0067, UCHAR_ASCII_HEX_DIGIT, FALSE }, 2097 2098 { 0x202c, UCHAR_BIDI_CONTROL, TRUE }, 2099 { 0x202f, UCHAR_BIDI_CONTROL, FALSE }, 2100 2101 { 0x003c, UCHAR_BIDI_MIRRORED, TRUE }, 2102 { 0x003d, UCHAR_BIDI_MIRRORED, FALSE }, 2103 2104 /* see Unicode Corrigendum #6 at http://www.unicode.org/versions/corrigendum6.html */ 2105 { 0x2018, UCHAR_BIDI_MIRRORED, FALSE }, 2106 { 0x201d, UCHAR_BIDI_MIRRORED, FALSE }, 2107 { 0x201f, UCHAR_BIDI_MIRRORED, FALSE }, 2108 { 0x301e, UCHAR_BIDI_MIRRORED, FALSE }, 2109 2110 { 0x058a, UCHAR_DASH, TRUE }, 2111 { 0x007e, UCHAR_DASH, FALSE }, 2112 2113 { 0x0c4d, UCHAR_DIACRITIC, TRUE }, 2114 { 0x3000, UCHAR_DIACRITIC, FALSE }, 2115 2116 { 0x0e46, UCHAR_EXTENDER, TRUE }, 2117 { 0x0020, UCHAR_EXTENDER, FALSE }, 2118 2119 #if !UCONFIG_NO_NORMALIZATION 2120 { 0xfb1d, UCHAR_FULL_COMPOSITION_EXCLUSION, TRUE }, 2121 { 0x1d15f, UCHAR_FULL_COMPOSITION_EXCLUSION, TRUE }, 2122 { 0xfb1e, UCHAR_FULL_COMPOSITION_EXCLUSION, FALSE }, 2123 2124 { 0x110a, UCHAR_NFD_INERT, TRUE }, /* Jamo L */ 2125 { 0x0308, UCHAR_NFD_INERT, FALSE }, 2126 2127 { 0x1164, UCHAR_NFKD_INERT, TRUE }, /* Jamo V */ 2128 { 0x1d79d, UCHAR_NFKD_INERT, FALSE }, /* math compat version of xi */ 2129 2130 { 0x0021, UCHAR_NFC_INERT, TRUE }, /* ! */ 2131 { 0x0061, UCHAR_NFC_INERT, FALSE }, /* a */ 2132 { 0x00e4, UCHAR_NFC_INERT, FALSE }, /* a-umlaut */ 2133 { 0x0102, UCHAR_NFC_INERT, FALSE }, /* a-breve */ 2134 { 0xac1c, UCHAR_NFC_INERT, FALSE }, /* Hangul LV */ 2135 { 0xac1d, UCHAR_NFC_INERT, TRUE }, /* Hangul LVT */ 2136 2137 { 0x1d79d, UCHAR_NFKC_INERT, FALSE }, /* math compat version of xi */ 2138 { 0x2a6d6, UCHAR_NFKC_INERT, TRUE }, /* Han, last of CJK ext. B */ 2139 2140 { 0x00e4, UCHAR_SEGMENT_STARTER, TRUE }, 2141 { 0x0308, UCHAR_SEGMENT_STARTER, FALSE }, 2142 { 0x110a, UCHAR_SEGMENT_STARTER, TRUE }, /* Jamo L */ 2143 { 0x1164, UCHAR_SEGMENT_STARTER, FALSE },/* Jamo V */ 2144 { 0xac1c, UCHAR_SEGMENT_STARTER, TRUE }, /* Hangul LV */ 2145 { 0xac1d, UCHAR_SEGMENT_STARTER, TRUE }, /* Hangul LVT */ 2146 #endif 2147 2148 { 0x0044, UCHAR_HEX_DIGIT, TRUE }, 2149 { 0xff46, UCHAR_HEX_DIGIT, TRUE }, 2150 { 0x0047, UCHAR_HEX_DIGIT, FALSE }, 2151 2152 { 0x30fb, UCHAR_HYPHEN, TRUE }, 2153 { 0xfe58, UCHAR_HYPHEN, FALSE }, 2154 2155 { 0x2172, UCHAR_ID_CONTINUE, TRUE }, 2156 { 0x0307, UCHAR_ID_CONTINUE, TRUE }, 2157 { 0x005c, UCHAR_ID_CONTINUE, FALSE }, 2158 2159 { 0x2172, UCHAR_ID_START, TRUE }, 2160 { 0x007a, UCHAR_ID_START, TRUE }, 2161 { 0x0039, UCHAR_ID_START, FALSE }, 2162 2163 { 0x4db5, UCHAR_IDEOGRAPHIC, TRUE }, 2164 { 0x2f999, UCHAR_IDEOGRAPHIC, TRUE }, 2165 { 0x2f99, UCHAR_IDEOGRAPHIC, FALSE }, 2166 2167 { 0x200c, UCHAR_JOIN_CONTROL, TRUE }, 2168 { 0x2029, UCHAR_JOIN_CONTROL, FALSE }, 2169 2170 { 0x1d7bc, UCHAR_LOWERCASE, TRUE }, 2171 { 0x0345, UCHAR_LOWERCASE, TRUE }, 2172 { 0x0030, UCHAR_LOWERCASE, FALSE }, 2173 2174 { 0x1d7a9, UCHAR_MATH, TRUE }, 2175 { 0x2135, UCHAR_MATH, TRUE }, 2176 { 0x0062, UCHAR_MATH, FALSE }, 2177 2178 { 0xfde1, UCHAR_NONCHARACTER_CODE_POINT, TRUE }, 2179 { 0x10ffff, UCHAR_NONCHARACTER_CODE_POINT, TRUE }, 2180 { 0x10fffd, UCHAR_NONCHARACTER_CODE_POINT, FALSE }, 2181 2182 { 0x0022, UCHAR_QUOTATION_MARK, TRUE }, 2183 { 0xff62, UCHAR_QUOTATION_MARK, TRUE }, 2184 { 0xd840, UCHAR_QUOTATION_MARK, FALSE }, 2185 2186 { 0x061f, UCHAR_TERMINAL_PUNCTUATION, TRUE }, 2187 { 0xe003f, UCHAR_TERMINAL_PUNCTUATION, FALSE }, 2188 2189 { 0x1d44a, UCHAR_UPPERCASE, TRUE }, 2190 { 0x2162, UCHAR_UPPERCASE, TRUE }, 2191 { 0x0345, UCHAR_UPPERCASE, FALSE }, 2192 2193 { 0x0020, UCHAR_WHITE_SPACE, TRUE }, 2194 { 0x202f, UCHAR_WHITE_SPACE, TRUE }, 2195 { 0x3001, UCHAR_WHITE_SPACE, FALSE }, 2196 2197 { 0x0711, UCHAR_XID_CONTINUE, TRUE }, 2198 { 0x1d1aa, UCHAR_XID_CONTINUE, TRUE }, 2199 { 0x007c, UCHAR_XID_CONTINUE, FALSE }, 2200 2201 { 0x16ee, UCHAR_XID_START, TRUE }, 2202 { 0x23456, UCHAR_XID_START, TRUE }, 2203 { 0x1d1aa, UCHAR_XID_START, FALSE }, 2204 2205 /* 2206 * Version break: 2207 * The following properties are only supported starting with the 2208 * Unicode version indicated in the second field. 2209 */ 2210 { -1, 0x320, 0 }, 2211 2212 { 0x180c, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, TRUE }, 2213 { 0xfe02, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, TRUE }, 2214 { 0x1801, UCHAR_DEFAULT_IGNORABLE_CODE_POINT, FALSE }, 2215 2216 { 0x0341, UCHAR_DEPRECATED, TRUE }, 2217 { 0xe0041, UCHAR_DEPRECATED, TRUE }, /* changed from Unicode 5 to 5.1 */ 2218 { 0xe0100, UCHAR_DEPRECATED, FALSE }, 2219 2220 { 0x00a0, UCHAR_GRAPHEME_BASE, TRUE }, 2221 { 0x0a4d, UCHAR_GRAPHEME_BASE, FALSE }, 2222 { 0xff9d, UCHAR_GRAPHEME_BASE, TRUE }, 2223 { 0xff9f, UCHAR_GRAPHEME_BASE, FALSE }, /* changed from Unicode 3.2 to 4 and again from 5 to 5.1 */ 2224 2225 { 0x0300, UCHAR_GRAPHEME_EXTEND, TRUE }, 2226 { 0xff9d, UCHAR_GRAPHEME_EXTEND, FALSE }, 2227 { 0xff9f, UCHAR_GRAPHEME_EXTEND, TRUE }, /* changed from Unicode 3.2 to 4 and again from 5 to 5.1 */ 2228 { 0x0603, UCHAR_GRAPHEME_EXTEND, FALSE }, 2229 2230 { 0x0a4d, UCHAR_GRAPHEME_LINK, TRUE }, 2231 { 0xff9f, UCHAR_GRAPHEME_LINK, FALSE }, 2232 2233 { 0x2ff7, UCHAR_IDS_BINARY_OPERATOR, TRUE }, 2234 { 0x2ff3, UCHAR_IDS_BINARY_OPERATOR, FALSE }, 2235 2236 { 0x2ff3, UCHAR_IDS_TRINARY_OPERATOR, TRUE }, 2237 { 0x2f03, UCHAR_IDS_TRINARY_OPERATOR, FALSE }, 2238 2239 { 0x0ec1, UCHAR_LOGICAL_ORDER_EXCEPTION, TRUE }, 2240 { 0xdcba, UCHAR_LOGICAL_ORDER_EXCEPTION, FALSE }, 2241 2242 { 0x2e9b, UCHAR_RADICAL, TRUE }, 2243 { 0x4e00, UCHAR_RADICAL, FALSE }, 2244 2245 { 0x012f, UCHAR_SOFT_DOTTED, TRUE }, 2246 { 0x0049, UCHAR_SOFT_DOTTED, FALSE }, 2247 2248 { 0xfa11, UCHAR_UNIFIED_IDEOGRAPH, TRUE }, 2249 { 0xfa12, UCHAR_UNIFIED_IDEOGRAPH, FALSE }, 2250 2251 { -1, 0x401, 0 }, /* version break for Unicode 4.0.1 */ 2252 2253 { 0x002e, UCHAR_S_TERM, TRUE }, 2254 { 0x0061, UCHAR_S_TERM, FALSE }, 2255 2256 { 0x180c, UCHAR_VARIATION_SELECTOR, TRUE }, 2257 { 0xfe03, UCHAR_VARIATION_SELECTOR, TRUE }, 2258 { 0xe01ef, UCHAR_VARIATION_SELECTOR, TRUE }, 2259 { 0xe0200, UCHAR_VARIATION_SELECTOR, FALSE }, 2260 2261 /* enum/integer type properties */ 2262 2263 /* UCHAR_BIDI_CLASS tested for assigned characters in TestUnicodeData() */ 2264 /* test default Bidi classes for unassigned code points */ 2265 { 0x0590, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, 2266 { 0x05cf, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, 2267 { 0x05ed, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, 2268 { 0x07f2, UCHAR_BIDI_CLASS, U_DIR_NON_SPACING_MARK }, /* Nko, new in Unicode 5.0 */ 2269 { 0x07fe, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, /* unassigned R */ 2270 { 0x08ba, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, 2271 { 0xfb37, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, 2272 { 0xfb42, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, 2273 { 0x10806, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, 2274 { 0x10909, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, 2275 { 0x10fe4, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT }, 2276 2277 { 0x0605, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC }, 2278 { 0x061c, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC }, 2279 { 0x063f, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC }, 2280 { 0x070e, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC }, 2281 { 0x0775, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC }, 2282 { 0xfbc2, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC }, 2283 { 0xfd90, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC }, 2284 { 0xfefe, UCHAR_BIDI_CLASS, U_RIGHT_TO_LEFT_ARABIC }, 2285 2286 { 0x02AF, UCHAR_BLOCK, UBLOCK_IPA_EXTENSIONS }, 2287 { 0x0C4E, UCHAR_BLOCK, UBLOCK_TELUGU }, 2288 { 0x155A, UCHAR_BLOCK, UBLOCK_UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS }, 2289 { 0x1717, UCHAR_BLOCK, UBLOCK_TAGALOG }, 2290 { 0x1900, UCHAR_BLOCK, UBLOCK_LIMBU }, 2291 { 0x1AFF, UCHAR_BLOCK, UBLOCK_NO_BLOCK }, 2292 { 0x3040, UCHAR_BLOCK, UBLOCK_HIRAGANA }, 2293 { 0x1D0FF, UCHAR_BLOCK, UBLOCK_BYZANTINE_MUSICAL_SYMBOLS }, 2294 { 0x50000, UCHAR_BLOCK, UBLOCK_NO_BLOCK }, 2295 { 0xEFFFF, UCHAR_BLOCK, UBLOCK_NO_BLOCK }, 2296 { 0x10D0FF, UCHAR_BLOCK, UBLOCK_SUPPLEMENTARY_PRIVATE_USE_AREA_B }, 2297 2298 /* UCHAR_CANONICAL_COMBINING_CLASS tested for assigned characters in TestUnicodeData() */ 2299 { 0xd7d7, UCHAR_CANONICAL_COMBINING_CLASS, 0 }, 2300 2301 { 0x00A0, UCHAR_DECOMPOSITION_TYPE, U_DT_NOBREAK }, 2302 { 0x00A8, UCHAR_DECOMPOSITION_TYPE, U_DT_COMPAT }, 2303 { 0x00bf, UCHAR_DECOMPOSITION_TYPE, U_DT_NONE }, 2304 { 0x00c0, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL }, 2305 { 0x1E9B, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL }, 2306 { 0xBCDE, UCHAR_DECOMPOSITION_TYPE, U_DT_CANONICAL }, 2307 { 0xFB5D, UCHAR_DECOMPOSITION_TYPE, U_DT_MEDIAL }, 2308 { 0x1D736, UCHAR_DECOMPOSITION_TYPE, U_DT_FONT }, 2309 { 0xe0033, UCHAR_DECOMPOSITION_TYPE, U_DT_NONE }, 2310 2311 { 0x0009, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL }, 2312 { 0x0020, UCHAR_EAST_ASIAN_WIDTH, U_EA_NARROW }, 2313 { 0x00B1, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS }, 2314 { 0x20A9, UCHAR_EAST_ASIAN_WIDTH, U_EA_HALFWIDTH }, 2315 { 0x2FFB, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, 2316 { 0x3000, UCHAR_EAST_ASIAN_WIDTH, U_EA_FULLWIDTH }, 2317 { 0x35bb, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, 2318 { 0x58bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, 2319 { 0xD7A3, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, 2320 { 0xEEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS }, 2321 { 0x1D198, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL }, 2322 { 0x20000, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, 2323 { 0x2F8C7, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, 2324 { 0x3a5bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_WIDE }, /* plane 3 got default W values in Unicode 4 */ 2325 { 0x5a5bd, UCHAR_EAST_ASIAN_WIDTH, U_EA_NEUTRAL }, 2326 { 0xFEEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS }, 2327 { 0x10EEEE, UCHAR_EAST_ASIAN_WIDTH, U_EA_AMBIGUOUS }, 2328 2329 /* UCHAR_GENERAL_CATEGORY tested for assigned characters in TestUnicodeData() */ 2330 { 0xd7d7, UCHAR_GENERAL_CATEGORY, 0 }, 2331 2332 { 0x0444, UCHAR_JOINING_GROUP, U_JG_NO_JOINING_GROUP }, 2333 { 0x0639, UCHAR_JOINING_GROUP, U_JG_AIN }, 2334 { 0x072A, UCHAR_JOINING_GROUP, U_JG_DALATH_RISH }, 2335 { 0x0647, UCHAR_JOINING_GROUP, U_JG_HEH }, 2336 { 0x06C1, UCHAR_JOINING_GROUP, U_JG_HEH_GOAL }, 2337 { 0x06C3, UCHAR_JOINING_GROUP, U_JG_HAMZA_ON_HEH_GOAL }, 2338 2339 { 0x200C, UCHAR_JOINING_TYPE, U_JT_NON_JOINING }, 2340 { 0x200D, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING }, 2341 { 0x0639, UCHAR_JOINING_TYPE, U_JT_DUAL_JOINING }, 2342 { 0x0640, UCHAR_JOINING_TYPE, U_JT_JOIN_CAUSING }, 2343 { 0x06C3, UCHAR_JOINING_TYPE, U_JT_RIGHT_JOINING }, 2344 { 0x0300, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT }, 2345 { 0x070F, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT }, 2346 { 0xe0033, UCHAR_JOINING_TYPE, U_JT_TRANSPARENT }, 2347 2348 /* TestUnicodeData() verifies that no assigned character has "XX" (unknown) */ 2349 { 0xe7e7, UCHAR_LINE_BREAK, U_LB_UNKNOWN }, 2350 { 0x10fffd, UCHAR_LINE_BREAK, U_LB_UNKNOWN }, 2351 { 0x0028, UCHAR_LINE_BREAK, U_LB_OPEN_PUNCTUATION }, 2352 { 0x232A, UCHAR_LINE_BREAK, U_LB_CLOSE_PUNCTUATION }, 2353 { 0x3401, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC }, 2354 { 0x4e02, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC }, 2355 { 0x20004, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC }, 2356 { 0xf905, UCHAR_LINE_BREAK, U_LB_IDEOGRAPHIC }, 2357 { 0xdb7e, UCHAR_LINE_BREAK, U_LB_SURROGATE }, 2358 { 0xdbfd, UCHAR_LINE_BREAK, U_LB_SURROGATE }, 2359 { 0xdffc, UCHAR_LINE_BREAK, U_LB_SURROGATE }, 2360 { 0x2762, UCHAR_LINE_BREAK, U_LB_EXCLAMATION }, 2361 { 0x002F, UCHAR_LINE_BREAK, U_LB_BREAK_SYMBOLS }, 2362 { 0x1D49C, UCHAR_LINE_BREAK, U_LB_ALPHABETIC }, 2363 { 0x1731, UCHAR_LINE_BREAK, U_LB_ALPHABETIC }, 2364 2365 /* UCHAR_NUMERIC_TYPE tested in TestNumericProperties() */ 2366 2367 /* UCHAR_SCRIPT tested in TestUScriptCodeAPI() */ 2368 2369 { 0x1100, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, 2370 { 0x1111, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, 2371 { 0x1159, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, 2372 { 0x115f, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LEADING_JAMO }, 2373 2374 { 0x1160, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, 2375 { 0x1161, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, 2376 { 0x1172, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, 2377 { 0x11a2, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_VOWEL_JAMO }, 2378 2379 { 0x11a8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, 2380 { 0x11b8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, 2381 { 0x11c8, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, 2382 { 0x11f9, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_TRAILING_JAMO }, 2383 2384 { 0x115a, UCHAR_HANGUL_SYLLABLE_TYPE, 0 }, 2385 { 0x115e, UCHAR_HANGUL_SYLLABLE_TYPE, 0 }, 2386 { 0x11a3, UCHAR_HANGUL_SYLLABLE_TYPE, 0 }, 2387 { 0x11a7, UCHAR_HANGUL_SYLLABLE_TYPE, 0 }, 2388 { 0x11fa, UCHAR_HANGUL_SYLLABLE_TYPE, 0 }, 2389 { 0x11ff, UCHAR_HANGUL_SYLLABLE_TYPE, 0 }, 2390 2391 { 0xac00, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE }, 2392 { 0xac1c, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE }, 2393 { 0xc5ec, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE }, 2394 { 0xd788, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LV_SYLLABLE }, 2395 2396 { 0xac01, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE }, 2397 { 0xac1b, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE }, 2398 { 0xac1d, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE }, 2399 { 0xc5ee, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE }, 2400 { 0xd7a3, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_LVT_SYLLABLE }, 2401 2402 { 0xd7a4, UCHAR_HANGUL_SYLLABLE_TYPE, 0 }, 2403 2404 { -1, 0x410, 0 }, /* version break for Unicode 4.1 */ 2405 2406 { 0x00d7, UCHAR_PATTERN_SYNTAX, TRUE }, 2407 { 0xfe45, UCHAR_PATTERN_SYNTAX, TRUE }, 2408 { 0x0061, UCHAR_PATTERN_SYNTAX, FALSE }, 2409 2410 { 0x0020, UCHAR_PATTERN_WHITE_SPACE, TRUE }, 2411 { 0x0085, UCHAR_PATTERN_WHITE_SPACE, TRUE }, 2412 { 0x200f, UCHAR_PATTERN_WHITE_SPACE, TRUE }, 2413 { 0x00a0, UCHAR_PATTERN_WHITE_SPACE, FALSE }, 2414 { 0x3000, UCHAR_PATTERN_WHITE_SPACE, FALSE }, 2415 2416 { 0x1d200, UCHAR_BLOCK, UBLOCK_ANCIENT_GREEK_MUSICAL_NOTATION }, 2417 { 0x2c8e, UCHAR_BLOCK, UBLOCK_COPTIC }, 2418 { 0xfe17, UCHAR_BLOCK, UBLOCK_VERTICAL_FORMS }, 2419 2420 { 0x1a00, UCHAR_SCRIPT, USCRIPT_BUGINESE }, 2421 { 0x2cea, UCHAR_SCRIPT, USCRIPT_COPTIC }, 2422 { 0xa82b, UCHAR_SCRIPT, USCRIPT_SYLOTI_NAGRI }, 2423 { 0x103d0, UCHAR_SCRIPT, USCRIPT_OLD_PERSIAN }, 2424 2425 { 0xcc28, UCHAR_LINE_BREAK, U_LB_H2 }, 2426 { 0xcc29, UCHAR_LINE_BREAK, U_LB_H3 }, 2427 { 0xac03, UCHAR_LINE_BREAK, U_LB_H3 }, 2428 { 0x115f, UCHAR_LINE_BREAK, U_LB_JL }, 2429 { 0x11aa, UCHAR_LINE_BREAK, U_LB_JT }, 2430 { 0x11a1, UCHAR_LINE_BREAK, U_LB_JV }, 2431 2432 { 0xb2c9, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_LVT }, 2433 { 0x036f, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_EXTEND }, 2434 { 0x0000, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_CONTROL }, 2435 { 0x1160, UCHAR_GRAPHEME_CLUSTER_BREAK, U_GCB_V }, 2436 2437 { 0x05f4, UCHAR_WORD_BREAK, U_WB_MIDLETTER }, 2438 { 0x4ef0, UCHAR_WORD_BREAK, U_WB_OTHER }, 2439 { 0x19d9, UCHAR_WORD_BREAK, U_WB_NUMERIC }, 2440 { 0x2044, UCHAR_WORD_BREAK, U_WB_MIDNUM }, 2441 2442 { 0xfffd, UCHAR_SENTENCE_BREAK, U_SB_OTHER }, 2443 { 0x1ffc, UCHAR_SENTENCE_BREAK, U_SB_UPPER }, 2444 { 0xff63, UCHAR_SENTENCE_BREAK, U_SB_CLOSE }, 2445 { 0x2028, UCHAR_SENTENCE_BREAK, U_SB_SEP }, 2446 2447 /* undefined UProperty values */ 2448 { 0x61, 0x4a7, 0 }, 2449 { 0x234bc, 0x15ed, 0 } 2450 }; 2451 2452 UVersionInfo version; 2453 UChar32 c; 2454 int32_t i, result, uVersion; 2455 UProperty which; 2456 2457 /* what is our Unicode version? */ 2458 u_getUnicodeVersion(version); 2459 uVersion=((int32_t)version[0]<<8)|(version[1]<<4)|version[2]; /* major/minor/update version numbers */ 2460 2461 u_charAge(0x20, version); 2462 if(version[0]==0) { 2463 /* no additional properties available */ 2464 log_err("TestAdditionalProperties: no additional properties available, not tested\n"); 2465 return; 2466 } 2467 2468 /* test u_charAge() */ 2469 for(i=0; i<sizeof(charAges)/sizeof(charAges[0]); ++i) { 2470 u_charAge(charAges[i].c, version); 2471 if(0!=memcmp(version, charAges[i].version, sizeof(UVersionInfo))) { 2472 log_err("error: u_charAge(U+%04lx)={ %u, %u, %u, %u } instead of { %u, %u, %u, %u }\n", 2473 charAges[i].c, 2474 version[0], version[1], version[2], version[3], 2475 charAges[i].version[0], charAges[i].version[1], charAges[i].version[2], charAges[i].version[3]); 2476 } 2477 } 2478 2479 if( u_getIntPropertyMinValue(UCHAR_DASH)!=0 || 2480 u_getIntPropertyMinValue(UCHAR_BIDI_CLASS)!=0 || 2481 u_getIntPropertyMinValue(UCHAR_BLOCK)!=0 || /* j2478 */ 2482 u_getIntPropertyMinValue(UCHAR_SCRIPT)!=0 || /*JB#2410*/ 2483 u_getIntPropertyMinValue(0x2345)!=0 2484 ) { 2485 log_err("error: u_getIntPropertyMinValue() wrong\n"); 2486 } 2487 if( u_getIntPropertyMaxValue(UCHAR_DASH)!=1) { 2488 log_err("error: u_getIntPropertyMaxValue(UCHAR_DASH) wrong\n"); 2489 } 2490 if( u_getIntPropertyMaxValue(UCHAR_ID_CONTINUE)!=1) { 2491 log_err("error: u_getIntPropertyMaxValue(UCHAR_ID_CONTINUE) wrong\n"); 2492 } 2493 if( u_getIntPropertyMaxValue((UProperty)(UCHAR_BINARY_LIMIT-1))!=1) { 2494 log_err("error: u_getIntPropertyMaxValue(UCHAR_BINARY_LIMIT-1) wrong\n"); 2495 } 2496 if( u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS)!=(int32_t)U_CHAR_DIRECTION_COUNT-1 ) { 2497 log_err("error: u_getIntPropertyMaxValue(UCHAR_BIDI_CLASS) wrong\n"); 2498 } 2499 if( u_getIntPropertyMaxValue(UCHAR_BLOCK)!=(int32_t)UBLOCK_COUNT-1 ) { 2500 log_err("error: u_getIntPropertyMaxValue(UCHAR_BLOCK) wrong\n"); 2501 } 2502 if(u_getIntPropertyMaxValue(UCHAR_LINE_BREAK)!=(int32_t)U_LB_COUNT-1) { 2503 log_err("error: u_getIntPropertyMaxValue(UCHAR_LINE_BREAK) wrong\n"); 2504 } 2505 if(u_getIntPropertyMaxValue(UCHAR_SCRIPT)!=(int32_t)USCRIPT_CODE_LIMIT-1) { 2506 log_err("error: u_getIntPropertyMaxValue(UCHAR_SCRIPT) wrong\n"); 2507 } 2508 if(u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE)!=(int32_t)U_NT_COUNT-1) { 2509 log_err("error: u_getIntPropertyMaxValue(UCHAR_NUMERIC_TYPE) wrong\n"); 2510 } 2511 if(u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY)!=(int32_t)U_CHAR_CATEGORY_COUNT-1) { 2512 log_err("error: u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY) wrong\n"); 2513 } 2514 if(u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE)!=(int32_t)U_HST_COUNT-1) { 2515 log_err("error: u_getIntPropertyMaxValue(UCHAR_HANGUL_SYLLABLE_TYPE) wrong\n"); 2516 } 2517 if(u_getIntPropertyMaxValue(UCHAR_GRAPHEME_CLUSTER_BREAK)!=(int32_t)U_GCB_COUNT-1) { 2518 log_err("error: u_getIntPropertyMaxValue(UCHAR_GRAPHEME_CLUSTER_BREAK) wrong\n"); 2519 } 2520 if(u_getIntPropertyMaxValue(UCHAR_SENTENCE_BREAK)!=(int32_t)U_SB_COUNT-1) { 2521 log_err("error: u_getIntPropertyMaxValue(UCHAR_SENTENCE_BREAK) wrong\n"); 2522 } 2523 if(u_getIntPropertyMaxValue(UCHAR_WORD_BREAK)!=(int32_t)U_WB_COUNT-1) { 2524 log_err("error: u_getIntPropertyMaxValue(UCHAR_WORD_BREAK) wrong\n"); 2525 } 2526 /*JB#2410*/ 2527 if( u_getIntPropertyMaxValue(0x2345)!=-1) { 2528 log_err("error: u_getIntPropertyMaxValue(0x2345) wrong\n"); 2529 } 2530 if( u_getIntPropertyMaxValue(UCHAR_DECOMPOSITION_TYPE) != (int32_t) (U_DT_COUNT - 1)) { 2531 log_err("error: u_getIntPropertyMaxValue(UCHAR_DECOMPOSITION_TYPE) wrong\n"); 2532 } 2533 if( u_getIntPropertyMaxValue(UCHAR_JOINING_GROUP) != (int32_t) (U_JG_COUNT -1)) { 2534 log_err("error: u_getIntPropertyMaxValue(UCHAR_JOINING_GROUP) wrong\n"); 2535 } 2536 if( u_getIntPropertyMaxValue(UCHAR_JOINING_TYPE) != (int32_t) (U_JT_COUNT -1)) { 2537 log_err("error: u_getIntPropertyMaxValue(UCHAR_JOINING_TYPE) wrong\n"); 2538 } 2539 if( u_getIntPropertyMaxValue(UCHAR_EAST_ASIAN_WIDTH) != (int32_t) (U_EA_COUNT -1)) { 2540 log_err("error: u_getIntPropertyMaxValue(UCHAR_EAST_ASIAN_WIDTH) wrong\n"); 2541 } 2542 2543 /* test u_hasBinaryProperty() and u_getIntPropertyValue() */ 2544 for(i=0; i<sizeof(props)/sizeof(props[0]); ++i) { 2545 if(props[i][0]<0) { 2546 /* Unicode version break */ 2547 if(uVersion<props[i][1]) { 2548 break; /* do not test properties that are not yet supported */ 2549 } else { 2550 continue; /* skip this row */ 2551 } 2552 } 2553 2554 c=(UChar32)props[i][0]; 2555 which=(UProperty)props[i][1]; 2556 2557 if(which<UCHAR_INT_START) { 2558 result=u_hasBinaryProperty(c, which); 2559 if(result!=props[i][2]) { 2560 log_err("error: u_hasBinaryProperty(U+%04lx, %d)=%d is wrong (props[%d])\n", 2561 c, which, result, i); 2562 } 2563 } 2564 2565 result=u_getIntPropertyValue(c, which); 2566 if(result!=props[i][2]) { 2567 log_err("error: u_getIntPropertyValue(U+%04lx, 0x1000+%d)=%d is wrong, should be %d (props[%d])\n", 2568 c, (int32_t)which-0x1000, result, props[i][2], i); 2569 } 2570 2571 /* test separate functions, too */ 2572 switch((UProperty)props[i][1]) { 2573 case UCHAR_ALPHABETIC: 2574 if(u_isUAlphabetic((UChar32)props[i][0])!=(UBool)props[i][2]) { 2575 log_err("error: u_isUAlphabetic(U+%04lx)=%d is wrong (props[%d])\n", 2576 props[i][0], result, i); 2577 } 2578 break; 2579 case UCHAR_LOWERCASE: 2580 if(u_isULowercase((UChar32)props[i][0])!=(UBool)props[i][2]) { 2581 log_err("error: u_isULowercase(U+%04lx)=%d is wrong (props[%d])\n", 2582 props[i][0], result, i); 2583 } 2584 break; 2585 case UCHAR_UPPERCASE: 2586 if(u_isUUppercase((UChar32)props[i][0])!=(UBool)props[i][2]) { 2587 log_err("error: u_isUUppercase(U+%04lx)=%d is wrong (props[%d])\n", 2588 props[i][0], result, i); 2589 } 2590 break; 2591 case UCHAR_WHITE_SPACE: 2592 if(u_isUWhiteSpace((UChar32)props[i][0])!=(UBool)props[i][2]) { 2593 log_err("error: u_isUWhiteSpace(U+%04lx)=%d is wrong (props[%d])\n", 2594 props[i][0], result, i); 2595 } 2596 break; 2597 default: 2598 break; 2599 } 2600 } 2601 } 2602 2603 static void 2604 TestNumericProperties(void) { 2605 /* see UnicodeData.txt, DerivedNumericValues.txt */ 2606 static const struct { 2607 UChar32 c; 2608 int32_t type; 2609 double numValue; 2610 } values[]={ 2611 { 0x0F33, U_NT_NUMERIC, -1./2. }, 2612 { 0x0C66, U_NT_DECIMAL, 0 }, 2613 { 0x96f6, U_NT_NUMERIC, 0 }, 2614 { 0x2159, U_NT_NUMERIC, 1./6. }, 2615 { 0x00BD, U_NT_NUMERIC, 1./2. }, 2616 { 0x0031, U_NT_DECIMAL, 1. }, 2617 { 0x4e00, U_NT_NUMERIC, 1. }, 2618 { 0x58f1, U_NT_NUMERIC, 1. }, 2619 { 0x10320, U_NT_NUMERIC, 1. }, 2620 { 0x0F2B, U_NT_NUMERIC, 3./2. }, 2621 { 0x00B2, U_NT_DIGIT, 2. }, 2622 { 0x5f10, U_NT_NUMERIC, 2. }, 2623 { 0x1813, U_NT_DECIMAL, 3. }, 2624 { 0x5f0e, U_NT_NUMERIC, 3. }, 2625 { 0x2173, U_NT_NUMERIC, 4. }, 2626 { 0x8086, U_NT_NUMERIC, 4. }, 2627 { 0x278E, U_NT_DIGIT, 5. }, 2628 { 0x1D7F2, U_NT_DECIMAL, 6. }, 2629 { 0x247A, U_NT_DIGIT, 7. }, 2630 { 0x7396, U_NT_NUMERIC, 9. }, 2631 { 0x1372, U_NT_NUMERIC, 10. }, 2632 { 0x216B, U_NT_NUMERIC, 12. }, 2633 { 0x16EE, U_NT_NUMERIC, 17. }, 2634 { 0x249A, U_NT_NUMERIC, 19. }, 2635 { 0x303A, U_NT_NUMERIC, 30. }, 2636 { 0x5345, U_NT_NUMERIC, 30. }, 2637 { 0x32B2, U_NT_NUMERIC, 37. }, 2638 { 0x1375, U_NT_NUMERIC, 40. }, 2639 { 0x10323, U_NT_NUMERIC, 50. }, 2640 { 0x0BF1, U_NT_NUMERIC, 100. }, 2641 { 0x964c, U_NT_NUMERIC, 100. }, 2642 { 0x217E, U_NT_NUMERIC, 500. }, 2643 { 0x2180, U_NT_NUMERIC, 1000. }, 2644 { 0x4edf, U_NT_NUMERIC, 1000. }, 2645 { 0x2181, U_NT_NUMERIC, 5000. }, 2646 { 0x137C, U_NT_NUMERIC, 10000. }, 2647 { 0x4e07, U_NT_NUMERIC, 10000. }, 2648 { 0x4ebf, U_NT_NUMERIC, 100000000. }, 2649 { 0x5146, U_NT_NUMERIC, 1000000000000. }, 2650 { 0x61, U_NT_NONE, U_NO_NUMERIC_VALUE }, 2651 { 0x3000, U_NT_NONE, U_NO_NUMERIC_VALUE }, 2652 { 0xfffe, U_NT_NONE, U_NO_NUMERIC_VALUE }, 2653 { 0x10301, U_NT_NONE, U_NO_NUMERIC_VALUE }, 2654 { 0xe0033, U_NT_NONE, U_NO_NUMERIC_VALUE }, 2655 { 0x10ffff, U_NT_NONE, U_NO_NUMERIC_VALUE } 2656 }; 2657 2658 double nv; 2659 UChar32 c; 2660 int32_t i, type; 2661 2662 for(i=0; i<LENGTHOF(values); ++i) { 2663 c=values[i].c; 2664 type=u_getIntPropertyValue(c, UCHAR_NUMERIC_TYPE); 2665 nv=u_getNumericValue(c); 2666 2667 if(type!=values[i].type) { 2668 log_err("UCHAR_NUMERIC_TYPE(U+%04lx)=%d should be %d\n", c, type, values[i].type); 2669 } 2670 if(0.000001 <= fabs(nv - values[i].numValue)) { 2671 log_err("u_getNumericValue(U+%04lx)=%g should be %g\n", c, nv, values[i].numValue); 2672 } 2673 } 2674 } 2675 2676 /** 2677 * Test the property names and property value names API. 2678 */ 2679 static void 2680 TestPropertyNames(void) { 2681 int32_t p, v, choice=0, rev; 2682 UBool atLeastSomething = FALSE; 2683 2684 for (p=0; ; ++p) { 2685 UProperty propEnum = (UProperty)p; 2686 UBool sawProp = FALSE; 2687 if(p > 10 && !atLeastSomething) { 2688 log_data_err("Never got anything after 10 tries.\nYour data is probably fried. Quitting this test\n", p, choice); 2689 return; 2690 } 2691 2692 for (choice=0; ; ++choice) { 2693 const char* name = u_getPropertyName(propEnum, (UPropertyNameChoice)choice); 2694 if (name) { 2695 if (!sawProp) 2696 log_verbose("prop 0x%04x+%2d:", p&~0xfff, p&0xfff); 2697 log_verbose("%d=\"%s\"", choice, name); 2698 sawProp = TRUE; 2699 atLeastSomething = TRUE; 2700 2701 /* test reverse mapping */ 2702 rev = u_getPropertyEnum(name); 2703 if (rev != p) { 2704 log_err("Property round-trip failure: %d -> %s -> %d\n", 2705 p, name, rev); 2706 } 2707 } 2708 if (!name && choice>0) break; 2709 } 2710 if (sawProp) { 2711 /* looks like a valid property; check the values */ 2712 const char* pname = u_getPropertyName(propEnum, U_LONG_PROPERTY_NAME); 2713 int32_t max = 0; 2714 if (p == UCHAR_CANONICAL_COMBINING_CLASS) { 2715 max = 255; 2716 } else if (p == UCHAR_GENERAL_CATEGORY_MASK) { 2717 /* it's far too slow to iterate all the way up to 2718 the real max, U_GC_P_MASK */ 2719 max = U_GC_NL_MASK; 2720 } else if (p == UCHAR_BLOCK) { 2721 /* UBlockCodes, unlike other values, start at 1 */ 2722 max = 1; 2723 } 2724 log_verbose("\n"); 2725 for (v=-1; ; ++v) { 2726 UBool sawValue = FALSE; 2727 for (choice=0; ; ++choice) { 2728 const char* vname = u_getPropertyValueName(propEnum, v, (UPropertyNameChoice)choice); 2729 if (vname) { 2730 if (!sawValue) log_verbose(" %s, value %d:", pname, v); 2731 log_verbose("%d=\"%s\"", choice, vname); 2732 sawValue = TRUE; 2733 2734 /* test reverse mapping */ 2735 rev = u_getPropertyValueEnum(propEnum, vname); 2736 if (rev != v) { 2737 log_err("Value round-trip failure (%s): %d -> %s -> %d\n", 2738 pname, v, vname, rev); 2739 } 2740 } 2741 if (!vname && choice>0) break; 2742 } 2743 if (sawValue) { 2744 log_verbose("\n"); 2745 } 2746 if (!sawValue && v>=max) break; 2747 } 2748 } 2749 if (!sawProp) { 2750 if (p>=UCHAR_STRING_LIMIT) { 2751 break; 2752 } else if (p>=UCHAR_DOUBLE_LIMIT) { 2753 p = UCHAR_STRING_START - 1; 2754 } else if (p>=UCHAR_MASK_LIMIT) { 2755 p = UCHAR_DOUBLE_START - 1; 2756 } else if (p>=UCHAR_INT_LIMIT) { 2757 p = UCHAR_MASK_START - 1; 2758 } else if (p>=UCHAR_BINARY_LIMIT) { 2759 p = UCHAR_INT_START - 1; 2760 } 2761 } 2762 } 2763 } 2764 2765 /** 2766 * Test the property values API. See JB#2410. 2767 */ 2768 static void 2769 TestPropertyValues(void) { 2770 int32_t i, p, min, max; 2771 UErrorCode ec; 2772 2773 /* Min should be 0 for everything. */ 2774 /* Until JB#2478 is fixed, the one exception is UCHAR_BLOCK. */ 2775 for (p=UCHAR_INT_START; p<UCHAR_INT_LIMIT; ++p) { 2776 UProperty propEnum = (UProperty)p; 2777 min = u_getIntPropertyMinValue(propEnum); 2778 if (min != 0) { 2779 if (p == UCHAR_BLOCK) { 2780 /* This is okay...for now. See JB#2487. 2781 TODO Update this for JB#2487. */ 2782 } else { 2783 const char* name; 2784 name = u_getPropertyName(propEnum, U_LONG_PROPERTY_NAME); 2785 if (name == NULL) 2786 name = "<ERROR>"; 2787 log_err("FAIL: u_getIntPropertyMinValue(%s) = %d, exp. 0\n", 2788 name, min); 2789 } 2790 } 2791 } 2792 2793 if( u_getIntPropertyMinValue(UCHAR_GENERAL_CATEGORY_MASK)!=0 || 2794 u_getIntPropertyMaxValue(UCHAR_GENERAL_CATEGORY_MASK)!=-1) { 2795 log_err("error: u_getIntPropertyMin/MaxValue(UCHAR_GENERAL_CATEGORY_MASK) is wrong\n"); 2796 } 2797 2798 /* Max should be -1 for invalid properties. */ 2799 max = u_getIntPropertyMaxValue(UCHAR_INVALID_CODE); 2800 if (max != -1) { 2801 log_err("FAIL: u_getIntPropertyMaxValue(-1) = %d, exp. -1\n", 2802 max); 2803 } 2804 2805 /* Script should return USCRIPT_INVALID_CODE for an invalid code point. */ 2806 for (i=0; i<2; ++i) { 2807 int32_t script; 2808 const char* desc; 2809 ec = U_ZERO_ERROR; 2810 switch (i) { 2811 case 0: 2812 script = uscript_getScript(-1, &ec); 2813 desc = "uscript_getScript(-1)"; 2814 break; 2815 case 1: 2816 script = u_getIntPropertyValue(-1, UCHAR_SCRIPT); 2817 desc = "u_getIntPropertyValue(-1, UCHAR_SCRIPT)"; 2818 break; 2819 default: 2820 log_err("Internal test error. Too many scripts\n"); 2821 return; 2822 } 2823 /* We don't explicitly test ec. It should be U_FAILURE but it 2824 isn't documented as such. */ 2825 if (script != (int32_t)USCRIPT_INVALID_CODE) { 2826 log_err("FAIL: %s = %d, exp. 0\n", 2827 desc, script); 2828 } 2829 } 2830 } 2831 2832 /* add characters from a serialized set to a normal one */ 2833 static void 2834 _setAddSerialized(USet *set, const USerializedSet *sset) { 2835 UChar32 start, end; 2836 int32_t i, count; 2837 2838 count=uset_getSerializedRangeCount(sset); 2839 for(i=0; i<count; ++i) { 2840 uset_getSerializedRange(sset, i, &start, &end); 2841 uset_addRange(set, start, end); 2842 } 2843 } 2844 2845 /* various tests for consistency of UCD data and API behavior */ 2846 static void 2847 TestConsistency() { 2848 #if !UCONFIG_NO_NORMALIZATION 2849 UChar buffer16[300]; 2850 #endif 2851 char buffer[300]; 2852 USet *set1, *set2, *set3, *set4; 2853 UErrorCode errorCode; 2854 2855 #if !UCONFIG_NO_NORMALIZATION 2856 USerializedSet sset; 2857 #endif 2858 UChar32 start, end; 2859 int32_t i, length; 2860 2861 U_STRING_DECL(hyphenPattern, "[:Hyphen:]", 10); 2862 U_STRING_DECL(dashPattern, "[:Dash:]", 8); 2863 U_STRING_DECL(lowerPattern, "[:Lowercase:]", 13); 2864 U_STRING_DECL(formatPattern, "[:Cf:]", 6); 2865 U_STRING_DECL(alphaPattern, "[:Alphabetic:]", 14); 2866 2867 U_STRING_DECL(mathBlocksPattern, 2868 "[[:block=Mathematical Operators:][:block=Miscellaneous Mathematical Symbols-A:][:block=Miscellaneous Mathematical Symbols-B:][:block=Supplemental Mathematical Operators:][:block=Mathematical Alphanumeric Symbols:]]", 2869 1+32+46+46+45+43+1+1); /* +1 for NUL */ 2870 U_STRING_DECL(mathPattern, "[:Math:]", 8); 2871 U_STRING_DECL(unassignedPattern, "[:Cn:]", 6); 2872 U_STRING_DECL(unknownPattern, "[:sc=Unknown:]", 14); 2873 U_STRING_DECL(reservedPattern, "[[:Cn:][:Co:][:Cs:]]", 20); 2874 2875 U_STRING_INIT(hyphenPattern, "[:Hyphen:]", 10); 2876 U_STRING_INIT(dashPattern, "[:Dash:]", 8); 2877 U_STRING_INIT(lowerPattern, "[:Lowercase:]", 13); 2878 U_STRING_INIT(formatPattern, "[:Cf:]", 6); 2879 U_STRING_INIT(alphaPattern, "[:Alphabetic:]", 14); 2880 2881 U_STRING_INIT(mathBlocksPattern, 2882 "[[:block=Mathematical Operators:][:block=Miscellaneous Mathematical Symbols-A:][:block=Miscellaneous Mathematical Symbols-B:][:block=Supplemental Mathematical Operators:][:block=Mathematical Alphanumeric Symbols:]]", 2883 1+32+46+46+45+43+1+1); /* +1 for NUL */ 2884 U_STRING_INIT(mathPattern, "[:Math:]", 8); 2885 U_STRING_INIT(unassignedPattern, "[:Cn:]", 6); 2886 U_STRING_INIT(unknownPattern, "[:sc=Unknown:]", 14); 2887 U_STRING_INIT(reservedPattern, "[[:Cn:][:Co:][:Cs:]]", 20); 2888 2889 /* 2890 * It used to be that UCD.html and its precursors said 2891 * "Those dashes used to mark connections between pieces of words, 2892 * plus the Katakana middle dot." 2893 * 2894 * Unicode 4 changed 00AD Soft Hyphen to Cf and removed it from Dash 2895 * but not from Hyphen. 2896 * UTC 94 (2003mar) decided to leave it that way and to changed UCD.html. 2897 * Therefore, do not show errors when testing the Hyphen property. 2898 */ 2899 log_verbose("Starting with Unicode 4, inconsistencies with [:Hyphen:] are\n" 2900 "known to the UTC and not considered errors.\n"); 2901 2902 errorCode=U_ZERO_ERROR; 2903 set1=uset_openPattern(hyphenPattern, 10, &errorCode); 2904 set2=uset_openPattern(dashPattern, 8, &errorCode); 2905 if(U_SUCCESS(errorCode)) { 2906 /* remove the Katakana middle dot(s) from set1 */ 2907 uset_remove(set1, 0x30fb); 2908 uset_remove(set1, 0xff65); /* halfwidth variant */ 2909 showAMinusB(set1, set2, "[:Hyphen:]", "[:Dash:]", FALSE); 2910 } else { 2911 log_data_err("error opening [:Hyphen:] or [:Dash:] - %s (Are you missing data?)\n", u_errorName(errorCode)); 2912 } 2913 2914 /* check that Cf is neither Hyphen nor Dash nor Alphabetic */ 2915 set3=uset_openPattern(formatPattern, 6, &errorCode); 2916 set4=uset_openPattern(alphaPattern, 14, &errorCode); 2917 if(U_SUCCESS(errorCode)) { 2918 showAIntersectB(set3, set1, "[:Cf:]", "[:Hyphen:]", FALSE); 2919 showAIntersectB(set3, set2, "[:Cf:]", "[:Dash:]", TRUE); 2920 showAIntersectB(set3, set4, "[:Cf:]", "[:Alphabetic:]", TRUE); 2921 } else { 2922 log_data_err("error opening [:Cf:] or [:Alpbabetic:] - %s (Are you missing data?)\n", u_errorName(errorCode)); 2923 } 2924 2925 uset_close(set1); 2926 uset_close(set2); 2927 uset_close(set3); 2928 uset_close(set4); 2929 2930 /* 2931 * Check that each lowercase character has "small" in its name 2932 * and not "capital". 2933 * There are some such characters, some of which seem odd. 2934 * Use the verbose flag to see these notices. 2935 */ 2936 errorCode=U_ZERO_ERROR; 2937 set1=uset_openPattern(lowerPattern, 13, &errorCode); 2938 if(U_SUCCESS(errorCode)) { 2939 for(i=0;; ++i) { 2940 length=uset_getItem(set1, i, &start, &end, NULL, 0, &errorCode); 2941 if(errorCode==U_INDEX_OUTOFBOUNDS_ERROR) { 2942 break; /* done */ 2943 } 2944 if(U_FAILURE(errorCode)) { 2945 log_err("error iterating over [:Lowercase:] at item %d: %s\n", 2946 i, u_errorName(errorCode)); 2947 break; 2948 } 2949 if(length!=0) { 2950 break; /* done with code points, got a string or -1 */ 2951 } 2952 2953 while(start<=end) { 2954 length=u_charName(start, U_UNICODE_CHAR_NAME, buffer, sizeof(buffer), &errorCode); 2955 if(U_FAILURE(errorCode)) { 2956 log_err("error getting the name of U+%04x - %s\n", start, u_errorName(errorCode)); 2957 errorCode=U_ZERO_ERROR; 2958 continue; 2959 } 2960 if( (strstr(buffer, "SMALL")==NULL || strstr(buffer, "CAPITAL")!=NULL) && 2961 strstr(buffer, "SMALL CAPITAL")==NULL 2962 ) { 2963 log_verbose("info: [:Lowercase:] contains U+%04x whose name does not suggest lowercase: %s\n", start, buffer); 2964 } 2965 ++start; 2966 } 2967 } 2968 } else { 2969 log_data_err("error opening [:Lowercase:] - %s (Are you missing data?)\n", u_errorName(errorCode)); 2970 } 2971 uset_close(set1); 2972 2973 #if !UCONFIG_NO_NORMALIZATION 2974 2975 /* 2976 * Test for an example that unorm_getCanonStartSet() delivers 2977 * all characters that compose from the input one, 2978 * even in multiple steps. 2979 * For example, the set for "I" (0049) should contain both 2980 * I-diaeresis (00CF) and I-diaeresis-acute (1E2E). 2981 * In general, the set for the middle such character should be a subset 2982 * of the set for the first. 2983 */ 2984 set1=uset_open(1, 0); 2985 set2=uset_open(1, 0); 2986 2987 if (unorm_getCanonStartSet(0x49, &sset)) { 2988 _setAddSerialized(set1, &sset); 2989 2990 /* enumerate all characters that are plausible to be latin letters */ 2991 for(start=0xa0; start<0x2000; ++start) { 2992 if(unorm_getDecomposition(start, FALSE, buffer16, LENGTHOF(buffer16))>1 && buffer16[0]==0x49) { 2993 uset_add(set2, start); 2994 } 2995 } 2996 2997 compareUSets(set1, set2, 2998 "[canon start set of 0049]", "[all c with canon decomp with 0049]", 2999 TRUE); 3000 } else { 3001 log_err("error calling unorm_getCanonStartSet()\n"); 3002 } 3003 3004 uset_close(set1); 3005 uset_close(set2); 3006 3007 #endif 3008 3009 /* verify that all assigned characters in Math blocks are exactly Math characters */ 3010 errorCode=U_ZERO_ERROR; 3011 set1=uset_openPattern(mathBlocksPattern, -1, &errorCode); 3012 set2=uset_openPattern(mathPattern, 8, &errorCode); 3013 set3=uset_openPattern(unassignedPattern, 6, &errorCode); 3014 if(U_SUCCESS(errorCode)) { 3015 uset_retainAll(set2, set1); /* [math blocks]&[:Math:] */ 3016 uset_complement(set3); /* assigned characters */ 3017 uset_retainAll(set1, set3); /* [math blocks]&[assigned] */ 3018 compareUSets(set1, set2, 3019 "[assigned Math block chars]", "[math blocks]&[:Math:]", 3020 TRUE); 3021 } else { 3022 log_data_err("error opening [math blocks] or [:Math:] or [:Cn:] - %s (Are you missing data?)\n", u_errorName(errorCode)); 3023 } 3024 uset_close(set1); 3025 uset_close(set2); 3026 uset_close(set3); 3027 3028 /* new in Unicode 5.0: exactly all unassigned+PUA+surrogate code points have script=Unknown */ 3029 errorCode=U_ZERO_ERROR; 3030 set1=uset_openPattern(unknownPattern, 14, &errorCode); 3031 set2=uset_openPattern(reservedPattern, 20, &errorCode); 3032 if(U_SUCCESS(errorCode)) { 3033 compareUSets(set1, set2, 3034 "[:sc=Unknown:]", "[[:Cn:][:Co:][:Cs:]]", 3035 TRUE); 3036 } else { 3037 log_data_err("error opening [:sc=Unknown:] or [[:Cn:][:Co:][:Cs:]] - %s (Are you missing data?)\n", u_errorName(errorCode)); 3038 } 3039 uset_close(set1); 3040 uset_close(set2); 3041 } 3042 3043 /* 3044 * Starting with ICU4C 3.4, the core Unicode properties files 3045 * (uprops.icu, ucase.icu, ubidi.icu, unorm.icu) 3046 * are hardcoded in the common DLL and therefore not included 3047 * in the data package any more. 3048 * Test requiring these files are disabled so that 3049 * we need not jump through hoops (like adding snapshots of these files 3050 * to testdata). 3051 * See Jitterbug 4497. 3052 */ 3053 #define HARDCODED_DATA_4497 1 3054 3055 /* API coverage for ucase.c */ 3056 static void TestUCase() { 3057 #if !HARDCODED_DATA_4497 3058 UDataMemory *pData; 3059 UCaseProps *csp; 3060 const UCaseProps *ccsp; 3061 UErrorCode errorCode; 3062 3063 /* coverage for ucase_openBinary() */ 3064 errorCode=U_ZERO_ERROR; 3065 pData=udata_open(NULL, UCASE_DATA_TYPE, UCASE_DATA_NAME, &errorCode); 3066 if(U_FAILURE(errorCode)) { 3067 log_data_err("unable to open " UCASE_DATA_NAME "." UCASE_DATA_TYPE ": %s\n", 3068 u_errorName(errorCode)); 3069 return; 3070 } 3071 3072 csp=ucase_openBinary((const uint8_t *)pData->pHeader, -1, &errorCode); 3073 if(U_FAILURE(errorCode)) { 3074 log_err("ucase_openBinary() fails for the contents of " UCASE_DATA_NAME "." UCASE_DATA_TYPE ": %s\n", 3075 u_errorName(errorCode)); 3076 udata_close(pData); 3077 return; 3078 } 3079 3080 if(UCASE_LOWER!=ucase_getType(csp, 0xdf)) { /* verify islower(sharp s) */ 3081 log_err("ucase_openBinary() does not seem to return working UCaseProps\n"); 3082 } 3083 3084 ucase_close(csp); 3085 udata_close(pData); 3086 3087 /* coverage for ucase_getDummy() */ 3088 errorCode=U_ZERO_ERROR; 3089 ccsp=ucase_getDummy(&errorCode); 3090 if(ucase_tolower(ccsp, 0x41)!=0x41) { 3091 log_err("ucase_tolower(dummy, A)!=A\n"); 3092 } 3093 #endif 3094 } 3095 3096 /* API coverage for ubidi_props.c */ 3097 static void TestUBiDiProps() { 3098 #if !HARDCODED_DATA_4497 3099 UDataMemory *pData; 3100 UBiDiProps *bdp; 3101 const UBiDiProps *cbdp; 3102 UErrorCode errorCode; 3103 3104 /* coverage for ubidi_openBinary() */ 3105 errorCode=U_ZERO_ERROR; 3106 pData=udata_open(NULL, UBIDI_DATA_TYPE, UBIDI_DATA_NAME, &errorCode); 3107 if(U_FAILURE(errorCode)) { 3108 log_data_err("unable to open " UBIDI_DATA_NAME "." UBIDI_DATA_TYPE ": %s\n", 3109 u_errorName(errorCode)); 3110 return; 3111 } 3112 3113 bdp=ubidi_openBinary((const uint8_t *)pData->pHeader, -1, &errorCode); 3114 if(U_FAILURE(errorCode)) { 3115 log_err("ubidi_openBinary() fails for the contents of " UBIDI_DATA_NAME "." UBIDI_DATA_TYPE ": %s\n", 3116 u_errorName(errorCode)); 3117 udata_close(pData); 3118 return; 3119 } 3120 3121 if(0x2215!=ubidi_getMirror(bdp, 0x29F5)) { /* verify some data */ 3122 log_err("ubidi_openBinary() does not seem to return working UBiDiProps\n"); 3123 } 3124 3125 ubidi_closeProps(bdp); 3126 udata_close(pData); 3127 3128 /* coverage for ubidi_getDummy() */ 3129 errorCode=U_ZERO_ERROR; 3130 cbdp=ubidi_getDummy(&errorCode); 3131 if(ubidi_getClass(cbdp, 0x20)!=0) { 3132 log_err("ubidi_getClass(dummy, space)!=0\n"); 3133 } 3134 #endif 3135 } 3136 3137 /* test case folding, compare return values with CaseFolding.txt ------------ */ 3138 3139 /* bit set for which case foldings for a character have been tested already */ 3140 enum { 3141 CF_SIMPLE=1, 3142 CF_FULL=2, 3143 CF_TURKIC=4, 3144 CF_ALL=7 3145 }; 3146 3147 static void 3148 testFold(UChar32 c, int which, 3149 UChar32 simple, UChar32 turkic, 3150 const UChar *full, int32_t fullLength, 3151 const UChar *turkicFull, int32_t turkicFullLength) { 3152 UChar s[2], t[32]; 3153 UChar32 c2; 3154 int32_t length, length2; 3155 3156 UErrorCode errorCode=U_ZERO_ERROR; 3157 3158 length=0; 3159 U16_APPEND_UNSAFE(s, length, c); 3160 3161 if((which&CF_SIMPLE)!=0 && (c2=u_foldCase(c, 0))!=simple) { 3162 log_err("u_foldCase(U+%04lx, default)=U+%04lx != U+%04lx\n", (long)c, (long)c2, (long)simple); 3163 } 3164 if((which&CF_FULL)!=0) { 3165 length2=u_strFoldCase(t, LENGTHOF(t), s, length, 0, &errorCode); 3166 if(length2!=fullLength || 0!=u_memcmp(t, full, fullLength)) { 3167 log_err("u_strFoldCase(U+%04lx, default) does not fold properly\n", (long)c); 3168 } 3169 } 3170 if((which&CF_TURKIC)!=0) { 3171 if((c2=u_foldCase(c, U_FOLD_CASE_EXCLUDE_SPECIAL_I))!=turkic) { 3172 log_err("u_foldCase(U+%04lx, turkic)=U+%04lx != U+%04lx\n", (long)c, (long)c2, (long)simple); 3173 } 3174 3175 length2=u_strFoldCase(t, LENGTHOF(t), s, length, U_FOLD_CASE_EXCLUDE_SPECIAL_I, &errorCode); 3176 if(length2!=turkicFullLength || 0!=u_memcmp(t, turkicFull, length2)) { 3177 log_err("u_strFoldCase(U+%04lx, turkic) does not fold properly\n", (long)c); 3178 } 3179 } 3180 } 3181 3182 /* test that c case-folds to itself */ 3183 static void 3184 testFoldToSelf(UChar32 c, int which) { 3185 UChar s[2]; 3186 int32_t length; 3187 3188 length=0; 3189 U16_APPEND_UNSAFE(s, length, c); 3190 testFold(c, which, c, c, s, length, s, length); 3191 } 3192 3193 struct CaseFoldingData { 3194 USet *notSeen; 3195 UChar32 prev, prevSimple; 3196 UChar prevFull[32]; 3197 int32_t prevFullLength; 3198 int which; 3199 }; 3200 typedef struct CaseFoldingData CaseFoldingData; 3201 3202 static void U_CALLCONV 3203 caseFoldingLineFn(void *context, 3204 char *fields[][2], int32_t fieldCount, 3205 UErrorCode *pErrorCode) { 3206 CaseFoldingData *pData=(CaseFoldingData *)context; 3207 char *end; 3208 UChar full[32]; 3209 UChar32 c, prev, simple; 3210 int32_t count; 3211 int which; 3212 char status; 3213 3214 /* get code point */ 3215 c=(UChar32)strtoul(u_skipWhitespace(fields[0][0]), &end, 16); 3216 end=(char *)u_skipWhitespace(end); 3217 if(end<=fields[0][0] || end!=fields[0][1]) { 3218 log_err("syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]); 3219 *pErrorCode=U_PARSE_ERROR; 3220 return; 3221 } 3222 3223 /* get the status of this mapping */ 3224 status=*u_skipWhitespace(fields[1][0]); 3225 if(status!='C' && status!='S' && status!='F' && status!='T') { 3226 log_err("unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]); 3227 *pErrorCode=U_PARSE_ERROR; 3228 return; 3229 } 3230 3231 /* get the mapping */ 3232 count=u_parseString(fields[2][0], full, 32, (uint32_t *)&simple, pErrorCode); 3233 if(U_FAILURE(*pErrorCode)) { 3234 log_err("error parsing CaseFolding.txt mapping at %s\n", fields[0][0]); 3235 return; 3236 } 3237 3238 /* there is a simple mapping only if there is exactly one code point (count is in UChars) */ 3239 if(count==0 || count>2 || (count==2 && U16_IS_SINGLE(full[1]))) { 3240 simple=c; 3241 } 3242 3243 if(c!=(prev=pData->prev)) { 3244 /* 3245 * Test remaining mappings for the previous code point. 3246 * If a turkic folding was not mentioned, then it should fold the same 3247 * as the regular simple case folding. 3248 */ 3249 UChar s[2]; 3250 int32_t length; 3251 3252 length=0; 3253 U16_APPEND_UNSAFE(s, length, prev); 3254 testFold(prev, (~pData->which)&CF_ALL, 3255 prev, pData->prevSimple, 3256 s, length, 3257 pData->prevFull, pData->prevFullLength); 3258 pData->prev=pData->prevSimple=c; 3259 length=0; 3260 U16_APPEND_UNSAFE(pData->prevFull, length, c); 3261 pData->prevFullLength=length; 3262 pData->which=0; 3263 } 3264 3265 /* 3266 * Turn the status into a bit set of case foldings to test. 3267 * Remember non-Turkic case foldings as defaults for Turkic mode. 3268 */ 3269 switch(status) { 3270 case 'C': 3271 which=CF_SIMPLE|CF_FULL; 3272 pData->prevSimple=simple; 3273 u_memcpy(pData->prevFull, full, count); 3274 pData->prevFullLength=count; 3275 break; 3276 case 'S': 3277 which=CF_SIMPLE; 3278 pData->prevSimple=simple; 3279 break; 3280 case 'F': 3281 which=CF_FULL; 3282 u_memcpy(pData->prevFull, full, count); 3283 pData->prevFullLength=count; 3284 break; 3285 case 'T': 3286 which=CF_TURKIC; 3287 break; 3288 default: 3289 which=0; 3290 break; /* won't happen because of test above */ 3291 } 3292 3293 testFold(c, which, simple, simple, full, count, full, count); 3294 3295 /* remember which case foldings of c have been tested */ 3296 pData->which|=which; 3297 3298 /* remove c from the set of ones not mentioned in CaseFolding.txt */ 3299 uset_remove(pData->notSeen, c); 3300 } 3301 3302 static void 3303 TestCaseFolding() { 3304 CaseFoldingData data={ NULL }; 3305 char *fields[3][2]; 3306 UErrorCode errorCode; 3307 3308 static char *lastLine= (char *)"10FFFF; C; 10FFFF;"; 3309 3310 errorCode=U_ZERO_ERROR; 3311 /* test BMP & plane 1 - nothing interesting above */ 3312 data.notSeen=uset_open(0, 0x1ffff); 3313 data.prevFullLength=1; /* length of full case folding of U+0000 */ 3314 3315 parseUCDFile("CaseFolding.txt", fields, 3, caseFoldingLineFn, &data, &errorCode); 3316 if(U_SUCCESS(errorCode)) { 3317 int32_t i, start, end; 3318 3319 /* add a pseudo-last line to finish testing of the actual last one */ 3320 fields[0][0]=lastLine; 3321 fields[0][1]=lastLine+6; 3322 fields[1][0]=lastLine+7; 3323 fields[1][1]=lastLine+9; 3324 fields[2][0]=lastLine+10; 3325 fields[2][1]=lastLine+17; 3326 caseFoldingLineFn(&data, fields, 3, &errorCode); 3327 3328 /* verify that all code points that are not mentioned in CaseFolding.txt fold to themselves */ 3329 for(i=0; 3330 0==uset_getItem(data.notSeen, i, &start, &end, NULL, 0, &errorCode) && 3331 U_SUCCESS(errorCode); 3332 ++i 3333 ) { 3334 do { 3335 testFoldToSelf(start, CF_ALL); 3336 } while(++start<=end); 3337 } 3338 } 3339 3340 uset_close(data.notSeen); 3341 } 3342